Comment 8 for bug 483391

Revision history for this message
toobuntu (toobuntu) wrote :

If this helps... For English OCR, I have some helper scripts. I don't remember where I found them, prob. Ubuntu Forums.

/ocr$ ls
ocr.sh pdf2tif usage.txt

/ocr$ cat usage.txt
to generate a txt from a pdf # uses pdf2tif as a helper script
./ocr.sh <filename>.pdf

to produce only a tif from a pdf
./pdf2tif <filename>.pdf

/ocr$ cat ocr.sh
#! /bin/sh -e

# takes one parameter, the path to a pdf file to be processed.
# uses custom script 'pdf2tif' to generate the tif files,
# generates them at 300x300 dpi.
# drops them in our current directory
# then runs $progdir/tesseract on them, deleting the .raw
# and .map files that tesseract drops.

./pdf2tif $1

# edit this to point to wherever you've got your tesseract binary
progdir=/usr/bin

for j in *.tif

    do
    x=$( basename $j \.tif )
    ${progdir}/tesseract ${j} ${x}
    rm ${x}.raw
    rm ${x}.map

# un-comment next line if you want to remove the .tif files when done.
# rm ${j}
done

/ocr$ cat pdf2tif
#! /bin/sh -e
# $Id: pdf2ps 6300 2005-12-28 19:56:24Z giles $
# Convert PDF to PostScript.

# This definition is changed on install to match the
# executable name set in the makefile
GS_EXECUTABLE=gs

OPTIONS=""
while true
do
 case "$1" in
 -?*) OPTIONS="$OPTIONS $1" ;;
 *) break ;;
 esac
 shift
done

if [ $# -eq 2 ]
then
    outfile=$2
elif [ $# -eq 1 ]
then
    outfile=$( basename "$1" \.pdf ).tif
else
    echo "Usage: $( basename $0 ) [-dASCII85EncodePages=false] [-dLanguageLevel=1|2|3] input.pdf [output.tif]" 1>&2
    exit 1
fi

# Doing an initial 'save' helps keep fonts from being flushed between pages.
# We have to include the options twice because -I only takes effect if it
# appears before other options.
exec $GS_EXECUTABLE $OPTIONS -q -dNOPAUSE -dBATCH -dSAFER -r300x300 -sDEVICE=tiffg3 "-sOutputFile=$outfile" $OPTIONS -c save pop -f "$1"