If this helps... For English OCR, I have some helper scripts. I don't remember where I found them, prob. Ubuntu Forums.
/ocr$ ls
ocr.sh pdf2tif usage.txt
/ocr$ cat usage.txt
to generate a txt from a pdf # uses pdf2tif as a helper script
./ocr.sh <filename>.pdf
to produce only a tif from a pdf
./pdf2tif <filename>.pdf
/ocr$ cat ocr.sh
#! /bin/sh -e
# takes one parameter, the path to a pdf file to be processed.
# uses custom script 'pdf2tif' to generate the tif files,
# generates them at 300x300 dpi.
# drops them in our current directory
# then runs $progdir/tesseract on them, deleting the .raw
# and .map files that tesseract drops.
./pdf2tif $1
# edit this to point to wherever you've got your tesseract binary
progdir=/usr/bin
# un-comment next line if you want to remove the .tif files when done.
# rm ${j}
done
/ocr$ cat pdf2tif
#! /bin/sh -e
# $Id: pdf2ps 6300 2005-12-28 19:56:24Z giles $
# Convert PDF to PostScript.
# This definition is changed on install to match the
# executable name set in the makefile
GS_EXECUTABLE=gs
OPTIONS=""
while true
do
case "$1" in
-?*) OPTIONS="$OPTIONS $1" ;;
*) break ;;
esac
shift
done
if [ $# -eq 2 ]
then
outfile=$2
elif [ $# -eq 1 ]
then
outfile=$( basename "$1" \.pdf ).tif
else
echo "Usage: $( basename $0 ) [-dASCII85EncodePages=false] [-dLanguageLevel=1|2|3] input.pdf [output.tif]" 1>&2
exit 1
fi
# Doing an initial 'save' helps keep fonts from being flushed between pages.
# We have to include the options twice because -I only takes effect if it
# appears before other options.
exec $GS_EXECUTABLE $OPTIONS -q -dNOPAUSE -dBATCH -dSAFER -r300x300 -sDEVICE=tiffg3 "-sOutputFile=$outfile" $OPTIONS -c save pop -f "$1"
If this helps... For English OCR, I have some helper scripts. I don't remember where I found them, prob. Ubuntu Forums.
/ocr$ ls
ocr.sh pdf2tif usage.txt
/ocr$ cat usage.txt
to generate a txt from a pdf # uses pdf2tif as a helper script
./ocr.sh <filename>.pdf
to produce only a tif from a pdf
./pdf2tif <filename>.pdf
/ocr$ cat ocr.sh
#! /bin/sh -e
# takes one parameter, the path to a pdf file to be processed.
# uses custom script 'pdf2tif' to generate the tif files,
# generates them at 300x300 dpi.
# drops them in our current directory
# then runs $progdir/tesseract on them, deleting the .raw
# and .map files that tesseract drops.
./pdf2tif $1
# edit this to point to wherever you've got your tesseract binary
progdir=/usr/bin
for j in *.tif
do /tesseract ${j} ${x}
x=$( basename $j \.tif )
${progdir}
rm ${x}.raw
rm ${x}.map
# un-comment next line if you want to remove the .tif files when done.
# rm ${j}
done
/ocr$ cat pdf2tif
#! /bin/sh -e
# $Id: pdf2ps 6300 2005-12-28 19:56:24Z giles $
# Convert PDF to PostScript.
# This definition is changed on install to match the
# executable name set in the makefile
GS_EXECUTABLE=gs
OPTIONS=""
while true
do
case "$1" in
-?*) OPTIONS="$OPTIONS $1" ;;
*) break ;;
esac
shift
done
if [ $# -eq 2 ] ePages= false] [-dLanguageLeve l=1|2|3] input.pdf [output.tif]" 1>&2
then
outfile=$2
elif [ $# -eq 1 ]
then
outfile=$( basename "$1" \.pdf ).tif
else
echo "Usage: $( basename $0 ) [-dASCII85Encod
exit 1
fi
# Doing an initial 'save' helps keep fonts from being flushed between pages. $outfile" $OPTIONS -c save pop -f "$1"
# We have to include the options twice because -I only takes effect if it
# appears before other options.
exec $GS_EXECUTABLE $OPTIONS -q -dNOPAUSE -dBATCH -dSAFER -r300x300 -sDEVICE=tiffg3 "-sOutputFile=