#!/bin/bash # # Tesseract OCR batchrun # # this script uses the xdialog GUI # # 2010-03-13 JvO, new script # 2011-11-30 JvO, adapted for scanning booklets (rotate and split scan) # clear # # Image size (pixels) # TODO get the picture info with Netpbm pamfile # ## 300dpi scan, B/W (1bit) original_width=1416; original_height=2854; width=$((original_width/2)) height=$((original_height/2)) # # variables used # resultText="OCRText.txt"; # the resulting text outputfile tempTIFF1="temp1.tif"; tempTIFF2="temp2.tif"; # # choose the directory with the images to be scanned # inputdir=$(xdialog --stdout --dselect\ ~/Desktop\ 24 80) # # show settings # echo "Directory : $inputdir"; echo "Imagewidth: $width"; echo " Height: $height"; # # process all (TIFF) images in the selected directory # cd "${inputdir}"; echo "" > ${resultText}; for file in $(ls | grep '\.tif$' | grep '^[^temp]') ## Unix powertools p.730 do ## the file being processed filename=$(basename "${file##*/}" .tif) echo "Filename: $filename"; ## show image info ##echo `tifftopnm < $file | pamfile;`; # # preprocess TIFF image # modify this to your needs # Netpbm programs' overview: http://netpbm.sourceforge.net/doc/directory.html#converters # rotate and split the imagefile tifftopnm < $file | \ pamditherbw -threshold -value 0.8 | \ pamtopnm |\ pamflip -rotate270 | pamdice -width=$height -height=$original_width -outstem=$filename; # convert split PBM images to TIFF pnmtotiff ${filename}_0_0.pbm > $tempTIFF1; pnmtotiff ${filename}_0_1.pbm > $tempTIFF2; # and finally perform the Optical character recoginition # Usage:tesseract imagename outputbase [-l lang] [configfile [[+|-]varfile]...] tesseract ${tempTIFF1} ${filename}.1 -l nld; tesseract ${tempTIFF2} ${filename}.2 -l nld; # append text to result textfile cat ${filename}.1.txt >> ${resultText}; cat ${filename}.2.txt >> ${resultText}; # cleanup rm ${filename}_0_0.pbm; rm ${filename}_0_1.pbm; rm ${filename}.1.txt; rm ${filename}.2.txt; done ## cleanup rm $tempTIFF1; rm $tempTIFF2; echo "=========="; echo "OCR Ready!"; echo "=========="; exit 0;