-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathocr
More file actions
executable file
·67 lines (57 loc) · 1.43 KB
/
ocr
File metadata and controls
executable file
·67 lines (57 loc) · 1.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/bin/bash
if [ "$1" != "" ]
then
echo "$1" | grep -q '\.pdf$'
if [ $? -eq 0 ]
then
pdfs=$1
dir=`dirname $1`
else
pdfs=`ls $1/*.pdf`
dir=$1
fi
else
pdfs=`ls *.pdf`
dir=`pwd`
fi
for pdf in $pdfs
do
doc=`basename $pdf .pdf`
if grep -q FontName $dir/$doc.pdf
then
echo "Extracting with pdftotext on $doc.pdf..."
pdftotext $dir/$doc.pdf $doc.txt
fi
filesize=$(stat -c%s "$doc.txt" 2> /dev/null)
if [ "$filesize" -le "1000" ]
then
echo "Performing OCR on $doc.pdf..."
pdfimages $dir/$doc.pdf $doc
for image in `ls $doc-*[[:digit:]].ppm 2> /dev/null`
do
f=`basename $image .ppm`
convert $f.ppm $f.pbm
rm $f.ppm
done
# try using "unpaper" first
for image in `ls $doc-*[[:digit:]].pbm`
do
f=`basename $image .pbm`
unpaper $image $f-fixed.pbm
nice tesseract $f-fixed.pbm $f
done
cat $doc-*.txt > $doc.txt
# if that fails, try again without "unpaper"
filesize=$(stat -c%s "$doc.txt")
if [ "$filesize" -le "1000" ]
then
for image in `ls $doc-*[[:digit:]].pbm`
do
f=`basename $image .pbm`
echo "OCR on $f.pbm"
nice tesseract $f.pbm $f
done
fi
rm $doc-*.pbm $doc-*.txt
fi
done