Skip to content
9 changes: 9 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,15 @@ Folder monitoring:

--> Every time a pdf file is added to `watch_directory` it will be OCR'ed

pypdfocr -w watch_directory --archive --archive_suffix _orig.pdf

--> Every time a pdf file is added to `watch_directory` it will be OCR'ed. The original will move to _orig and the
OCR'ed version will have its name

pypdfocr -w watch_directory --archive --archive_suffix _orig.pdf --initial_scan
--> Every time a pdf file is added to `watch_directory` it will be OCR'ed. The original will move to _orig and the
OCR'ed version will have its name. All PDF's in the folder will be scanned and OCR'ed if they have not been already.

Automatic filing:
~~~~~~~~~~~~~~~~~

Expand Down
47 changes: 36 additions & 11 deletions pypdfocr/pypdfocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,17 @@ def get_options(self, argv):
default=False, dest='match_using_filename', help='Use filename to match if contents did not match anything, before filing to default folder')


#--------------
# Watch Options
#--------------
p.add_argument('--archive', action='store_true',
dest='archive', help='Move the source document to an archive')
p.add_argument('--initial_scan', action='store_true',
dest='initial_scan', help='Include PDF documents already in folder if not processed')
p.add_argument('--archive_suffix',
dest='archive_suffix', help='Include PDF documents already in folder if not processed', default='_orig.pdf')


# Add flow option to single mode extract_images,preprocess,ocr,write

args = p.parse_args(argv)
Expand All @@ -173,6 +184,10 @@ def get_options(self, argv):
self.match_using_filename = args.match_using_filename
self.skip_preprocess = args.skip_preprocess

self.archive = args.archive
self.archive_suffix = args.archive_suffix
self.initial_scan = args.initial_scan

if self.debug:
logging.basicConfig(level=logging.DEBUG, format='%(message)s')

Expand Down Expand Up @@ -320,7 +335,11 @@ def run_conversion(self, pdf_filename):
"""
print ("Starting conversion of %s" % pdf_filename)
# Make the images for Tesseract
img_dpi, glob_img_filename = self.gs.make_img_from_pdf(pdf_filename)
try:
img_dpi, glob_img_filename = self.gs.make_img_from_pdf(pdf_filename)
except Exception, e:
print "Exception occurred in processing %s: %s" % (pdf_filename, e)
return

fns = glob.glob(glob_img_filename)

Expand All @@ -337,7 +356,8 @@ def run_conversion(self, pdf_filename):

# Generate new pdf with overlayed text
#ocr_pdf_filename = self.pdf.overlay_hocr(tiff_dpi, hocr_filename, pdf_filename)
ocr_pdf_filename = self.pdf.overlay_hocr_pages(img_dpi, hocr_filenames, pdf_filename)
ocr_pdf_filename = self.pdf.overlay_hocr_pages(img_dpi, hocr_filenames, pdf_filename,
archive=self.archive, archive_suffix=self.archive_suffix)

# Clean up the files
if not self.debug:
Expand Down Expand Up @@ -426,13 +446,15 @@ def go(self, argv):
if self.watch:
while True: # Make sure the watcher doesn't terminate
try:
py_watcher = PyPdfWatcher(self.watch_dir, self.config.get('watch'))
py_watcher = PyPdfWatcher(self.watch_dir, self.config.get('watch'),
archive=self.archive, initial_scan=self.initial_scan,
archive_suffix=self.archive_suffix)
for pdf_filename in py_watcher.start():
self._convert_and_file_email(pdf_filename)
except KeyboardInterrupt:
break
except Exception as e:
print traceback.print_exc(e)
traceback.print_exc(e)
py_watcher.stop()

else:
Expand All @@ -442,14 +464,17 @@ def _convert_and_file_email(self, pdf_filename):
"""
Helper function to run the conversion, then do the optional filing, and optional emailing.
"""
ocr_pdffilename = self.run_conversion(pdf_filename)
if self.enable_filing:
filing = self.file_converted_file(ocr_pdffilename, pdf_filename)
else:
filing = "None"
try:
ocr_pdffilename = self.run_conversion(pdf_filename)
if self.enable_filing:
filing = self.file_converted_file(ocr_pdffilename, pdf_filename)
else:
filing = "None"

if self.enable_email:
self._send_email(pdf_filename, ocr_pdffilename, filing)
if self.enable_email:
self._send_email(pdf_filename, ocr_pdffilename, filing)
except Exception, e:
print traceback.print_exc(e)

def main(): # pragma: no cover
script = PyPDFOCR()
Expand Down
10 changes: 5 additions & 5 deletions pypdfocr/pypdfocr_gs.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,12 +174,12 @@ def _run_gs(self, options, output_filename, pdf_filename):
out = subprocess.check_output(cmd, shell=True)

except subprocess.CalledProcessError as e:
print e.output
print "Exception running Ghostscript:\n\n", e.output

if "undefined in .getdeviceparams" in e.output:
error(self.msgs['GS_OUTDATED'])
raise(self.msgs['GS_OUTDATED'])
else:
error (self.msgs['GS_FAILED'])

raise(self.msgs['GS_FAILED'])

def make_img_from_pdf(self, pdf_filename):
self._get_dpi(pdf_filename) # No need to bother anymore
Expand All @@ -189,7 +189,6 @@ def make_img_from_pdf(self, pdf_filename):

filename, filext = os.path.splitext(pdf_filename)


# Create ancillary jpeg files to use later to calculate image dpi etc
# We no longer use these for the final image. Instead the text is merged
# directly with the original PDF. Yay!
Expand All @@ -213,6 +212,7 @@ def make_img_from_pdf(self, pdf_filename):
options = ' '.join(self.gs_options[self.img_format][1]) % {'dpi':self.output_dpi}
output_filename = '%s_%%d.%s' % (filename, self.img_file_ext)
self._run_gs(options, output_filename, pdf_filename)

for fn in glob.glob(globable_filename):
logging.info("Created image %s" % fn)
return (self.output_dpi, globable_filename)
Expand Down
22 changes: 21 additions & 1 deletion pypdfocr/pypdfocr_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def mergeRotateAroundPointPage(self,page, page2, rotation, tx, ty):
ctm[1][0], ctm[1][1],
ctm[2][0], ctm[2][1]])

def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename):
def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename, archive=False, archive_suffix="_orig.pdf"):

logging.debug("Going to overlay following files onto %s" % orig_pdf_filename)
# Sort the hocr_filenames into natural keys!
Expand All @@ -87,6 +87,7 @@ def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename):
basename = os.path.splitext(pdf_basename)[0]
pdf_filename = os.path.join(pdf_dir, "%s_ocr.pdf" % (basename))


text_pdf_filenames = []
for img_filename, hocr_filename in hocr_filenames:
text_pdf_filename = self.overlay_hocr_page(dpi, hocr_filename, img_filename)
Expand All @@ -96,6 +97,16 @@ def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename):

writer = PdfFileWriter()
orig = open(orig_pdf_filename, 'rb')
orig_reader = PdfFileReader(orig)

# Save the properties
pdf_info = orig_reader.getDocumentInfo()
if pdf_info is not None:
writer.addMetadata(pdf_info)

writer.addMetadata({ '/PyPDFOCR': 'True' })

# Loop through the pages
for orig_pg, text_pg_filename in zip(self.iter_pdf_page(orig), text_pdf_filenames):
text_file = open(text_pg_filename, 'rb')
text_pg = self.iter_pdf_page(text_file).next()
Expand Down Expand Up @@ -123,6 +134,15 @@ def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename):
for fn in text_pdf_filenames:
os.remove(fn)

print "Done on conversion: ", orig_pdf_filename
if archive:
original_filename = os.path.join(pdf_dir, "%s%s" % (basename, archive_suffix))
ocr_filename = orig_pdf_filename
print "Archiving PDF %s -> %s, %s -> %s" % (orig_pdf_filename, original_filename, pdf_filename, ocr_filename)
os.rename(orig_pdf_filename, original_filename)
os.rename(pdf_filename, ocr_filename)


logging.info("Created OCR'ed pdf as %s" % (pdf_filename))
return pdf_filename

Expand Down
61 changes: 57 additions & 4 deletions pypdfocr/pypdfocr_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,42 @@
import logging
import glob
import functools
import signal

from multiprocessing import Pool

TIMEOUT = 500

# Ugly hack to pass in object method to the multiprocessing library
# From http://www.rueckstiess.net/research/snippets/show/ca1d7d90
# Basically gets passed in a pair of (self, arg), and calls the method
def unwrap_self(arg, **kwarg):
return PyPreprocess._run_preprocess(*arg, **kwarg)

class TimeoutError(Exception):
pass


def handler(signum, frame):
raise TimeoutError()

def which(program):
import os
def is_exe(fpath):
return os.path.isfile(fpath) and os.access(fpath, os.X_OK)

fpath, fname = os.path.split(program)
if fpath:
if is_exe(program):
return program
else:
for path in os.environ["PATH"].split(os.pathsep):
path = path.strip('"')
exe_file = os.path.join(path, program)
if is_exe(exe_file):
return exe_file

return None

class PyPreprocess(object):
"""Class to wrap all the ImageMagick convert calls"""
Expand All @@ -51,12 +78,31 @@ def cmd(self, cmd_list):
cmd_list = ' '.join(cmd_list)
logging.debug("Running cmd: %s" % cmd_list)
try:
out = subprocess.check_output(cmd_list, stderr=subprocess.STDOUT, shell=True)
signal.signal(signal.SIGALRM, handler)
signal.alarm(TIMEOUT)
proc = subprocess.Popen(cmd_list, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, preexec_fn=os.setsid)
pid = proc.pid
(out, error) = proc.communicate()
signal.alarm(0)
logging.debug(out)
return out
except subprocess.CalledProcessError as e:
print e.output
self._warn("Could not run command %s" % cmd_list)
except TimeoutError, te:
print "Timeout exceeded PID", pid, cmd_list
os.killpg(pid, signal.SIGTERM)
# os.kill(pid, signal.SIGTERM)
finally:
signal.alarm(0)

if proc:
proc.terminate()
proc.kill()
print "Killing processes"

return None



def _run_preprocess(self, in_filename):
Expand All @@ -69,7 +115,8 @@ def _run_preprocess(self, in_filename):
else:
backslash = '\\'

c = ['convert',
convert = which('convert');
c = [convert,
'"%s"' % in_filename,
'-respect-parenthesis',
#'\\( $setcspace -colorspace gray -type grayscale \\)',
Expand All @@ -86,17 +133,23 @@ def _run_preprocess(self, in_filename):
]
logging.info("Preprocessing image %s for better OCR" % in_filename)
res = self.cmd(c)

if res is None:
return in_filename
else:
return out_filename
# Make sure the convert process did not die on us
if os.path.isfile(out_filename):
print "Filename does not exist: ", out_filename, " using ", in_filename
return out_filename

return in_filename

def preprocess(self, in_filenames):
fns = in_filenames

pool = Pool(processes=self.threads)
logging.info("Starting preprocessing parallel execution")
preprocessed_filenames = pool.map(unwrap_self,zip([self]*len(fns),fns))
preprocessed_filenames = pool.map(unwrap_self, zip([self]*len(fns),fns))
pool.close()
pool.join()
logging.info ("Completed preprocessing")
Expand Down
11 changes: 7 additions & 4 deletions pypdfocr/pypdfocr_tesseract.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import os, sys
import logging
import subprocess
import signal
import glob
from subprocess import CalledProcessError
from multiprocessing import Pool
Expand All @@ -36,6 +37,9 @@ def error(text):
def unwrap_self(arg, **kwarg):
return PyTesseract.make_hocr_from_pnm(*arg, **kwarg)

def init_worker():
signal.signal(signal.SIGINT, signal.SIG_IGN)

class PyTesseract(object):
"""Class to wrap all the tesseract calls"""
def __init__(self, config):
Expand All @@ -44,7 +48,7 @@ def __init__(self, config):
"""
self.lang = 'eng'
self.required = "3.02.02"
self.threads = config.get('threads',4)
self.threads = config.get('threads', 4)

if "binary" in config: # Override location of binary
binary = config['binary']
Expand Down Expand Up @@ -129,12 +133,11 @@ def make_hocr_from_pnms(self, fns):

# Glob it
#fns = glob.glob(img_filename)
pool = Pool(processes=self.threads)
print("Making pool")
pool = Pool(processes=self.threads, initializer=init_worker)
hocr_filenames = pool.map(unwrap_self, zip([self]*len(fns), fns))
pool.close()
pool.join()
return zip(fns,hocr_filenames)
return zip(fns, hocr_filenames)


def make_hocr_from_pnm(self, img_filename):
Expand Down
Loading