virantha · albertcbrown · Nov 3, 2015 · Nov 3, 2015 · Nov 3, 2015 · Nov 3, 2015
diff --git a/README.rst b/README.rst
@@ -48,6 +48,15 @@ Folder monitoring:
 
     --> Every time a pdf file is added to `watch_directory` it will be OCR'ed
 
+    pypdfocr -w watch_directory --archive --archive_suffix _orig.pdf
+
+    --> Every time a pdf file is added to `watch_directory` it will be OCR'ed.  The original will move to _orig and the
+    OCR'ed version will have its name
+
+    pypdfocr -w watch_directory --archive --archive_suffix _orig.pdf --initial_scan
+    --> Every time a pdf file is added to `watch_directory` it will be OCR'ed.  The original will move to _orig and the
+    OCR'ed version will have its name.  All PDF's in the folder will be scanned and OCR'ed if they have not been already.
+
 Automatic filing:
 ~~~~~~~~~~~~~~~~~
 

diff --git a/pypdfocr/pypdfocr.py b/pypdfocr/pypdfocr.py
@@ -160,6 +160,17 @@ def get_options(self, argv):
             default=False, dest='match_using_filename', help='Use filename to match if contents did not match anything, before filing to default folder')
 
 
+        #--------------
+        # Watch Options
+        #--------------
+        p.add_argument('--archive', action='store_true',
+             dest='archive', help='Move the source document to an archive')
+        p.add_argument('--initial_scan', action='store_true',
+             dest='initial_scan', help='Include PDF documents already in folder if not processed')
+        p.add_argument('--archive_suffix',
+             dest='archive_suffix', help='Include PDF documents already in folder if not processed', default='_orig.pdf')
+
+
         # Add flow option to single mode extract_images,preprocess,ocr,write
 
         args = p.parse_args(argv)
@@ -173,6 +184,10 @@ def get_options(self, argv):
         self.match_using_filename = args.match_using_filename
         self.skip_preprocess = args.skip_preprocess
 
+        self.archive = args.archive
+        self.archive_suffix = args.archive_suffix
+        self.initial_scan = args.initial_scan
+
         if self.debug:
             logging.basicConfig(level=logging.DEBUG, format='%(message)s')
 
@@ -320,7 +335,11 @@ def run_conversion(self, pdf_filename):
         """
         print ("Starting conversion of %s" % pdf_filename)
         # Make the images for Tesseract
-        img_dpi, glob_img_filename = self.gs.make_img_from_pdf(pdf_filename)
+        try:
+            img_dpi, glob_img_filename = self.gs.make_img_from_pdf(pdf_filename)
+        except Exception, e:
+            print "Exception occurred in processing %s: %s" % (pdf_filename, e)
+            return
 
         fns = glob.glob(glob_img_filename)
 
@@ -337,7 +356,8 @@ def run_conversion(self, pdf_filename):
 
         # Generate new pdf with overlayed text
         #ocr_pdf_filename = self.pdf.overlay_hocr(tiff_dpi, hocr_filename, pdf_filename)
-        ocr_pdf_filename = self.pdf.overlay_hocr_pages(img_dpi, hocr_filenames, pdf_filename)
+        ocr_pdf_filename = self.pdf.overlay_hocr_pages(img_dpi, hocr_filenames, pdf_filename,
+                                                       archive=self.archive, archive_suffix=self.archive_suffix)
 
         # Clean up the files
         if not self.debug:
@@ -426,13 +446,15 @@ def go(self, argv):
         if self.watch:
             while True:  # Make sure the watcher doesn't terminate
                 try:
-                    py_watcher = PyPdfWatcher(self.watch_dir, self.config.get('watch'))
+                    py_watcher = PyPdfWatcher(self.watch_dir, self.config.get('watch'),
+                                              archive=self.archive, initial_scan=self.initial_scan,
+                                              archive_suffix=self.archive_suffix)
                     for pdf_filename in py_watcher.start():
                         self._convert_and_file_email(pdf_filename)
                 except KeyboardInterrupt:
                     break
                 except Exception as e:
-                    print traceback.print_exc(e)
+                    traceback.print_exc(e)
                     py_watcher.stop()
 
         else:
@@ -442,14 +464,17 @@ def _convert_and_file_email(self, pdf_filename):
         """
             Helper function to run the conversion, then do the optional filing, and optional emailing.
         """
-        ocr_pdffilename = self.run_conversion(pdf_filename)
-        if self.enable_filing:
-            filing = self.file_converted_file(ocr_pdffilename, pdf_filename)
-        else:
-            filing = "None"
+        try:
+            ocr_pdffilename = self.run_conversion(pdf_filename)
+            if self.enable_filing:
+                filing = self.file_converted_file(ocr_pdffilename, pdf_filename)
+            else:
+                filing = "None"
 
-        if self.enable_email:
-            self._send_email(pdf_filename, ocr_pdffilename, filing)
+            if self.enable_email:
+                self._send_email(pdf_filename, ocr_pdffilename, filing)
+        except Exception, e:
+            print traceback.print_exc(e)
 
 def main(): # pragma: no cover 
     script = PyPDFOCR()

diff --git a/pypdfocr/pypdfocr_gs.py b/pypdfocr/pypdfocr_gs.py
@@ -174,12 +174,12 @@ def _run_gs(self, options, output_filename, pdf_filename):
             out = subprocess.check_output(cmd, shell=True)
 
         except subprocess.CalledProcessError as e:
-            print e.output
+            print "Exception running Ghostscript:\n\n", e.output
+
             if "undefined in .getdeviceparams" in e.output:
-                error(self.msgs['GS_OUTDATED'])
+                raise(self.msgs['GS_OUTDATED'])
             else:
-                error (self.msgs['GS_FAILED'])
-
+                raise(self.msgs['GS_FAILED'])
 
     def make_img_from_pdf(self, pdf_filename):
         self._get_dpi(pdf_filename) # No need to bother anymore
@@ -189,7 +189,6 @@ def make_img_from_pdf(self, pdf_filename):
 
         filename, filext = os.path.splitext(pdf_filename)
 
-
         # Create ancillary jpeg files to use later to calculate image dpi etc
         #   We no longer use these for the final image. Instead the text is merged
         #   directly with the original PDF.  Yay!
@@ -213,6 +212,7 @@ def make_img_from_pdf(self, pdf_filename):
         options = ' '.join(self.gs_options[self.img_format][1]) % {'dpi':self.output_dpi}
         output_filename = '%s_%%d.%s' % (filename, self.img_file_ext)
         self._run_gs(options, output_filename, pdf_filename)
+
         for fn in glob.glob(globable_filename):
             logging.info("Created image %s" % fn)
         return (self.output_dpi, globable_filename)

diff --git a/pypdfocr/pypdfocr_pdf.py b/pypdfocr/pypdfocr_pdf.py
@@ -76,7 +76,7 @@ def mergeRotateAroundPointPage(self,page, page2, rotation, tx, ty):
                                                  ctm[1][0], ctm[1][1],
                                                  ctm[2][0], ctm[2][1]])
 
-    def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename):
+    def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename, archive=False, archive_suffix="_orig.pdf"):
 
         logging.debug("Going to overlay following files onto %s" % orig_pdf_filename)
         # Sort the hocr_filenames into natural keys!
@@ -87,6 +87,7 @@ def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename):
         basename = os.path.splitext(pdf_basename)[0]
         pdf_filename = os.path.join(pdf_dir, "%s_ocr.pdf" % (basename))
 
+
         text_pdf_filenames = []
         for img_filename, hocr_filename in hocr_filenames:
             text_pdf_filename = self.overlay_hocr_page(dpi, hocr_filename, img_filename)
@@ -96,6 +97,16 @@ def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename):
 
         writer = PdfFileWriter()
         orig = open(orig_pdf_filename, 'rb')
+        orig_reader = PdfFileReader(orig)
+
+        # Save  the properties
+        pdf_info = orig_reader.getDocumentInfo()
+        if pdf_info is not None:
+            writer.addMetadata(pdf_info)
+
+        writer.addMetadata({ '/PyPDFOCR': 'True' })
+
+        # Loop through the pages
         for orig_pg, text_pg_filename in zip(self.iter_pdf_page(orig), text_pdf_filenames):
             text_file = open(text_pg_filename, 'rb')
             text_pg = self.iter_pdf_page(text_file).next()
@@ -123,6 +134,15 @@ def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename):
         for fn in text_pdf_filenames:
             os.remove(fn)
 
+        print "Done on conversion: ", orig_pdf_filename
+        if archive:
+            original_filename = os.path.join(pdf_dir, "%s%s" % (basename, archive_suffix))
+            ocr_filename = orig_pdf_filename
+            print "Archiving PDF %s -> %s, %s -> %s" % (orig_pdf_filename, original_filename, pdf_filename, ocr_filename)
+            os.rename(orig_pdf_filename, original_filename)
+            os.rename(pdf_filename, ocr_filename)
+
+
         logging.info("Created OCR'ed pdf as %s" % (pdf_filename))
         return pdf_filename
 

diff --git a/pypdfocr/pypdfocr_preprocess.py b/pypdfocr/pypdfocr_preprocess.py
@@ -25,15 +25,42 @@
 import logging
 import glob
 import functools
+import signal
 
 from multiprocessing import Pool
 
+TIMEOUT = 500
+
 # Ugly hack to pass in object method to the multiprocessing library
 # From http://www.rueckstiess.net/research/snippets/show/ca1d7d90
 # Basically gets passed in a pair of (self, arg), and calls the method
 def unwrap_self(arg, **kwarg):
     return PyPreprocess._run_preprocess(*arg, **kwarg)
 
+class TimeoutError(Exception):
+    pass
+
+
+def handler(signum, frame):
+    raise TimeoutError()
+
+def which(program):
+    import os
+    def is_exe(fpath):
+        return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
+
+    fpath, fname = os.path.split(program)
+    if fpath:
+        if is_exe(program):
+            return program
+    else:
+        for path in os.environ["PATH"].split(os.pathsep):
+            path = path.strip('"')
+            exe_file = os.path.join(path, program)
+            if is_exe(exe_file):
+                return exe_file
+
+    return None
 
 class PyPreprocess(object):
     """Class to wrap all the ImageMagick convert calls"""
@@ -51,12 +78,31 @@ def cmd(self, cmd_list):
             cmd_list = ' '.join(cmd_list)
         logging.debug("Running cmd: %s" % cmd_list)
         try:
-            out = subprocess.check_output(cmd_list, stderr=subprocess.STDOUT, shell=True)
+            signal.signal(signal.SIGALRM, handler)
+            signal.alarm(TIMEOUT)
+            proc = subprocess.Popen(cmd_list, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, preexec_fn=os.setsid)
+            pid = proc.pid
+            (out, error) = proc.communicate()
+            signal.alarm(0)
             logging.debug(out)
             return out
         except subprocess.CalledProcessError as e:
             print e.output
             self._warn("Could not run command %s" % cmd_list)
+        except TimeoutError, te:
+            print "Timeout exceeded PID", pid, cmd_list
+            os.killpg(pid, signal.SIGTERM)
+            # os.kill(pid, signal.SIGTERM)
+        finally:
+            signal.alarm(0)
+
+        if proc:
+            proc.terminate()
+            proc.kill()
+            print "Killing processes"
+
+        return None
+
 
 
     def _run_preprocess(self,  in_filename):
@@ -69,7 +115,8 @@ def _run_preprocess(self,  in_filename):
         else:
             backslash = '\\'
 
-        c = ['convert',
+        convert = which('convert');
+        c = [convert,
                 '"%s"' % in_filename,
                 '-respect-parenthesis',
                 #'\\( $setcspace -colorspace gray -type grayscale \\)',
@@ -86,17 +133,23 @@ def _run_preprocess(self,  in_filename):
                 ]
         logging.info("Preprocessing image %s for better OCR" % in_filename)
         res = self.cmd(c)
+
         if res is None:
             return in_filename
         else:
-            return out_filename
+            # Make sure the convert process did not die on us
+            if os.path.isfile(out_filename):
+                print "Filename does not exist: ", out_filename, " using ", in_filename
+                return out_filename
+
+            return in_filename
 
     def preprocess(self, in_filenames):
         fns = in_filenames
 
         pool = Pool(processes=self.threads)
         logging.info("Starting preprocessing parallel execution")
-        preprocessed_filenames = pool.map(unwrap_self,zip([self]*len(fns),fns))
+        preprocessed_filenames = pool.map(unwrap_self, zip([self]*len(fns),fns))
         pool.close()
         pool.join()
         logging.info ("Completed preprocessing")

diff --git a/pypdfocr/pypdfocr_tesseract.py b/pypdfocr/pypdfocr_tesseract.py
@@ -22,6 +22,7 @@
 import os, sys
 import logging
 import subprocess
+import signal
 import glob
 from subprocess import CalledProcessError
 from multiprocessing import Pool
@@ -36,6 +37,9 @@ def error(text):
 def unwrap_self(arg, **kwarg):
     return PyTesseract.make_hocr_from_pnm(*arg, **kwarg)
 
+def init_worker():
+    signal.signal(signal.SIGINT, signal.SIG_IGN)
+
 class PyTesseract(object):
     """Class to wrap all the tesseract calls"""
     def __init__(self, config):
@@ -44,7 +48,7 @@ def __init__(self, config):
         """
         self.lang = 'eng'
         self.required = "3.02.02"
-        self.threads = config.get('threads',4)
+        self.threads = config.get('threads', 4)
 
         if "binary" in config:  # Override location of binary
             binary = config['binary']
@@ -129,12 +133,11 @@ def make_hocr_from_pnms(self, fns):
 
         # Glob it
         #fns = glob.glob(img_filename)
-        pool = Pool(processes=self.threads)
-        print("Making pool")
+        pool = Pool(processes=self.threads, initializer=init_worker)
         hocr_filenames = pool.map(unwrap_self, zip([self]*len(fns), fns))
         pool.close()
         pool.join()
-        return zip(fns,hocr_filenames)
+        return zip(fns, hocr_filenames)
 
 
     def make_hocr_from_pnm(self, img_filename):