diff --git a/README.rst b/README.rst index da1477f..f8fc341 100644 --- a/README.rst +++ b/README.rst @@ -48,6 +48,15 @@ Folder monitoring: --> Every time a pdf file is added to `watch_directory` it will be OCR'ed + pypdfocr -w watch_directory --archive --archive_suffix _orig.pdf + + --> Every time a pdf file is added to `watch_directory` it will be OCR'ed. The original will move to _orig and the + OCR'ed version will have its name + + pypdfocr -w watch_directory --archive --archive_suffix _orig.pdf --initial_scan + --> Every time a pdf file is added to `watch_directory` it will be OCR'ed. The original will move to _orig and the + OCR'ed version will have its name. All PDF's in the folder will be scanned and OCR'ed if they have not been already. + Automatic filing: ~~~~~~~~~~~~~~~~~ diff --git a/pypdfocr/pypdfocr.py b/pypdfocr/pypdfocr.py index 4ef136b..3ccd0bf 100644 --- a/pypdfocr/pypdfocr.py +++ b/pypdfocr/pypdfocr.py @@ -160,6 +160,17 @@ def get_options(self, argv): default=False, dest='match_using_filename', help='Use filename to match if contents did not match anything, before filing to default folder') + #-------------- + # Watch Options + #-------------- + p.add_argument('--archive', action='store_true', + dest='archive', help='Move the source document to an archive') + p.add_argument('--initial_scan', action='store_true', + dest='initial_scan', help='Include PDF documents already in folder if not processed') + p.add_argument('--archive_suffix', + dest='archive_suffix', help='Include PDF documents already in folder if not processed', default='_orig.pdf') + + # Add flow option to single mode extract_images,preprocess,ocr,write args = p.parse_args(argv) @@ -173,6 +184,10 @@ def get_options(self, argv): self.match_using_filename = args.match_using_filename self.skip_preprocess = args.skip_preprocess + self.archive = args.archive + self.archive_suffix = args.archive_suffix + self.initial_scan = args.initial_scan + if self.debug: logging.basicConfig(level=logging.DEBUG, format='%(message)s') @@ -320,7 +335,11 @@ def run_conversion(self, pdf_filename): """ print ("Starting conversion of %s" % pdf_filename) # Make the images for Tesseract - img_dpi, glob_img_filename = self.gs.make_img_from_pdf(pdf_filename) + try: + img_dpi, glob_img_filename = self.gs.make_img_from_pdf(pdf_filename) + except Exception, e: + print "Exception occurred in processing %s: %s" % (pdf_filename, e) + return fns = glob.glob(glob_img_filename) @@ -337,7 +356,8 @@ def run_conversion(self, pdf_filename): # Generate new pdf with overlayed text #ocr_pdf_filename = self.pdf.overlay_hocr(tiff_dpi, hocr_filename, pdf_filename) - ocr_pdf_filename = self.pdf.overlay_hocr_pages(img_dpi, hocr_filenames, pdf_filename) + ocr_pdf_filename = self.pdf.overlay_hocr_pages(img_dpi, hocr_filenames, pdf_filename, + archive=self.archive, archive_suffix=self.archive_suffix) # Clean up the files if not self.debug: @@ -426,13 +446,15 @@ def go(self, argv): if self.watch: while True: # Make sure the watcher doesn't terminate try: - py_watcher = PyPdfWatcher(self.watch_dir, self.config.get('watch')) + py_watcher = PyPdfWatcher(self.watch_dir, self.config.get('watch'), + archive=self.archive, initial_scan=self.initial_scan, + archive_suffix=self.archive_suffix) for pdf_filename in py_watcher.start(): self._convert_and_file_email(pdf_filename) except KeyboardInterrupt: break except Exception as e: - print traceback.print_exc(e) + traceback.print_exc(e) py_watcher.stop() else: @@ -442,14 +464,17 @@ def _convert_and_file_email(self, pdf_filename): """ Helper function to run the conversion, then do the optional filing, and optional emailing. """ - ocr_pdffilename = self.run_conversion(pdf_filename) - if self.enable_filing: - filing = self.file_converted_file(ocr_pdffilename, pdf_filename) - else: - filing = "None" + try: + ocr_pdffilename = self.run_conversion(pdf_filename) + if self.enable_filing: + filing = self.file_converted_file(ocr_pdffilename, pdf_filename) + else: + filing = "None" - if self.enable_email: - self._send_email(pdf_filename, ocr_pdffilename, filing) + if self.enable_email: + self._send_email(pdf_filename, ocr_pdffilename, filing) + except Exception, e: + print traceback.print_exc(e) def main(): # pragma: no cover script = PyPDFOCR() diff --git a/pypdfocr/pypdfocr_gs.py b/pypdfocr/pypdfocr_gs.py index 5599082..af289dc 100644 --- a/pypdfocr/pypdfocr_gs.py +++ b/pypdfocr/pypdfocr_gs.py @@ -174,12 +174,12 @@ def _run_gs(self, options, output_filename, pdf_filename): out = subprocess.check_output(cmd, shell=True) except subprocess.CalledProcessError as e: - print e.output + print "Exception running Ghostscript:\n\n", e.output + if "undefined in .getdeviceparams" in e.output: - error(self.msgs['GS_OUTDATED']) + raise(self.msgs['GS_OUTDATED']) else: - error (self.msgs['GS_FAILED']) - + raise(self.msgs['GS_FAILED']) def make_img_from_pdf(self, pdf_filename): self._get_dpi(pdf_filename) # No need to bother anymore @@ -189,7 +189,6 @@ def make_img_from_pdf(self, pdf_filename): filename, filext = os.path.splitext(pdf_filename) - # Create ancillary jpeg files to use later to calculate image dpi etc # We no longer use these for the final image. Instead the text is merged # directly with the original PDF. Yay! @@ -213,6 +212,7 @@ def make_img_from_pdf(self, pdf_filename): options = ' '.join(self.gs_options[self.img_format][1]) % {'dpi':self.output_dpi} output_filename = '%s_%%d.%s' % (filename, self.img_file_ext) self._run_gs(options, output_filename, pdf_filename) + for fn in glob.glob(globable_filename): logging.info("Created image %s" % fn) return (self.output_dpi, globable_filename) diff --git a/pypdfocr/pypdfocr_pdf.py b/pypdfocr/pypdfocr_pdf.py index b4e31e0..e7c8a39 100644 --- a/pypdfocr/pypdfocr_pdf.py +++ b/pypdfocr/pypdfocr_pdf.py @@ -76,7 +76,7 @@ def mergeRotateAroundPointPage(self,page, page2, rotation, tx, ty): ctm[1][0], ctm[1][1], ctm[2][0], ctm[2][1]]) - def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename): + def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename, archive=False, archive_suffix="_orig.pdf"): logging.debug("Going to overlay following files onto %s" % orig_pdf_filename) # Sort the hocr_filenames into natural keys! @@ -87,6 +87,7 @@ def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename): basename = os.path.splitext(pdf_basename)[0] pdf_filename = os.path.join(pdf_dir, "%s_ocr.pdf" % (basename)) + text_pdf_filenames = [] for img_filename, hocr_filename in hocr_filenames: text_pdf_filename = self.overlay_hocr_page(dpi, hocr_filename, img_filename) @@ -96,6 +97,16 @@ def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename): writer = PdfFileWriter() orig = open(orig_pdf_filename, 'rb') + orig_reader = PdfFileReader(orig) + + # Save the properties + pdf_info = orig_reader.getDocumentInfo() + if pdf_info is not None: + writer.addMetadata(pdf_info) + + writer.addMetadata({ '/PyPDFOCR': 'True' }) + + # Loop through the pages for orig_pg, text_pg_filename in zip(self.iter_pdf_page(orig), text_pdf_filenames): text_file = open(text_pg_filename, 'rb') text_pg = self.iter_pdf_page(text_file).next() @@ -123,6 +134,15 @@ def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename): for fn in text_pdf_filenames: os.remove(fn) + print "Done on conversion: ", orig_pdf_filename + if archive: + original_filename = os.path.join(pdf_dir, "%s%s" % (basename, archive_suffix)) + ocr_filename = orig_pdf_filename + print "Archiving PDF %s -> %s, %s -> %s" % (orig_pdf_filename, original_filename, pdf_filename, ocr_filename) + os.rename(orig_pdf_filename, original_filename) + os.rename(pdf_filename, ocr_filename) + + logging.info("Created OCR'ed pdf as %s" % (pdf_filename)) return pdf_filename diff --git a/pypdfocr/pypdfocr_preprocess.py b/pypdfocr/pypdfocr_preprocess.py index 39abc37..43e11a7 100644 --- a/pypdfocr/pypdfocr_preprocess.py +++ b/pypdfocr/pypdfocr_preprocess.py @@ -25,15 +25,42 @@ import logging import glob import functools +import signal from multiprocessing import Pool +TIMEOUT = 500 + # Ugly hack to pass in object method to the multiprocessing library # From http://www.rueckstiess.net/research/snippets/show/ca1d7d90 # Basically gets passed in a pair of (self, arg), and calls the method def unwrap_self(arg, **kwarg): return PyPreprocess._run_preprocess(*arg, **kwarg) +class TimeoutError(Exception): + pass + + +def handler(signum, frame): + raise TimeoutError() + +def which(program): + import os + def is_exe(fpath): + return os.path.isfile(fpath) and os.access(fpath, os.X_OK) + + fpath, fname = os.path.split(program) + if fpath: + if is_exe(program): + return program + else: + for path in os.environ["PATH"].split(os.pathsep): + path = path.strip('"') + exe_file = os.path.join(path, program) + if is_exe(exe_file): + return exe_file + + return None class PyPreprocess(object): """Class to wrap all the ImageMagick convert calls""" @@ -51,12 +78,31 @@ def cmd(self, cmd_list): cmd_list = ' '.join(cmd_list) logging.debug("Running cmd: %s" % cmd_list) try: - out = subprocess.check_output(cmd_list, stderr=subprocess.STDOUT, shell=True) + signal.signal(signal.SIGALRM, handler) + signal.alarm(TIMEOUT) + proc = subprocess.Popen(cmd_list, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, preexec_fn=os.setsid) + pid = proc.pid + (out, error) = proc.communicate() + signal.alarm(0) logging.debug(out) return out except subprocess.CalledProcessError as e: print e.output self._warn("Could not run command %s" % cmd_list) + except TimeoutError, te: + print "Timeout exceeded PID", pid, cmd_list + os.killpg(pid, signal.SIGTERM) + # os.kill(pid, signal.SIGTERM) + finally: + signal.alarm(0) + + if proc: + proc.terminate() + proc.kill() + print "Killing processes" + + return None + def _run_preprocess(self, in_filename): @@ -69,7 +115,8 @@ def _run_preprocess(self, in_filename): else: backslash = '\\' - c = ['convert', + convert = which('convert'); + c = [convert, '"%s"' % in_filename, '-respect-parenthesis', #'\\( $setcspace -colorspace gray -type grayscale \\)', @@ -86,17 +133,23 @@ def _run_preprocess(self, in_filename): ] logging.info("Preprocessing image %s for better OCR" % in_filename) res = self.cmd(c) + if res is None: return in_filename else: - return out_filename + # Make sure the convert process did not die on us + if os.path.isfile(out_filename): + print "Filename does not exist: ", out_filename, " using ", in_filename + return out_filename + + return in_filename def preprocess(self, in_filenames): fns = in_filenames pool = Pool(processes=self.threads) logging.info("Starting preprocessing parallel execution") - preprocessed_filenames = pool.map(unwrap_self,zip([self]*len(fns),fns)) + preprocessed_filenames = pool.map(unwrap_self, zip([self]*len(fns),fns)) pool.close() pool.join() logging.info ("Completed preprocessing") diff --git a/pypdfocr/pypdfocr_tesseract.py b/pypdfocr/pypdfocr_tesseract.py index 476d5cd..392a980 100644 --- a/pypdfocr/pypdfocr_tesseract.py +++ b/pypdfocr/pypdfocr_tesseract.py @@ -22,6 +22,7 @@ import os, sys import logging import subprocess +import signal import glob from subprocess import CalledProcessError from multiprocessing import Pool @@ -36,6 +37,9 @@ def error(text): def unwrap_self(arg, **kwarg): return PyTesseract.make_hocr_from_pnm(*arg, **kwarg) +def init_worker(): + signal.signal(signal.SIGINT, signal.SIG_IGN) + class PyTesseract(object): """Class to wrap all the tesseract calls""" def __init__(self, config): @@ -44,7 +48,7 @@ def __init__(self, config): """ self.lang = 'eng' self.required = "3.02.02" - self.threads = config.get('threads',4) + self.threads = config.get('threads', 4) if "binary" in config: # Override location of binary binary = config['binary'] @@ -129,12 +133,11 @@ def make_hocr_from_pnms(self, fns): # Glob it #fns = glob.glob(img_filename) - pool = Pool(processes=self.threads) - print("Making pool") + pool = Pool(processes=self.threads, initializer=init_worker) hocr_filenames = pool.map(unwrap_self, zip([self]*len(fns), fns)) pool.close() pool.join() - return zip(fns,hocr_filenames) + return zip(fns, hocr_filenames) def make_hocr_from_pnm(self, img_filename): diff --git a/pypdfocr/pypdfocr_watcher.py b/pypdfocr/pypdfocr_watcher.py index 73581e0..a495baf 100755 --- a/pypdfocr/pypdfocr_watcher.py +++ b/pypdfocr/pypdfocr_watcher.py @@ -15,7 +15,9 @@ from watchdog.events import LoggingEventHandler from watchdog.events import FileSystemEventHandler - +from PyPDF2 import PdfFileReader +from PyPDF2.utils import PdfReadError + class PyPdfWatcher(FileSystemEventHandler): """ Watch a folder for new pdf files. @@ -28,12 +30,20 @@ class PyPdfWatcher(FileSystemEventHandler): events = {} events_lock = Lock() - def __init__(self, monitor_dir, config): + def __init__(self, monitor_dir, config, archive=False, initial_scan=False, + archive_suffix="_orig.pdf"): FileSystemEventHandler.__init__(self) self.monitor_dir = monitor_dir + self.archive_suffix = archive_suffix + self.archive = archive + if not config: config = {} + # Scan initial folder + if initial_scan: + self.scan_folder() + self.scan_interval = config.get('scan_interval', 3) # If no updates in 3 seconds (or user specified option in config file) process file def start(self): @@ -48,11 +58,11 @@ def start(self): if newFile: yield newFile self.observer.join() - + def stop(self): self.observer.stop() - + def rename_file_with_spaces(self, pdf_filename): """ Rename any portion of a filename that has spaces in the basename with underscores. @@ -68,11 +78,43 @@ def rename_file_with_spaces(self, pdf_filename): newFilename = os.path.join(filepath, filename.replace(' ','_')) logging.debug("Renaming spaces") logging.debug("---> %s \n ------> %s" % (pdf_filename, newFilename)) - shutil.move(pdf_filename, newFilename) + shutil.move(pdf_filename, newFilename) return newFilename else: return pdf_filename + def check_file_for_processing(self, ev_path): + """ + This checks a path to see if it we should process it. + + :param ev_path: Fully qualified path to file to check + :return: True if it should be convertred. False if not + """ + if not ev_path.endswith(".pdf"): + return False + + if ev_path.endswith("_ocr.pdf"): + return False + + if self.archive_suffix and ev_path.endswith(self.archive_suffix): + return False + + try: + with open(ev_path, "rb") as f: + pdf = PdfFileReader(f) + pdf_info = pdf.getDocumentInfo() + + # It has been OCR'ed' + if pdf_info is not None and '/PyPDFOCR' in pdf_info: + return False + except IOError: + return False + except PdfReadError: + return False + + return True + + def check_for_new_pdf(self,ev_path): """ Called by the file watching api on any file creations/modifications. @@ -87,29 +129,29 @@ def check_for_new_pdf(self,ev_path): - Add it with the current time Otherwise: - + - If the file time is marked as -1, delete it from the dict - Else, update the time in the dict to the current time """ - if ev_path.endswith(".pdf"): - if not ev_path.endswith("_ocr.pdf"): - PyPdfWatcher.events_lock.acquire() - if not ev_path in PyPdfWatcher.events: - PyPdfWatcher.events[ev_path] = time.time() - logging.info ("Adding %s to event queue" % ev_path) - else: - if PyPdfWatcher.events[ev_path] == -1: - logging.info ( "%s removing from event queue" % (ev_path)) - del PyPdfWatcher.events[ev_path] - else: - newTime = time.time() - logging.debug ( "%s already in event queue, updating timestamp to %d" % (ev_path, newTime)) - PyPdfWatcher.events[ev_path] = newTime - PyPdfWatcher.events_lock.release() + result = self.check_file_for_processing(ev_path) + if not result: + return + + PyPdfWatcher.events_lock.acquire() + if not ev_path in PyPdfWatcher.events: + PyPdfWatcher.events[ev_path] = time.time() + logging.info ("Adding %s to event queue" % ev_path) + else: + if PyPdfWatcher.events[ev_path] == -1: + logging.info ( "%s removing from event queue" % (ev_path)) + del PyPdfWatcher.events[ev_path] + else: + newTime = time.time() + logging.debug ( "%s already in event queue, updating timestamp to %d" % (ev_path, newTime)) + PyPdfWatcher.events[ev_path] = newTime + PyPdfWatcher.events_lock.release() - - def on_created(self, event): logging.debug ("on_created: %s at time %d" % (event.src_path, time.time())) self.check_for_new_pdf(event.src_path) @@ -125,7 +167,7 @@ def on_modified(self, event): def check_queue(self): """ This function is called at regular intervals by :func:`start`. - + Iterate through the events, and if there is any with a timestamp greater than the scan_interval, return it and set its timestamp to -1 for purging later. @@ -148,5 +190,46 @@ def check_queue(self): PyPdfWatcher.events_lock.release() return None - - + def scan_folder(self): + path = os.path.abspath(self.monitor_dir) + dirs, files = self.separate_folder_contents(path)[:2] + self.scan_folder_internal(path, dirs, files) + + + def scan_folder_internal(self, root, dirs, files): + if files: + for name in files: + path = os.path.join(root, name) + + result = self.check_file_for_processing(path) + if not result: + continue + + PyPdfWatcher.events[path] = time.time() + + for pos, neg, name in self.enumerate2(dirs): + path = os.path.join(root, name) + + try: + dirs, files = self.separate_folder_contents(path)[:2] + except: + pass + else: + self.scan_folder_internal(path, dirs, files) + + def separate_folder_contents(self, path): + dirs, files, links = [], [], [] + for name in os.listdir(path): + path_name = os.path.join(path, name) + if os.path.isdir(path_name): + dirs.append(name) + elif os.path.isfile(path_name): + files.append(name) + elif os.path.islink(path_name): + links.append(name) + return dirs, files, links + + def enumerate2(self, sequence): + length = len(sequence) + for count, value in enumerate(sequence): + yield count, count - length, value