diff --git a/camelot/cli.py b/camelot/cli.py index 1715e6f1..428494b1 100644 --- a/camelot/cli.py +++ b/camelot/cli.py @@ -100,6 +100,9 @@ def cli(ctx, *args, **kwargs): @click.option( "-back", "--process_background", is_flag=True, help="Process background lines." ) +@click.option( + "-color", "--process_color_background", is_flag=True, help="Increase contrast for better background line processing." +) @click.option( "-scale", "--line_scale", diff --git a/camelot/image_processing.py b/camelot/image_processing.py index 08acb23e..8578a5bc 100644 --- a/camelot/image_processing.py +++ b/camelot/image_processing.py @@ -4,7 +4,7 @@ import numpy as np -def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2): +def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2, process_color_background=False, saturation_threshold=5): """Thresholds an image using OpenCV's adaptiveThreshold. Parameters @@ -36,6 +36,17 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2): gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) if process_background: + if process_color_background: + hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) + initial = hsv[:, :, 1] + hsv[initial > saturation_threshold, 0] = 0 + hsv[initial > saturation_threshold, 1] = 255 + hsv[initial > saturation_threshold, 2] = 0 + hsv[initial <= saturation_threshold, 0] = 128 + hsv[initial <= saturation_threshold, 1] = 0 + hsv[initial <= saturation_threshold, 2] = 255 + hsv[initial == 255, 1] = 0 + gray = cv2.cvtColor(hsv, cv2.COLOR_BGR2GRAY) threshold = cv2.adaptiveThreshold( gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c ) diff --git a/camelot/io.py b/camelot/io.py index a27a7c66..9399ba81 100644 --- a/camelot/io.py +++ b/camelot/io.py @@ -59,6 +59,8 @@ def read_pdf( to generate columns. process_background* : bool, optional (default: False) Process background lines. + process_color_background* : bool, optional (default: False) + Increase contrast for better background line processing. line_scale* : int, optional (default: 15) Line size scaling factor. The larger the value the smaller the detected lines. Making it very large will lead to text diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index 5469fac8..064bc8fe 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -99,6 +99,7 @@ def __init__( table_regions=None, table_areas=None, process_background=False, + process_color_background=False, line_scale=15, copy_text=None, shift_text=["l", "t"], @@ -116,6 +117,7 @@ def __init__( self.table_regions = table_regions self.table_areas = table_areas self.process_background = process_background + self.process_color_background = process_color_background self.line_scale = line_scale self.copy_text = copy_text self.shift_text = shift_text @@ -236,6 +238,7 @@ def scale_areas(areas): self.image, self.threshold = adaptive_threshold( self.imagename, process_background=self.process_background, + process_color_background=self.process_color_background, blocksize=self.threshold_blocksize, c=self.threshold_constant, ) diff --git a/camelot/utils.py b/camelot/utils.py index 3e8ab96b..5843a029 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -96,6 +96,7 @@ def download_url(url): stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol"] lattice_kwargs = [ "process_background", + "process_color_background", "line_scale", "copy_text", "shift_text", diff --git a/docs/user/advanced.rst b/docs/user/advanced.rst index b482022b..e1ef329c 100644 --- a/docs/user/advanced.rst +++ b/docs/user/advanced.rst @@ -33,6 +33,19 @@ To process background lines, you can pass ``process_background=True``. .. csv-table:: :file: ../_static/csv/background_lines.csv +If there's too little contrast between the table background color and the document background color, you can try combining the experimental option ``process_color_background=True``. + +:: + + >>> tables = camelot.read_pdf('background_lines.pdf', process_background=True, process_color_background=True) + >>> tables[1].df + +.. tip:: + Here's how you can do the same with the :ref:`command-line interface `. + :: + + $ camelot lattice -back -color background_lines.pdf + Visual debugging ----------------