diff --git a/camelot/cli.py b/camelot/cli.py index 8ff57135..76643adb 100644 --- a/camelot/cli.py +++ b/camelot/cli.py @@ -113,6 +113,12 @@ def cli(ctx, *args, **kwargs): @click.option( "-back", "--process_background", is_flag=True, help="Process background lines." ) +@click.option( + "-color", + "--process_color_background", + is_flag=True, + help="Increase contrast for better background line processing.", +) @click.option( "-scale", "--line_scale", diff --git a/camelot/image_processing.py b/camelot/image_processing.py index 94c0b58f..947dddb5 100644 --- a/camelot/image_processing.py +++ b/camelot/image_processing.py @@ -4,7 +4,14 @@ import numpy as np -def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2): +def adaptive_threshold( + imagename, + process_background=False, + blocksize=15, + c=-2, + process_color_background=False, + saturation_threshold=5, +): """Thresholds an image using OpenCV's adaptiveThreshold. Parameters @@ -13,18 +20,23 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2): Path to image file. process_background : bool, optional (default: False) Whether or not to process lines that are in background. + process_color_background : bool, optional (default: False) + Increase contrast for better background line processing. + saturation_threshold : int, optional (default: 15) + Increase the saturation for better colored background line processing. blocksize : int, optional (default: 15) Size of a pixel neighborhood that is used to calculate a threshold value for the pixel: 3, 5, 7, and so on. For more information, refer `OpenCV's adaptiveThreshold - `_. + `_. c : int, optional (default: -2) Constant subtracted from the mean or weighted mean. Normally, it is positive but may be zero or negative as well. For more information, refer `OpenCV's adaptiveThreshold - `_. + `_. + Returns ------- @@ -35,10 +47,26 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2): """ img = cv2.imread(imagename) gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) - if not process_background: + if process_color_background: + hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) + initial = hsv[:, :, 1] + hsv[initial > saturation_threshold, 0] = 0 + hsv[initial > saturation_threshold, 1] = 255 + hsv[initial > saturation_threshold, 2] = 0 + hsv[initial <= saturation_threshold, 0] = 128 + hsv[initial <= saturation_threshold, 1] = 0 + hsv[initial <= saturation_threshold, 2] = 255 + hsv[initial == 255, 1] = 0 + gray = cv2.cvtColor(hsv, cv2.COLOR_BGR2GRAY) + elif not process_background: gray = np.invert(gray) threshold = cv2.adaptiveThreshold( - gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c + gray, + 255, + cv2.ADAPTIVE_THRESH_GAUSSIAN_C, + cv2.THRESH_BINARY, + blocksize, + c, ) return img, threshold diff --git a/camelot/io.py b/camelot/io.py index 11ec2d8a..b1d525fa 100644 --- a/camelot/io.py +++ b/camelot/io.py @@ -70,6 +70,8 @@ def read_pdf( to generate columns. process_background* : bool, optional (default: False) Process background lines. + process_color_background* : bool, optional (default: False) + Increase contrast for better background line processing. line_scale* : int, optional (default: 40) Line size scaling factor. The larger the value the smaller the detected lines. Making it very large will lead to text diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index d185860b..58cd9d7b 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -34,6 +34,10 @@ class Lattice(BaseParser): in PDF coordinate space. process_background : bool, optional (default: False) Process background lines. + process_color_background : bool, optional (default: False) + Increase contrast for better background line processing. + saturation_threshold : int, optional (default: 15) + Increase the saturation for better colored background line processing. line_scale : int, optional (default: 15) Line size scaling factor. The larger the value the smaller the detected lines. Making it very large will lead to text @@ -85,6 +89,8 @@ def __init__( table_regions=None, table_areas=None, process_background=False, + process_color_background=False, + saturation_threshold=5, line_scale=15, copy_text=None, shift_text=None, @@ -105,6 +111,8 @@ def __init__( self.table_regions = table_regions self.table_areas = table_areas self.process_background = process_background + self.process_color_background = process_color_background + self.saturation_threshold = saturation_threshold self.line_scale = line_scale self.copy_text = copy_text self.shift_text = shift_text or ["l", "t"] @@ -230,6 +238,8 @@ def scale_areas(areas): self.pdf_image, self.threshold = adaptive_threshold( self.image_path, process_background=self.process_background, + process_color_background=self.process_color_background, + saturation_threshold=self.saturation_threshold, blocksize=self.threshold_blocksize, c=self.threshold_constant, ) diff --git a/camelot/utils.py b/camelot/utils.py index 4ed5bff2..44b185ab 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -130,6 +130,8 @@ def download_url(url: str) -> StrByteType | Path: text_kwargs = common_kwargs + ["columns", "edge_tol", "row_tol", "column_tol"] lattice_kwargs = common_kwargs + [ "process_background", + "process_color_background", + "saturation_threshold", "line_scale", "copy_text", "shift_text", diff --git a/docs/user/advanced.rst b/docs/user/advanced.rst index 2fd696e3..cbc1b90e 100644 --- a/docs/user/advanced.rst +++ b/docs/user/advanced.rst @@ -35,6 +35,19 @@ To process background lines, you can pass ``process_background=True``. :file: ../_static/csv/background_lines.csv :class: full-width +If there's too little contrast between the table background color and the document background color, you can try the option ``process_color_background=True``. + +.. code-block:: pycon + + >>> tables = camelot.read_pdf('background_lines.pdf', process_background=True, process_color_background=True) + >>> tables[1].df + +.. tip:: + Here's how you can do the same with the :ref:`command-line interface `. + :: + + $ camelot lattice --process_color_background background_lines.pdf + Visual debugging ----------------