Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 0.8.4-dev0

* feat: **Use password** to load PDF with all modes

## 0.8.3

* fix: removed `layoutelement.from_lp_textblock()` and related tests as it's not used
Expand Down
Binary file added sample-docs/password.pdf
Binary file not shown.
16 changes: 16 additions & 0 deletions test_unstructured_inference/inference/test_layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,21 @@ def mock_get_elements(self, *args, **kwargs):
assert page.image is None


@pytest.mark.slow()
def test_from_file_with_password(monkeypatch, mock_final_layout):

doc = layout.DocumentLayout.from_file("sample-docs/password.pdf", password="password")
assert doc

monkeypatch.setattr(layout, "get_model", lambda x: MockLayoutModel(mock_final_layout))
with patch(
"unstructured_inference.inference.layout.UnstructuredObjectDetectionModel",
MockLayoutModel,
), open("sample-docs/password.pdf", mode="rb") as fp:
doc = layout.process_data_with_model(fp, model_name="fake", password="password")
assert doc


def test_from_image_file_raises_with_empty_fn():
with pytest.raises(FileNotFoundError):
layout.DocumentLayout.from_image_file("")
Expand Down Expand Up @@ -544,6 +559,7 @@ def test_process_file_with_model_routing(monkeypatch, model_type, is_detection_m
detection_model=detection_model,
element_extraction_model=element_extraction_model,
fixed_layouts=None,
password=None,
pdf_image_dpi=200,
)

Expand Down
2 changes: 1 addition & 1 deletion unstructured_inference/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.8.3" # pragma: no cover
__version__ = "0.8.4-dev0" # pragma: no cover
11 changes: 11 additions & 0 deletions unstructured_inference/inference/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def from_file(
filename: str,
fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None,
pdf_image_dpi: int = 200,
password: Optional[str] = None,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the best practice is to avoid passing password as a plain text object since this leaves the password open for grabs. I'd suggest at least try hashing/encryption

**kwargs,
) -> DocumentLayout:
"""Creates a DocumentLayout from a pdf file."""
Expand All @@ -62,6 +63,7 @@ def from_file(
pdf_image_dpi,
output_folder=temp_dir,
path_only=True,
password=password,
)
image_paths = cast(List[str], _image_paths)
number_of_pages = len(image_paths)
Expand Down Expand Up @@ -133,6 +135,7 @@ def __init__(
document_filename: Optional[Union[str, PurePath]] = None,
detection_model: Optional[UnstructuredObjectDetectionModel] = None,
element_extraction_model: Optional[UnstructuredElementExtractionModel] = None,
password: Optional[str] = None,
):
if detection_model is not None and element_extraction_model is not None:
raise ValueError("Only one of detection_model and extraction_model should be passed.")
Expand All @@ -148,6 +151,7 @@ def __init__(
self.element_extraction_model = element_extraction_model
self.elements: Collection[LayoutElement] = []
self.elements_array: LayoutElements | None = None
self.password = password
# NOTE(alan): Dropped LocationlessLayoutElement that was created for chipper - chipper has
# locations now and if we need to support LayoutElements without bounding boxes we can make
# the bbox property optional
Expand Down Expand Up @@ -325,6 +329,7 @@ def from_image(
def process_data_with_model(
data: BinaryIO,
model_name: Optional[str],
password: Optional[str] = None,
**kwargs: Any,
) -> DocumentLayout:
"""Process PDF as file-like object `data` into a `DocumentLayout`.
Expand All @@ -339,6 +344,7 @@ def process_data_with_model(
layout = process_file_with_model(
file_path,
model_name,
password=password,
**kwargs,
)

Expand All @@ -351,6 +357,7 @@ def process_file_with_model(
is_image: bool = False,
fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None,
pdf_image_dpi: int = 200,
password: Optional[str] = None,
**kwargs: Any,
) -> DocumentLayout:
"""Processes pdf file with name filename into a DocumentLayout by using a model identified by
Expand Down Expand Up @@ -379,6 +386,7 @@ def process_file_with_model(
element_extraction_model=element_extraction_model,
fixed_layouts=fixed_layouts,
pdf_image_dpi=pdf_image_dpi,
password=password,
**kwargs,
)
)
Expand All @@ -390,6 +398,7 @@ def convert_pdf_to_image(
dpi: int = 200,
output_folder: Optional[Union[str, PurePath]] = None,
path_only: bool = False,
password: Optional[str] = None,
) -> Union[List[Image.Image], List[str]]:
"""Get the image renderings of the pdf pages using pdf2image"""

Expand All @@ -402,12 +411,14 @@ def convert_pdf_to_image(
dpi=dpi,
output_folder=output_folder,
paths_only=path_only,
userpw=password, # type: ignore
)
else:
images = pdf2image.convert_from_path(
filename,
dpi=dpi,
paths_only=path_only,
userpw=password, # type: ignore
)

return images
Loading