Skip to content

Commit a9ecafc

Browse files
committed
Merge branch 'release/0.3'
release 0.3
2 parents 8b3a4f9 + e1fc298 commit a9ecafc

File tree

18 files changed

+1776
-47
lines changed

18 files changed

+1776
-47
lines changed

CHANGELOG.md

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,16 @@
11
# CHANGELOG
22

3-
## 0.2.0
3+
## 0.3.0
4+
5+
### Sentence corpus creation
6+
7+
- Sentence corpora generated from TEI now include line number field (`line_number`) based on line begin tag (`<lb>` n attribute)
8+
- Support for ALTO XML input as a zipfile with multiple pages
9+
- Skips non-ALTO files, logs warnings for invalid or empty xml
10+
- Yields sentence corpora indexed across pages; ordering based on natural sort of filenames
11+
- Improved logging output for `remarx-create-corpus` script, with optional verbose mode
12+
13+
## [0.2.0] - 2025-10-15
414

515
### Application
616

@@ -59,3 +69,4 @@ _Initial release._
5969
- Add GitHub Actions workflow to build and publish python package on PyPI when a new GitHub release created
6070

6171
[0.1.0]: https://github.com/Princeton-CDH/remarx/tree/0.1
72+
[0.2.0]: https://github.com/Princeton-CDH/remarx/tree/0.2

pyproject.toml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ dependencies = [
3535
"voyager>=2.1.0",
3636
"fastapi",
3737
"uvicorn",
38+
"natsort>=8.4.0",
3839
]
3940

4041
[project.optional-dependencies]
@@ -81,6 +82,11 @@ omit = [
8182

8283
[tool.coverage.report]
8384
show_missing = true # Helpful for debugging
85+
exclude_lines = [
86+
"# pragma: no cover",
87+
# skip command-line configuration for main method on scripts
88+
"if __name__ == .__main__.:"
89+
]
8490

8591
[tool.coverage.html]
8692
directory = "coverage_html_report"

src/remarx/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,6 @@
55

66
from remarx import app, sentence
77

8-
__version__ = "0.2"
8+
__version__ = "0.3rc1"
99

1010
__all__ = ["__version__", "app", "sentence"]

src/remarx/sentence/corpus/__init__.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,17 @@
22
Functionality for loading and chunking input files for sentence corpus creation.
33
"""
44

5+
from remarx.sentence.corpus.alto_input import ALTOInput
56
from remarx.sentence.corpus.base_input import FileInput
67
from remarx.sentence.corpus.tei_input import TEI_TAG, TEIDocument, TEIinput, TEIPage
78
from remarx.sentence.corpus.text_input import TextInput
89

9-
__all__ = ["TEI_TAG", "FileInput", "TEIDocument", "TEIPage", "TEIinput", "TextInput"]
10+
__all__ = [
11+
"TEI_TAG",
12+
"ALTOInput",
13+
"FileInput",
14+
"TEIDocument",
15+
"TEIPage",
16+
"TEIinput",
17+
"TextInput",
18+
]
Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
"""
2+
Functionality related to parsing ALTO XML content packaged within a zipfile,
3+
with the goal of creating a sentence corpus with associated metadata from ALTO.
4+
"""
5+
6+
import logging
7+
import pathlib
8+
from collections.abc import Generator
9+
from dataclasses import dataclass
10+
from functools import cached_property
11+
from timeit import default_timer as time
12+
from typing import ClassVar
13+
from zipfile import ZipFile
14+
15+
from lxml import etree
16+
from natsort import natsorted
17+
from neuxml import xmlmap
18+
19+
from remarx.sentence.corpus.base_input import FileInput, SectionType
20+
21+
logger = logging.getLogger(__name__)
22+
23+
24+
ALTO_NAMESPACE_V4: str = "http://www.loc.gov/standards/alto/ns-v4#"
25+
26+
27+
class AltoXmlObject(xmlmap.XmlObject):
28+
"""
29+
Base :class:`neuxml.xmlmap.XmlObject` class for ALTO-XML content.
30+
"""
31+
32+
# alto namespace v4; we may eventually need to support other versions
33+
ROOT_NAMESPACES: ClassVar[dict[str, str]] = {"alto": ALTO_NAMESPACE_V4}
34+
35+
36+
class AltoBlock(AltoXmlObject):
37+
"""
38+
Base class for an ALTO element with position information.
39+
"""
40+
41+
vertical_position = xmlmap.FloatField("@VPOS")
42+
horizontal_position = xmlmap.FloatField("@HPOS")
43+
44+
45+
class TextLine(AltoBlock):
46+
"""
47+
Single line of text (`TextLine`) in an ALTO document
48+
"""
49+
50+
text_content = xmlmap.StringField("alto:String/@CONTENT")
51+
52+
def __str__(self) -> str:
53+
"""
54+
Override default string method to return text content of this line.
55+
"""
56+
return self.text_content
57+
58+
59+
class TextBlock(AltoBlock):
60+
"""
61+
Block of text with one or more lines.
62+
"""
63+
64+
lines = xmlmap.NodeListField("alto:TextLine", TextLine)
65+
66+
@cached_property
67+
def sorted_lines(self) -> list[TextLine]:
68+
"""
69+
Returns a list of TextLines for this block, sorted by vertical position.
70+
"""
71+
# there's no guarantee that xml document order follows page order,
72+
# so sort by @VPOS (may need further refinement for more complicated layouts)
73+
return sorted(self.lines, key=lambda line: line.vertical_position)
74+
75+
@property
76+
def text_content(self) -> str:
77+
"""
78+
Text contents of this block; newline-delimited content of
79+
each line within this block, sorted by vertical position.
80+
"""
81+
return "\n".join([line.text_content for line in self.sorted_lines])
82+
83+
84+
class AltoDocument(AltoXmlObject):
85+
"""
86+
:class:`neuxml.xmlmap.XmlObject` instance for a single ALTO XML file
87+
"""
88+
89+
blocks = xmlmap.NodeListField(".//alto:TextBlock", TextBlock)
90+
lines = xmlmap.NodeListField(".//alto:TextLine", TextLine)
91+
92+
def is_alto(self) -> bool:
93+
"""
94+
Check if this is an ALTO-XML document, based on the root element
95+
"""
96+
# parse with QName to access namespace and tag name without namespace
97+
root_element = etree.QName(self.node.tag)
98+
# both must match
99+
return (
100+
root_element.namespace == ALTO_NAMESPACE_V4
101+
and root_element.localname == "alto"
102+
)
103+
104+
@property
105+
def sorted_blocks(self) -> list[TextBlock]:
106+
"""
107+
Returns a list of TextBlocks for this page, sorted by vertical position.
108+
"""
109+
# there's no guarantee that xml document order follows page order,
110+
# so sort by @VPOS (may need further refinement for more complicated layouts).
111+
# NOTE: in some cases, a textblock may not have a VPOS attribute;
112+
# in that case, use the position for the first line
113+
# (text block id = eSc_dummyblock_, but appears to have real content)
114+
# if block has no line, sort text block last
115+
if not self.blocks:
116+
return []
117+
return sorted(
118+
self.blocks,
119+
key=lambda block: block.vertical_position
120+
or (
121+
block.sorted_lines[0].vertical_position if block.lines else float("inf")
122+
),
123+
)
124+
125+
def text_chunks(self) -> Generator[dict[str, str]]:
126+
"""
127+
Returns a generator of a dictionary of text content and section type,
128+
one dictionary per text block on the page.
129+
"""
130+
# yield by block, since in future we may set section type
131+
# based on block-level semantic tagging
132+
for block in self.sorted_blocks:
133+
yield {"text": block.text_content, "section_type": SectionType.TEXT.value}
134+
135+
136+
@dataclass
137+
class ALTOInput(FileInput):
138+
"""
139+
Preliminary FileInput implementation for ALTO XML delivered as a zipfile.
140+
Iterates through ALTO XML members and stubs out chunk yielding for future parsing.
141+
"""
142+
143+
field_names: ClassVar[tuple[str, ...]] = (*FileInput.field_names, "section_type")
144+
"List of field names for sentences originating from ALTO XML content."
145+
146+
file_type: ClassVar[str] = ".zip"
147+
"Supported file extension for ALTO zipfiles (.zip)"
148+
149+
def get_text(self) -> Generator[dict[str, str], None, None]:
150+
"""
151+
Iterate over ALTO XML files contained in the zipfile and return
152+
a generator of text content.
153+
"""
154+
num_files = 0
155+
num_valid_files = 0
156+
157+
start = time()
158+
with ZipFile(self.input_file) as archive:
159+
# iterate over all files in the zipfile;
160+
# use natural sorting to process in logical order
161+
for zip_filepath in natsorted(archive.namelist()):
162+
num_files += 1
163+
base_filename = pathlib.Path(zip_filepath).name
164+
# ignore & log non-xml files
165+
if not base_filename.lower().endswith(".xml"):
166+
logger.info(
167+
f"Ignoring non-xml file included in ALTO zipfile: {zip_filepath}"
168+
)
169+
continue
170+
# if the file is .xml, attempt to open as an ALTO XML
171+
with archive.open(zip_filepath) as xmlfile:
172+
logger.info(f"Processing XML file {zip_filepath}")
173+
# zipfile archive open returns a file-like object
174+
try:
175+
alto_xmlobj = xmlmap.load_xmlobject_from_file(
176+
xmlfile, AltoDocument
177+
)
178+
except etree.XMLSyntaxError as err:
179+
logger.warning(f"Skipping {zip_filepath} : invalid XML")
180+
logger.debug(f"XML syntax error: {err}", exc_info=err)
181+
continue
182+
183+
if not alto_xmlobj.is_alto():
184+
# TODO: add unit test for this case
185+
logger.warning(
186+
f"Skipping non-ALTO XML file {zip_filepath} (root element {alto_xmlobj.node.tag})"
187+
)
188+
continue
189+
190+
num_valid_files += 1
191+
# report total # blocks, lines for each file as processed
192+
logger.debug(
193+
f"{base_filename}: {len(alto_xmlobj.blocks)} blocks, {len(alto_xmlobj.lines)} lines"
194+
)
195+
196+
# use the base xml file as filename here, rather than zipfile for all
197+
for chunk in alto_xmlobj.text_chunks():
198+
yield chunk | {"file": base_filename}
199+
200+
# warn if a document has no lines
201+
if len(alto_xmlobj.lines) == 0:
202+
logger.warning(
203+
f"No text lines found in ALTO XML file: {base_filename}"
204+
)
205+
206+
elapsed_time = time() - start
207+
logger.info(
208+
f"Processed {self.file_name} with {num_files} files ({num_valid_files} valid ALTO) in {elapsed_time:.1f} seconds"
209+
)
210+
211+
# error if no valid files were found
212+
if num_valid_files == 0:
213+
raise ValueError(f"No valid ALTO XML files found in {self.file_name}")

src/remarx/sentence/corpus/base_input.py

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,16 @@ def get_text(self) -> Generator[dict[str, str]]:
6060
"""
6161
raise NotImplementedError
6262

63+
def get_extra_metadata(
64+
self, chunk_info: dict[str, Any], _char_idx: int, sentence: str
65+
) -> dict[str, Any]:
66+
"""
67+
Hook method for subclasses to override to provide extra metadata for a sentence (e.g. line number).
68+
69+
:returns: Dictionary of additional metadata fields to include, or empty dict
70+
"""
71+
return {}
72+
6373
def get_sentences(self) -> Generator[dict[str, Any]]:
6474
"""
6575
Get sentences for this file, with associated metadata.
@@ -82,12 +92,20 @@ def get_sentences(self) -> Generator[dict[str, Any]]:
8292

8393
# character index is not included in output,
8494
# but may be useful for sub-chunk metadata (e.g., line number)
85-
yield chunk_info | {
86-
"text": sentence,
87-
"file": self.file_name,
88-
"sent_index": sentence_index,
89-
"sent_id": f"{self.file_name}:{sentence_index}",
90-
}
95+
96+
# specify input file name first;
97+
# chunk-specific filename take precedence (e.g. alto file within zip)
98+
yield (
99+
{"file": self.file_name}
100+
| chunk_info
101+
| {
102+
"text": sentence,
103+
"sent_index": sentence_index,
104+
"sent_id": f"{self.file_name}:{sentence_index}",
105+
}
106+
# Include any extra metadata (subclass specific)
107+
| self.get_extra_metadata(chunk_info, _char_idx, sentence)
108+
)
91109

92110
# increment sentence index
93111
sentence_index += 1

src/remarx/sentence/corpus/create.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,12 @@
1212

1313
import argparse
1414
import csv
15+
import logging
1516
import pathlib
17+
import sys
1618

1719
from remarx.sentence.corpus.base_input import FileInput
20+
from remarx.utils import configure_logging
1821

1922

2023
def create_corpus(
@@ -56,9 +59,19 @@ def main() -> None:
5659
parser.add_argument(
5760
"output_csv", type=pathlib.Path, help="Path to output sentence corpus (CSV)"
5861
)
62+
parser.add_argument(
63+
"-v",
64+
"--verbose",
65+
action="store_true",
66+
help="Verbose output (debug logging)",
67+
default=False,
68+
)
5969

6070
args = parser.parse_args()
6171

72+
log_level = logging.DEBUG if args.verbose else logging.INFO
73+
74+
configure_logging(sys.stdout, log_level=log_level)
6275
create_corpus(
6376
args.input_file,
6477
args.output_csv,

0 commit comments

Comments
 (0)