|
| 1 | +""" |
| 2 | +Functionality related to parsing ALTO XML content packaged within a zipfile, |
| 3 | +with the goal of creating a sentence corpus with associated metadata from ALTO. |
| 4 | +""" |
| 5 | + |
| 6 | +import logging |
| 7 | +import pathlib |
| 8 | +from collections.abc import Generator |
| 9 | +from dataclasses import dataclass |
| 10 | +from functools import cached_property |
| 11 | +from timeit import default_timer as time |
| 12 | +from typing import ClassVar |
| 13 | +from zipfile import ZipFile |
| 14 | + |
| 15 | +from lxml import etree |
| 16 | +from natsort import natsorted |
| 17 | +from neuxml import xmlmap |
| 18 | + |
| 19 | +from remarx.sentence.corpus.base_input import FileInput, SectionType |
| 20 | + |
| 21 | +logger = logging.getLogger(__name__) |
| 22 | + |
| 23 | + |
| 24 | +ALTO_NAMESPACE_V4: str = "http://www.loc.gov/standards/alto/ns-v4#" |
| 25 | + |
| 26 | + |
| 27 | +class AltoXmlObject(xmlmap.XmlObject): |
| 28 | + """ |
| 29 | + Base :class:`neuxml.xmlmap.XmlObject` class for ALTO-XML content. |
| 30 | + """ |
| 31 | + |
| 32 | + # alto namespace v4; we may eventually need to support other versions |
| 33 | + ROOT_NAMESPACES: ClassVar[dict[str, str]] = {"alto": ALTO_NAMESPACE_V4} |
| 34 | + |
| 35 | + |
| 36 | +class AltoBlock(AltoXmlObject): |
| 37 | + """ |
| 38 | + Base class for an ALTO element with position information. |
| 39 | + """ |
| 40 | + |
| 41 | + vertical_position = xmlmap.FloatField("@VPOS") |
| 42 | + horizontal_position = xmlmap.FloatField("@HPOS") |
| 43 | + |
| 44 | + |
| 45 | +class TextLine(AltoBlock): |
| 46 | + """ |
| 47 | + Single line of text (`TextLine`) in an ALTO document |
| 48 | + """ |
| 49 | + |
| 50 | + text_content = xmlmap.StringField("alto:String/@CONTENT") |
| 51 | + |
| 52 | + def __str__(self) -> str: |
| 53 | + """ |
| 54 | + Override default string method to return text content of this line. |
| 55 | + """ |
| 56 | + return self.text_content |
| 57 | + |
| 58 | + |
| 59 | +class TextBlock(AltoBlock): |
| 60 | + """ |
| 61 | + Block of text with one or more lines. |
| 62 | + """ |
| 63 | + |
| 64 | + lines = xmlmap.NodeListField("alto:TextLine", TextLine) |
| 65 | + |
| 66 | + @cached_property |
| 67 | + def sorted_lines(self) -> list[TextLine]: |
| 68 | + """ |
| 69 | + Returns a list of TextLines for this block, sorted by vertical position. |
| 70 | + """ |
| 71 | + # there's no guarantee that xml document order follows page order, |
| 72 | + # so sort by @VPOS (may need further refinement for more complicated layouts) |
| 73 | + return sorted(self.lines, key=lambda line: line.vertical_position) |
| 74 | + |
| 75 | + @property |
| 76 | + def text_content(self) -> str: |
| 77 | + """ |
| 78 | + Text contents of this block; newline-delimited content of |
| 79 | + each line within this block, sorted by vertical position. |
| 80 | + """ |
| 81 | + return "\n".join([line.text_content for line in self.sorted_lines]) |
| 82 | + |
| 83 | + |
| 84 | +class AltoDocument(AltoXmlObject): |
| 85 | + """ |
| 86 | + :class:`neuxml.xmlmap.XmlObject` instance for a single ALTO XML file |
| 87 | + """ |
| 88 | + |
| 89 | + blocks = xmlmap.NodeListField(".//alto:TextBlock", TextBlock) |
| 90 | + lines = xmlmap.NodeListField(".//alto:TextLine", TextLine) |
| 91 | + |
| 92 | + def is_alto(self) -> bool: |
| 93 | + """ |
| 94 | + Check if this is an ALTO-XML document, based on the root element |
| 95 | + """ |
| 96 | + # parse with QName to access namespace and tag name without namespace |
| 97 | + root_element = etree.QName(self.node.tag) |
| 98 | + # both must match |
| 99 | + return ( |
| 100 | + root_element.namespace == ALTO_NAMESPACE_V4 |
| 101 | + and root_element.localname == "alto" |
| 102 | + ) |
| 103 | + |
| 104 | + @property |
| 105 | + def sorted_blocks(self) -> list[TextBlock]: |
| 106 | + """ |
| 107 | + Returns a list of TextBlocks for this page, sorted by vertical position. |
| 108 | + """ |
| 109 | + # there's no guarantee that xml document order follows page order, |
| 110 | + # so sort by @VPOS (may need further refinement for more complicated layouts). |
| 111 | + # NOTE: in some cases, a textblock may not have a VPOS attribute; |
| 112 | + # in that case, use the position for the first line |
| 113 | + # (text block id = eSc_dummyblock_, but appears to have real content) |
| 114 | + # if block has no line, sort text block last |
| 115 | + if not self.blocks: |
| 116 | + return [] |
| 117 | + return sorted( |
| 118 | + self.blocks, |
| 119 | + key=lambda block: block.vertical_position |
| 120 | + or ( |
| 121 | + block.sorted_lines[0].vertical_position if block.lines else float("inf") |
| 122 | + ), |
| 123 | + ) |
| 124 | + |
| 125 | + def text_chunks(self) -> Generator[dict[str, str]]: |
| 126 | + """ |
| 127 | + Returns a generator of a dictionary of text content and section type, |
| 128 | + one dictionary per text block on the page. |
| 129 | + """ |
| 130 | + # yield by block, since in future we may set section type |
| 131 | + # based on block-level semantic tagging |
| 132 | + for block in self.sorted_blocks: |
| 133 | + yield {"text": block.text_content, "section_type": SectionType.TEXT.value} |
| 134 | + |
| 135 | + |
| 136 | +@dataclass |
| 137 | +class ALTOInput(FileInput): |
| 138 | + """ |
| 139 | + Preliminary FileInput implementation for ALTO XML delivered as a zipfile. |
| 140 | + Iterates through ALTO XML members and stubs out chunk yielding for future parsing. |
| 141 | + """ |
| 142 | + |
| 143 | + field_names: ClassVar[tuple[str, ...]] = (*FileInput.field_names, "section_type") |
| 144 | + "List of field names for sentences originating from ALTO XML content." |
| 145 | + |
| 146 | + file_type: ClassVar[str] = ".zip" |
| 147 | + "Supported file extension for ALTO zipfiles (.zip)" |
| 148 | + |
| 149 | + def get_text(self) -> Generator[dict[str, str], None, None]: |
| 150 | + """ |
| 151 | + Iterate over ALTO XML files contained in the zipfile and return |
| 152 | + a generator of text content. |
| 153 | + """ |
| 154 | + num_files = 0 |
| 155 | + num_valid_files = 0 |
| 156 | + |
| 157 | + start = time() |
| 158 | + with ZipFile(self.input_file) as archive: |
| 159 | + # iterate over all files in the zipfile; |
| 160 | + # use natural sorting to process in logical order |
| 161 | + for zip_filepath in natsorted(archive.namelist()): |
| 162 | + num_files += 1 |
| 163 | + base_filename = pathlib.Path(zip_filepath).name |
| 164 | + # ignore & log non-xml files |
| 165 | + if not base_filename.lower().endswith(".xml"): |
| 166 | + logger.info( |
| 167 | + f"Ignoring non-xml file included in ALTO zipfile: {zip_filepath}" |
| 168 | + ) |
| 169 | + continue |
| 170 | + # if the file is .xml, attempt to open as an ALTO XML |
| 171 | + with archive.open(zip_filepath) as xmlfile: |
| 172 | + logger.info(f"Processing XML file {zip_filepath}") |
| 173 | + # zipfile archive open returns a file-like object |
| 174 | + try: |
| 175 | + alto_xmlobj = xmlmap.load_xmlobject_from_file( |
| 176 | + xmlfile, AltoDocument |
| 177 | + ) |
| 178 | + except etree.XMLSyntaxError as err: |
| 179 | + logger.warning(f"Skipping {zip_filepath} : invalid XML") |
| 180 | + logger.debug(f"XML syntax error: {err}", exc_info=err) |
| 181 | + continue |
| 182 | + |
| 183 | + if not alto_xmlobj.is_alto(): |
| 184 | + # TODO: add unit test for this case |
| 185 | + logger.warning( |
| 186 | + f"Skipping non-ALTO XML file {zip_filepath} (root element {alto_xmlobj.node.tag})" |
| 187 | + ) |
| 188 | + continue |
| 189 | + |
| 190 | + num_valid_files += 1 |
| 191 | + # report total # blocks, lines for each file as processed |
| 192 | + logger.debug( |
| 193 | + f"{base_filename}: {len(alto_xmlobj.blocks)} blocks, {len(alto_xmlobj.lines)} lines" |
| 194 | + ) |
| 195 | + |
| 196 | + # use the base xml file as filename here, rather than zipfile for all |
| 197 | + for chunk in alto_xmlobj.text_chunks(): |
| 198 | + yield chunk | {"file": base_filename} |
| 199 | + |
| 200 | + # warn if a document has no lines |
| 201 | + if len(alto_xmlobj.lines) == 0: |
| 202 | + logger.warning( |
| 203 | + f"No text lines found in ALTO XML file: {base_filename}" |
| 204 | + ) |
| 205 | + |
| 206 | + elapsed_time = time() - start |
| 207 | + logger.info( |
| 208 | + f"Processed {self.file_name} with {num_files} files ({num_valid_files} valid ALTO) in {elapsed_time:.1f} seconds" |
| 209 | + ) |
| 210 | + |
| 211 | + # error if no valid files were found |
| 212 | + if num_valid_files == 0: |
| 213 | + raise ValueError(f"No valid ALTO XML files found in {self.file_name}") |
0 commit comments