Skip to content

Commit 0431a5b

Browse files
committed
Add markdown support
1 parent 6da7179 commit 0431a5b

File tree

1 file changed

+65
-0
lines changed

1 file changed

+65
-0
lines changed

llmstack/common/utils/text_extraction_service.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,18 @@ def table_html_to_text(table_html: str) -> str:
3131
return text
3232

3333

34+
def table_html_to_markdown(table_html: str) -> str:
35+
from bs4 import BeautifulSoup
36+
37+
soup = BeautifulSoup(table_html, "html.parser")
38+
text = ""
39+
for row in soup.find_all("tr"):
40+
for cell in row.find_all(["td", "th"]):
41+
text += cell.get_text() + " | "
42+
text += "\n"
43+
return text
44+
45+
3446
class TextCanvas:
3547
def __init__(self, width: int, height: int):
3648
self.width = width
@@ -150,6 +162,50 @@ def formatted_text(self):
150162
else:
151163
return "\n".join([element.formatted_text for element in self.elements])
152164

165+
@property
166+
def markdown(self):
167+
text = ""
168+
for element in self.elements:
169+
print(element)
170+
if element.element_type == "Formula":
171+
text += f"{element.text}\n\n"
172+
elif element.element_type == "FigureCaption":
173+
text += f"**{element.text}**\n\n"
174+
elif element.element_type == "NarrativeText":
175+
text += f"{element.text}\n\n"
176+
elif element.element_type == "ListItem":
177+
text += "-"
178+
continue
179+
elif element.element_type == "Title":
180+
text += f"# {element.text}"
181+
elif element.element_type == "Address":
182+
text += f"{element.text}"
183+
elif element.element_type == "EmailAddress":
184+
text += f"{element.text}"
185+
elif element.element_type == "Image":
186+
text += f"![Image metadata: {element.text}](#)\n\n"
187+
elif element.element_type == "PageBreak":
188+
text += '<div class="pagebreak" />'
189+
elif element.element_type == "Table":
190+
if element.provider_data and element.provider_data.get("type") == "Table":
191+
text += table_html_to_markdown(element.provider_data.get("metadata", {}).get("text_as_html"))
192+
else:
193+
text += f"{element.text}"
194+
elif element.element_type == "Header":
195+
text += f"## {element.text}"
196+
elif element.element_type == "Footer":
197+
text += f"## {element.text}"
198+
elif element.element_type == "CodeSnippet":
199+
text += f"```{element.text}```"
200+
elif element.element_type == "PageNumber":
201+
text += f"Page No. {element.text}"
202+
elif element.element_type == "UncategorizedText":
203+
text += f"{element.text}\n"
204+
else:
205+
text += element.text
206+
text += "\n"
207+
return text
208+
153209

154210
class TextractResponse(BaseModel):
155211
pages: List[Page] = []
@@ -171,6 +227,14 @@ def formatted_text(self):
171227
text += f"\n--- Page Break (Pg {page.page_no})---\n"
172228
return text
173229

230+
@property
231+
def markdown(self):
232+
text = ""
233+
for page in self.pages:
234+
text += page.markdown
235+
text += '<div class="pagebreak" />'
236+
return text
237+
174238

175239
class TextExtractionService(ABC):
176240
def __init__(self, provider) -> None:
@@ -242,6 +306,7 @@ def extract_from_bytes(self, file: bytes, **kwargs) -> TextractResponse:
242306
bottom_right=(box[2].x, box[2].y),
243307
bottom_left=(box[3].x, box[3].y),
244308
),
309+
element_type="UncategorizedText",
245310
)
246311
page_element.set_midpoint_normalized(page_width, page_height)
247312
page.elements.append(page_element)

0 commit comments

Comments
 (0)