@@ -31,6 +31,18 @@ def table_html_to_text(table_html: str) -> str:
31
31
return text
32
32
33
33
34
+ def table_html_to_markdown (table_html : str ) -> str :
35
+ from bs4 import BeautifulSoup
36
+
37
+ soup = BeautifulSoup (table_html , "html.parser" )
38
+ text = ""
39
+ for row in soup .find_all ("tr" ):
40
+ for cell in row .find_all (["td" , "th" ]):
41
+ text += cell .get_text () + " | "
42
+ text += "\n "
43
+ return text
44
+
45
+
34
46
class TextCanvas :
35
47
def __init__ (self , width : int , height : int ):
36
48
self .width = width
@@ -150,6 +162,50 @@ def formatted_text(self):
150
162
else :
151
163
return "\n " .join ([element .formatted_text for element in self .elements ])
152
164
165
+ @property
166
+ def markdown (self ):
167
+ text = ""
168
+ for element in self .elements :
169
+ print (element )
170
+ if element .element_type == "Formula" :
171
+ text += f"{ element .text } \n \n "
172
+ elif element .element_type == "FigureCaption" :
173
+ text += f"**{ element .text } **\n \n "
174
+ elif element .element_type == "NarrativeText" :
175
+ text += f"{ element .text } \n \n "
176
+ elif element .element_type == "ListItem" :
177
+ text += "-"
178
+ continue
179
+ elif element .element_type == "Title" :
180
+ text += f"# { element .text } "
181
+ elif element .element_type == "Address" :
182
+ text += f"{ element .text } "
183
+ elif element .element_type == "EmailAddress" :
184
+ text += f"{ element .text } "
185
+ elif element .element_type == "Image" :
186
+ text += f"\n \n "
187
+ elif element .element_type == "PageBreak" :
188
+ text += '<div class="pagebreak" />'
189
+ elif element .element_type == "Table" :
190
+ if element .provider_data and element .provider_data .get ("type" ) == "Table" :
191
+ text += table_html_to_markdown (element .provider_data .get ("metadata" , {}).get ("text_as_html" ))
192
+ else :
193
+ text += f"{ element .text } "
194
+ elif element .element_type == "Header" :
195
+ text += f"## { element .text } "
196
+ elif element .element_type == "Footer" :
197
+ text += f"## { element .text } "
198
+ elif element .element_type == "CodeSnippet" :
199
+ text += f"```{ element .text } ```"
200
+ elif element .element_type == "PageNumber" :
201
+ text += f"Page No. { element .text } "
202
+ elif element .element_type == "UncategorizedText" :
203
+ text += f"{ element .text } \n "
204
+ else :
205
+ text += element .text
206
+ text += "\n "
207
+ return text
208
+
153
209
154
210
class TextractResponse (BaseModel ):
155
211
pages : List [Page ] = []
@@ -171,6 +227,14 @@ def formatted_text(self):
171
227
text += f"\n --- Page Break (Pg { page .page_no } )---\n "
172
228
return text
173
229
230
+ @property
231
+ def markdown (self ):
232
+ text = ""
233
+ for page in self .pages :
234
+ text += page .markdown
235
+ text += '<div class="pagebreak" />'
236
+ return text
237
+
174
238
175
239
class TextExtractionService (ABC ):
176
240
def __init__ (self , provider ) -> None :
@@ -242,6 +306,7 @@ def extract_from_bytes(self, file: bytes, **kwargs) -> TextractResponse:
242
306
bottom_right = (box [2 ].x , box [2 ].y ),
243
307
bottom_left = (box [3 ].x , box [3 ].y ),
244
308
),
309
+ element_type = "UncategorizedText" ,
245
310
)
246
311
page_element .set_midpoint_normalized (page_width , page_height )
247
312
page .elements .append (page_element )
0 commit comments