@@ -31,6 +31,18 @@ def table_html_to_text(table_html: str) -> str:
3131 return text
3232
3333
34+ def table_html_to_markdown (table_html : str ) -> str :
35+ from bs4 import BeautifulSoup
36+
37+ soup = BeautifulSoup (table_html , "html.parser" )
38+ text = ""
39+ for row in soup .find_all ("tr" ):
40+ for cell in row .find_all (["td" , "th" ]):
41+ text += cell .get_text () + " | "
42+ text += "\n "
43+ return text
44+
45+
3446class TextCanvas :
3547 def __init__ (self , width : int , height : int ):
3648 self .width = width
@@ -150,6 +162,50 @@ def formatted_text(self):
150162 else :
151163 return "\n " .join ([element .formatted_text for element in self .elements ])
152164
165+ @property
166+ def markdown (self ):
167+ text = ""
168+ for element in self .elements :
169+ print (element )
170+ if element .element_type == "Formula" :
171+ text += f"{ element .text } \n \n "
172+ elif element .element_type == "FigureCaption" :
173+ text += f"**{ element .text } **\n \n "
174+ elif element .element_type == "NarrativeText" :
175+ text += f"{ element .text } \n \n "
176+ elif element .element_type == "ListItem" :
177+ text += "-"
178+ continue
179+ elif element .element_type == "Title" :
180+ text += f"# { element .text } "
181+ elif element .element_type == "Address" :
182+ text += f"{ element .text } "
183+ elif element .element_type == "EmailAddress" :
184+ text += f"{ element .text } "
185+ elif element .element_type == "Image" :
186+ text += f"\n \n "
187+ elif element .element_type == "PageBreak" :
188+ text += '<div class="pagebreak" />'
189+ elif element .element_type == "Table" :
190+ if element .provider_data and element .provider_data .get ("type" ) == "Table" :
191+ text += table_html_to_markdown (element .provider_data .get ("metadata" , {}).get ("text_as_html" ))
192+ else :
193+ text += f"{ element .text } "
194+ elif element .element_type == "Header" :
195+ text += f"## { element .text } "
196+ elif element .element_type == "Footer" :
197+ text += f"## { element .text } "
198+ elif element .element_type == "CodeSnippet" :
199+ text += f"```{ element .text } ```"
200+ elif element .element_type == "PageNumber" :
201+ text += f"Page No. { element .text } "
202+ elif element .element_type == "UncategorizedText" :
203+ text += f"{ element .text } \n "
204+ else :
205+ text += element .text
206+ text += "\n "
207+ return text
208+
153209
154210class TextractResponse (BaseModel ):
155211 pages : List [Page ] = []
@@ -171,6 +227,14 @@ def formatted_text(self):
171227 text += f"\n --- Page Break (Pg { page .page_no } )---\n "
172228 return text
173229
230+ @property
231+ def markdown (self ):
232+ text = ""
233+ for page in self .pages :
234+ text += page .markdown
235+ text += '<div class="pagebreak" />'
236+ return text
237+
174238
175239class TextExtractionService (ABC ):
176240 def __init__ (self , provider ) -> None :
@@ -242,6 +306,7 @@ def extract_from_bytes(self, file: bytes, **kwargs) -> TextractResponse:
242306 bottom_right = (box [2 ].x , box [2 ].y ),
243307 bottom_left = (box [3 ].x , box [3 ].y ),
244308 ),
309+ element_type = "UncategorizedText" ,
245310 )
246311 page_element .set_midpoint_normalized (page_width , page_height )
247312 page .elements .append (page_element )
0 commit comments