Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
69d420c
resolve nest table
Feb 25, 2025
e7c3792
update extract table
Feb 25, 2025
f0347ff
remove table tail
Feb 25, 2025
5e17694
normalize line endings
Feb 25, 2025
c15dea1
update test case
Feb 25, 2025
d34a8a7
update format
Feb 25, 2025
87a2495
update format
Feb 25, 2025
9861090
update format
Feb 25, 2025
a77735f
change parse order
Feb 25, 2025
419b2c1
add list nest level
Feb 25, 2025
d46cb64
Merge branch 'ccprocessor:dev' into dev
dt-yy Feb 25, 2025
c40b1ea
fix pylint
Feb 25, 2025
06d251f
Merge branch 'dev' of https://github.com/dt-yy/llm-webkit-mirror into…
Feb 26, 2025
d863de3
Merge branch 'ccprocessor:dev' into dev
dt-yy Feb 27, 2025
6c7ca2d
update table nest spec.md
Feb 27, 2025
9785e05
Merge branch 'ccprocessor:dev' into dev
dt-yy Feb 28, 2025
9ef8f8a
Merge branch 'ccprocessor:dev' into dev
dt-yy Feb 28, 2025
9e15452
update parse order
Mar 3, 2025
4a61728
update parse order
Mar 3, 2025
0e239dc
Merge branch 'ccprocessor:dev' into dev
dt-yy Mar 3, 2025
1b0e1e9
update parse order
Mar 3, 2025
78ca028
update list标准
Mar 3, 2025
9eb13f5
Merge branch 'ccprocessor:dev' into dev
dt-yy Mar 3, 2025
efcd7a2
add table involve inline code
Mar 3, 2025
0776f6e
add test case
Mar 3, 2025
c04b2d8
Merge branch 'dev' of https://github.com/dt-yy/llm-webkit-mirror into…
Mar 3, 2025
3fda2a6
fix test case
Mar 3, 2025
a3b0bf6
Merge branch 'ccprocessor:dev' into dev
dt-yy Mar 5, 2025
e0196bf
add table tail
dt-yy Mar 5, 2025
f961665
加上table的tail处理
dt-yy Mar 5, 2025
cdae7a1
Merge branch 'ccprocessor:dev' into dev
dt-yy Mar 5, 2025
cdfd98c
add table tail test
dt-yy Mar 5, 2025
f8b523a
Merge branch 'ccprocessor:dev' into dev
dt-yy Mar 5, 2025
44844ee
fix test case
dt-yy Mar 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 15 additions & 12 deletions llm_web_kit/extractor/html/recognizer/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,11 +196,14 @@ def __check_table_include_math_code(self, raw_html: HtmlElement):
]
ele_res.extend(ccinterline_codes)
else:
ele_res.extend([
text.strip()
for text in self._build_html_tree(math_item[1]).itertext()
if text.strip()
])
texts = []
# 使用 itertext() 遍历所有文本片段
for text_segment in ele_item.itertext():
# 统一处理文本:去空白 + 替换字面 \n
cleaned_text = text_segment.strip().replace('\\n', '')
if cleaned_text: # 过滤空字符串
texts.append(cleaned_text)
ele_res.extend(texts)
return ele_res

def __simplify_td_th_content(self, elem: HtmlElement) -> None:
Expand All @@ -212,7 +215,8 @@ def __simplify_td_th_content(self, elem: HtmlElement) -> None:
parse_res.extend(math_res)
for item in list(elem.iterchildren()):
elem.remove(item)
elem.text = '<br>'.join(parse_res)
if parse_res:
elem.text = '<br>'.join(parse_res)
return
for child in elem.iter('td', 'th'):
self.__simplify_td_th_content(child)
Expand All @@ -227,18 +231,17 @@ def __get_table_body(self, table_type, table_root):
cleaned_attrs = {k: v for k, v in table_root.attrib.items() if k in allowed_attributes}
table_root.attrib.clear()
table_root.attrib.update(cleaned_attrs)
# text进行strip操作,tail去掉(有较多空换行)
# text进行strip操作,tail保留(部分内容留在tail中)
for elem in chain([table_root], table_root.iterdescendants()):
if elem.text:
elem.text = elem.text.strip()
if elem.tail:
elem.tail = None
if elem.text is not None:
elem.text = elem.text.strip().replace('\\n', '')
if elem.tail is not None:
elem.tail = elem.tail.strip().replace('\\n', '')
self.__simplify_td_th_content(table_root)
# 迭代
for child in table_root.iterchildren():
if child is not None:
self.__get_table_body(table_type, child)

return self._element_to_html(table_root)

def __do_extract_tables(self, root: HtmlElement) -> None:
Expand Down
Loading