Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
69d420c
resolve nest table
Feb 25, 2025
e7c3792
update extract table
Feb 25, 2025
f0347ff
remove table tail
Feb 25, 2025
5e17694
normalize line endings
Feb 25, 2025
c15dea1
update test case
Feb 25, 2025
d34a8a7
update format
Feb 25, 2025
87a2495
update format
Feb 25, 2025
9861090
update format
Feb 25, 2025
a77735f
change parse order
Feb 25, 2025
419b2c1
add list nest level
Feb 25, 2025
d46cb64
Merge branch 'ccprocessor:dev' into dev
dt-yy Feb 25, 2025
c40b1ea
fix pylint
Feb 25, 2025
06d251f
Merge branch 'dev' of https://github.com/dt-yy/llm-webkit-mirror into…
Feb 26, 2025
d863de3
Merge branch 'ccprocessor:dev' into dev
dt-yy Feb 27, 2025
6c7ca2d
update table nest spec.md
Feb 27, 2025
9785e05
Merge branch 'ccprocessor:dev' into dev
dt-yy Feb 28, 2025
9ef8f8a
Merge branch 'ccprocessor:dev' into dev
dt-yy Feb 28, 2025
9e15452
update parse order
Mar 3, 2025
4a61728
update parse order
Mar 3, 2025
0e239dc
Merge branch 'ccprocessor:dev' into dev
dt-yy Mar 3, 2025
1b0e1e9
update parse order
Mar 3, 2025
78ca028
update list标准
Mar 3, 2025
9eb13f5
Merge branch 'ccprocessor:dev' into dev
dt-yy Mar 3, 2025
efcd7a2
add table involve inline code
Mar 3, 2025
0776f6e
add test case
Mar 3, 2025
c04b2d8
Merge branch 'dev' of https://github.com/dt-yy/llm-webkit-mirror into…
Mar 3, 2025
3fda2a6
fix test case
Mar 3, 2025
a3b0bf6
Merge branch 'ccprocessor:dev' into dev
dt-yy Mar 5, 2025
e0196bf
add table tail
dt-yy Mar 5, 2025
f961665
加上table的tail处理
dt-yy Mar 5, 2025
cdae7a1
Merge branch 'ccprocessor:dev' into dev
dt-yy Mar 5, 2025
cdfd98c
add table tail test
dt-yy Mar 5, 2025
f8b523a
Merge branch 'ccprocessor:dev' into dev
dt-yy Mar 5, 2025
44844ee
fix test case
dt-yy Mar 5, 2025
8d8cfce
Merge branch 'ccprocessor:dev' into dev
dt-yy Mar 6, 2025
94f627f
remove enter in table
dt-yy Mar 6, 2025
d90dd07
:wq
dt-yy Mar 6, 2025
95120d0
remove print
dt-yy Mar 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion llm_web_kit/extractor/html/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ def __get_cc_node(self, html:str) -> (str, str):
nodes = el.xpath(xpath_expr)
if len(nodes) == 0:
raise ValueError(f'html文本中没有cc标签: {html}') # TODO 异常处理
if len(nodes) > 1:
if len(nodes) > 2:
raise ValueError(f'html文本中包含多个cc标签: {html}') # TODO 异常处理
return element_to_html(nodes[0]), nodes[0].tag

Expand Down
17 changes: 15 additions & 2 deletions llm_web_kit/libs/html_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import html
import re
from copy import deepcopy

from lxml.html import HtmlElement, HTMLParser, fromstring, tostring
Expand Down Expand Up @@ -114,6 +115,18 @@ def iter_node(element: HtmlElement):
yield from iter_node(sub_element)


def _escape_table_cell(text: str) -> str:
"""转义表格单元格中的特殊字符.

比如 |、内容中的\n等
"""
# 首先处理换行符,将其替换为空格
text = re.sub(r'[\r\n]+', ' ', text)
# 转义竖线和点号,避免与markdown表格语法冲突
escaped = text.replace('|', '\\|')
return escaped


def html_to_markdown_table(table_html_source: str) -> str:
"""把html代码片段转换成markdown表格.

Expand All @@ -140,7 +153,7 @@ def html_to_markdown_table(table_html_source: str) -> str:

# 检查第一行是否是表头并获取表头内容
first_row_tags = rows[0].xpath('.//th | .//td')
headers = [tag.text_content().strip() for tag in first_row_tags]
headers = [_escape_table_cell(tag.text_content().strip()) for tag in first_row_tags]
# 如果表头存在,添加表头和分隔符,并保证表头与最大列数对齐
if headers:
while len(headers) < max_cols:
Expand All @@ -155,7 +168,7 @@ def html_to_markdown_table(table_html_source: str) -> str:

# 添加表格内容,跳过已被用作表头的第一行(如果有的话)
for row in rows[1:]:
columns = [td.text_content().strip() for td in row.xpath('.//td | .//th')]
columns = [_escape_table_cell(td.text_content().strip()) for td in row.xpath('.//td | .//th')]
# 如果这一行的列数少于最大列数,则补充空白单元格
while len(columns) < max_cols:
columns.append('')
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@
{"track_id": "legato_doc", "dataset_name": "test_pipeline_suit", "url": "https://www.test.com","data_source_category": "HTML", "path":"legato_docs.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
{"track_id": "oracle_doc", "dataset_name": "test_pipeline_suit", "url": "https://docs.oracle.com/en-us/iaas/tools/java/3.57.1/com/oracle/bmc/integration/model/CustomEndpointDetails.html","data_source_category": "HTML", "path":"oracle_doc.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
{"track_id": "table_involve_inline_code", "dataset_name": "test_table_involve_inline_code", "url": "https://docs.oracle.com/en-us/iaas/tools/java/3.57.1/com/oracle/bmc/integration/model/CustomEndpointDetails.html","data_source_category": "HTML", "path":"table_involve_inline_code.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
{"track_id": "table_tail_text", "dataset_name": "test_table_tail_text", "url": "https://dchublists.com/?do=hublist&id=hub-975&language=en","data_source_category": "HTML", "path":"table_tail_text.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
{"track_id": "table_tail_text", "dataset_name": "test_table_tail_text", "url": "https://dchublists.com/?do=hublist&id=hub-975&language=en","data_source_category": "HTML", "path":"table_tail_text.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
{"track_id": "table_elem_include_enter", "dataset_name": "table_elem_include_enter", "url": "https://fardapaper.ir/financial-development-equity-capital","data_source_category": "HTML", "path":"test_table_elem_include_enter.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions tests/llm_web_kit/extractor/html/recognizer/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ def test_table_involve_after_code(self):
parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html)
assert html_to_element(parts[0][0]).xpath(f'.//{CCTag.CC_TABLE}')[0].text is None

@unittest.skip(reason='在code模块解决了table嵌套多行代码问题')
def test_table_involve_code(self):
"""table involve code."""
for test_case in TEST_CASES:
Expand Down
19 changes: 18 additions & 1 deletion tests/llm_web_kit/extractor/test_extractor_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def setUp(self):
for line in f:
self.data_json.append(json.loads(line.strip()))

assert len(self.data_json) == 13
assert len(self.data_json) == 14

# Config for HTML extraction
self.config = {
Expand Down Expand Up @@ -369,4 +369,21 @@ def test_table_tail_text(self):
input_data = DataJson(test_data)
result = chain.extract(input_data)
content_md = result.get_content_list().to_mm_md()
print(content_md)
assert '| ID: 975' in content_md

def test_table_element_include_enter(self):
"""table的元素中间有换行."""
chain = ExtractSimpleFactory.create(self.config)
self.assertIsNotNone(chain)
test_data = self.data_json[13]
# Create DataJson from test data
input_data = DataJson(test_data)
result = chain.extract(input_data)
content_md = result.get_content_list().to_mm_md()
print(content_md)
assert """| عنوان فارسی | توسعه مالی و هزینه سرمایه حقوق سهامداران: شواهدی از چین |
|---|---|
| عنوان انگلیسی | Financial development and the cost of equity capital: Evidence from China |
| کلمات کلیدی : | &nbsp توسعه مالی؛ هزینه سرمایه حقوق سهامداران؛ قانون و امور مالی؛ چین |
| درسهای مرتبط | حسابداری |""" in content_md
Loading