Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
69d420c
resolve nest table
Feb 25, 2025
e7c3792
update extract table
Feb 25, 2025
f0347ff
remove table tail
Feb 25, 2025
5e17694
normalize line endings
Feb 25, 2025
c15dea1
update test case
Feb 25, 2025
d34a8a7
update format
Feb 25, 2025
87a2495
update format
Feb 25, 2025
9861090
update format
Feb 25, 2025
a77735f
change parse order
Feb 25, 2025
419b2c1
add list nest level
Feb 25, 2025
d46cb64
Merge branch 'ccprocessor:dev' into dev
dt-yy Feb 25, 2025
c40b1ea
fix pylint
Feb 25, 2025
06d251f
Merge branch 'dev' of https://github.com/dt-yy/llm-webkit-mirror into…
Feb 26, 2025
d863de3
Merge branch 'ccprocessor:dev' into dev
dt-yy Feb 27, 2025
6c7ca2d
update table nest spec.md
Feb 27, 2025
9785e05
Merge branch 'ccprocessor:dev' into dev
dt-yy Feb 28, 2025
9ef8f8a
Merge branch 'ccprocessor:dev' into dev
dt-yy Feb 28, 2025
9e15452
update parse order
Mar 3, 2025
4a61728
update parse order
Mar 3, 2025
0e239dc
Merge branch 'ccprocessor:dev' into dev
dt-yy Mar 3, 2025
1b0e1e9
update parse order
Mar 3, 2025
78ca028
update list标准
Mar 3, 2025
9eb13f5
Merge branch 'ccprocessor:dev' into dev
dt-yy Mar 3, 2025
efcd7a2
add table involve inline code
Mar 3, 2025
0776f6e
add test case
Mar 3, 2025
c04b2d8
Merge branch 'dev' of https://github.com/dt-yy/llm-webkit-mirror into…
Mar 3, 2025
3fda2a6
fix test case
Mar 3, 2025
a3b0bf6
Merge branch 'ccprocessor:dev' into dev
dt-yy Mar 5, 2025
e0196bf
add table tail
dt-yy Mar 5, 2025
f961665
加上table的tail处理
dt-yy Mar 5, 2025
cdae7a1
Merge branch 'ccprocessor:dev' into dev
dt-yy Mar 5, 2025
cdfd98c
add table tail test
dt-yy Mar 5, 2025
f8b523a
Merge branch 'ccprocessor:dev' into dev
dt-yy Mar 5, 2025
44844ee
fix test case
dt-yy Mar 5, 2025
8d8cfce
Merge branch 'ccprocessor:dev' into dev
dt-yy Mar 6, 2025
94f627f
remove enter in table
dt-yy Mar 6, 2025
d90dd07
:wq
dt-yy Mar 6, 2025
95120d0
remove print
dt-yy Mar 6, 2025
1c5ff71
remove print
dt-yy Mar 6, 2025
371800e
add exception
dt-yy Mar 6, 2025
11d0968
fix pylint
dt-yy Mar 6, 2025
d3f995f
修复table&list问题
dt-yy Mar 7, 2025
a7daf28
修复元素识别table&list问题
dt-yy Mar 7, 2025
07f1de4
fix pylint
dt-yy Mar 7, 2025
dbe26d6
解决list和table等问题
dt-yy Mar 7, 2025
661f294
Merge branch 'ccprocessor:dev' into dev
dt-yy Mar 10, 2025
45071b3
add list test
dt-yy Mar 10, 2025
50468bc
add list test
dt-yy Mar 10, 2025
77532d4
add list test
dt-yy Mar 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 36 additions & 11 deletions llm_web_kit/extractor/html/recognizer/list.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from lxml.etree import _Element as HtmlElement
from overrides import override

from llm_web_kit.exception.exception import HtmlListRecognizerException
from llm_web_kit.extractor.html.recognizer.recognizer import (
BaseHTMLElementRecognizer, CCTag)
from llm_web_kit.libs.doc_element_type import DocElementType, ParagraphTextType
Expand All @@ -22,13 +23,14 @@

Returns:
"""
ordered, content_list, _ = self.__get_attribute(parsed_content)
ordered, content_list, _, list_nest_level = self.__get_attribute(parsed_content)
ele_node = {
'type': DocElementType.LIST,
'raw_content': raw_html_segment,
'content': {
'items': content_list,
'ordered': ordered
'ordered': ordered,
'list_nest_level': list_nest_level
}
}

Expand Down Expand Up @@ -148,12 +150,35 @@
return list_nest_level, is_ordered, content_list, raw_html, tail_text

def __get_list_type(self, list_ele:HtmlElement) -> int:
"""获取list嵌套的类型."""
if list_ele.tag not in ['ul', 'ol', 'dl', 'menu', 'dir']:
return 0
ancestor_count = list_ele.xpath('count(ancestor::ul | ancestor::ol)')
# 层级 = 祖先列表数量 + 自身(1层)
return int(ancestor_count) + 1
"""获取list嵌套的层级。

计算一个列表元素的最大嵌套深度,通过递归遍历所有子元素。
例如:
- 没有嵌套的列表返回1
- 有一层嵌套的列表返回2
- 有两层嵌套的列表返回3

Args:
list_ele: 列表HTML元素

Returns:
int: 列表的最大嵌套深度
"""
list_type = ['ul', 'ol', 'dl', 'menu', 'dir']

def get_max_depth(element):
max_child_depth = 0
for child in element.iterchildren():
if child.tag in list_type:
# 找到嵌套列表,其深度至少为1
child_depth = 1 + get_max_depth(child)
max_child_depth = max(max_child_depth, child_depth)
else:
# 对非列表元素递归检查其子元素
child_depth = get_max_depth(child)
max_child_depth = max(max_child_depth, child_depth)
return max_child_depth
return get_max_depth(list_ele) + 1

def __extract_list_item_text(self, root:HtmlElement) -> list[list]:
"""提取列表项的文本.
Expand Down Expand Up @@ -208,7 +233,7 @@
ordered = ele.attrib.get('ordered', 'False') in ['True', 'true']
content_list = json.loads(ele.text)
raw_html = ele.attrib.get('html')
return ordered, content_list, raw_html
list_nest_level = ele.attrib.get('list_nest_level', 0)
return ordered, content_list, raw_html, list_nest_level
else:
# TODO 抛出异常, 需要自定义
raise ValueError(f'{html}中没有cctitle标签')
raise HtmlListRecognizerException(f'{html}中没有cctitle标签')

Check warning on line 239 in llm_web_kit/extractor/html/recognizer/list.py

View check run for this annotation

Codecov / codecov/patch

llm_web_kit/extractor/html/recognizer/list.py#L239

Added line #L239 was not covered by tests
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
<!-- 第1层: dl 列表 -->
<dl>
<dt>外层列表项</dt>
<dd>
<!-- 第2层: ol 列表 -->
<ol>
<li>第二层列表项
<!-- 第3层: ul 列表 -->
<ul>
<li>第三层列表项 1</li>
<li>第三层列表项 2</li>
</ul>
</li>
<li>第二层其他项</li>
</ol>
</dd>

<dt>外层另一个列表项</dt>
<dd>
<!-- 第2层: menu 列表 -->
<menu>
<li>第二层菜单项
<!-- 第3层: dir 列表 -->
<dir>
<li>第三层目录项</li>
</dir>
</li>
</menu>
</dd>
</dl>
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@
{"track_id": "list_empty", "dataset_name": "test_list_empty", "url": "https://productcenter.ru/products/27276/naturalnoie-krymskoie-mylo-ruchnoi-raboty-39-raznovidnostiei","data_source_category": "HTML", "path":"test_list_empty.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
{"track_id": "table_include_math_p", "dataset_name": "table_include_math_p", "url": "https://math.stackexchange.com/questions/458323/is-8327-1-a-prime-number?answertab=active","data_source_category": "HTML", "path":"table_include_math_p.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
{"track_id": "table_include_table_math", "dataset_name": "table_include_table_math", "url": "https://test","data_source_category": "HTML", "path":"table_include_table_math.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
{"track_id": "test_clean_tags", "dataset_name": "test_pipeline_suit", "url": "https://math.stackexchange.com/questions/4082284/solving-for-vector-contained-in-a-diagonal-matrix","data_source_category": "HTML", "path":"test_clean_tags.html", "file_bytes": 1000, "page_layout_type":"forum", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
{"track_id": "test_clean_tags", "dataset_name": "test_pipeline_suit", "url": "https://math.stackexchange.com/questions/4082284/solving-for-vector-contained-in-a-diagonal-matrix","data_source_category": "HTML", "path":"test_clean_tags.html", "file_bytes": 1000, "page_layout_type":"forum", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
{"track_id": "list_nest_three", "dataset_name": "list_nest_three", "url": "http://test.com","data_source_category": "HTML", "path":"list_nest_three.html", "file_bytes": 1000, "page_layout_type":"forum", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
12 changes: 11 additions & 1 deletion tests/llm_web_kit/extractor/test_extractor_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def setUp(self):
for line in f:
self.data_json.append(json.loads(line.strip()))

assert len(self.data_json) == 18
assert len(self.data_json) == 19

# Config for HTML extraction
self.config = {
Expand Down Expand Up @@ -434,3 +434,13 @@ def test_clean_tags(self):
result = chain.extract(input_data)
content_md = result.get_content_list().to_mm_md()
self.assertNotIn('begingroup', content_md)

def test_list_nest_three(self):
"""测试列表嵌套三层."""
chain = ExtractSimpleFactory.create(self.config)
self.assertIsNotNone(chain)
test_data = self.data_json[18]
input_data = DataJson(test_data)
result = chain.extract(input_data)
result_content_list = result.get_content_list()._get_data()
assert int(result_content_list[0][0]['content']['list_nest_level']) == 3