Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
67 commits
Select commit Hold shift + click to select a range
69d420c
resolve nest table
Feb 25, 2025
e7c3792
update extract table
Feb 25, 2025
f0347ff
remove table tail
Feb 25, 2025
5e17694
normalize line endings
Feb 25, 2025
c15dea1
update test case
Feb 25, 2025
d34a8a7
update format
Feb 25, 2025
87a2495
update format
Feb 25, 2025
9861090
update format
Feb 25, 2025
a77735f
change parse order
Feb 25, 2025
419b2c1
add list nest level
Feb 25, 2025
d46cb64
Merge branch 'ccprocessor:dev' into dev
dt-yy Feb 25, 2025
c40b1ea
fix pylint
Feb 25, 2025
06d251f
Merge branch 'dev' of https://github.com/dt-yy/llm-webkit-mirror into…
Feb 26, 2025
d863de3
Merge branch 'ccprocessor:dev' into dev
dt-yy Feb 27, 2025
6c7ca2d
update table nest spec.md
Feb 27, 2025
9785e05
Merge branch 'ccprocessor:dev' into dev
dt-yy Feb 28, 2025
9ef8f8a
Merge branch 'ccprocessor:dev' into dev
dt-yy Feb 28, 2025
9e15452
update parse order
Mar 3, 2025
4a61728
update parse order
Mar 3, 2025
0e239dc
Merge branch 'ccprocessor:dev' into dev
dt-yy Mar 3, 2025
1b0e1e9
update parse order
Mar 3, 2025
78ca028
update list标准
Mar 3, 2025
9eb13f5
Merge branch 'ccprocessor:dev' into dev
dt-yy Mar 3, 2025
efcd7a2
add table involve inline code
Mar 3, 2025
0776f6e
add test case
Mar 3, 2025
c04b2d8
Merge branch 'dev' of https://github.com/dt-yy/llm-webkit-mirror into…
Mar 3, 2025
3fda2a6
fix test case
Mar 3, 2025
a3b0bf6
Merge branch 'ccprocessor:dev' into dev
dt-yy Mar 5, 2025
e0196bf
add table tail
dt-yy Mar 5, 2025
f961665
加上table的tail处理
dt-yy Mar 5, 2025
cdae7a1
Merge branch 'ccprocessor:dev' into dev
dt-yy Mar 5, 2025
cdfd98c
add table tail test
dt-yy Mar 5, 2025
f8b523a
Merge branch 'ccprocessor:dev' into dev
dt-yy Mar 5, 2025
44844ee
fix test case
dt-yy Mar 5, 2025
8d8cfce
Merge branch 'ccprocessor:dev' into dev
dt-yy Mar 6, 2025
94f627f
remove enter in table
dt-yy Mar 6, 2025
d90dd07
:wq
dt-yy Mar 6, 2025
95120d0
remove print
dt-yy Mar 6, 2025
1c5ff71
remove print
dt-yy Mar 6, 2025
371800e
add exception
dt-yy Mar 6, 2025
11d0968
fix pylint
dt-yy Mar 6, 2025
d3f995f
修复table&list问题
dt-yy Mar 7, 2025
a7daf28
修复元素识别table&list问题
dt-yy Mar 7, 2025
07f1de4
fix pylint
dt-yy Mar 7, 2025
dbe26d6
解决list和table等问题
dt-yy Mar 7, 2025
661f294
Merge branch 'ccprocessor:dev' into dev
dt-yy Mar 10, 2025
45071b3
add list test
dt-yy Mar 10, 2025
50468bc
add list test
dt-yy Mar 10, 2025
77532d4
add list test
dt-yy Mar 10, 2025
b836984
Merge branch 'ccprocessor:dev' into dev
dt-yy Mar 10, 2025
1cefea5
Merge branch 'ccprocessor:dev' into dev
dt-yy Mar 13, 2025
b173d3c
improve performence
dt-yy Mar 13, 2025
cea3c48
update html strucr
dt-yy Mar 13, 2025
fb17ca3
Merge branch 'ccprocessor:dev' into dev
dt-yy Mar 13, 2025
e07a119
improve performence
dt-yy Mar 17, 2025
92ac356
remove more file
dt-yy Mar 17, 2025
fd4eb0d
update cccode
dt-yy Mar 17, 2025
b7eaf83
update perf improvement
dt-yy Mar 17, 2025
f6f880e
fix pylint
dt-yy Mar 17, 2025
10b7480
fix pylint
dt-yy Mar 17, 2025
8d44af6
update perf
dt-yy Mar 17, 2025
b3723b8
update perf
dt-yy Mar 17, 2025
a31953b
feat: fast tag code extract (#1)
NgZiming Mar 18, 2025
4c4587b
fix: wrong tag nums (#2)
NgZiming Mar 18, 2025
48329cd
improve perf
dt-yy Mar 20, 2025
7aec524
Merge branch 'dev' of https://github.com/dt-yy/llm-webkit-mirror into…
dt-yy Mar 20, 2025
d05fc10
improve perf
dt-yy Mar 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 22 additions & 16 deletions llm_web_kit/extractor/html/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import List, Tuple

import commentjson as json
from lxml.html import HtmlElement
from overrides import override

from llm_web_kit.config.cfg_reader import load_config
Expand All @@ -20,6 +21,7 @@
from llm_web_kit.extractor.html.recognizer.title import TitleRecognizer
from llm_web_kit.extractor.html.recognizer.video import VideoRecognizer
from llm_web_kit.input.datajson import ContentList, DataJson
from llm_web_kit.libs.doc_element_type import DocElementType
from llm_web_kit.libs.html_utils import element_to_html, html_to_element
from llm_web_kit.libs.path_lib import get_py_pkg_root_dir

Expand Down Expand Up @@ -92,12 +94,12 @@ def _do_extract(self, data_json: DataJson) -> DataJson:
page_layout_type:str = data_json.get('page_layout_type', HTMLPageLayoutType.LAYOUT_ARTICLE) # 默认是文章类型

main_html, method, title = self._extract_main_html(raw_html, base_url, page_layout_type)
parsed_html = [(main_html,raw_html)]
main_html_element = html_to_element(main_html)
parsed_html = [(main_html_element, raw_html)]
for extract_func in [self._extract_code, self._extract_table, self._extract_math, self._extract_list,
self._extract_image,
self._extract_title, self._extract_paragraph]:
parsed_html = extract_func(base_url, parsed_html, raw_html)

content_list:ContentList = self._export_to_content_list(base_url, parsed_html, raw_html)
data_json['content_list'] = content_list
data_json['title'] = title
Expand All @@ -119,7 +121,7 @@ def _extract_main_html(self, raw_html:str, base_url:str, page_layout_type:str) -
dict_result = self.__magic_html_extractor.extract(raw_html, base_url=base_url, precision=False, html_type=page_layout_type)
return dict_result['html'], dict_result['xp_num'], dict_result.get('title', '')

def _extract_code(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]:
def _extract_code(self, base_url:str, html_lst:List[Tuple[HtmlElement, HtmlElement]], raw_html:str) -> List[Tuple[HtmlElement,HtmlElement]]:
"""从html文本中提取代码.

Args:
Expand Down Expand Up @@ -256,43 +258,43 @@ def __is_valid_node(self, node: dict) -> bool:
if not node:
raise HtmlFileExtractorException('node is empty')
node_type = node.get('type')
valid_types = {'list', 'code', 'equation-interline', 'image', 'table', 'title', 'paragraph'}
valid_types = {DocElementType.TITLE, DocElementType.LIST, DocElementType.CODE, DocElementType.EQUATION_INTERLINE, DocElementType.IMAGE, DocElementType.SIMPLE_TABLE, DocElementType.COMPLEX_TABLE, DocElementType.IMAGE, DocElementType.PARAGRAPH}
if node_type not in valid_types:
raise HtmlFileExtractorException(f'Invalid node type: {node_type}')
# 检查列表类型的节点
if node.get('type') == 'list':
if node.get('type') == DocElementType.LIST:
items = node.get('content', {}).get('items', [])
# 过滤掉None、空列表,以及只包含None或空值的列表
return bool(items) and any(
isinstance(item, (dict, list)) and bool(item)
for item in items)
# 检测code类型的节点
if node.get('type') == 'code':
if node.get('type') == DocElementType.CODE:
code_content = node.get('content', {}).get('code_content')
# 如果代码内容为None或空字符串,则视为无效节点
return bool(code_content and code_content.strip())
# 检测行间公式类型的节点
if node.get('type') == 'equation-interline':
if node.get('type') == DocElementType.EQUATION_INTERLINE:
math_content = node.get('content', {}).get('math_content')
# 如果公式内容为None或空字符串,则视为无效节点
return bool(math_content and math_content.strip())
# 检测image类型的节点
if node.get('type') == 'image':
if node.get('type') == DocElementType.IMAGE:
content = node.get('content', {})
# 检查url、path或data字段是否至少有一个不为空
return bool(content.get('url') or content.get('path') or content.get('data'))
# 检测table类型的节点
if node.get('type') == 'table':
if node.get('type') == DocElementType.SIMPLE_TABLE or node.get('type') == DocElementType.COMPLEX_TABLE:
html = node.get('content', {}).get('html')
# 如果表格的html内容为None或空字符串,则视为无效节点
return bool(html and html.strip())
# 检测title类型的节点
if node.get('type') == 'title':
if node.get('type') == DocElementType.TITLE:
title_content = node.get('content', {}).get('title_content')
# 如果标题内容为None或空字符串,则视为无效节点
return bool(title_content and title_content.strip())
# 检测段落类型的节点
if node.get('type') == 'paragraph':
if node.get('type') == DocElementType.PARAGRAPH:
content = node.get('content', [])
# 检查content列表是否存在且不为空,并且至少有一个非空的内容项
return bool(content) and any(
Expand All @@ -301,7 +303,7 @@ def __is_valid_node(self, node: dict) -> bool:
)
return True

def _export_to_content_list(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str) -> ContentList:
def _export_to_content_list(self, base_url:str, html_lst:List[Tuple[HtmlElement,HtmlElement]], raw_html:str) -> ContentList:
"""将解析结果存入content_list格式中.

Args:
Expand All @@ -318,15 +320,17 @@ def _export_to_content_list(self, base_url:str, html_lst:List[Tuple[str,str]], r
ccnode_html, cc_tag = self.__get_cc_node(parsed_html)
parser:BaseHTMLElementRecognizer = self.__to_content_list_mapper.get(cc_tag)
if parser:
node = parser.to_content_list_node(base_url, ccnode_html, raw_html)
raw_html_str = element_to_html(raw_html)
# raw_html_str = raw_html
node = parser.to_content_list_node(base_url, ccnode_html, raw_html_str)
if node and self.__is_valid_node(node):
one_page.append(node)
else:
raise HtmlFileExtractorException(f'无法识别的html标签:{cc_tag}, {parsed_html}')
content_list = ContentList([one_page]) # 对于网页来说仅有一页,如果多页,则剩下的每个都是一个论坛的回复
return content_list

def __get_cc_node(self, html:str) -> (str, str):
def __get_cc_node(self, html:HtmlElement) -> (HtmlElement, str):
"""获取html文本的根标签名。只获取一个,如果html文本中包含多个cc标签,则抛异常。

Args:
Expand All @@ -335,7 +339,8 @@ def __get_cc_node(self, html:str) -> (str, str):
Returns:
str: 根标签名
"""
el = html_to_element(html)
# el = html_to_element(html)
el = html
if el.tag in self.__to_content_list_mapper.keys():
return html, el.tag
else:
Expand All @@ -346,7 +351,8 @@ def __get_cc_node(self, html:str) -> (str, str):
raise HtmlFileExtractorException(f'html文本中没有cc标签: {html}')
if len(nodes) > 3:
raise HtmlFileExtractorException(f'html文本中包含多个cc标签: {html}')
return element_to_html(nodes[0]), nodes[0].tag
# return element_to_html(nodes[0]), nodes[0].tag
return nodes[0], nodes[0].tag

def __build_extractor(self):
"""
Expand Down
15 changes: 13 additions & 2 deletions llm_web_kit/extractor/html/recognizer/audio.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import List, Tuple

from lxml.html import HtmlElement
from overrides import override

from llm_web_kit.extractor.html.recognizer.recognizer import \
Expand All @@ -9,7 +10,7 @@
class AudioRecognizer(BaseHTMLElementRecognizer):
"""解析音频元素."""
@override
def recognize(self, base_url:str, main_html_lst: List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]:
def recognize(self, base_url:str, main_html_lst: List[Tuple[HtmlElement,HtmlElement]], raw_html:str) -> List[Tuple[HtmlElement,HtmlElement]]:
"""父类,解析音频元素.

Args:
Expand All @@ -22,5 +23,15 @@ def recognize(self, base_url:str, main_html_lst: List[Tuple[str,str]], raw_html:
raise NotImplementedError

@override
def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segment: str) -> dict:
def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_html_segment: str) -> dict:
"""
把音频元素转换为content list node.
Args:
base_url:
parsed_content:
raw_html_segment:

Returns:

"""
raise NotImplementedError
8 changes: 4 additions & 4 deletions llm_web_kit/extractor/html/recognizer/cc_math/common.py
Comment thread
e06084 marked this conversation as resolved.
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,10 @@ class MATH_TYPE_PATTERN:
['\\[', '\\]'],
['$$', '$$'],
['[tex]', '[/tex]'], # 这个网站自定义的分割,https://www.physicsforums.com/threads/turning-to-a-single-logarithm-then-simply.269419/
['\\begin{equation}', '\\end{equation}'],
['\\begin{align}', '\\end{align}'],
['\\begin{alignat}', '\\end{alignat}'],
['\\begin{array}', '\\end{array}'],
# ['\\begin{equation}', '\\end{equation}'],
# ['\\begin{align}', '\\end{align}'],
# ['\\begin{alignat}', '\\end{alignat}'],
# ['\\begin{array}', '\\end{array}'],
# 添加通用的begin/end匹配
['\\begin{.*?}', '\\end{.*?}'],
],
Expand Down
38 changes: 21 additions & 17 deletions llm_web_kit/extractor/html/recognizer/cccode.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,17 @@
tag_pre_code)
from llm_web_kit.extractor.html.recognizer.recognizer import (
BaseHTMLElementRecognizer, CCTag)
from llm_web_kit.libs.html_utils import element_to_html, html_to_element


class CodeRecognizer(BaseHTMLElementRecognizer):
"""解析代码元素."""

@override
def recognize(
self,
base_url: str,
main_html_lst: List[Tuple[str, str]],
raw_html: str,
) -> List[Tuple[str, str]]:
main_html_lst: List[Tuple[HtmlElement, HtmlElement]],
raw_html: str
) -> List[Tuple[HtmlElement, HtmlElement]]:
"""父类,解析代码元素.

Args:
Expand All @@ -38,7 +36,8 @@ def recognize(
if self.is_cc_html(html):
rtn.append((html, raw_html))
continue
root: HtmlElement = html_to_element(html)
# root: HtmlElement = html_to_element(html)
root = html
while True:
# 最常见:
# <pre><code></code></pre>
Expand Down Expand Up @@ -77,31 +76,36 @@ def remove_empty_code(r: HtmlElement):
remove_empty_code(x)

remove_empty_code(root)

html_str: str = element_to_html(root)

rtn.extend(BaseHTMLElementRecognizer.html_split_by_tags(html_str, CCTag.CC_CODE))

# html_str: str = element_to_html(root)
rtn.extend(BaseHTMLElementRecognizer.html_split_by_tags(root, CCTag.CC_CODE))
return rtn

@override
def to_content_list_node(self, base_url:str, parsed_content: str, raw_html_segment:str) -> dict:
code_node: HtmlElement = html_to_element(parsed_content)
def to_content_list_node(self, base_url:str, parsed_content: HtmlElement, raw_html_segment:str) -> dict:
"""
把代码元素转换为content list node.
Args:
base_url:
parsed_content: HtmlElement对象
raw_html_segment:

Returns:

"""
d = {
'type': 'code',
# "bbox": [],
'raw_content': raw_html_segment,
'inline': code_node.get('inline', 'false') == 'true',
'inline': parsed_content.get('inline', 'false') == 'true',
'content': {
'code_content': code_node.text,
'code_content': parsed_content.text,
},
}

if lang := code_node.get('language', None):
if lang := parsed_content.get('language', None):
d['content']['language'] = lang

if by := code_node.get('by', None):
if by := parsed_content.get('by', None):
d['content']['by'] = by

return d
16 changes: 8 additions & 8 deletions llm_web_kit/extractor/html/recognizer/ccmath.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def __init__(self):
self.cm = CCMATH()

@override
def recognize(self, base_url: str, main_html_lst: List[Tuple[str, str]], raw_html: str) -> List[Tuple[str, str]]:
def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html: str) -> List[Tuple[HtmlElement, HtmlElement]]:
"""父类,解析数学公式元素.

Args:
Expand Down Expand Up @@ -56,7 +56,7 @@ def recognize(self, base_url: str, main_html_lst: List[Tuple[str, str]], raw_htm
return result

@override
def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segment: str) -> dict:
def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_html_segment: str) -> dict:
"""将content转换成content_list_node.
每种类型的html元素都有自己的content-list格式:参考 docs/specification/output_format/content_list_spec.md
例如代码的返回格式:
Expand All @@ -78,7 +78,7 @@ def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segm
Returns:
dict: content_list_node
"""
tree = self._build_html_tree(parsed_content)
tree = parsed_content
if tree is None:
raise HtmlMathRecognizerException(f'Failed to load html: {parsed_content}')

Expand Down Expand Up @@ -125,7 +125,7 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe
"""
# node是从cc_html中解析出来的lxml节点
self.cm.url = base_url
tree = self._build_html_tree(cc_html)
tree = cc_html
math_render_type = math_render.get_render_type()
if tree is None:
raise HtmlMathRecognizerException(f'Failed to load html: {cc_html}')
Expand Down Expand Up @@ -171,20 +171,20 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe
# 保存处理后的html
# with open('math_physicsforums_1_processed.html', 'w') as f:
# f.write(self._element_to_html(tree))
return self.html_split_by_tags(self._element_to_html(tree), [CCTag.CC_MATH_INTERLINE])
return self.html_split_by_tags(tree, [CCTag.CC_MATH_INTERLINE])

def process_mathjax_html(self, cc_html: str, o_html: str, math_render: BaseMathRender, base_url: str) -> List[Tuple[str, str]]:
def process_mathjax_html(self, cc_html: HtmlElement, o_html: HtmlElement, math_render: BaseMathRender, base_url: str) -> List[Tuple[HtmlElement, HtmlElement]]:
"""处理mathjax有自定义标识符的数学公式."""
self.cm.url = base_url
try:
tree = self._build_html_tree(cc_html)
tree = cc_html
math_render.find_math(tree)

# with open('math_physicsforums_1_processed.html', 'w') as f:
# f.write(self._element_to_html(tree))
except Exception as e:
raise HtmlMathMathjaxRenderRecognizerException(f'处理mathjax有自定义标识符的数学公式失败: {e}')
return self.html_split_by_tags(self._element_to_html(tree), [CCTag.CC_MATH_INTERLINE])
return self.html_split_by_tags(tree, [CCTag.CC_MATH_INTERLINE])


if __name__ == '__main__':
Expand Down
Loading