Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
69d420c
resolve nest table
Feb 25, 2025
e7c3792
update extract table
Feb 25, 2025
f0347ff
remove table tail
Feb 25, 2025
5e17694
normalize line endings
Feb 25, 2025
c15dea1
update test case
Feb 25, 2025
d34a8a7
update format
Feb 25, 2025
87a2495
update format
Feb 25, 2025
9861090
update format
Feb 25, 2025
a77735f
change parse order
Feb 25, 2025
419b2c1
add list nest level
Feb 25, 2025
d46cb64
Merge branch 'ccprocessor:dev' into dev
dt-yy Feb 25, 2025
c40b1ea
fix pylint
Feb 25, 2025
06d251f
Merge branch 'dev' of https://github.com/dt-yy/llm-webkit-mirror into…
Feb 26, 2025
d863de3
Merge branch 'ccprocessor:dev' into dev
dt-yy Feb 27, 2025
6c7ca2d
update table nest spec.md
Feb 27, 2025
9785e05
Merge branch 'ccprocessor:dev' into dev
dt-yy Feb 28, 2025
9ef8f8a
Merge branch 'ccprocessor:dev' into dev
dt-yy Feb 28, 2025
9e15452
update parse order
Mar 3, 2025
4a61728
update parse order
Mar 3, 2025
0e239dc
Merge branch 'ccprocessor:dev' into dev
dt-yy Mar 3, 2025
1b0e1e9
update parse order
Mar 3, 2025
78ca028
update list标准
Mar 3, 2025
9eb13f5
Merge branch 'ccprocessor:dev' into dev
dt-yy Mar 3, 2025
efcd7a2
add table involve inline code
Mar 3, 2025
0776f6e
add test case
Mar 3, 2025
c04b2d8
Merge branch 'dev' of https://github.com/dt-yy/llm-webkit-mirror into…
Mar 3, 2025
3fda2a6
fix test case
Mar 3, 2025
a3b0bf6
Merge branch 'ccprocessor:dev' into dev
dt-yy Mar 5, 2025
e0196bf
add table tail
dt-yy Mar 5, 2025
f961665
加上table的tail处理
dt-yy Mar 5, 2025
cdae7a1
Merge branch 'ccprocessor:dev' into dev
dt-yy Mar 5, 2025
cdfd98c
add table tail test
dt-yy Mar 5, 2025
f8b523a
Merge branch 'ccprocessor:dev' into dev
dt-yy Mar 5, 2025
44844ee
fix test case
dt-yy Mar 5, 2025
8d8cfce
Merge branch 'ccprocessor:dev' into dev
dt-yy Mar 6, 2025
94f627f
remove enter in table
dt-yy Mar 6, 2025
d90dd07
:wq
dt-yy Mar 6, 2025
95120d0
remove print
dt-yy Mar 6, 2025
1c5ff71
remove print
dt-yy Mar 6, 2025
371800e
add exception
dt-yy Mar 6, 2025
11d0968
fix pylint
dt-yy Mar 6, 2025
d3f995f
修复table&list问题
dt-yy Mar 7, 2025
a7daf28
修复元素识别table&list问题
dt-yy Mar 7, 2025
07f1de4
fix pylint
dt-yy Mar 7, 2025
dbe26d6
解决list和table等问题
dt-yy Mar 7, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion llm_web_kit/extractor/extractor_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ def extract(self, data: DataJson) -> DataJson:
# Pre extractors
for pre_ext in self.__pre_extractors:
data = pre_ext.pre_extract(data)

# Main extractors
for ext in self.__extractors:
data = ext.extract(data)
Expand Down
71 changes: 63 additions & 8 deletions llm_web_kit/extractor/html/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from overrides import override

from llm_web_kit.config.cfg_reader import load_config
from llm_web_kit.exception.exception import HtmlFileExtractorException
from llm_web_kit.extractor.extractor import BaseFileFormatExtractor
from llm_web_kit.extractor.html.magic_html import GeneralExtractor
from llm_web_kit.extractor.html.recognizer.audio import AudioRecognizer
Expand All @@ -20,7 +21,6 @@
from llm_web_kit.extractor.html.recognizer.video import VideoRecognizer
from llm_web_kit.input.datajson import ContentList, DataJson
from llm_web_kit.libs.html_utils import element_to_html, html_to_element
from llm_web_kit.libs.logger import mylogger
from llm_web_kit.libs.path_lib import get_py_pkg_root_dir


Expand Down Expand Up @@ -245,6 +245,63 @@
lst = self.__paragraph_recognizer.recognize(base_url, html_lst, raw_html)
return lst

def __is_valid_node(self, node: dict) -> bool:
"""检查节点是否有效(不为空).

Args:
node (dict): 内容节点

Returns:
bool: 如果节点有效返回True,否则返回False
"""
if not node:
raise HtmlFileExtractorException('node is empty')

Check warning on line 258 in llm_web_kit/extractor/html/extractor.py

View check run for this annotation

Codecov / codecov/patch

llm_web_kit/extractor/html/extractor.py#L258

Added line #L258 was not covered by tests
node_type = node.get('type')
valid_types = {'list', 'code', 'equation-interline', 'image', 'table', 'title', 'paragraph'}
if node_type not in valid_types:
raise HtmlFileExtractorException(f'Invalid node type: {node_type}')

Check warning on line 262 in llm_web_kit/extractor/html/extractor.py

View check run for this annotation

Codecov / codecov/patch

llm_web_kit/extractor/html/extractor.py#L262

Added line #L262 was not covered by tests
# 检查列表类型的节点
if node.get('type') == 'list':
items = node.get('content', {}).get('items', [])
# 过滤掉None、空列表,以及只包含None或空值的列表
return bool(items) and any(
isinstance(item, (dict, list)) and bool(item)
for item in items)
# 检测code类型的节点
if node.get('type') == 'code':
code_content = node.get('content', {}).get('code_content')
# 如果代码内容为None或空字符串,则视为无效节点
return bool(code_content and code_content.strip())
# 检测行间公式类型的节点
if node.get('type') == 'equation-interline':
math_content = node.get('content', {}).get('math_content')
# 如果公式内容为None或空字符串,则视为无效节点
return bool(math_content and math_content.strip())
# 检测image类型的节点
if node.get('type') == 'image':
content = node.get('content', {})
# 检查url、path或data字段是否至少有一个不为空
return bool(content.get('url') or content.get('path') or content.get('data'))
# 检测table类型的节点
if node.get('type') == 'table':
html = node.get('content', {}).get('html')
# 如果表格的html内容为None或空字符串,则视为无效节点
return bool(html and html.strip())
# 检测title类型的节点
if node.get('type') == 'title':
title_content = node.get('content', {}).get('title_content')
# 如果标题内容为None或空字符串,则视为无效节点
return bool(title_content and title_content.strip())
# 检测段落类型的节点
if node.get('type') == 'paragraph':
content = node.get('content', [])
# 检查content列表是否存在且不为空,并且至少有一个非空的内容项
return bool(content) and any(
item.get('c') and item.get('c').strip()
for item in content
)
return True

Check warning on line 303 in llm_web_kit/extractor/html/extractor.py

View check run for this annotation

Codecov / codecov/patch

llm_web_kit/extractor/html/extractor.py#L303

Added line #L303 was not covered by tests

def _export_to_content_list(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str) -> ContentList:
"""将解析结果存入content_list格式中.

Expand All @@ -263,12 +320,10 @@
parser:BaseHTMLElementRecognizer = self.__to_content_list_mapper.get(cc_tag)
if parser:
node = parser.to_content_list_node(base_url, ccnode_html, raw_html)
if node:
if node and self.__is_valid_node(node):
one_page.append(node)
else:
mylogger.warning(f'无法识别的html标签:{cc_tag}, {parsed_html}')
# TODO 开发成熟的时候,在这里抛出异常,让调用者记录下来,以便后续分析改进

raise HtmlFileExtractorException(f'无法识别的html标签:{cc_tag}, {parsed_html}')

Check warning on line 326 in llm_web_kit/extractor/html/extractor.py

View check run for this annotation

Codecov / codecov/patch

llm_web_kit/extractor/html/extractor.py#L326

Added line #L326 was not covered by tests
content_list = ContentList([one_page]) # 对于网页来说仅有一页,如果多页,则剩下的每个都是一个论坛的回复
return content_list

Expand All @@ -289,9 +344,9 @@
xpath_expr = ' | '.join(f'self::{tag} | .//{tag}' for tag in self.__to_content_list_mapper.keys())
nodes = el.xpath(xpath_expr)
if len(nodes) == 0:
raise ValueError(f'html文本中没有cc标签: {html}') # TODO 异常处理
if len(nodes) > 1:
raise ValueError(f'html文本中包含多个cc标签: {html}') # TODO 异常处理
raise HtmlFileExtractorException(f'html文本中没有cc标签: {html}')

Check warning on line 347 in llm_web_kit/extractor/html/extractor.py

View check run for this annotation

Codecov / codecov/patch

llm_web_kit/extractor/html/extractor.py#L347

Added line #L347 was not covered by tests
if len(nodes) > 3:
raise HtmlFileExtractorException(f'html文本中包含多个cc标签: {html}')

Check warning on line 349 in llm_web_kit/extractor/html/extractor.py

View check run for this annotation

Codecov / codecov/patch

llm_web_kit/extractor/html/extractor.py#L349

Added line #L349 was not covered by tests
return element_to_html(nodes[0]), nodes[0].tag

def __build_extractor(self):
Expand Down
1 change: 0 additions & 1 deletion llm_web_kit/extractor/html/recognizer/cccode.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ def recognize(
if self.is_cc_html(html):
rtn.append((html, raw_html))
continue

root: HtmlElement = html_to_element(html)
while True:
# 最常见:
Expand Down
179 changes: 96 additions & 83 deletions llm_web_kit/extractor/html/recognizer/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from overrides import override

from llm_web_kit.exception.exception import HtmlTableRecognizerException
from llm_web_kit.extractor.html.recognizer.cccode import CodeRecognizer
from llm_web_kit.extractor.html.recognizer.ccmath import MathRecognizer
from llm_web_kit.extractor.html.recognizer.recognizer import (
BaseHTMLElementRecognizer, CCTag)
Expand Down Expand Up @@ -68,7 +67,6 @@
:param table: lxml.html.HtmlElement 对象,表示一个 <table> 元素
:return: 如果表格为空,返回 True;否则返回 False
"""

def is_element_empty(elem):
# 检查元素本身的文本内容
if elem.text and elem.text.strip():
Expand Down Expand Up @@ -113,20 +111,19 @@
return False
return True

def __is_table_contain_img(self, tree) -> bool:
"""判断table元素是否包含图片."""
imgs = tree.xpath('//table//img')
if len(imgs) == 0:
return True
else:
return False

def __is_table_nested(self, tree) -> int:
"""获取表格元素的嵌套层级(非表格元素返回0,顶层表格返回1,嵌套表格返回层级数)."""
if tree.tag != 'table':
return 0 # 非表格元素返回0
# 计算祖先中的 table 数量(不包括自身),再加1表示自身层级
return len(tree.xpath('ancestor::table')) + 1
def __is_table_nested(self, element) -> int:
"""计算表格的嵌套层级(非表格返回0,根据原始table判断的."""
if element.tag != 'table':
return 0

Check warning on line 117 in llm_web_kit/extractor/html/recognizer/table.py

View check run for this annotation

Codecov / codecov/patch

llm_web_kit/extractor/html/recognizer/table.py#L117

Added line #L117 was not covered by tests
# 获取当前表格下所有的表格(包括自身)
all_tables = [element] + element.xpath('.//table')
max_level = 1 # 初始层级为1(当前表格)
# 计算每个表格的层级,取最大值
for table in all_tables:
ancestor_count = len(table.xpath('ancestor::table'))
level = ancestor_count + 1
max_level = max(max_level, level)
return max_level

def __extract_tables(self, ele: str) -> List[Tuple[str, str]]:
"""提取html中的table元素."""
Expand All @@ -150,78 +147,93 @@
table_type = 'complex'
return table_type

def __extract_table_element(self, ele: HtmlElement) -> str:
"""提取表格的元素."""
for item in ele.iterchildren():
return self._element_to_html(item)

def __check_table_include_math_code(self, raw_html: HtmlElement):
"""check table中是否包含math."""
"""检查table中的内容,包括普通文本、数学公式和代码."""
math_html = self._element_to_html(raw_html)
ele_res = list()
math_recognizer = MathRecognizer()
math_res_parts = math_recognizer.recognize(base_url='', main_html_lst=[(math_html, math_html)],
raw_html=math_html)
code_recognizer = CodeRecognizer()
code_res_parts = code_recognizer.recognize(base_url='', main_html_lst=math_res_parts,
raw_html=math_html)
for math_item in code_res_parts:
math_res_parts = math_recognizer.recognize(
base_url='',
main_html_lst=[(math_html, math_html)],
raw_html=math_html
)
result = []
for math_item in math_res_parts:
ele_item = self._build_html_tree(math_item[0])
ccinline_math_node = ele_item.xpath(f'//{CCTag.CC_MATH_INLINE}')
ccinline_code_node = ele_item.xpath(f'//{CCTag.CC_CODE_INLINE}')
ccinterline_math_node = ele_item.xpath(f'//{CCTag.CC_MATH_INTERLINE}')
ccinterline_code_node = ele_item.xpath(f'//{CCTag.CC_CODE}')
if ccinline_math_node:
formulas = [
el.text if el.text.strip() else ''
for el in ccinline_math_node
]
ele_res.extend(formulas) # 添加字符串
elif ccinterline_math_node:
codes = [
el.text if el.text.strip() else ''
for el in ccinterline_math_node
]
ele_res.extend(codes)
elif ccinline_code_node:
inline_codes = [
el.text if el.text.strip() else ''
for el in ccinline_code_node
]
ele_res.extend(inline_codes)
elif ccinterline_code_node:
ccinterline_codes = [
el.text if el.text else ''
for el in ccinterline_code_node
]
ele_res.extend(ccinterline_codes)
else:
texts = []
# 使用 itertext() 遍历所有文本片段
for text_segment in ele_item.itertext():
# 统一处理文本:去空白 + 替换字面 \n
cleaned_text = text_segment.strip().replace('\\n', '')
if cleaned_text: # 过滤空字符串
texts.append(cleaned_text)
ele_res.extend(texts)
return ele_res

def __simplify_td_th_content(self, elem: HtmlElement) -> None:
"""简化 <td> 和 <th> 内容,仅保留文本内容."""
def process_node(node):
"""处理行内公式、行间公式、行间代码、行内代码."""
if node.tag == CCTag.CC_MATH_INLINE:
if node.text and node.text.strip():
result.append(f'${node.text.strip()}$')
if node.tail and node.tail.strip():
result.append(node.tail.strip())
# 处理行间公式
elif node.tag == CCTag.CC_MATH_INTERLINE:
if node.text and node.text.strip():
result.append(f'$${node.text.strip()}$$')
if node.tail and node.tail.strip():
result.append(node.tail.strip())

Check warning on line 175 in llm_web_kit/extractor/html/recognizer/table.py

View check run for this annotation

Codecov / codecov/patch

llm_web_kit/extractor/html/recognizer/table.py#L175

Added line #L175 was not covered by tests
# 处理行间代码
elif node.tag == CCTag.CC_CODE:
if node.text and node.text.strip():
result.append(f'```{node.text.strip()}```')
if node.tail and node.tail.strip():
result.append(node.tail.strip())

Check warning on line 181 in llm_web_kit/extractor/html/recognizer/table.py

View check run for this annotation

Codecov / codecov/patch

llm_web_kit/extractor/html/recognizer/table.py#L178-L181

Added lines #L178 - L181 were not covered by tests
# 处理行内代码
elif node.tag == CCTag.CC_CODE_INLINE:
if node.text and node.text.strip():
result.append(f'`{node.text.strip()}`')
if node.tail and node.tail.strip():
result.append(node.tail.strip())

Check warning on line 187 in llm_web_kit/extractor/html/recognizer/table.py

View check run for this annotation

Codecov / codecov/patch

llm_web_kit/extractor/html/recognizer/table.py#L187

Added line #L187 was not covered by tests
else:
# 提取当前节点的文本
if node.text and node.text.strip():
cleaned_text = node.text.strip().replace('\\n', '')
result.append(cleaned_text)
# 处理节点的tail(元素闭合后的文本)
if node.tail and node.tail.strip():
cleaned_tail = node.tail.strip().replace('\\n', '')
result.append(cleaned_tail)
# 递归处理子节点
for child in node:
process_node(child)
# 从根节点开始处理
process_node(ele_item)
return result

def __simplify_td_th_content(self, table_nest_level, elem: HtmlElement) -> None:
"""简化 <td> 和 <th> 内容,保留嵌套表格结构."""
if elem.tag in ['td', 'th']:
# 简化单元格中的元素
parse_res = list()
math_res = self.__check_table_include_math_code(elem)
parse_res.extend(math_res)
for item in list(elem.iterchildren()):
elem.remove(item)
if parse_res:
elem.text = '<br>'.join(parse_res)
parse_res = []
# 检查是否存在嵌套的表格
if table_nest_level > 1:
# 存在嵌套表格,递归处理子节点
for child in elem.iterchildren():
if child.tag == 'table':
# 对嵌套表格递归调用简化处理
self.__simplify_td_th_content(table_nest_level, child)
else:
# 处理非表格元素
math_res = self.__check_table_include_math_code(child)
parse_res.extend(math_res)
elem.remove(child)
# 将非表格内容拼接后放在表格前面
if parse_res:
elem.text = ' '.join(parse_res) + (elem.text or '')
else:
# 没有嵌套表格,直接简化
math_res = self.__check_table_include_math_code(elem)
parse_res.extend(math_res)
for item in list(elem.iterchildren()):
elem.remove(item)
if parse_res:
elem.text = ' '.join(parse_res)
return
for child in elem.iter('td', 'th'):
self.__simplify_td_th_content(child)
# 非 td/th 元素继续递归处理
for child in elem.iterchildren():
self.__simplify_td_th_content(table_nest_level, child)

def __get_table_body(self, table_type, table_root):
def __get_table_body(self, table_type, table_nest_level, table_root):
"""获取并处理table body,返回处理后的HTML字符串。"""
if table_type == 'empty':
return None
Expand All @@ -237,11 +249,12 @@
elem.text = elem.text.strip().replace('\\n', '')
if elem.tail is not None:
elem.tail = elem.tail.strip().replace('\\n', '')
self.__simplify_td_th_content(table_root)
# 单元格内的多标签内容进行简化,空格拼接,公式、代码识别
self.__simplify_td_th_content(table_nest_level, table_root)
# 迭代
for child in table_root.iterchildren():
if child is not None:
self.__get_table_body(table_type, child)
self.__get_table_body(table_type, table_nest_level, child)
return self._element_to_html(table_root)

def __do_extract_tables(self, root: HtmlElement) -> None:
Expand All @@ -251,7 +264,7 @@
table_type = self.__get_table_type(root)
table_nest_level = self.__is_table_nested(root)
tail_text = root.tail
table_body = self.__get_table_body(table_type, root)
table_body = self.__get_table_body(table_type, table_nest_level, root)
cc_element = self._build_cc_element(
CCTag.CC_TABLE, table_body, tail_text, table_type=table_type, table_nest_level=table_nest_level,
html=table_raw_html)
Expand Down
17 changes: 15 additions & 2 deletions llm_web_kit/libs/html_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import html
import re
from copy import deepcopy

from lxml.html import HtmlElement, HTMLParser, fromstring, tostring
Expand Down Expand Up @@ -114,6 +115,18 @@ def iter_node(element: HtmlElement):
yield from iter_node(sub_element)


def _escape_table_cell(text: str) -> str:
"""转义表格单元格中的特殊字符.

比如 |、内容中的\n等
"""
# 首先处理换行符,将其替换为空格
text = re.sub(r'[\r\n]+', ' ', text)
# 转义竖线和点号,避免与markdown表格语法冲突
escaped = text.replace('|', '\\|')
return escaped


def html_to_markdown_table(table_html_source: str) -> str:
"""把html代码片段转换成markdown表格.

Expand All @@ -140,7 +153,7 @@ def html_to_markdown_table(table_html_source: str) -> str:

# 检查第一行是否是表头并获取表头内容
first_row_tags = rows[0].xpath('.//th | .//td')
headers = [tag.text_content().strip() for tag in first_row_tags]
headers = [_escape_table_cell(tag.text_content().strip()) for tag in first_row_tags]
# 如果表头存在,添加表头和分隔符,并保证表头与最大列数对齐
if headers:
while len(headers) < max_cols:
Expand All @@ -155,7 +168,7 @@ def html_to_markdown_table(table_html_source: str) -> str:

# 添加表格内容,跳过已被用作表头的第一行(如果有的话)
for row in rows[1:]:
columns = [td.text_content().strip() for td in row.xpath('.//td | .//th')]
columns = [_escape_table_cell(td.text_content().strip()) for td in row.xpath('.//td | .//th')]
# 如果这一行的列数少于最大列数,则补充空白单元格
while len(columns) < max_cols:
columns.append('')
Expand Down
Loading