From 9fec7d143c1dad8d5df5e45b52648f491b897caa Mon Sep 17 00:00:00 2001 From: Yanggq <1041206149@qq.com> Date: Tue, 2 Sep 2025 16:54:35 +0800 Subject: [PATCH] feat: add LayoutTableDetector --- llm_web_kit/extractor/html/pre_extractor.py | 87 +++++++++++++++------ 1 file changed, 65 insertions(+), 22 deletions(-) diff --git a/llm_web_kit/extractor/html/pre_extractor.py b/llm_web_kit/extractor/html/pre_extractor.py index 064430d3..b826d6c4 100644 --- a/llm_web_kit/extractor/html/pre_extractor.py +++ b/llm_web_kit/extractor/html/pre_extractor.py @@ -47,39 +47,82 @@ def _ensure_main_html(self, data_json: DataJson) -> DataJson: return data_json +class LayoutTableDetector: + """排版表格检测器,用于识别排版表格.""" + + def __init__(self, max_links_ratio=0.51): + self.max_links_ratio = max_links_ratio + + def is_layout_table(self, table): + """判断是否为排版表格.""" + return (self.is_form_layout_table(table) or + self.is_navigation_layout_table(table)) + + def is_form_layout_table(self, table): + """ + 检测规则1: 表单内的表格 + 原因: 表单内的表格通常是排版表格,不包含有用的数据 + """ + # 检查表格是否在form元素内 + parent = table.getparent() + while parent is not None: + if parent.tag == 'form': + return True + parent = parent.getparent() + return False + + def is_navigation_layout_table(self, table): + """ + 检测规则2: 链接占比超过51%的表格 + 原因: 链接过多的表格通常是导航排版表格,不是数据表格 + """ + # 获取所有单元格 + cells = table.xpath('.//td | .//th') + if not cells: + return False + + # 统计包含链接的单元格数量 + link_cells = 0 + for cell in cells: + if cell.xpath('.//a'): + link_cells += 1 + + # 计算链接占比 + link_ratio = link_cells / len(cells) + return link_ratio > self.max_links_ratio + + class HTMLFileFormatNoClipFilterTablePreExtractor(HTMLFileFormatFilterPreExtractor): + """HTML文件格式排版表格检测预处理器.""" + def __init__(self, config: dict): super().__init__(config) - @override - def _filter_by_rule(self, data_json: DataJson) -> bool: - if self.__remove_format_table(data_json): - return True - else: - return False - @override def _do_pre_extract(self, data_json: DataJson) -> DataJson: - pass # TODO - return data_json - - def __remove_format_table(self, data_json: DataJson): - """remove 排版table.""" + """检测并处理HTML内容中的排版表格.""" html_content = self._get_html_content(data_json) - return self.__do_remove_layout_table(html_content) + + html_tree = html_to_element(html_content) + layout_detector = LayoutTableDetector() + + # 找到所有表格并处理排版表格 + tables = html_tree.xpath('//table') + for table in tables: + if layout_detector.is_layout_table(table): + self._process_layout_table(table) + + # 更新处理后的HTML内容 + data_json['html'] = element_to_html(html_tree) + return data_json def _get_html_content(self, data_json: DataJson): return data_json['html'] - def __do_remove_layout_table(self, html_content: str): - """remove 排版table.""" - html_str = html_to_element(html_content) - first_structure = html_str.xpath('/html/body/table') != [] - second_structure = html_str.xpath('/html/body/center/table') != [] - if bool(first_structure and second_structure): - return True - else: - return False + def _process_layout_table(self, table): + # TODO: 排版表格的具体处理逻辑 + print("Detected layout table, processing...") + pass class HTMLFileFormatNoClipCleanTagsPreExtractor(HTMLFileFormatFilterPreExtractor):