From 9fec7d143c1dad8d5df5e45b52648f491b897caa Mon Sep 17 00:00:00 2001
From: Yanggq <1041206149@qq.com>
Date: Tue, 2 Sep 2025 16:54:35 +0800
Subject: [PATCH] feat: add LayoutTableDetector

---
 llm_web_kit/extractor/html/pre_extractor.py | 87 +++++++++++++++------
 1 file changed, 65 insertions(+), 22 deletions(-)

diff --git a/llm_web_kit/extractor/html/pre_extractor.py b/llm_web_kit/extractor/html/pre_extractor.py
index 064430d3..b826d6c4 100644
--- a/llm_web_kit/extractor/html/pre_extractor.py
+++ b/llm_web_kit/extractor/html/pre_extractor.py
@@ -47,39 +47,82 @@ def _ensure_main_html(self, data_json: DataJson) -> DataJson:
         return data_json
 
 
+class LayoutTableDetector:
+    """排版表格检测器，用于识别排版表格."""
+
+    def __init__(self, max_links_ratio=0.51):
+        self.max_links_ratio = max_links_ratio
+
+    def is_layout_table(self, table):
+        """判断是否为排版表格."""
+        return (self.is_form_layout_table(table) or
+                self.is_navigation_layout_table(table))
+
+    def is_form_layout_table(self, table):
+        """
+        检测规则1: 表单内的表格
+        原因: 表单内的表格通常是排版表格，不包含有用的数据
+        """
+        # 检查表格是否在form元素内
+        parent = table.getparent()
+        while parent is not None:
+            if parent.tag == 'form':
+                return True
+            parent = parent.getparent()
+        return False
+
+    def is_navigation_layout_table(self, table):
+        """
+        检测规则2: 链接占比超过51%的表格
+        原因: 链接过多的表格通常是导航排版表格，不是数据表格
+        """
+        # 获取所有单元格
+        cells = table.xpath('.//td | .//th')
+        if not cells:
+            return False
+
+        # 统计包含链接的单元格数量
+        link_cells = 0
+        for cell in cells:
+            if cell.xpath('.//a'):
+                link_cells += 1
+
+        # 计算链接占比
+        link_ratio = link_cells / len(cells)
+        return link_ratio > self.max_links_ratio
+
+
 class HTMLFileFormatNoClipFilterTablePreExtractor(HTMLFileFormatFilterPreExtractor):
+    """HTML文件格式排版表格检测预处理器."""
+
     def __init__(self, config: dict):
         super().__init__(config)
 
-    @override
-    def _filter_by_rule(self, data_json: DataJson) -> bool:
-        if self.__remove_format_table(data_json):
-            return True
-        else:
-            return False
-
     @override
     def _do_pre_extract(self, data_json: DataJson) -> DataJson:
-        pass  # TODO
-        return data_json
-
-    def __remove_format_table(self, data_json: DataJson):
-        """remove 排版table."""
+        """检测并处理HTML内容中的排版表格."""
         html_content = self._get_html_content(data_json)
-        return self.__do_remove_layout_table(html_content)
+
+        html_tree = html_to_element(html_content)
+        layout_detector = LayoutTableDetector()
+
+        # 找到所有表格并处理排版表格
+        tables = html_tree.xpath('//table')
+        for table in tables:
+            if layout_detector.is_layout_table(table):
+                self._process_layout_table(table)
+
+        # 更新处理后的HTML内容
+        data_json['html'] = element_to_html(html_tree)
+        return data_json
 
     def _get_html_content(self, data_json: DataJson):
         return data_json['html']
 
-    def __do_remove_layout_table(self, html_content: str):
-        """remove 排版table."""
-        html_str = html_to_element(html_content)
-        first_structure = html_str.xpath('/html/body/table') != []
-        second_structure = html_str.xpath('/html/body/center/table') != []
-        if bool(first_structure and second_structure):
-            return True
-        else:
-            return False
+    def _process_layout_table(self, table):
+        # TODO: 排版表格的具体处理逻辑
+        print("Detected layout table, processing...")
+        pass
 
 
 class HTMLFileFormatNoClipCleanTagsPreExtractor(HTMLFileFormatFilterPreExtractor):