ccprocessor · yogacc33 · Mar 10, 2025 · Feb 25, 2025 · Feb 25, 2025 · Feb 25, 2025
diff --git a/llm_web_kit/extractor/extractor_chain.py b/llm_web_kit/extractor/extractor_chain.py
@@ -46,7 +46,6 @@ def extract(self, data: DataJson) -> DataJson:
             # Pre extractors
             for pre_ext in self.__pre_extractors:
                 data = pre_ext.pre_extract(data)
-
             # Main extractors
             for ext in self.__extractors:
                 data = ext.extract(data)

diff --git a/llm_web_kit/extractor/html/extractor.py b/llm_web_kit/extractor/html/extractor.py
@@ -5,6 +5,7 @@
 from overrides import override
 
 from llm_web_kit.config.cfg_reader import load_config
+from llm_web_kit.exception.exception import HtmlFileExtractorException
 from llm_web_kit.extractor.extractor import BaseFileFormatExtractor
 from llm_web_kit.extractor.html.magic_html import GeneralExtractor
 from llm_web_kit.extractor.html.recognizer.audio import AudioRecognizer
@@ -20,7 +21,6 @@
 from llm_web_kit.extractor.html.recognizer.video import VideoRecognizer
 from llm_web_kit.input.datajson import ContentList, DataJson
 from llm_web_kit.libs.html_utils import element_to_html, html_to_element
-from llm_web_kit.libs.logger import mylogger
 from llm_web_kit.libs.path_lib import get_py_pkg_root_dir
 
 
@@ -245,6 +245,63 @@
         lst = self.__paragraph_recognizer.recognize(base_url, html_lst, raw_html)
         return lst
 
+    def __is_valid_node(self, node: dict) -> bool:
+        """检查节点是否有效(不为空).
+
+        Args:
+            node (dict): 内容节点
+
+        Returns:
+            bool: 如果节点有效返回True,否则返回False
+        """
+        if not node:
+            raise HtmlFileExtractorException('node is empty')
+        node_type = node.get('type')
+        valid_types = {'list', 'code', 'equation-interline', 'image', 'table', 'title', 'paragraph'}
+        if node_type not in valid_types:
+            raise HtmlFileExtractorException(f'Invalid node type: {node_type}')
+        # 检查列表类型的节点
+        if node.get('type') == 'list':
+            items = node.get('content', {}).get('items', [])
+            # 过滤掉None、空列表，以及只包含None或空值的列表
+            return bool(items) and any(
+                isinstance(item, (dict, list)) and bool(item)
+                for item in items)
+        # 检测code类型的节点
+        if node.get('type') == 'code':
+            code_content = node.get('content', {}).get('code_content')
+            # 如果代码内容为None或空字符串，则视为无效节点
+            return bool(code_content and code_content.strip())
+        # 检测行间公式类型的节点
+        if node.get('type') == 'equation-interline':
+            math_content = node.get('content', {}).get('math_content')
+            # 如果公式内容为None或空字符串，则视为无效节点
+            return bool(math_content and math_content.strip())
+        # 检测image类型的节点
+        if node.get('type') == 'image':
+            content = node.get('content', {})
+            # 检查url、path或data字段是否至少有一个不为空
+            return bool(content.get('url') or content.get('path') or content.get('data'))
+        # 检测table类型的节点
+        if node.get('type') == 'table':
+            html = node.get('content', {}).get('html')
+            # 如果表格的html内容为None或空字符串，则视为无效节点
+            return bool(html and html.strip())
+        # 检测title类型的节点
+        if node.get('type') == 'title':
+            title_content = node.get('content', {}).get('title_content')
+            # 如果标题内容为None或空字符串，则视为无效节点
+            return bool(title_content and title_content.strip())
+        # 检测段落类型的节点
+        if node.get('type') == 'paragraph':
+            content = node.get('content', [])
+            # 检查content列表是否存在且不为空，并且至少有一个非空的内容项
+            return bool(content) and any(
+                item.get('c') and item.get('c').strip()
+                for item in content
+            )
+        return True
+
     def _export_to_content_list(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str) -> ContentList:
         """将解析结果存入content_list格式中.
 
@@ -263,12 +320,10 @@
             parser:BaseHTMLElementRecognizer = self.__to_content_list_mapper.get(cc_tag)
             if parser:
                 node = parser.to_content_list_node(base_url, ccnode_html, raw_html)
-                if node:
+                if node and self.__is_valid_node(node):
                     one_page.append(node)
             else:
-                mylogger.warning(f'无法识别的html标签：{cc_tag}, {parsed_html}')
-                # TODO 开发成熟的时候，在这里抛出异常，让调用者记录下来，以便后续分析改进
-
+                raise HtmlFileExtractorException(f'无法识别的html标签：{cc_tag}, {parsed_html}')
         content_list = ContentList([one_page])  # 对于网页来说仅有一页，如果多页，则剩下的每个都是一个论坛的回复
         return content_list
 
@@ -289,9 +344,9 @@
             xpath_expr = ' | '.join(f'self::{tag} | .//{tag}' for tag in self.__to_content_list_mapper.keys())
             nodes = el.xpath(xpath_expr)
             if len(nodes) == 0:
-                raise ValueError(f'html文本中没有cc标签: {html}')  # TODO 异常处理
-            if len(nodes) > 1:
-                raise ValueError(f'html文本中包含多个cc标签: {html}')  # TODO 异常处理
+                raise HtmlFileExtractorException(f'html文本中没有cc标签: {html}')
+            if len(nodes) > 3:
+                raise HtmlFileExtractorException(f'html文本中包含多个cc标签: {html}')
             return element_to_html(nodes[0]), nodes[0].tag
 
     def __build_extractor(self):

diff --git a/llm_web_kit/extractor/html/recognizer/cccode.py b/llm_web_kit/extractor/html/recognizer/cccode.py
@@ -38,7 +38,6 @@ def recognize(
             if self.is_cc_html(html):
                 rtn.append((html, raw_html))
                 continue
-
             root: HtmlElement = html_to_element(html)
             while True:
                 # 最常见:

diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py
@@ -5,7 +5,6 @@
 from overrides import override
 
 from llm_web_kit.exception.exception import HtmlTableRecognizerException
-from llm_web_kit.extractor.html.recognizer.cccode import CodeRecognizer
 from llm_web_kit.extractor.html.recognizer.ccmath import MathRecognizer
 from llm_web_kit.extractor.html.recognizer.recognizer import (
     BaseHTMLElementRecognizer, CCTag)
@@ -68,7 +67,6 @@
         :param table: lxml.html.HtmlElement 对象，表示一个 <table> 元素
         :return: 如果表格为空，返回 True；否则返回 False
         """
-
         def is_element_empty(elem):
             # 检查元素本身的文本内容
             if elem.text and elem.text.strip():
@@ -113,20 +111,19 @@
                 return False
         return True
 
-    def __is_table_contain_img(self, tree) -> bool:
-        """判断table元素是否包含图片."""
-        imgs = tree.xpath('//table//img')
-        if len(imgs) == 0:
-            return True
-        else:
-            return False
-
-    def __is_table_nested(self, tree) -> int:
-        """获取表格元素的嵌套层级（非表格元素返回0，顶层表格返回1，嵌套表格返回层级数）."""
-        if tree.tag != 'table':
-            return 0  # 非表格元素返回0
-        # 计算祖先中的 table 数量（不包括自身），再加1表示自身层级
-        return len(tree.xpath('ancestor::table')) + 1
+    def __is_table_nested(self, element) -> int:
+        """计算表格的嵌套层级（非表格返回0，根据原始table判断的."""
+        if element.tag != 'table':
+            return 0
+        # 获取当前表格下所有的表格（包括自身）
+        all_tables = [element] + element.xpath('.//table')
+        max_level = 1  # 初始层级为1（当前表格）
+        # 计算每个表格的层级，取最大值
+        for table in all_tables:
+            ancestor_count = len(table.xpath('ancestor::table'))
+            level = ancestor_count + 1
+            max_level = max(max_level, level)
+        return max_level
 
     def __extract_tables(self, ele: str) -> List[Tuple[str, str]]:
         """提取html中的table元素."""
@@ -150,78 +147,93 @@
             table_type = 'complex'
         return table_type
 
-    def __extract_table_element(self, ele: HtmlElement) -> str:
-        """提取表格的元素."""
-        for item in ele.iterchildren():
-            return self._element_to_html(item)
-
     def __check_table_include_math_code(self, raw_html: HtmlElement):
-        """check table中是否包含math."""
+        """检查table中的内容，包括普通文本、数学公式和代码."""
         math_html = self._element_to_html(raw_html)
-        ele_res = list()
         math_recognizer = MathRecognizer()
-        math_res_parts = math_recognizer.recognize(base_url='', main_html_lst=[(math_html, math_html)],
-                                                   raw_html=math_html)
-        code_recognizer = CodeRecognizer()
-        code_res_parts = code_recognizer.recognize(base_url='', main_html_lst=math_res_parts,
-                                                   raw_html=math_html)
-        for math_item in code_res_parts:
+        math_res_parts = math_recognizer.recognize(
+            base_url='',
+            main_html_lst=[(math_html, math_html)],
+            raw_html=math_html
+        )
+        result = []
+        for math_item in math_res_parts:
             ele_item = self._build_html_tree(math_item[0])
-            ccinline_math_node = ele_item.xpath(f'//{CCTag.CC_MATH_INLINE}')
-            ccinline_code_node = ele_item.xpath(f'//{CCTag.CC_CODE_INLINE}')
-            ccinterline_math_node = ele_item.xpath(f'//{CCTag.CC_MATH_INTERLINE}')
-            ccinterline_code_node = ele_item.xpath(f'//{CCTag.CC_CODE}')
-            if ccinline_math_node:
-                formulas = [
-                    el.text if el.text.strip() else ''
-                    for el in ccinline_math_node
-                ]
-                ele_res.extend(formulas)  # 添加字符串
-            elif ccinterline_math_node:
-                codes = [
-                    el.text if el.text.strip() else ''
-                    for el in ccinterline_math_node
-                ]
-                ele_res.extend(codes)
-            elif ccinline_code_node:
-                inline_codes = [
-                    el.text if el.text.strip() else ''
-                    for el in ccinline_code_node
-                ]
-                ele_res.extend(inline_codes)
-            elif ccinterline_code_node:
-                ccinterline_codes = [
-                    el.text if el.text else ''
-                    for el in ccinterline_code_node
-                ]
-                ele_res.extend(ccinterline_codes)
-            else:
-                texts = []
-                # 使用 itertext() 遍历所有文本片段
-                for text_segment in ele_item.itertext():
-                    # 统一处理文本：去空白 + 替换字面 \n
-                    cleaned_text = text_segment.strip().replace('\\n', '')
-                    if cleaned_text:  # 过滤空字符串
-                        texts.append(cleaned_text)
-                ele_res.extend(texts)
-        return ele_res
 
-    def __simplify_td_th_content(self, elem: HtmlElement) -> None:
-        """简化 <td> 和 <th> 内容，仅保留文本内容."""
+            def process_node(node):
+                """处理行内公式、行间公式、行间代码、行内代码."""
+                if node.tag == CCTag.CC_MATH_INLINE:
+                    if node.text and node.text.strip():
+                        result.append(f'${node.text.strip()}$')
+                    if node.tail and node.tail.strip():
+                        result.append(node.tail.strip())
+                # 处理行间公式
+                elif node.tag == CCTag.CC_MATH_INTERLINE:
+                    if node.text and node.text.strip():
+                        result.append(f'$${node.text.strip()}$$')
+                    if node.tail and node.tail.strip():
+                        result.append(node.tail.strip())
+                # 处理行间代码
+                elif node.tag == CCTag.CC_CODE:
+                    if node.text and node.text.strip():
+                        result.append(f'```{node.text.strip()}```')
+                    if node.tail and node.tail.strip():
+                        result.append(node.tail.strip())
+                # 处理行内代码
+                elif node.tag == CCTag.CC_CODE_INLINE:
+                    if node.text and node.text.strip():
+                        result.append(f'`{node.text.strip()}`')
+                    if node.tail and node.tail.strip():
+                        result.append(node.tail.strip())
+                else:
+                    # 提取当前节点的文本
+                    if node.text and node.text.strip():
+                        cleaned_text = node.text.strip().replace('\\n', '')
+                        result.append(cleaned_text)
+                    # 处理节点的tail（元素闭合后的文本）
+                    if node.tail and node.tail.strip():
+                        cleaned_tail = node.tail.strip().replace('\\n', '')
+                        result.append(cleaned_tail)
+                    # 递归处理子节点
+                    for child in node:
+                        process_node(child)
+            # 从根节点开始处理
+            process_node(ele_item)
+        return result
+
+    def __simplify_td_th_content(self, table_nest_level, elem: HtmlElement) -> None:
+        """简化 <td> 和 <th> 内容，保留嵌套表格结构."""
         if elem.tag in ['td', 'th']:
-            # 简化单元格中的元素
-            parse_res = list()
-            math_res = self.__check_table_include_math_code(elem)
-            parse_res.extend(math_res)
-            for item in list(elem.iterchildren()):
-                elem.remove(item)
-            if parse_res:
-                elem.text = '<br>'.join(parse_res)
+            parse_res = []
+            # 检查是否存在嵌套的表格
+            if table_nest_level > 1:
+                # 存在嵌套表格，递归处理子节点
+                for child in elem.iterchildren():
+                    if child.tag == 'table':
+                        # 对嵌套表格递归调用简化处理
+                        self.__simplify_td_th_content(table_nest_level, child)
+                    else:
+                        # 处理非表格元素
+                        math_res = self.__check_table_include_math_code(child)
+                        parse_res.extend(math_res)
+                        elem.remove(child)
+                # 将非表格内容拼接后放在表格前面
+                if parse_res:
+                    elem.text = ' '.join(parse_res) + (elem.text or '')
+            else:
+                # 没有嵌套表格，直接简化
+                math_res = self.__check_table_include_math_code(elem)
+                parse_res.extend(math_res)
+                for item in list(elem.iterchildren()):
+                    elem.remove(item)
+                if parse_res:
+                    elem.text = ' '.join(parse_res)
             return
-        for child in elem.iter('td', 'th'):
-            self.__simplify_td_th_content(child)
+        # 非 td/th 元素继续递归处理
+        for child in elem.iterchildren():
+            self.__simplify_td_th_content(table_nest_level, child)
 
-    def __get_table_body(self, table_type, table_root):
+    def __get_table_body(self, table_type, table_nest_level, table_root):
         """获取并处理table body，返回处理后的HTML字符串。"""
         if table_type == 'empty':
             return None
@@ -237,11 +249,12 @@
                 elem.text = elem.text.strip().replace('\\n', '')
             if elem.tail is not None:
                 elem.tail = elem.tail.strip().replace('\\n', '')
-        self.__simplify_td_th_content(table_root)
+        # 单元格内的多标签内容进行简化，空格拼接，公式、代码识别
+        self.__simplify_td_th_content(table_nest_level, table_root)
         # 迭代
         for child in table_root.iterchildren():
             if child is not None:
-                self.__get_table_body(table_type, child)
+                self.__get_table_body(table_type, table_nest_level, child)
         return self._element_to_html(table_root)
 
     def __do_extract_tables(self, root: HtmlElement) -> None:
@@ -251,7 +264,7 @@
             table_type = self.__get_table_type(root)
             table_nest_level = self.__is_table_nested(root)
             tail_text = root.tail
-            table_body = self.__get_table_body(table_type, root)
+            table_body = self.__get_table_body(table_type, table_nest_level, root)
             cc_element = self._build_cc_element(
                 CCTag.CC_TABLE, table_body, tail_text, table_type=table_type, table_nest_level=table_nest_level,
                 html=table_raw_html)

diff --git a/llm_web_kit/libs/html_utils.py b/llm_web_kit/libs/html_utils.py
@@ -1,4 +1,5 @@
 import html
+import re
 from copy import deepcopy
 
 from lxml.html import HtmlElement, HTMLParser, fromstring, tostring
@@ -114,6 +115,18 @@ def iter_node(element: HtmlElement):
             yield from iter_node(sub_element)
 
 
+def _escape_table_cell(text: str) -> str:
+    """转义表格单元格中的特殊字符.
+
+    比如 |、内容中的\n等
+    """
+    # 首先处理换行符，将其替换为空格
+    text = re.sub(r'[\r\n]+', ' ', text)
+    # 转义竖线和点号，避免与markdown表格语法冲突
+    escaped = text.replace('|', '\\|')
+    return escaped
+
+
 def html_to_markdown_table(table_html_source: str) -> str:
     """把html代码片段转换成markdown表格.
 
@@ -140,7 +153,7 @@ def html_to_markdown_table(table_html_source: str) -> str:
 
     # 检查第一行是否是表头并获取表头内容
     first_row_tags = rows[0].xpath('.//th | .//td')
-    headers = [tag.text_content().strip() for tag in first_row_tags]
+    headers = [_escape_table_cell(tag.text_content().strip()) for tag in first_row_tags]
     # 如果表头存在，添加表头和分隔符，并保证表头与最大列数对齐
     if headers:
         while len(headers) < max_cols:
@@ -155,7 +168,7 @@ def html_to_markdown_table(table_html_source: str) -> str:
 
     # 添加表格内容，跳过已被用作表头的第一行（如果有的话）
     for row in rows[1:]:
-        columns = [td.text_content().strip() for td in row.xpath('.//td | .//th')]
+        columns = [_escape_table_cell(td.text_content().strip()) for td in row.xpath('.//td | .//th')]
         # 如果这一行的列数少于最大列数，则补充空白单元格
         while len(columns) < max_cols:
             columns.append('')