ccprocessor · dt-yy · Feb 25, 2025 · Feb 25, 2025 · Feb 25, 2025 · Feb 25, 2025
diff --git a/llm_web_kit/extractor/html/extractor.py b/llm_web_kit/extractor/html/extractor.py
@@ -290,7 +290,7 @@ def __get_cc_node(self, html:str) -> (str, str):
             nodes = el.xpath(xpath_expr)
             if len(nodes) == 0:
                 raise ValueError(f'html文本中没有cc标签: {html}')  # TODO 异常处理
-            if len(nodes) > 1:
+            if len(nodes) > 2:
                 raise ValueError(f'html文本中包含多个cc标签: {html}')  # TODO 异常处理
             return element_to_html(nodes[0]), nodes[0].tag
 

diff --git a/llm_web_kit/libs/html_utils.py b/llm_web_kit/libs/html_utils.py
@@ -1,4 +1,5 @@
 import html
+import re
 from copy import deepcopy
 
 from lxml.html import HtmlElement, HTMLParser, fromstring, tostring
@@ -114,6 +115,18 @@ def iter_node(element: HtmlElement):
             yield from iter_node(sub_element)
 
 
+def _escape_table_cell(text: str) -> str:
+    """转义表格单元格中的特殊字符.
+
+    比如 |、内容中的\n等
+    """
+    # 首先处理换行符，将其替换为空格
+    text = re.sub(r'[\r\n]+', ' ', text)
+    # 转义竖线和点号，避免与markdown表格语法冲突
+    escaped = text.replace('|', '\\|')
+    return escaped
+
+
 def html_to_markdown_table(table_html_source: str) -> str:
     """把html代码片段转换成markdown表格.
 
@@ -140,7 +153,7 @@ def html_to_markdown_table(table_html_source: str) -> str:
 
     # 检查第一行是否是表头并获取表头内容
     first_row_tags = rows[0].xpath('.//th | .//td')
-    headers = [tag.text_content().strip() for tag in first_row_tags]
+    headers = [_escape_table_cell(tag.text_content().strip()) for tag in first_row_tags]
     # 如果表头存在，添加表头和分隔符，并保证表头与最大列数对齐
     if headers:
         while len(headers) < max_cols:
@@ -155,7 +168,7 @@ def html_to_markdown_table(table_html_source: str) -> str:
 
     # 添加表格内容，跳过已被用作表头的第一行（如果有的话）
     for row in rows[1:]:
-        columns = [td.text_content().strip() for td in row.xpath('.//td | .//th')]
+        columns = [_escape_table_cell(td.text_content().strip()) for td in row.xpath('.//td | .//th')]
         # 如果这一行的列数少于最大列数，则补充空白单元格
         while len(columns) < max_cols:
             columns.append('')

diff --git a/.../extractor/assets/extractor_chain_input/good_data/html/test_table_elem_include_enter.html b/.../extractor/assets/extractor_chain_input/good_data/html/test_table_elem_include_enter.html
diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl
@@ -10,4 +10,5 @@
 {"track_id": "legato_doc", "dataset_name": "test_pipeline_suit", "url": "https://www.test.com","data_source_category": "HTML",  "path":"legato_docs.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
 {"track_id": "oracle_doc", "dataset_name": "test_pipeline_suit", "url": "https://docs.oracle.com/en-us/iaas/tools/java/3.57.1/com/oracle/bmc/integration/model/CustomEndpointDetails.html","data_source_category": "HTML",  "path":"oracle_doc.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
 {"track_id": "table_involve_inline_code", "dataset_name": "test_table_involve_inline_code", "url": "https://docs.oracle.com/en-us/iaas/tools/java/3.57.1/com/oracle/bmc/integration/model/CustomEndpointDetails.html","data_source_category": "HTML",  "path":"table_involve_inline_code.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
-{"track_id": "table_tail_text", "dataset_name": "test_table_tail_text", "url": "https://dchublists.com/?do=hublist&id=hub-975&language=en","data_source_category": "HTML",  "path":"table_tail_text.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
+{"track_id": "table_tail_text", "dataset_name": "test_table_tail_text", "url": "https://dchublists.com/?do=hublist&id=hub-975&language=en","data_source_category": "HTML",  "path":"table_tail_text.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
+{"track_id": "table_elem_include_enter", "dataset_name": "table_elem_include_enter", "url": "https://fardapaper.ir/financial-development-equity-capital","data_source_category": "HTML",  "path":"test_table_elem_include_enter.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_simple_cc.html b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_simple_cc.html
diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py
@@ -166,6 +166,7 @@ def test_table_involve_after_code(self):
             parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html)
             assert html_to_element(parts[0][0]).xpath(f'.//{CCTag.CC_TABLE}')[0].text is None
 
+    @unittest.skip(reason='在code模块解决了table嵌套多行代码问题')
     def test_table_involve_code(self):
         """table involve code."""
         for test_case in TEST_CASES:

diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py
@@ -59,7 +59,7 @@ def setUp(self):
             for line in f:
                 self.data_json.append(json.loads(line.strip()))
 
-        assert len(self.data_json) == 13
+        assert len(self.data_json) == 14
 
         # Config for HTML extraction
         self.config = {
@@ -369,4 +369,21 @@ def test_table_tail_text(self):
         input_data = DataJson(test_data)
         result = chain.extract(input_data)
         content_md = result.get_content_list().to_mm_md()
+        print(content_md)
         assert '| ID: 975' in content_md
+
+    def test_table_element_include_enter(self):
+        """table的元素中间有换行."""
+        chain = ExtractSimpleFactory.create(self.config)
+        self.assertIsNotNone(chain)
+        test_data = self.data_json[13]
+        # Create DataJson from test data
+        input_data = DataJson(test_data)
+        result = chain.extract(input_data)
+        content_md = result.get_content_list().to_mm_md()
+        print(content_md)
+        assert """| عنوان فارسی | توسعه مالی و هزینه سرمایه حقوق سهامداران: شواهدی از چین |
+|---|---|
+| عنوان انگلیسی | Financial development and the cost of equity capital: Evidence from China |
+| کلمات کلیدی : | &nbsp         توسعه مالی؛ هزینه سرمایه حقوق سهامداران؛ قانون و امور مالی؛ چین |
+| درسهای مرتبط | حسابداری |""" in content_md