ccprocessor · yogacc33 · Mar 19, 2025 · Mar 11, 2025 · Mar 11, 2025 · Mar 12, 2025
diff --git a/bench/config/data_config.jsonl b/bench/config/data_config.jsonl
diff --git a/bench/config/data_math_config.jsonl b/bench/config/data_math_config.jsonl
diff --git a/bench/config/ours_config.jsonc b/bench/config/ours_config.jsonc
@@ -25,7 +25,7 @@
                 "enable": true,
                 "python_class": "llm_web_kit.extractor.html.pre_extractor.TestHTMLFileFormatFilterPreExtractor",
                 "class_init_kwargs": {
-                    "html_parent_dir": "bench/data/"
+                    "html_parent_dir": "bench/"
                 }
             },
             {

diff --git a/bench/config/ours_data_config.jsonl b/bench/config/ours_data_config.jsonl
diff --git a/bench/data/groundtruth/math_physicsforums_1.jsonl b/bench/data/groundtruth/math_physicsforums_1.jsonl
diff --git a/bench/eval/ours.py b/bench/eval/ours.py
@@ -1,23 +1,18 @@
 import json
+import os
+from pathlib import Path
 from typing import Dict, List, Tuple
 
 from llm_web_kit.extractor.extractor_chain import ExtractSimpleFactory
 from llm_web_kit.input.datajson import DataJson, DataJsonKey
 
 
-def eval_ours_extract_html(chain_config: dict, html_data_path: str, filePath: str, page_layout_type: str = '', url: str = '') -> Tuple[str, List[Dict], str, dict]:
+def eval_ours_extract_html(chain_config: dict, test_data: dict) -> Tuple[str, List[Dict], str, dict]:
     chain = ExtractSimpleFactory.create(chain_config)
     assert chain is not None
 
-    # Read test data
-    with open(html_data_path, 'r') as f:
-        test_data = json.loads(f.readline().strip())
-
-        # Create DataJson from test data
-        input_data = DataJson(test_data)
-        input_data.__setitem__('path', filePath)
-        input_data.__setitem__('page_layout_type', page_layout_type)
-        input_data.__setitem__('url', url)
+    # Create DataJson from test data
+    input_data = DataJson(test_data)
 
     # Test extraction
     result = chain.extract(input_data)
@@ -26,3 +21,61 @@ def eval_ours_extract_html(chain_config: dict, html_data_path: str, filePath: st
     main_html = content_list.to_main_html()
     content = content_list.to_nlp_md()
     return content, content_list._get_data(), main_html, statics
+
+
+if __name__ == '__main__':
+    root = Path(__file__).parent.parent.parent
+    from llm_web_kit.dataio.filebase import (FileBasedDataReader,
+                                             FileBasedDataWriter)
+    reader = FileBasedDataReader('')
+    writer = FileBasedDataWriter('')
+
+    # 确保输出目录存在
+    output_dir = f'{root}/bench/output/ours'
+    os.makedirs(output_dir, exist_ok=True)
+
+    with open(f'{root}/bench/config/ours_config.jsonc', 'r') as f:
+        chain_config = json.load(f)
+
+    # 循环处理每一行数据
+    with open(f'{root}/bench/config/data_math_config.jsonl', 'r') as f:
+        for line in f:
+            test_data = json.loads(line.strip())
+            content, content_list, main_html, statics = eval_ours_extract_html(
+                chain_config,
+                test_data
+            )
+            print('处理数据:', test_data.get('track_id'))
+            print('URL:', test_data.get('url'))
+            print('统计信息:', statics)
+
+            out = {
+                'url': test_data.get('url'),
+                'content': content,
+                'main_html': main_html,
+                'content_list': content_list,
+                'html': reader.read(
+                    f'{root}/bench/{test_data.get("path")}'
+                ).decode('utf-8'),
+                'statics': statics
+            }
+
+            # 获取path的前两级目录
+            path = test_data.get('path', '')
+            path_parts = path.split('/')
+            if len(path_parts) >= 2:
+                output_subdir = '/'.join(path_parts[:2])
+            else:
+                output_subdir = 'unknown'
+
+            # 创建对应的输出目录
+            output_dir = f'{root}/bench/output/ours/{output_subdir}'
+            os.makedirs(output_dir, exist_ok=True)
+
+            # 追加写入结果
+            output_file = f'{output_dir}/data_math_output.jsonl'
+            writer.append_write(
+                output_file,
+                json.dumps(out).encode('utf-8') + b'\n'
+            )
+            print(f'结果已追加到: {output_file}')
diff --git a/bench/run.py b/bench/run.py
@@ -5,7 +5,7 @@
 from pathlib import Path
 
 from bench.common.metrics import Metrics
-from bench.common.result import Error_Item, Result_Detail, Result_Summary
+from bench.common.result import Result_Detail, Result_Summary
 from bench.eval.ours import eval_ours_extract_html
 from llm_web_kit.dataio.filebase import (FileBasedDataReader,
                                          FileBasedDataWriter)
@@ -23,12 +23,36 @@
 sourcePath = os.path.join(root, 'data/all.json')
 outputPath = os.path.join(root, 'output')
 pipelineConfigPath = os.path.join(root, 'config/ours_config.jsonc')
-pipeline_data_path = os.path.join(root, 'config/ours_data_config.jsonl')
+pipeline_data_path = os.path.join(root, 'config/data_config.jsonl')
 
 reader = FileBasedDataReader('')
 writer = FileBasedDataWriter('')
 
 
+def run_ours(pipelineConfigPath, pipeline_data_path, outputPath, statics_pre):
+    with open(pipeline_data_path, 'r') as f:
+        for line in f:
+            print(line)
+            data_json = json.loads(line.strip())
+            content, content_list, main_html, statics = eval_ours_extract_html(pipelineConfigPath, data_json)
+            out = {
+                'url': data_json.get('url'),
+                'content': content,
+                'main_html': main_html,
+                'content_list': content_list,
+                'html': reader.read(
+                    f'{root}/{data_json.get("path")}'
+                ).decode('utf-8'),
+                'statics': statics
+            }
+            Statics(statics).print()
+            statics_pre.merge_statics(statics)
+            writer.write(
+                f'{outputPath}/{args.tool}/{data_json.get("track_id")}.jsonl',
+                json.dumps(out).encode('utf-8') + b'\n'
+            )
+
+
 def main():
     out = {}
     task_id = str(uuid.uuid1())
@@ -71,33 +95,27 @@ def main():
             if args.tool == 'magic_html':
                 from bench.eval.magic_html import eval_magic_html
                 output = eval_magic_html(html, fileName)
+                out = {
+                    'url': url,
+                    'content': output,
+                    'html': html,
+                }
+                writer.write(
+                    f'{outputPath}/{args.tool}/{fileName}.jsonl',
+                    json.dumps(out).encode('utf-8') + b'\n'
+                )
             elif args.tool == 'unstructured':
                 from bench.eval.unstructured_eval import eval_unstructured
                 output = eval_unstructured(html, fileName)
-            elif args.tool == 'ours':
-                try:
-                    print(pipelineConfigPath)
-                    print(pipeline_data_path)
-                    print(f'{root}/data/{origin_filepath}')
-                    output, content_list, main_html, statics = eval_ours_extract_html(pipelineConfigPath, pipeline_data_path, f'{root}/data/{origin_filepath}', layout_type, url)
-                    out['content_list'] = content_list
-                    out['main_html'] = main_html
-                    out['statics'] = statics
-                    Statics(statics).print()
-                    statics_pre.merge_statics(statics)
-                except Exception as e:
-                    summary.error_summary['count'] += 1
-                    detail.result_detail['error_result'].append(Error_Item(
-                        file_path=origin_filepath,
-                        error_detail=str(e)
-                    ))
-            else:
-                raise ValueError(f'Invalid tool: {args.tool}')
+                out = {
+                    'url': url,
+                    'content': output,
+                    'html': html,
+                }
+                writer.write(f'{outputPath}/{args.tool}/{fileName}.jsonl', json.dumps(out).encode('utf-8') + b'\n')
+    if args.tool == 'ours':
+        run_ours(pipelineConfigPath, pipeline_data_path, outputPath, statics_pre)
 
-            out['url'] = url
-            out['content'] = output
-            out['html'] = html
-            writer.write(f'{outputPath}/{args.tool}/{fileName}.jsonl', json.dumps(out).encode('utf-8') + b'\n')
     summary.finish()
     detail.finish()
     statics_gt.print()

diff --git a/llm_web_kit/dataio/filebase.py b/llm_web_kit/dataio/filebase.py
@@ -60,3 +60,20 @@
 
         with open(fn_path, 'wb') as f:
             f.write(data)
+
+    def append_write(self, path: str, data: bytes) -> None:
+        """Append data to file.
+
+        Args:
+            path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
+            data (bytes): the data want to append
+        """
+        fn_path = path
+        if not os.path.isabs(fn_path) and len(self._parent_dir) > 0:
+            fn_path = os.path.join(self._parent_dir, path)
+
+        if not os.path.exists(os.path.dirname(fn_path)) and os.path.dirname(fn_path) != '':
+            os.makedirs(os.path.dirname(fn_path), exist_ok=True)
+
+        with open(fn_path, 'ab') as f:
+            f.write(data)
diff --git a/llm_web_kit/exception/exception.jsonc b/llm_web_kit/exception/exception.jsonc
@@ -80,6 +80,10 @@
       "code": 31031100,
       "message": "HTML math recognizer exception"
     },
+    "HtmlMathMathjaxRenderRecognizerException": {
+      "code": 31031110,
+      "message": "HTML math mathjax render recognizer exception"
+    },
     "HtmlCodeRecognizerException": {
       "code": 31031200,
       "message": "HTML code recognizer exception"

diff --git a/llm_web_kit/exception/exception.py b/llm_web_kit/exception/exception.py
@@ -234,6 +234,14 @@
         super().__init__(custom_message, error_code)
 
 
+class HtmlMathMathjaxRenderRecognizerException(HtmlRecognizerException):
+    """Exception raised during math render."""
+    def __init__(self, custom_message: str | None = None, error_code: int | None = None):
+        if error_code is None:
+            error_code = ErrorMsg.get_error_code('HtmlRecognizer', 'HtmlMathMathjaxRenderRecognizerException')
+        super().__init__(custom_message, error_code)
+
+
 class HtmlCodeRecognizerException(HtmlRecognizerException):
     """Exception raised during code content recognition."""
     def __init__(self, custom_message: str | None = None, error_code: int | None = None):

diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/common.py b/llm_web_kit/extractor/html/recognizer/cc_math/common.py
@@ -61,12 +61,6 @@ class MathType:
     HTMLMATH = 'htmlmath'  # sub, sup, etc.
 
 
-# 数学公式渲染器
-class MathRender:
-    MATHJAX = 'mathjax'
-    KATEX = 'katex'
-
-
 # node.text匹配结果：
 class MathMatchRes:
     ALLMATCH = 'all_match'
@@ -195,31 +189,6 @@ def extract_asciimath(self, s: str) -> str:
         parsed = asciimath2tex.translate(s)
         return parsed
 
-    def get_math_render(self, html: str) -> str:
-        """获取数学公式渲染器.
-        示例:
-        MathJax:
-            <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-MML-AM_CHTML"></script>
-        Katex:
-            <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.13.11/dist/katex.min.css">
-        """
-        tree = html_to_element(html)
-        if tree is None:
-            return None
-        # 检查 KaTeX
-        for link in tree.iter('link'):
-            if link.get('href') and 'katex' in link.get('href', '').lower():
-                return MathRender.KATEX
-        # 查找head标签
-        # head = tree.find('head')
-        # if head is not None:
-        # 检查 MathJax
-        for script in tree.iter('script'):
-            src = script.get('src', '').lower()
-            if src and ('mathjax' in src or 'asciimath' in src):
-                return MathRender.MATHJAX
-        return None
-
     def get_equation_type(self, html: str) -> List[Tuple[str, str]]:
         """根据latex_config判断数学公式是行内还是行间公式.
 

diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/render/__init__.py b/llm_web_kit/extractor/html/recognizer/cc_math/render/__init__.py