ccprocessor · drunkpig · Mar 18, 2025 · Mar 18, 2025 · Mar 18, 2025
diff --git a/llm_web_kit/config/cfg_reader.py b/llm_web_kit/config/cfg_reader.py
@@ -3,6 +3,7 @@
 import commentjson as json
 
 from llm_web_kit.exception.exception import ModelResourceException
+from llm_web_kit.libs.path_lib import get_py_pkg_root_dir
 
 
 def load_config() -> dict:
@@ -41,3 +42,17 @@ def load_config() -> dict:
         config = json.load(f)
 
     return config
+
+
+def load_pipe_tpl(pipe_name: str) -> dict:
+    """Load the pipe template for the web kit.
+
+    Args:
+        pipe_name(str): The name of the pipe to load
+
+    Returns: pipe_tpl(dict): The pipe template dictionary
+    """
+    pipe_tpl_path = os.path.join(get_py_pkg_root_dir(), 'config', 'pipe_tpl', f'{pipe_name}.jsonc')
+    with open(pipe_tpl_path, 'r', encoding='utf-8') as f:
+        pipe_tpl = json.load(f)
+    return pipe_tpl
diff --git a/llm_web_kit/config/pipe_tpl/ebook.jsonc b/llm_web_kit/config/pipe_tpl/ebook.jsonc
@@ -0,0 +1,25 @@
+{
+    "extractor_pipe": {
+        "pre_extractor": [
+            {
+                "enable": true,
+                "python_class": "llm_web_kit.extractor.ebook.pre_extractor.EBOOKFileFormatFilterPreExtractor",
+                "class_init_kwargs": {}
+            }
+        ],
+        "extractor": [
+            {
+                "enable": true,
+                "python_class": "llm_web_kit.extractor.ebook.extractor.EBOOKFileFormatExtractor",
+                "class_init_kwargs": {}
+            }
+        ],
+        "post_extractor": [
+            {
+                "enable": false,
+                "python_class": "llm_web_kit.extractor.ebook.post_extractor.EBOOKFileFormatPostExtractor",
+                "class_init_kwargs": {}
+            }
+        ]
+    }
+}
diff --git a/llm_web_kit/config/pipe_tpl/html-test.jsonc b/llm_web_kit/config/pipe_tpl/html-test.jsonc
@@ -0,0 +1,37 @@
+{
+    "extractor_pipe": {
+        "enable": true,
+        "validate_input_format": false,
+        "pre_extractor": [
+            {
+                "enable": true,
+                "python_class": "llm_web_kit.extractor.html.pre_extractor.TestHTMLFileFormatFilterPreExtractor",
+                "class_init_kwargs": {
+                    "html_parent_dir": "tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/"
+                }
+            },
+            {
+                "enable": true,
+                "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterTablePreExtractor"
+            },
+            {
+                "enable": true,
+                "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatCleanTagsPreExtractor",
+                "class_init_kwargs": {}
+            }
+        ],
+        "extractor": [
+            {
+                "enable": true,
+                "python_class": "llm_web_kit.extractor.html.extractor.HTMLFileFormatExtractor",
+                "class_init_kwargs": {}
+            }
+        ],
+        "post_extractor": [
+            {
+                "enable": true,
+                "python_class": "llm_web_kit.extractor.html.post_extractor.HTMLStripSpacePostExtractor"
+            }
+        ]
+    }
+}
diff --git a/llm_web_kit/config/pipe_tpl/html.jsonc b/llm_web_kit/config/pipe_tpl/html.jsonc
@@ -0,0 +1,30 @@
+{
+    "extractor_pipe": {
+            "enable": true,
+            "validate_input_format": false,
+            "pre_extractor": [
+                {
+                    "enable": true,
+                    "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterTablePreExtractor"
+                },
+                {
+                    "enable": true,
+                    "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatCleanTagsPreExtractor",
+                    "class_init_kwargs": {}
+                }
+            ],
+            "extractor": [
+                {
+                    "enable": true,
+                    "python_class": "llm_web_kit.extractor.html.extractor.HTMLFileFormatExtractor",
+                    "class_init_kwargs": {}
+                }
+            ],
+            "post_extractor": [
+            {
+                "enable": true,
+                "python_class": "llm_web_kit.extractor.html.post_extractor.HTMLStripSpacePostExtractor"
+            }
+        ]
+    }
+}
diff --git a/llm_web_kit/config/pipe_tpl/pdf.jsonc b/llm_web_kit/config/pipe_tpl/pdf.jsonc
@@ -0,0 +1,25 @@
+{
+    "extractor_pipe": {
+        "pre_extractor": [
+            {
+                "enable": true,
+                "python_class": "llm_web_kit.extractor.pdf.pre_extractor.PDFFileFormatFilterPreExtractor",
+                "class_init_kwargs": {}
+            }
+        ],
+        "extractor": [
+            {
+                "enable": true,
+                "python_class": "llm_web_kit.extractor.pdf.extractor.PDFFileFormatExtractor",
+                "class_init_kwargs": {}
+            }
+        ],
+        "post_extractor": [
+            {
+                "enable": false,
+                "python_class": "llm_web_kit.extractor.pdf.post_extractor.PDFFileFormatPostExtractor",
+                "class_init_kwargs": {}
+            }
+        ]
+    }
+}
diff --git a/llm_web_kit/extractor/html/post_extractor.py b/llm_web_kit/extractor/html/post_extractor.py
@@ -93,7 +93,7 @@ def __do_normalize_text(self, paragraph: list[dict]) -> list[dict]:
         for segment in paragraph:
             text = segment['c']
             text_type = segment['t']
-            if text_type == ParagraphTextType.TEXT:
+            if text_type not in [ParagraphTextType.CODE_INLINE]:  # skip code
                 segment['c'] = normalize_text_segment(text)
         return paragraph
 

diff --git a/llm_web_kit/html_layout_classify/classify-spot.sh b/llm_web_kit/html_layout_classify/classify-spot.sh
@@ -124,10 +124,6 @@ do
 
         if [ "$PD_COUNT" -lt "$MAX_PENDING_JOBS" ] && [ $spot_count -lt $MAX_JOBS ]; then
             # 如果PD任务数小于最大限制，则提交新任务
-            # tt=$(date '+%Y-%m-%d %H:%M:%S')
-            # total_spot_used=$(calculate_total_spot_used)
-            # total_reserved_idle=$(calculate_total_reserved_idle)
-            # echo -e "check  $partation spot \n tt:$tt \n total_spot_used: $total_spot_used\n total_reserved_idle: $total_reserved_idle \n PD_COUNT: $PD_COUNT"
             if [ $DEBUG -eq 1 ]; then
                 LOG_LEVEL=ERROR srun -p ${partation} --quotatype=spot --output=${SLURM_LOG_DIR}/logs/output_%j.out --export=ALL  --error=${SLURM_LOG_DIR}/error/error_%j.err -N 1 -n${TASK_NUM} --gres=gpu:1   python main.py  ${SERVER_ADDR} --result-save-dir ${RESULT_SAVE_DIR}
             else

diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.md b/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.md
@@ -54,7 +54,7 @@ your MATLAB code, add the `%#codegen` pragma
 to the top of your MATLAB file. When you edit your code in the MATLAB editor,
 the MATLAB Code Analyzer flags functions and constructs that
 are not supported for code generation. See Check Code Using the MATLAB Code Analyzer. When you use the MATLAB
-            Coder™ app,
+ Coder™ app,
 the app screens your code for code generation readiness. At the function
 line, you can use the Code Generation Readiness Tool. See Check Code Using the Code Generation Readiness Tool.
 
@@ -68,7 +68,7 @@ However, running the test file can slow the code generation. It is
 a best practice to pass the properties to the `-args` option
 so that `convertToSingle` does not run the test
 file to determine the argument properties. If you have a MATLAB
-            Coder license,
+ Coder license,
 you can use `coder.getArgTypes` to determine the
 argument properties. For example:
 
@@ -94,4 +94,4 @@ scfg = coder.config('single');
 scfg.TestBenchName = 'mytest';
 scfg.TestNumerics = true;
 scfg.LogIOForComparisonPlotting = true;
-```
+```
diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.txt b/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.txt
@@ -39,7 +39,7 @@ your MATLAB code, add the `%#codegen` pragma
 to the top of your MATLAB file. When you edit your code in the MATLAB editor,
 the MATLAB Code Analyzer flags functions and constructs that
 are not supported for code generation. See Check Code Using the MATLAB Code Analyzer. When you use the MATLAB
-            Coder™ app,
+ Coder™ app,
 the app screens your code for code generation readiness. At the function
 line, you can use the Code Generation Readiness Tool. See Check Code Using the Code Generation Readiness Tool.
 Use the `-args` Option to Specify Input Properties
@@ -51,7 +51,7 @@ However, running the test file can slow the code generation. It is
 a best practice to pass the properties to the `-args` option
 so that `convertToSingle` does not run the test
 file to determine the argument properties. If you have a MATLAB
-            Coder license,
+ Coder license,
 you can use `coder.getArgTypes` to determine the
 argument properties. For example:
 ```
@@ -73,4 +73,4 @@ scfg = coder.config('single');
 scfg.TestBenchName = 'mytest';
 scfg.TestNumerics = true;
 scfg.LogIOForComparisonPlotting = true;
-```
+```
diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_code.py b/tests/llm_web_kit/extractor/html/recognizer/test_code.py
@@ -1,6 +1,7 @@
 import unittest
 from pathlib import Path
 
+from llm_web_kit.config.cfg_reader import load_pipe_tpl
 from llm_web_kit.extractor.extractor_chain import ExtractSimpleFactory
 from llm_web_kit.extractor.html.recognizer.cccode import CodeRecognizer
 from llm_web_kit.extractor.html.recognizer.recognizer import CCTag
@@ -202,32 +203,7 @@
 class TestCodeRecognizer(unittest.TestCase):
     def setUp(self):
         self.rec = CodeRecognizer()
-        self.chain_config = {
-            'extractor_pipe': {
-                'enable': True,
-                'validate_input_format': True,
-                'pre_extractor': [
-                    {
-                        'enable': True,
-                        'python_class': 'llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterPreExtractor',
-                        'class_init_kwargs': {}
-                    }
-                ],
-                'extractor': [
-                    {
-                        'enable': True,
-                        'python_class': 'llm_web_kit.extractor.html.extractor.HTMLFileFormatExtractor',
-                        'class_init_kwargs': {}
-                    }
-                ],
-                'post_extractor': [
-                    {
-                        'enable': False,
-                        'python_class': 'llm_web_kit.extractor.html.post_extractor.HTMLFileFormatPostExtractor'
-                    }
-                ]
-            },
-        }
+        self.chain_config = load_pipe_tpl('html')
 
     def compare_code(self, expect: str, answer: str) -> None:
         self.assertEqual(expect, answer)

diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_text.py b/tests/llm_web_kit/extractor/html/recognizer/test_text.py
@@ -3,6 +3,7 @@
 import unittest
 from pathlib import Path
 
+from llm_web_kit.config.cfg_reader import load_pipe_tpl
 from llm_web_kit.extractor.extractor_chain import ExtractSimpleFactory
 from llm_web_kit.extractor.html.recognizer.recognizer import \
     BaseHTMLElementRecognizer
@@ -14,44 +15,7 @@ class TestTextParagraphRecognize(unittest.TestCase):
     def setUp(self):
         self.text_recognize = TextParagraphRecognizer()
         # Config for HTML extraction
-        self.config = {
-            'extractor_pipe': {
-                'enable': True,
-                'validate_input_format': False,
-                'pre_extractor': [
-                    {
-                        'enable': True,
-                        'python_class': 'llm_web_kit.extractor.html.pre_extractor.TestHTMLFileFormatFilterPreExtractor',
-                        'class_init_kwargs': {
-                            'html_parent_dir': 'tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/',
-                        },
-                    },
-                    {
-                        'enable': True,
-                        'python_class': 'llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterTablePreExtractor',
-                    },
-                    {
-                        'enable': True,
-                        'python_class': 'llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatCleanTagsPreExtractor',
-                        'class_init_kwargs': {},
-                    }
-                ],
-                'extractor': [
-                    {
-                        'enable': True,
-                        'python_class': 'llm_web_kit.extractor.html.extractor.HTMLFileFormatExtractor',
-                        'class_init_kwargs': {},
-                    }
-                ],
-                'post_extractor': [
-                    {
-                        'enable': False,
-                        'python_class': 'llm_web_kit.extractor.html.post_extractor.HTMLFileFormatPostExtractor',
-                        'class_init_kwargs': {},
-                    }
-                ],
-            }
-        }
+        self.config = load_pipe_tpl('html-test')
 
     def test_text_1(self):
         """
@@ -87,7 +51,7 @@ def test_text_2(self):
         input_data = DataJson(test_data)
         result = chain.extract(input_data)
         content_md = result.get_content_list().to_mm_md()
-        assert content_md[:130] == '''For Swivel Hand Rivet Squeezer or any snap Type .187 Shank Diameter Squeezer\n  \n\n Instructions for Selecting Rivet Sets:\n\nTo devel'''
+        assert content_md[:130] == '''For Swivel Hand Rivet Squeezer or any snap Type .187 Shank Diameter Squeezer\n \n\n Instructions for Selecting Rivet Sets:\n\nTo develo'''
 
     def test_text_3(self):
         """
@@ -109,7 +73,7 @@ def test_text_3(self):
         input_data = DataJson(test_data)
         result = chain.extract(input_data)
         content_md = result.get_content_list().to_mm_md()
-        assert content_md[443:669] == '''2.\n    The speed of light in a material is 2.50x10^8 meters per second. What is the index of refraction of the\n    material?\n\n\n\n\n\n 2. Relevant equations\n\n\n\n\n\n\n\n 3. The\n        attempt at a solution\n\n1. di=22.22\n\n\n\n2. Dont know'''
+        assert content_md[371:584] == '''2.\n The speed of light in a material is 2.50x10^8 meters per second. What is the index of refraction of the\n material?\n\n\n\n\n\n 2. Relevant equations\n\n\n\n\n\n\n\n 3. The\n attempt at a solution\n\n1. di=22.22\n\n\n\n2. Dont know'''
 
     def test_text_4(self):
         """

diff --git a/tests/llm_web_kit/extractor/html/test_HTMLStripSpacePostExtractor.py b/tests/llm_web_kit/extractor/html/test_HTMLStripSpacePostExtractor.py
@@ -58,7 +58,7 @@ def test_space_post_extractor(self):
         self.assertEqual(text_1_processed, text_1_expected)
 
         text_2_processed = processed[0][0]['content']['items'][0][0][1]['c']
-        text_2_expected = 'E=mc^2   '
+        text_2_expected = 'E=mc^2 '
         self.assertEqual(text_2_processed, text_2_expected)
 
         text_3_processed = processed[0][0]['content']['items'][0][0][2]['c']
@@ -71,7 +71,7 @@ def test_space_post_extractor(self):
         self.assertEqual(text_4_processed, text_4_expected)
 
         text_5_processed = processed[0][1]['content'][1]['c']
-        text_5_expected = 'E=mc^2  '
+        text_5_expected = 'E=mc^2 '
         self.assertEqual(text_5_processed, text_5_expected)
 
         text_6_processed = processed[0][1]['content'][2]['c']