diff --git a/llm_web_kit/config/cfg_reader.py b/llm_web_kit/config/cfg_reader.py index 5b648f76..24aa9948 100644 --- a/llm_web_kit/config/cfg_reader.py +++ b/llm_web_kit/config/cfg_reader.py @@ -3,6 +3,7 @@ import commentjson as json from llm_web_kit.exception.exception import ModelResourceException +from llm_web_kit.libs.path_lib import get_py_pkg_root_dir def load_config() -> dict: @@ -41,3 +42,17 @@ def load_config() -> dict: config = json.load(f) return config + + +def load_pipe_tpl(pipe_name: str) -> dict: + """Load the pipe template for the web kit. + + Args: + pipe_name(str): The name of the pipe to load + + Returns: pipe_tpl(dict): The pipe template dictionary + """ + pipe_tpl_path = os.path.join(get_py_pkg_root_dir(), 'config', 'pipe_tpl', f'{pipe_name}.jsonc') + with open(pipe_tpl_path, 'r', encoding='utf-8') as f: + pipe_tpl = json.load(f) + return pipe_tpl diff --git a/llm_web_kit/config/pipe_tpl/ebook.jsonc b/llm_web_kit/config/pipe_tpl/ebook.jsonc new file mode 100644 index 00000000..61bef065 --- /dev/null +++ b/llm_web_kit/config/pipe_tpl/ebook.jsonc @@ -0,0 +1,25 @@ +{ + "extractor_pipe": { + "pre_extractor": [ + { + "enable": true, + "python_class": "llm_web_kit.extractor.ebook.pre_extractor.EBOOKFileFormatFilterPreExtractor", + "class_init_kwargs": {} + } + ], + "extractor": [ + { + "enable": true, + "python_class": "llm_web_kit.extractor.ebook.extractor.EBOOKFileFormatExtractor", + "class_init_kwargs": {} + } + ], + "post_extractor": [ + { + "enable": false, + "python_class": "llm_web_kit.extractor.ebook.post_extractor.EBOOKFileFormatPostExtractor", + "class_init_kwargs": {} + } + ] + } +} diff --git a/llm_web_kit/config/pipe_tpl/html-test.jsonc b/llm_web_kit/config/pipe_tpl/html-test.jsonc new file mode 100644 index 00000000..e569be23 --- /dev/null +++ b/llm_web_kit/config/pipe_tpl/html-test.jsonc @@ -0,0 +1,37 @@ +{ + "extractor_pipe": { + "enable": true, + "validate_input_format": false, + "pre_extractor": [ + { + "enable": true, + "python_class": "llm_web_kit.extractor.html.pre_extractor.TestHTMLFileFormatFilterPreExtractor", + "class_init_kwargs": { + "html_parent_dir": "tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/" + } + }, + { + "enable": true, + "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterTablePreExtractor" + }, + { + "enable": true, + "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatCleanTagsPreExtractor", + "class_init_kwargs": {} + } + ], + "extractor": [ + { + "enable": true, + "python_class": "llm_web_kit.extractor.html.extractor.HTMLFileFormatExtractor", + "class_init_kwargs": {} + } + ], + "post_extractor": [ + { + "enable": true, + "python_class": "llm_web_kit.extractor.html.post_extractor.HTMLStripSpacePostExtractor" + } + ] + } +} diff --git a/llm_web_kit/config/pipe_tpl/html.jsonc b/llm_web_kit/config/pipe_tpl/html.jsonc new file mode 100644 index 00000000..30838458 --- /dev/null +++ b/llm_web_kit/config/pipe_tpl/html.jsonc @@ -0,0 +1,30 @@ +{ + "extractor_pipe": { + "enable": true, + "validate_input_format": false, + "pre_extractor": [ + { + "enable": true, + "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterTablePreExtractor" + }, + { + "enable": true, + "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatCleanTagsPreExtractor", + "class_init_kwargs": {} + } + ], + "extractor": [ + { + "enable": true, + "python_class": "llm_web_kit.extractor.html.extractor.HTMLFileFormatExtractor", + "class_init_kwargs": {} + } + ], + "post_extractor": [ + { + "enable": true, + "python_class": "llm_web_kit.extractor.html.post_extractor.HTMLStripSpacePostExtractor" + } + ] + } +} diff --git a/llm_web_kit/config/pipe_tpl/pdf.jsonc b/llm_web_kit/config/pipe_tpl/pdf.jsonc new file mode 100644 index 00000000..9fb50cc0 --- /dev/null +++ b/llm_web_kit/config/pipe_tpl/pdf.jsonc @@ -0,0 +1,25 @@ +{ + "extractor_pipe": { + "pre_extractor": [ + { + "enable": true, + "python_class": "llm_web_kit.extractor.pdf.pre_extractor.PDFFileFormatFilterPreExtractor", + "class_init_kwargs": {} + } + ], + "extractor": [ + { + "enable": true, + "python_class": "llm_web_kit.extractor.pdf.extractor.PDFFileFormatExtractor", + "class_init_kwargs": {} + } + ], + "post_extractor": [ + { + "enable": false, + "python_class": "llm_web_kit.extractor.pdf.post_extractor.PDFFileFormatPostExtractor", + "class_init_kwargs": {} + } + ] + } +} diff --git a/llm_web_kit/extractor/html/post_extractor.py b/llm_web_kit/extractor/html/post_extractor.py index 4f1517ce..944d932a 100644 --- a/llm_web_kit/extractor/html/post_extractor.py +++ b/llm_web_kit/extractor/html/post_extractor.py @@ -93,7 +93,7 @@ def __do_normalize_text(self, paragraph: list[dict]) -> list[dict]: for segment in paragraph: text = segment['c'] text_type = segment['t'] - if text_type == ParagraphTextType.TEXT: + if text_type not in [ParagraphTextType.CODE_INLINE]: # skip code segment['c'] = normalize_text_segment(text) return paragraph diff --git a/llm_web_kit/html_layout_classify/classify-spot.sh b/llm_web_kit/html_layout_classify/classify-spot.sh index 5fbd0758..c5ed3c95 100755 --- a/llm_web_kit/html_layout_classify/classify-spot.sh +++ b/llm_web_kit/html_layout_classify/classify-spot.sh @@ -124,10 +124,6 @@ do if [ "$PD_COUNT" -lt "$MAX_PENDING_JOBS" ] && [ $spot_count -lt $MAX_JOBS ]; then # 如果PD任务数小于最大限制,则提交新任务 - # tt=$(date '+%Y-%m-%d %H:%M:%S') - # total_spot_used=$(calculate_total_spot_used) - # total_reserved_idle=$(calculate_total_reserved_idle) - # echo -e "check $partation spot \n tt:$tt \n total_spot_used: $total_spot_used\n total_reserved_idle: $total_reserved_idle \n PD_COUNT: $PD_COUNT" if [ $DEBUG -eq 1 ]; then LOG_LEVEL=ERROR srun -p ${partation} --quotatype=spot --output=${SLURM_LOG_DIR}/logs/output_%j.out --export=ALL --error=${SLURM_LOG_DIR}/error/error_%j.err -N 1 -n${TASK_NUM} --gres=gpu:1 python main.py ${SERVER_ADDR} --result-save-dir ${RESULT_SAVE_DIR} else diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.md b/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.md index 6ede30d9..f5b9f660 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.md +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.md @@ -54,7 +54,7 @@ your MATLAB code, add the `%#codegen` pragma to the top of your MATLAB file. When you edit your code in the MATLAB editor, the MATLAB Code Analyzer flags functions and constructs that are not supported for code generation. See Check Code Using the MATLAB Code Analyzer. When you use the MATLAB - Coder™ app, + Coder™ app, the app screens your code for code generation readiness. At the function line, you can use the Code Generation Readiness Tool. See Check Code Using the Code Generation Readiness Tool. @@ -68,7 +68,7 @@ However, running the test file can slow the code generation. It is a best practice to pass the properties to the `-args` option so that `convertToSingle` does not run the test file to determine the argument properties. If you have a MATLAB - Coder license, + Coder license, you can use `coder.getArgTypes` to determine the argument properties. For example: @@ -94,4 +94,4 @@ scfg = coder.config('single'); scfg.TestBenchName = 'mytest'; scfg.TestNumerics = true; scfg.LogIOForComparisonPlotting = true; -``` +``` \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.txt b/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.txt index 3ce15baa..be6b47b1 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.txt +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.txt @@ -39,7 +39,7 @@ your MATLAB code, add the `%#codegen` pragma to the top of your MATLAB file. When you edit your code in the MATLAB editor, the MATLAB Code Analyzer flags functions and constructs that are not supported for code generation. See Check Code Using the MATLAB Code Analyzer. When you use the MATLAB - Coder™ app, + Coder™ app, the app screens your code for code generation readiness. At the function line, you can use the Code Generation Readiness Tool. See Check Code Using the Code Generation Readiness Tool. Use the `-args` Option to Specify Input Properties @@ -51,7 +51,7 @@ However, running the test file can slow the code generation. It is a best practice to pass the properties to the `-args` option so that `convertToSingle` does not run the test file to determine the argument properties. If you have a MATLAB - Coder license, + Coder license, you can use `coder.getArgTypes` to determine the argument properties. For example: ``` @@ -73,4 +73,4 @@ scfg = coder.config('single'); scfg.TestBenchName = 'mytest'; scfg.TestNumerics = true; scfg.LogIOForComparisonPlotting = true; -``` +``` \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_code.py b/tests/llm_web_kit/extractor/html/recognizer/test_code.py index 40f758c1..ea7a4fb7 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_code.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_code.py @@ -1,6 +1,7 @@ import unittest from pathlib import Path +from llm_web_kit.config.cfg_reader import load_pipe_tpl from llm_web_kit.extractor.extractor_chain import ExtractSimpleFactory from llm_web_kit.extractor.html.recognizer.cccode import CodeRecognizer from llm_web_kit.extractor.html.recognizer.recognizer import CCTag @@ -202,32 +203,7 @@ class TestCodeRecognizer(unittest.TestCase): def setUp(self): self.rec = CodeRecognizer() - self.chain_config = { - 'extractor_pipe': { - 'enable': True, - 'validate_input_format': True, - 'pre_extractor': [ - { - 'enable': True, - 'python_class': 'llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterPreExtractor', - 'class_init_kwargs': {} - } - ], - 'extractor': [ - { - 'enable': True, - 'python_class': 'llm_web_kit.extractor.html.extractor.HTMLFileFormatExtractor', - 'class_init_kwargs': {} - } - ], - 'post_extractor': [ - { - 'enable': False, - 'python_class': 'llm_web_kit.extractor.html.post_extractor.HTMLFileFormatPostExtractor' - } - ] - }, - } + self.chain_config = load_pipe_tpl('html') def compare_code(self, expect: str, answer: str) -> None: self.assertEqual(expect, answer) diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_text.py b/tests/llm_web_kit/extractor/html/recognizer/test_text.py index 807a05db..a9d368a1 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_text.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_text.py @@ -3,6 +3,7 @@ import unittest from pathlib import Path +from llm_web_kit.config.cfg_reader import load_pipe_tpl from llm_web_kit.extractor.extractor_chain import ExtractSimpleFactory from llm_web_kit.extractor.html.recognizer.recognizer import \ BaseHTMLElementRecognizer @@ -14,44 +15,7 @@ class TestTextParagraphRecognize(unittest.TestCase): def setUp(self): self.text_recognize = TextParagraphRecognizer() # Config for HTML extraction - self.config = { - 'extractor_pipe': { - 'enable': True, - 'validate_input_format': False, - 'pre_extractor': [ - { - 'enable': True, - 'python_class': 'llm_web_kit.extractor.html.pre_extractor.TestHTMLFileFormatFilterPreExtractor', - 'class_init_kwargs': { - 'html_parent_dir': 'tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/', - }, - }, - { - 'enable': True, - 'python_class': 'llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterTablePreExtractor', - }, - { - 'enable': True, - 'python_class': 'llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatCleanTagsPreExtractor', - 'class_init_kwargs': {}, - } - ], - 'extractor': [ - { - 'enable': True, - 'python_class': 'llm_web_kit.extractor.html.extractor.HTMLFileFormatExtractor', - 'class_init_kwargs': {}, - } - ], - 'post_extractor': [ - { - 'enable': False, - 'python_class': 'llm_web_kit.extractor.html.post_extractor.HTMLFileFormatPostExtractor', - 'class_init_kwargs': {}, - } - ], - } - } + self.config = load_pipe_tpl('html-test') def test_text_1(self): """ @@ -87,7 +51,7 @@ def test_text_2(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_md = result.get_content_list().to_mm_md() - assert content_md[:130] == '''For Swivel Hand Rivet Squeezer or any snap Type .187 Shank Diameter Squeezer\n \n\n Instructions for Selecting Rivet Sets:\n\nTo devel''' + assert content_md[:130] == '''For Swivel Hand Rivet Squeezer or any snap Type .187 Shank Diameter Squeezer\n \n\n Instructions for Selecting Rivet Sets:\n\nTo develo''' def test_text_3(self): """ @@ -109,7 +73,7 @@ def test_text_3(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_md = result.get_content_list().to_mm_md() - assert content_md[443:669] == '''2.\n The speed of light in a material is 2.50x10^8 meters per second. What is the index of refraction of the\n material?\n\n\n\n\n\n 2. Relevant equations\n\n\n\n\n\n\n\n 3. The\n attempt at a solution\n\n1. di=22.22\n\n\n\n2. Dont know''' + assert content_md[371:584] == '''2.\n The speed of light in a material is 2.50x10^8 meters per second. What is the index of refraction of the\n material?\n\n\n\n\n\n 2. Relevant equations\n\n\n\n\n\n\n\n 3. The\n attempt at a solution\n\n1. di=22.22\n\n\n\n2. Dont know''' def test_text_4(self): """ diff --git a/tests/llm_web_kit/extractor/html/test_HTMLStripSpacePostExtractor.py b/tests/llm_web_kit/extractor/html/test_HTMLStripSpacePostExtractor.py index c8986236..52d19855 100644 --- a/tests/llm_web_kit/extractor/html/test_HTMLStripSpacePostExtractor.py +++ b/tests/llm_web_kit/extractor/html/test_HTMLStripSpacePostExtractor.py @@ -58,7 +58,7 @@ def test_space_post_extractor(self): self.assertEqual(text_1_processed, text_1_expected) text_2_processed = processed[0][0]['content']['items'][0][0][1]['c'] - text_2_expected = 'E=mc^2 ' + text_2_expected = 'E=mc^2 ' self.assertEqual(text_2_processed, text_2_expected) text_3_processed = processed[0][0]['content']['items'][0][0][2]['c'] @@ -71,7 +71,7 @@ def test_space_post_extractor(self): self.assertEqual(text_4_processed, text_4_expected) text_5_processed = processed[0][1]['content'][1]['c'] - text_5_expected = 'E=mc^2 ' + text_5_expected = 'E=mc^2 ' self.assertEqual(text_5_processed, text_5_expected) text_6_processed = processed[0][1]['content'][2]['c'] diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index 0fdf2da3..70ee7f26 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -16,6 +16,7 @@ from lxml import html +from llm_web_kit.config.cfg_reader import load_pipe_tpl from llm_web_kit.extractor.extractor_chain import ExtractSimpleFactory from llm_web_kit.extractor.html.recognizer.cc_math.common import MathType from llm_web_kit.input.datajson import DataJson @@ -62,44 +63,7 @@ def setUp(self): assert len(self.data_json) == 20 # Config for HTML extraction - self.config = { - 'extractor_pipe': { - 'enable': True, - 'validate_input_format': False, - 'pre_extractor': [ - { - 'enable': True, - 'python_class': 'llm_web_kit.extractor.html.pre_extractor.TestHTMLFileFormatFilterPreExtractor', - 'class_init_kwargs': { - 'html_parent_dir': 'tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/', - }, - }, - { - 'enable': True, - 'python_class': 'llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterTablePreExtractor', - }, - { - 'enable': True, - 'python_class': 'llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatCleanTagsPreExtractor', - 'class_init_kwargs': {}, - } - ], - 'extractor': [ - { - 'enable': True, - 'python_class': 'llm_web_kit.extractor.html.extractor.HTMLFileFormatExtractor', - 'class_init_kwargs': {}, - } - ], - 'post_extractor': [ - { - 'enable': False, - 'python_class': 'llm_web_kit.extractor.html.post_extractor.HTMLFileFormatPostExtractor', - 'class_init_kwargs': {}, - } - ], - } - } + self.config = load_pipe_tpl('html-test') def test_html_pipeline(self): """Test HTML extractor with sample data.""" diff --git a/tests/llm_web_kit/extractor/test_extractor_chain_normal.py b/tests/llm_web_kit/extractor/test_extractor_chain_normal.py index 355ed50d..2f36d6eb 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain_normal.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain_normal.py @@ -3,6 +3,7 @@ import unittest from unittest.mock import MagicMock, patch +from llm_web_kit.config.cfg_reader import load_pipe_tpl from llm_web_kit.exception.exception import (ExtractorChainBaseException, ExtractorChainConfigException, ExtractorChainInputException, @@ -21,90 +22,13 @@ def setUp(self): self.base_path = os.path.dirname(os.path.abspath(__file__)) # Basic HTML config - self.html_config = { - 'extractor_pipe': { - 'pre_extractor': [ - { - 'enable': True, - 'python_class': 'llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterPreExtractor', - 'class_init_kwargs': {}, - }, - { - 'enable': True, - 'python_class': 'llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatCleanTagsPreExtractor', - 'class_init_kwargs': {}, - } - ], - 'extractor': [ - { - 'enable': True, - 'python_class': 'llm_web_kit.extractor.html.extractor.HTMLFileFormatExtractor', - 'class_init_kwargs': {}, - } - ], - 'post_extractor': [ - { - 'enable': False, - 'python_class': 'llm_web_kit.extractor.html.post_extractor.HTMLFileFormatPostExtractor', - 'class_init_kwargs': {}, - } - ], - } - } + self.html_config = load_pipe_tpl('html') # Basic PDF config - self.pdf_config = { - 'extractor_pipe': { - 'pre_extractor': [ - { - 'enable': True, - 'python_class': 'llm_web_kit.extractor.pdf.pre_extractor.PDFFileFormatFilterPreExtractor', - 'class_init_kwargs': {}, - } - ], - 'extractor': [ - { - 'enable': True, - 'python_class': 'llm_web_kit.extractor.pdf.extractor.PDFFileFormatExtractor', - 'class_init_kwargs': {}, - } - ], - 'post_extractor': [ - { - 'enable': False, - 'python_class': 'llm_web_kit.extractor.pdf.post_extractor.PDFFileFormatPostExtractor', - 'class_init_kwargs': {}, - } - ], - } - } + self.pdf_config = load_pipe_tpl('pdf') # Basic EBOOK config - self.ebook_config = { - 'extractor_pipe': { - 'pre_extractor': [ - { - 'enable': True, - 'python_class': 'llm_web_kit.extractor.ebook.pre_extractor.EBOOKFileFormatFilterPreExtractor', - 'class_init_kwargs': {}, - } - ], - 'extractor': [ - { - 'enable': True, - 'python_class': 'llm_web_kit.extractor.ebook.extractor.EBOOKFileFormatExtractor', - 'class_init_kwargs': {}, - } - ], - 'post_extractor': [ - { - 'enable': False, - 'python_class': 'llm_web_kit.extractor.ebook.post_extractor.EBOOKFileFormatPostExtractor', - 'class_init_kwargs': {}, - } - ], - } - } + self.ebook_config = load_pipe_tpl('ebook') def test_factory_create(self): """Test factory creation with different inputs."""