Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions llm_web_kit/config/cfg_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import commentjson as json

from llm_web_kit.exception.exception import ModelResourceException
from llm_web_kit.libs.path_lib import get_py_pkg_root_dir


def load_config() -> dict:
Expand Down Expand Up @@ -41,3 +42,17 @@ def load_config() -> dict:
config = json.load(f)

return config


def load_pipe_tpl(pipe_name: str) -> dict:
"""Load the pipe template for the web kit.

Args:
pipe_name(str): The name of the pipe to load

Returns: pipe_tpl(dict): The pipe template dictionary
"""
pipe_tpl_path = os.path.join(get_py_pkg_root_dir(), 'config', 'pipe_tpl', f'{pipe_name}.jsonc')
with open(pipe_tpl_path, 'r', encoding='utf-8') as f:
pipe_tpl = json.load(f)
return pipe_tpl
25 changes: 25 additions & 0 deletions llm_web_kit/config/pipe_tpl/ebook.jsonc
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"extractor_pipe": {
"pre_extractor": [
{
"enable": true,
"python_class": "llm_web_kit.extractor.ebook.pre_extractor.EBOOKFileFormatFilterPreExtractor",
"class_init_kwargs": {}
}
],
"extractor": [
{
"enable": true,
"python_class": "llm_web_kit.extractor.ebook.extractor.EBOOKFileFormatExtractor",
"class_init_kwargs": {}
}
],
"post_extractor": [
{
"enable": false,
"python_class": "llm_web_kit.extractor.ebook.post_extractor.EBOOKFileFormatPostExtractor",
"class_init_kwargs": {}
}
]
}
}
37 changes: 37 additions & 0 deletions llm_web_kit/config/pipe_tpl/html-test.jsonc
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
{
"extractor_pipe": {
"enable": true,
"validate_input_format": false,
"pre_extractor": [
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.pre_extractor.TestHTMLFileFormatFilterPreExtractor",
"class_init_kwargs": {
"html_parent_dir": "tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/"
}
},
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterTablePreExtractor"
},
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatCleanTagsPreExtractor",
"class_init_kwargs": {}
}
],
"extractor": [
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.extractor.HTMLFileFormatExtractor",
"class_init_kwargs": {}
}
],
"post_extractor": [
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.post_extractor.HTMLStripSpacePostExtractor"
}
]
}
}
30 changes: 30 additions & 0 deletions llm_web_kit/config/pipe_tpl/html.jsonc
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"extractor_pipe": {
"enable": true,
"validate_input_format": false,
"pre_extractor": [
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterTablePreExtractor"
},
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatCleanTagsPreExtractor",
"class_init_kwargs": {}
}
],
"extractor": [
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.extractor.HTMLFileFormatExtractor",
"class_init_kwargs": {}
}
],
"post_extractor": [
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.post_extractor.HTMLStripSpacePostExtractor"
}
]
}
}
25 changes: 25 additions & 0 deletions llm_web_kit/config/pipe_tpl/pdf.jsonc
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"extractor_pipe": {
"pre_extractor": [
{
"enable": true,
"python_class": "llm_web_kit.extractor.pdf.pre_extractor.PDFFileFormatFilterPreExtractor",
"class_init_kwargs": {}
}
],
"extractor": [
{
"enable": true,
"python_class": "llm_web_kit.extractor.pdf.extractor.PDFFileFormatExtractor",
"class_init_kwargs": {}
}
],
"post_extractor": [
{
"enable": false,
"python_class": "llm_web_kit.extractor.pdf.post_extractor.PDFFileFormatPostExtractor",
"class_init_kwargs": {}
}
]
}
}
2 changes: 1 addition & 1 deletion llm_web_kit/extractor/html/post_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def __do_normalize_text(self, paragraph: list[dict]) -> list[dict]:
for segment in paragraph:
text = segment['c']
text_type = segment['t']
if text_type == ParagraphTextType.TEXT:
if text_type not in [ParagraphTextType.CODE_INLINE]: # skip code
segment['c'] = normalize_text_segment(text)
return paragraph

Expand Down
4 changes: 0 additions & 4 deletions llm_web_kit/html_layout_classify/classify-spot.sh
Original file line number Diff line number Diff line change
Expand Up @@ -124,10 +124,6 @@ do

if [ "$PD_COUNT" -lt "$MAX_PENDING_JOBS" ] && [ $spot_count -lt $MAX_JOBS ]; then
# 如果PD任务数小于最大限制,则提交新任务
# tt=$(date '+%Y-%m-%d %H:%M:%S')
# total_spot_used=$(calculate_total_spot_used)
# total_reserved_idle=$(calculate_total_reserved_idle)
# echo -e "check $partation spot \n tt:$tt \n total_spot_used: $total_spot_used\n total_reserved_idle: $total_reserved_idle \n PD_COUNT: $PD_COUNT"
if [ $DEBUG -eq 1 ]; then
LOG_LEVEL=ERROR srun -p ${partation} --quotatype=spot --output=${SLURM_LOG_DIR}/logs/output_%j.out --export=ALL --error=${SLURM_LOG_DIR}/error/error_%j.err -N 1 -n${TASK_NUM} --gres=gpu:1 python main.py ${SERVER_ADDR} --result-save-dir ${RESULT_SAVE_DIR}
else
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ your MATLAB code, add the `%#codegen` pragma
to the top of your MATLAB file. When you edit your code in the MATLAB editor,
the MATLAB Code Analyzer flags functions and constructs that
are not supported for code generation. See Check Code Using the MATLAB Code Analyzer. When you use the MATLAB
Coder™ app,
Coder™ app,
the app screens your code for code generation readiness. At the function
line, you can use the Code Generation Readiness Tool. See Check Code Using the Code Generation Readiness Tool.

Expand All @@ -68,7 +68,7 @@ However, running the test file can slow the code generation. It is
a best practice to pass the properties to the `-args` option
so that `convertToSingle` does not run the test
file to determine the argument properties. If you have a MATLAB
Coder license,
Coder license,
you can use `coder.getArgTypes` to determine the
argument properties. For example:

Expand All @@ -94,4 +94,4 @@ scfg = coder.config('single');
scfg.TestBenchName = 'mytest';
scfg.TestNumerics = true;
scfg.LogIOForComparisonPlotting = true;
```
```
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ your MATLAB code, add the `%#codegen` pragma
to the top of your MATLAB file. When you edit your code in the MATLAB editor,
the MATLAB Code Analyzer flags functions and constructs that
are not supported for code generation. See Check Code Using the MATLAB Code Analyzer. When you use the MATLAB
Coder™ app,
Coder™ app,
the app screens your code for code generation readiness. At the function
line, you can use the Code Generation Readiness Tool. See Check Code Using the Code Generation Readiness Tool.
Use the `-args` Option to Specify Input Properties
Expand All @@ -51,7 +51,7 @@ However, running the test file can slow the code generation. It is
a best practice to pass the properties to the `-args` option
so that `convertToSingle` does not run the test
file to determine the argument properties. If you have a MATLAB
Coder license,
Coder license,
you can use `coder.getArgTypes` to determine the
argument properties. For example:
```
Expand All @@ -73,4 +73,4 @@ scfg = coder.config('single');
scfg.TestBenchName = 'mytest';
scfg.TestNumerics = true;
scfg.LogIOForComparisonPlotting = true;
```
```
28 changes: 2 additions & 26 deletions tests/llm_web_kit/extractor/html/recognizer/test_code.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import unittest
from pathlib import Path

from llm_web_kit.config.cfg_reader import load_pipe_tpl
from llm_web_kit.extractor.extractor_chain import ExtractSimpleFactory
from llm_web_kit.extractor.html.recognizer.cccode import CodeRecognizer
from llm_web_kit.extractor.html.recognizer.recognizer import CCTag
Expand Down Expand Up @@ -202,32 +203,7 @@
class TestCodeRecognizer(unittest.TestCase):
def setUp(self):
self.rec = CodeRecognizer()
self.chain_config = {
'extractor_pipe': {
'enable': True,
'validate_input_format': True,
'pre_extractor': [
{
'enable': True,
'python_class': 'llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterPreExtractor',
'class_init_kwargs': {}
}
],
'extractor': [
{
'enable': True,
'python_class': 'llm_web_kit.extractor.html.extractor.HTMLFileFormatExtractor',
'class_init_kwargs': {}
}
],
'post_extractor': [
{
'enable': False,
'python_class': 'llm_web_kit.extractor.html.post_extractor.HTMLFileFormatPostExtractor'
}
]
},
}
self.chain_config = load_pipe_tpl('html')

def compare_code(self, expect: str, answer: str) -> None:
self.assertEqual(expect, answer)
Expand Down
44 changes: 4 additions & 40 deletions tests/llm_web_kit/extractor/html/recognizer/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import unittest
from pathlib import Path

from llm_web_kit.config.cfg_reader import load_pipe_tpl
from llm_web_kit.extractor.extractor_chain import ExtractSimpleFactory
from llm_web_kit.extractor.html.recognizer.recognizer import \
BaseHTMLElementRecognizer
Expand All @@ -14,44 +15,7 @@ class TestTextParagraphRecognize(unittest.TestCase):
def setUp(self):
self.text_recognize = TextParagraphRecognizer()
# Config for HTML extraction
self.config = {
'extractor_pipe': {
'enable': True,
'validate_input_format': False,
'pre_extractor': [
{
'enable': True,
'python_class': 'llm_web_kit.extractor.html.pre_extractor.TestHTMLFileFormatFilterPreExtractor',
'class_init_kwargs': {
'html_parent_dir': 'tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/',
},
},
{
'enable': True,
'python_class': 'llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterTablePreExtractor',
},
{
'enable': True,
'python_class': 'llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatCleanTagsPreExtractor',
'class_init_kwargs': {},
}
],
'extractor': [
{
'enable': True,
'python_class': 'llm_web_kit.extractor.html.extractor.HTMLFileFormatExtractor',
'class_init_kwargs': {},
}
],
'post_extractor': [
{
'enable': False,
'python_class': 'llm_web_kit.extractor.html.post_extractor.HTMLFileFormatPostExtractor',
'class_init_kwargs': {},
}
],
}
}
self.config = load_pipe_tpl('html-test')

def test_text_1(self):
"""
Expand Down Expand Up @@ -87,7 +51,7 @@ def test_text_2(self):
input_data = DataJson(test_data)
result = chain.extract(input_data)
content_md = result.get_content_list().to_mm_md()
assert content_md[:130] == '''For Swivel Hand Rivet Squeezer or any snap Type .187 Shank Diameter Squeezer\n \n\n Instructions for Selecting Rivet Sets:\n\nTo devel'''
assert content_md[:130] == '''For Swivel Hand Rivet Squeezer or any snap Type .187 Shank Diameter Squeezer\n \n\n Instructions for Selecting Rivet Sets:\n\nTo develo'''

def test_text_3(self):
"""
Expand All @@ -109,7 +73,7 @@ def test_text_3(self):
input_data = DataJson(test_data)
result = chain.extract(input_data)
content_md = result.get_content_list().to_mm_md()
assert content_md[443:669] == '''2.\n The speed of light in a material is 2.50x10^8 meters per second. What is the index of refraction of the\n material?\n\n\n\n\n\n 2. Relevant equations\n\n\n\n\n\n\n\n 3. The\n attempt at a solution\n\n1. di=22.22\n\n\n\n2. Dont know'''
assert content_md[371:584] == '''2.\n The speed of light in a material is 2.50x10^8 meters per second. What is the index of refraction of the\n material?\n\n\n\n\n\n 2. Relevant equations\n\n\n\n\n\n\n\n 3. The\n attempt at a solution\n\n1. di=22.22\n\n\n\n2. Dont know'''

def test_text_4(self):
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def test_space_post_extractor(self):
self.assertEqual(text_1_processed, text_1_expected)

text_2_processed = processed[0][0]['content']['items'][0][0][1]['c']
text_2_expected = 'E=mc^2 '
text_2_expected = 'E=mc^2 '
self.assertEqual(text_2_processed, text_2_expected)

text_3_processed = processed[0][0]['content']['items'][0][0][2]['c']
Expand All @@ -71,7 +71,7 @@ def test_space_post_extractor(self):
self.assertEqual(text_4_processed, text_4_expected)

text_5_processed = processed[0][1]['content'][1]['c']
text_5_expected = 'E=mc^2 '
text_5_expected = 'E=mc^2 '
self.assertEqual(text_5_processed, text_5_expected)

text_6_processed = processed[0][1]['content'][2]['c']
Expand Down
Loading