Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ repos:
rev: 5.0.4
hooks:
- id: flake8
args: [ "--max-line-length=2200", "--ignore=E131,E125,W503,W504,E203,E231,E702,E128,E402" ]
args: [ "--max-line-length=2200", "--ignore=E131,E125,W503,W504,E203,E231,E702,E128,E402,W604" ]
exclude: '^tests/.*/assets/'
- repo: https://github.com/PyCQA/isort
rev: 5.11.5
Expand Down Expand Up @@ -33,8 +33,8 @@ repos:
- id: end-of-file-fixer
exclude: '^tests/.*/assets/|llm_web_kit/model/assets/.*|\.ipynb$'
- id: requirements-txt-fixer
- id: double-quote-string-fixer
exclude: '^tests/.*/assets/|llm_web_kit/model/assets/.*|jupyter/domain_clustering/.*'
# - id: double-quote-string-fixer
# exclude: '^tests/.*/assets/|llm_web_kit/model/assets/.*|jupyter/domain_clustering/.*'
- id: check-merge-conflict
- id: fix-encoding-pragma
args: [ "--remove" ]
Expand Down
49 changes: 33 additions & 16 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,67 +78,84 @@ llm-web-kit is a python library that ..
### extract by magic_html+recognize

```python
from llm_web_kit.simple import extract_html_to_md, extract_html_to_mm_md
import traceback
from llm_web_kit.simple import extract_content_from_html_with_magic_html
from loguru import logger

def extract(url:str, html:str) -> str:
try:
nlp_md = extract_html_to_md(url, html)
# or mm_nlp_md = extract_html_to_mm_md(url, html)
nlp_md = extract_content_from_html_with_magic_html(url, html)
# or mm_nlp_md = extract_content_from_html_with_magic_html(url, html, 'mm_md')
return nlp_md
except Exception as e:
logger.exception(e)
return None

if __name__=="__main__":
url = ""
html = ""
html = '''<html><body>
<div class="options-div-0-0 option-box__items" style="display: none;">
<span class="bedroom-rate__title">Room Only Rate</span>
<span class="bedroom-rate__price">£1,230.00</span>
</div>
<p>正常内容</p>
</body></html>'''
markdown = extract(url, html)
print(markdown)
```

### only extract by recognize

```python
from llm_web_kit.simple import extract_html_to_md, extract_html_to_mm_md
import traceback
from llm_web_kit.simple import extract_content_from_main_html
from loguru import logger

def extract(url:str, raw_html:str) -> str:
def extract(url:str, html:str) -> str:
try:
nlp_md = extract_html_to_md(url, raw_html, clip_html=False)
# or mm_nlp_md = extract_html_to_mm_md(url, raw_html, clip_html=False)
nlp_md = extract_content_from_main_html(url, html)
# or mm_nlp_md = extract_content_from_main_html(url, html, 'mm_md')
return nlp_md
except Exception as e:
logger.exception(e)
return None

if __name__=="__main__":
url = ""
html = ""
html = '''<html><body>
<div class="options-div-0-0 option-box__items" style="display: none;">
<span class="bedroom-rate__title">Room Only Rate</span>
<span class="bedroom-rate__price">£1,230.00</span>
</div>
<p>正常内容</p>
</body></html>'''
markdown = extract(url, html)
print(markdown)
```

### only extract main_html by magic-html

```python
from llm_web_kit.simple import extract_main_html_by_maigic_html
import traceback
from llm_web_kit.simple import extract_main_html_only
from loguru import logger

def extract(url:str, html:str) -> str:
try:
main_html = extract_main_html_by_maigic_html(url, html)
# or mm_main_html = extract_pure_html_to_mm_md(url, html)
main_html = extract_main_html_only(url, html)
return main_html
except Exception as e:
logger.exception(e)
return None

if __name__=="__main__":
url = ""
html = ""
html = '''<html><body>
<div class="options-div-0-0 option-box__items" style="display: none;">
<span class="bedroom-rate__title">Room Only Rate</span>
<span class="bedroom-rate__price">£1,230.00</span>
</div>
<p>正常内容</p>
</body></html>'''
main_html = extract(url, html)
print(main_html)
```

### extract main_html by model response
Expand Down
19 changes: 15 additions & 4 deletions bench/config/ours_config.jsonc
Original file line number Diff line number Diff line change
Expand Up @@ -20,24 +20,35 @@
"extractor_pipe": {
"enable": true,
"validate_input_format": false,
"pre_extractor": [
"main_html_parser": [
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.pre_extractor.TestHTMLFileFormatFilterPreExtractor",
"python_class": "llm_web_kit.extractor.html.main_html_parser.TestHTMLFileFormatFilterMainHtmlParser",
"class_init_kwargs": {
"html_parent_dir": "bench/"
}
},
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.main_html_parser.MagicHTMLMainHtmlParser",
"class_init_kwargs": {}
}],
"pre_extractor": [
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatNoClipFilterTablePreExtractor",
"class_init_kwargs": {}
},
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatCleanTagsPreExtractor",
"python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatNoClipCleanTagsPreExtractor",
"class_init_kwargs": {}
}
],
"extractor": [
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.extractor.MagicHTMLFIleFormatorExtractor",
"python_class": "llm_web_kit.extractor.html.extractor.NoClipHTMLFIleFormatorExtractor",
"class_init_kwargs": {}
}
],
Expand Down
4 changes: 2 additions & 2 deletions bench/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from bench.eval.ours import eval_ours_extract_html
from llm_web_kit.dataio.filebase import (FileBasedDataReader,
FileBasedDataWriter)
from llm_web_kit.extractor.html.extractor import MagicHTMLFIleFormatorExtractor
from llm_web_kit.extractor.html.main_html_parser import MagicHTMLMainHtmlParser
from llm_web_kit.libs.statics import Statics


Expand Down Expand Up @@ -100,7 +100,7 @@ def run_ours(config_path, data_path, output_path, statics_pre, reader, writer,
print(f'文件不存在或路径为空: {file_path}')

# 提取main_html
htmlExtractor = MagicHTMLFIleFormatorExtractor(
htmlExtractor = MagicHTMLMainHtmlParser(
chain_config)
main_html, method, title = htmlExtractor._extract_main_html(
html_content, data_json.get('url', ''),
Expand Down
17 changes: 12 additions & 5 deletions llm_web_kit/config/pipe_tpl/html-test.jsonc
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,35 @@
"extractor_pipe": {
"enable": true,
"validate_input_format": false,
"pre_extractor": [
"main_html_parser": [
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.pre_extractor.TestHTMLFileFormatFilterPreExtractor",
"python_class": "llm_web_kit.extractor.html.main_html_parser.TestHTMLFileFormatFilterMainHtmlParser",
"class_init_kwargs": {
"html_parent_dir": "tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/"
}
},
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterTablePreExtractor"
"python_class": "llm_web_kit.extractor.html.main_html_parser.MagicHTMLMainHtmlParser",
"class_init_kwargs": {}
}
],
"pre_extractor": [
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatNoClipFilterTablePreExtractor"
},
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatCleanTagsPreExtractor",
"python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatNoClipCleanTagsPreExtractor",
"class_init_kwargs": {}
}
],
"extractor": [
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.extractor.MagicHTMLFIleFormatorExtractor",
"python_class": "llm_web_kit.extractor.html.extractor.NoClipHTMLFIleFormatorExtractor",
"class_init_kwargs": {}
}
],
Expand Down
13 changes: 10 additions & 3 deletions llm_web_kit/config/pipe_tpl/html.jsonc
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,28 @@
"extractor_pipe": {
"enable": true,
"validate_input_format": false,
"main_html_parser": [
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.main_html_parser.MagicHTMLMainHtmlParser",
"class_init_kwargs": {}
}
],
"pre_extractor": [
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterTablePreExtractor"
"python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatNoClipFilterTablePreExtractor"
},
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatCleanTagsPreExtractor",
"python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatNoClipCleanTagsPreExtractor",
"class_init_kwargs": {}
}
],
"extractor": [
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.extractor.MagicHTMLFIleFormatorExtractor",
"python_class": "llm_web_kit.extractor.html.extractor.NoClipHTMLFIleFormatorExtractor",
"class_init_kwargs": {}
}
],
Expand Down
18 changes: 18 additions & 0 deletions llm_web_kit/config/pipe_tpl/layout_batch_html.jsonc
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"extractor_pipe": {
"enable": true,
"validate_input_format": false,
// 第一阶段:使用layout_batch选择main_html
"main_html_parser": [
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.main_html_parser.LayoutBatchMainHtmlParser",
"class_init_kwargs": {}
}
],
// 不执行第二阶段
"pre_extractor": [],
"extractor": [],
"post_extractor": []
}
}
39 changes: 39 additions & 0 deletions llm_web_kit/config/pipe_tpl/layout_batch_noclip_html.jsonc
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
{
"extractor_pipe": {
"enable": true,
"validate_input_format": false,
// 第一阶段:使用layout_batch选择main_html
"main_html_parser": [
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.main_html_parser.LayoutBatchMainHtmlParser",
"class_init_kwargs": {}
}
],
// 第二阶段:html抽取为md
"pre_extractor": [
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatNoClipFilterTablePreExtractor"
},
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatNoClipCleanTagsPreExtractor",
"class_init_kwargs": {}
}
],
"extractor": [
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.extractor.NoClipHTMLFIleFormatorExtractor",
"class_init_kwargs": {}
}
],
"post_extractor": [
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.post_extractor.ContentListStripSpacePostExtractor"
}
]
}
}
18 changes: 18 additions & 0 deletions llm_web_kit/config/pipe_tpl/llm_html.jsonc
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"extractor_pipe": {
"enable": true,
"validate_input_format": false,
// 第一阶段:使用LLM选择main_html
"main_html_parser": [
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.main_html_parser.LLMMainHtmlParser",
"class_init_kwargs": {}
}
],
// 不执行第二阶段
"pre_extractor": [],
"extractor": [],
"post_extractor": []
}
}
39 changes: 39 additions & 0 deletions llm_web_kit/config/pipe_tpl/llm_noclip_html.jsonc
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
{
"extractor_pipe": {
"enable": true,
"validate_input_format": false,
// 第一阶段:使用LLM选择main_html
"main_html_parser": [
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.main_html_parser.LLMMainHtmlParser",
"class_init_kwargs": {}
}
],
// 第二阶段:html抽取为md
"pre_extractor": [
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatNoClipFilterTablePreExtractor"
},
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatNoClipCleanTagsPreExtractor",
"class_init_kwargs": {}
}
],
"extractor": [
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.extractor.NoClipHTMLFIleFormatorExtractor",
"class_init_kwargs": {}
}
],
"post_extractor": [
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.post_extractor.ContentListStripSpacePostExtractor"
}
]
}
}
17 changes: 17 additions & 0 deletions llm_web_kit/config/pipe_tpl/magic_html.jsonc
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"extractor_pipe": {
"enable": true,
"validate_input_format": false,
// 第一阶段:使用magic_html选择main_html
"main_html_parser": [
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.main_html_parser.MagicHTMLMainHtmlParser",
"class_init_kwargs": {}
}],
// 不执行第二阶段
"pre_extractor": [],
"extractor": [],
"post_extractor": []
}
}
Loading