Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 22 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,14 +103,14 @@ def extract(url:str, html:str) -> str:

if __name__=="__main__":
url = ""
html = '''<html><body>
raw_html = '''<html><body>
<div class="options-div-0-0 option-box__items" style="display: none;">
<span class="bedroom-rate__title">Room Only Rate</span>
<span class="bedroom-rate__price">£1,230.00</span>
</div>
<p>正常内容</p>
</body></html>'''
markdown = extract(url, html)
markdown = extract(url, raw_html)
print(markdown)
```

Expand All @@ -120,26 +120,38 @@ if __name__=="__main__":
from llm_web_kit.simple import extract_content_from_main_html
from loguru import logger

def extract(url:str, html:str) -> str:
def extract(url:str, raw_html: str, main_html:str) -> str:
try:
nlp_md = extract_content_from_main_html(url, html)
# or mm_nlp_md = extract_content_from_main_html(url, html, 'mm_md')
# or nlp_md = extract_content_from_main_html(url, html, language='zh')
nlp_md = extract_content_from_main_html(url = url, raw_html = raw_html, main_html = main_html)
# or mm_nlp_md = extract_content_from_main_html(url = url, raw_html = raw_html, main_html = main_html, 'mm_md')
# or nlp_md = extract_content_from_main_html(url = url, raw_html = raw_html, main_html = main_html, language='zh')
return nlp_md
except Exception as e:
logger.exception(e)
return None

if __name__=="__main__":
url = ""
html = '''<html><body>
raw_html = '''<html>
<meta charset="utf-8"><meta content="IE=edge" http-equiv="X-UA-Compatible"><meta content="width=device-width,initial-scale=1,shrink-to-fit=no" name="viewport">
<script>MathJax={tex:{inlineMath:[["$","$"],["\\(","\\)"]],processEscapes:!0},svg:{fontCache:"global"}}</script><script async="" id="MathJax-script" src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-svg.js" type="text/javascript"></script>
<body>
<div class="options-div-0-0 option-box__items" style="display: none;">
<span class="bedroom-rate__title">Room Only Rate</span>
<span class="bedroom-rate__price">£1,230.00</span>
</div>
<p cc-select="true" class="mark-selected" data-anno-uid="anno-uid-wygjielbjln" style="">Are the filtrations after these steps. Why only these? Looking at $\mathcal{F}_1$, we can obtain probabilities for the following events:</p>
<p>正常内容</p>
</body></html>'''
main_html = '''<html><body>
<div class="options-div-0-0 option-box__items" style="display: none;">
<span class="bedroom-rate__title">Room Only Rate</span>
<span class="bedroom-rate__price">£1,230.00</span>
</div>
<p cc-select="true" class="mark-selected" data-anno-uid="anno-uid-wygjielbjln" style="">Are the filtrations after these steps. Why only these? Looking at $\mathcal{F}_1$, we can obtain probabilities for the following events:</p>
<p>正常内容</p>
</body></html>'''
markdown = extract(url, html)
markdown = extract(url, raw_html, main_html)
print(markdown)
```

Expand All @@ -159,14 +171,14 @@ def extract(url:str, html:str) -> str:

if __name__=="__main__":
url = ""
html = '''<html><body>
raw_html = '''<html><body>
<div class="options-div-0-0 option-box__items" style="display: none;">
<span class="bedroom-rate__title">Room Only Rate</span>
<span class="bedroom-rate__price">£1,230.00</span>
</div>
<p>正常内容</p>
</body></html>'''
main_html = extract(url, html)
main_html = extract(url, raw_html)
print(main_html)
```

Expand Down
43 changes: 23 additions & 20 deletions llm_web_kit/simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,13 @@ def get_extractor(pipe_tpl_name: str):
return ExtractorFactory._extractors[pipe_tpl_name]


def _extract_html(url: str, html_content: str, pipe_tpl: str, language: str = 'en') -> DataJson:
def _extract_html(url: str, raw_html: str, main_html: str, pipe_tpl: str, language: str = 'en') -> DataJson:
"""内部使用的统一HTML提取方法,返回处理后的DataJson对象.

Args:
url: 网页URL
html_content: 原始HTML内容(或main_html,取决于pipe_tpl)
raw_html: 原始HTML内容
main_html: 正文对应的main_html
pipe_tpl: 处理类型,支持:
# 只执行第一阶段:
- PipeTpl.MAGIC_HTML: 使用magic_html提取main_html
Expand All @@ -80,10 +81,11 @@ def _extract_html(url: str, html_content: str, pipe_tpl: str, language: str = 'e
input_data_dict = {
'track_id': str(uuid.uuid4()),
'url': url,
'html': html_content,
'html': raw_html,
'main_html': main_html,
'dataset_name': f'llm-web-kit-{pipe_tpl}',
'data_source_category': 'HTML',
'file_bytes': len(html_content),
'file_bytes': len(raw_html),
'language': language,
'meta_info': {'input_datetime': datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
}
Expand All @@ -96,35 +98,36 @@ def _extract_html(url: str, html_content: str, pipe_tpl: str, language: str = 'e
# SDK方法(三种使用场景)
# ========================================

def extract_main_html_only(url: str, html_content: str, parser_type: str = PipeTpl.MAGIC_HTML, language: str = 'en') -> str:
def extract_main_html_only(url: str, raw_html: str, pipe_tpl: str = PipeTpl.MAGIC_HTML, language: str = 'en') -> str:
"""场景1: 只执行第一阶段,抽取main_html.

Args:
url: 网页URL
html_content: 原始HTML内容
parser_type: 解析器类型,可选:PipeTpl.MAGIC_HTML, PipeTpl.LLM, PipeTpl.LAYOUT_BATCH
raw_html: 原始HTML内容
pipe_tpl: 解析器类型,可选:PipeTpl.MAGIC_HTML, PipeTpl.LLM, PipeTpl.LAYOUT_BATCH
language: 语言,可选:'en' 或 'zh'

Returns:
str: 提取的主要HTML内容
"""
result = _extract_html(url, html_content, parser_type, language)
result = _extract_html(url=url, raw_html=raw_html, main_html='', pipe_tpl=pipe_tpl, language=language)
return result.get('main_html', '')


def extract_content_from_main_html(url: str, main_html: str, output_format: str = 'md', language: str = 'en') -> str:
def extract_content_from_main_html(url: str, raw_html: str, main_html: str, output_format: str = 'md', language: str = 'en') -> str:
"""场景2: 只执行第二阶段,从main_html抽取结构化内容.

Args:
url: 网页URL
main_html: 已经抽取的主要HTML内容
raw_html: 原始HTML内容
main_html: 正文对应的main_html
output_format: 输出格式,'md' 或 'mm_md'
language: 语言,可选:'en' 或 'zh'

Returns:
str: 结构化的内容(markdown格式)
"""
result = _extract_html(url, main_html, PipeTpl.NOCLIP, language)
result = _extract_html(url=url, raw_html=raw_html, main_html=main_html, pipe_tpl=PipeTpl.NOCLIP, language=language)
content_list = result.get_content_list()

if output_format == 'md':
Expand All @@ -137,19 +140,19 @@ def extract_content_from_main_html(url: str, main_html: str, output_format: str
raise InvalidOutputFormatException(f'Invalid output format: {output_format}')


def extract_content_from_html_with_magic_html(url: str, html_content: str, output_format: str = 'md', language: str = 'en') -> str:
def extract_content_from_html_with_magic_html(url: str, raw_html: str, output_format: str = 'md', language: str = 'en') -> str:
"""场景3: 执行两个阶段,从magic_html抽取main_html,再从main_html抽取结构化内容.

Args:
url: 网页URL
html_content: 原始HTML内容
raw_html: 原始HTML内容
output_format: 输出格式,'md' 或 'mm_md'
language: 语言,可选:'en' 或 'zh'

Returns:
str: 结构化的内容(markdown格式)
"""
result = _extract_html(url, html_content, PipeTpl.MAGIC_HTML_NOCLIP, language)
result = _extract_html(url=url, raw_html=raw_html, main_html='', pipe_tpl=PipeTpl.MAGIC_HTML_NOCLIP, language=language)
content_list = result.get_content_list()

if output_format == 'md':
Expand All @@ -162,19 +165,19 @@ def extract_content_from_html_with_magic_html(url: str, html_content: str, outpu
raise InvalidOutputFormatException(f'Invalid output format: {output_format}')


def extract_content_from_html_with_llm(url: str, html_content: str, output_format: str = 'md', language: str = 'en') -> str:
def extract_content_from_html_with_llm(url: str, raw_html: str, output_format: str = 'md', language: str = 'en') -> str:
"""场景3: 执行两个阶段,从llm抽取main_html,再从main_html抽取结构化内容.

Args:
url: 网页URL
html_content: 原始HTML内容
raw_html: 原始HTML内容
output_format: 输出格式,'md' 或 'mm_md'
language: 语言,可选:'en' 或 'zh'

Returns:
str: 结构化的内容(markdown格式)
"""
result = _extract_html(url, html_content, PipeTpl.LLM_NOCLIP, language)
result = _extract_html(url=url, raw_html=raw_html, main_html='', pipe_tpl=PipeTpl.LLM_NOCLIP, language=language)
content_list = result.get_content_list()

if output_format == 'md':
Expand All @@ -187,19 +190,19 @@ def extract_content_from_html_with_llm(url: str, html_content: str, output_forma
raise InvalidOutputFormatException(f'Invalid output format: {output_format}')


def extract_content_from_html_with_layout_batch(url: str, html_content: str, output_format: str = 'md', language: str = 'en') -> str:
def extract_content_from_html_with_layout_batch(url: str, raw_html: str, output_format: str = 'md', language: str = 'en') -> str:
"""场景3: 执行两个阶段,从layout_batch抽取main_html,再从main_html抽取结构化内容.

Args:
url: 网页URL
html_content: 原始HTML内容
raw_html: 原始HTML内容
output_format: 输出格式,'md' 或 'mm_md'
language: 语言,可选:'en' 或 'zh'

Returns:
str: 结构化的内容(markdown格式)
"""
result = _extract_html(url, html_content, PipeTpl.LAYOUT_BATCH_NOCLIP, language)
result = _extract_html(url=url, raw_html=raw_html, main_html='', pipe_tpl=PipeTpl.LAYOUT_BATCH_NOCLIP, language=language)
content_list = result.get_content_list()

if output_format == 'md':
Expand Down
Loading