Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions bench/config/data_config.jsonl

Large diffs are not rendered by default.

1,900 changes: 1,900 additions & 0 deletions bench/config/data_math_config.jsonl

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion bench/config/ours_config.jsonc
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
"enable": true,
"python_class": "llm_web_kit.extractor.html.pre_extractor.TestHTMLFileFormatFilterPreExtractor",
"class_init_kwargs": {
"html_parent_dir": "bench/data/"
"html_parent_dir": "bench/"
}
},
{
Expand Down
1 change: 0 additions & 1 deletion bench/config/ours_data_config.jsonl

This file was deleted.

2 changes: 1 addition & 1 deletion bench/data/groundtruth/math_physicsforums_1.jsonl

Large diffs are not rendered by default.

73 changes: 63 additions & 10 deletions bench/eval/ours.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,18 @@
import json
import os
from pathlib import Path
from typing import Dict, List, Tuple

from llm_web_kit.extractor.extractor_chain import ExtractSimpleFactory
from llm_web_kit.input.datajson import DataJson, DataJsonKey


def eval_ours_extract_html(chain_config: dict, html_data_path: str, filePath: str, page_layout_type: str = '', url: str = '') -> Tuple[str, List[Dict], str, dict]:
def eval_ours_extract_html(chain_config: dict, test_data: dict) -> Tuple[str, List[Dict], str, dict]:
chain = ExtractSimpleFactory.create(chain_config)
assert chain is not None

# Read test data
with open(html_data_path, 'r') as f:
test_data = json.loads(f.readline().strip())

# Create DataJson from test data
input_data = DataJson(test_data)
input_data.__setitem__('path', filePath)
input_data.__setitem__('page_layout_type', page_layout_type)
input_data.__setitem__('url', url)
# Create DataJson from test data
input_data = DataJson(test_data)

# Test extraction
result = chain.extract(input_data)
Expand All @@ -26,3 +21,61 @@ def eval_ours_extract_html(chain_config: dict, html_data_path: str, filePath: st
main_html = content_list.to_main_html()
content = content_list.to_nlp_md()
return content, content_list._get_data(), main_html, statics


if __name__ == '__main__':
root = Path(__file__).parent.parent.parent
from llm_web_kit.dataio.filebase import (FileBasedDataReader,
FileBasedDataWriter)
reader = FileBasedDataReader('')
writer = FileBasedDataWriter('')

# 确保输出目录存在
output_dir = f'{root}/bench/output/ours'
os.makedirs(output_dir, exist_ok=True)

with open(f'{root}/bench/config/ours_config.jsonc', 'r') as f:
chain_config = json.load(f)

# 循环处理每一行数据
with open(f'{root}/bench/config/data_math_config.jsonl', 'r') as f:
for line in f:
test_data = json.loads(line.strip())
content, content_list, main_html, statics = eval_ours_extract_html(
chain_config,
test_data
)
print('处理数据:', test_data.get('track_id'))
print('URL:', test_data.get('url'))
print('统计信息:', statics)

out = {
'url': test_data.get('url'),
'content': content,
'main_html': main_html,
'content_list': content_list,
'html': reader.read(
f'{root}/bench/{test_data.get("path")}'
).decode('utf-8'),
'statics': statics
}

# 获取path的前两级目录
path = test_data.get('path', '')
path_parts = path.split('/')
if len(path_parts) >= 2:
output_subdir = '/'.join(path_parts[:2])
else:
output_subdir = 'unknown'

# 创建对应的输出目录
output_dir = f'{root}/bench/output/ours/{output_subdir}'
os.makedirs(output_dir, exist_ok=True)

# 追加写入结果
output_file = f'{output_dir}/data_math_output.jsonl'
writer.append_write(
output_file,
json.dumps(out).encode('utf-8') + b'\n'
)
print(f'结果已追加到: {output_file}')
68 changes: 43 additions & 25 deletions bench/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from pathlib import Path

from bench.common.metrics import Metrics
from bench.common.result import Error_Item, Result_Detail, Result_Summary
from bench.common.result import Result_Detail, Result_Summary
from bench.eval.ours import eval_ours_extract_html
from llm_web_kit.dataio.filebase import (FileBasedDataReader,
FileBasedDataWriter)
Expand All @@ -23,12 +23,36 @@
sourcePath = os.path.join(root, 'data/all.json')
outputPath = os.path.join(root, 'output')
pipelineConfigPath = os.path.join(root, 'config/ours_config.jsonc')
pipeline_data_path = os.path.join(root, 'config/ours_data_config.jsonl')
pipeline_data_path = os.path.join(root, 'config/data_config.jsonl')

reader = FileBasedDataReader('')
writer = FileBasedDataWriter('')


def run_ours(pipelineConfigPath, pipeline_data_path, outputPath, statics_pre):
with open(pipeline_data_path, 'r') as f:
for line in f:
print(line)
data_json = json.loads(line.strip())
content, content_list, main_html, statics = eval_ours_extract_html(pipelineConfigPath, data_json)
out = {
'url': data_json.get('url'),
'content': content,
'main_html': main_html,
'content_list': content_list,
'html': reader.read(
f'{root}/{data_json.get("path")}'
).decode('utf-8'),
'statics': statics
}
Statics(statics).print()
statics_pre.merge_statics(statics)
writer.write(
f'{outputPath}/{args.tool}/{data_json.get("track_id")}.jsonl',
json.dumps(out).encode('utf-8') + b'\n'
)


def main():
out = {}
task_id = str(uuid.uuid1())
Expand Down Expand Up @@ -71,33 +95,27 @@ def main():
if args.tool == 'magic_html':
from bench.eval.magic_html import eval_magic_html
output = eval_magic_html(html, fileName)
out = {
'url': url,
'content': output,
'html': html,
}
writer.write(
f'{outputPath}/{args.tool}/{fileName}.jsonl',
json.dumps(out).encode('utf-8') + b'\n'
)
elif args.tool == 'unstructured':
from bench.eval.unstructured_eval import eval_unstructured
output = eval_unstructured(html, fileName)
elif args.tool == 'ours':
try:
print(pipelineConfigPath)
print(pipeline_data_path)
print(f'{root}/data/{origin_filepath}')
output, content_list, main_html, statics = eval_ours_extract_html(pipelineConfigPath, pipeline_data_path, f'{root}/data/{origin_filepath}', layout_type, url)
out['content_list'] = content_list
out['main_html'] = main_html
out['statics'] = statics
Statics(statics).print()
statics_pre.merge_statics(statics)
except Exception as e:
summary.error_summary['count'] += 1
detail.result_detail['error_result'].append(Error_Item(
file_path=origin_filepath,
error_detail=str(e)
))
else:
raise ValueError(f'Invalid tool: {args.tool}')
out = {
'url': url,
'content': output,
'html': html,
}
writer.write(f'{outputPath}/{args.tool}/{fileName}.jsonl', json.dumps(out).encode('utf-8') + b'\n')
if args.tool == 'ours':
run_ours(pipelineConfigPath, pipeline_data_path, outputPath, statics_pre)

out['url'] = url
out['content'] = output
out['html'] = html
writer.write(f'{outputPath}/{args.tool}/{fileName}.jsonl', json.dumps(out).encode('utf-8') + b'\n')
summary.finish()
detail.finish()
statics_gt.print()
Expand Down
17 changes: 17 additions & 0 deletions llm_web_kit/dataio/filebase.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,20 @@

with open(fn_path, 'wb') as f:
f.write(data)

def append_write(self, path: str, data: bytes) -> None:
"""Append data to file.

Args:
path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
data (bytes): the data want to append
"""
fn_path = path
if not os.path.isabs(fn_path) and len(self._parent_dir) > 0:
fn_path = os.path.join(self._parent_dir, path)

if not os.path.exists(os.path.dirname(fn_path)) and os.path.dirname(fn_path) != '':
os.makedirs(os.path.dirname(fn_path), exist_ok=True)

Check warning on line 76 in llm_web_kit/dataio/filebase.py

View check run for this annotation

Codecov / codecov/patch

llm_web_kit/dataio/filebase.py#L76

Added line #L76 was not covered by tests

with open(fn_path, 'ab') as f:
f.write(data)
4 changes: 4 additions & 0 deletions llm_web_kit/exception/exception.jsonc
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,10 @@
"code": 31031100,
"message": "HTML math recognizer exception"
},
"HtmlMathMathjaxRenderRecognizerException": {
"code": 31031110,
"message": "HTML math mathjax render recognizer exception"
},
"HtmlCodeRecognizerException": {
"code": 31031200,
"message": "HTML code recognizer exception"
Expand Down
8 changes: 8 additions & 0 deletions llm_web_kit/exception/exception.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,14 @@
super().__init__(custom_message, error_code)


class HtmlMathMathjaxRenderRecognizerException(HtmlRecognizerException):
"""Exception raised during math render."""
def __init__(self, custom_message: str | None = None, error_code: int | None = None):
if error_code is None:
error_code = ErrorMsg.get_error_code('HtmlRecognizer', 'HtmlMathMathjaxRenderRecognizerException')
super().__init__(custom_message, error_code)

Check warning on line 242 in llm_web_kit/exception/exception.py

View check run for this annotation

Codecov / codecov/patch

llm_web_kit/exception/exception.py#L240-L242

Added lines #L240 - L242 were not covered by tests


class HtmlCodeRecognizerException(HtmlRecognizerException):
"""Exception raised during code content recognition."""
def __init__(self, custom_message: str | None = None, error_code: int | None = None):
Expand Down
31 changes: 0 additions & 31 deletions llm_web_kit/extractor/html/recognizer/cc_math/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,6 @@ class MathType:
HTMLMATH = 'htmlmath' # sub, sup, etc.


# 数学公式渲染器
class MathRender:
MATHJAX = 'mathjax'
KATEX = 'katex'


# node.text匹配结果:
class MathMatchRes:
ALLMATCH = 'all_match'
Expand Down Expand Up @@ -195,31 +189,6 @@ def extract_asciimath(self, s: str) -> str:
parsed = asciimath2tex.translate(s)
return parsed

def get_math_render(self, html: str) -> str:
"""获取数学公式渲染器.
示例:
MathJax:
<script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-MML-AM_CHTML"></script>
Katex:
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.13.11/dist/katex.min.css">
"""
tree = html_to_element(html)
if tree is None:
return None
# 检查 KaTeX
for link in tree.iter('link'):
if link.get('href') and 'katex' in link.get('href', '').lower():
return MathRender.KATEX
# 查找head标签
# head = tree.find('head')
# if head is not None:
# 检查 MathJax
for script in tree.iter('script'):
src = script.get('src', '').lower()
if src and ('mathjax' in src or 'asciimath' in src):
return MathRender.MATHJAX
return None

def get_equation_type(self, html: str) -> List[Tuple[str, str]]:
"""根据latex_config判断数学公式是行内还是行间公式.

Expand Down
Empty file.
Loading