Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/pr_stage_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
PYTHONPATH: $PYTHONPATH:${{ github.workspace }}
strategy:
matrix:
python-version: [3.10.15]
python-version: [3.10.16]
steps:
- name: Checkout code
uses: actions/checkout@v4
Expand Down
6 changes: 6 additions & 0 deletions bench/data/all.json
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,12 @@
"groundtruth_filepath":"groundtruth/math_mathhelpforum_1.jsonl",
"layout_type": "forum"
},
"math_mathhelpforum_2": {
"url": "http://mathhelpforum.com/differential-geometry/82966-continuous-functions.html",
"origin_filepath":"origin/math_mathhelpforum_2.html",
"groundtruth_filepath":"groundtruth/math_mathhelpforum_2.jsonl",
"layout_type": "forum"
},
"math_stackexchange_1": {
"url": "https://math.stackexchange.com/questions/458323/is-8327-1-a-prime-number?answertab=active",
"origin_filepath":"origin/math_stackexchange_1.html",
Expand Down
1 change: 1 addition & 0 deletions bench/data/groundtruth/math_mathhelpforum_2.jsonl

Large diffs are not rendered by default.

359 changes: 359 additions & 0 deletions bench/data/origin/math_mathhelpforum_2.html

Large diffs are not rendered by default.

27 changes: 27 additions & 0 deletions llm_web_kit/extractor/html/recognizer/cc_math/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,14 @@ class MATH_TYPE_PATTERN:
],
}

# 兼容一些网站有错误的公式起始结尾
MATH_MD_CUSTOM_CONFIG = {
'mathhelpforum.com': [
['<br />', '\\<br />'], # 使用双反斜杠
['<br />', '<br />'],
],
}

asciiMath_config = {
MATH_TYPE_PATTERN.INLINEMATH: [
[r'`', r'`'],
Expand Down Expand Up @@ -129,6 +137,10 @@ def text_strip(text):


class CCMATH():
def __init__(self):
self.url = ''

# end def
def wrap_math(self, s, display=False):
"""根据行间行内公式加上$$或$"""
s = re.sub(r'\s+', ' ', s)
Expand Down Expand Up @@ -160,8 +172,20 @@ def wrap_math_md(self, s):
return s.replace('\\[', '').replace('\\]', '').strip()
if s.startswith('`') and s.endswith('`'):
return s.replace('`', '').strip()
s = self.wrap_math_md_custom(s)
return s.strip()

# 循环MATH_MD_CUSTOM_CONFIG,如果url匹配,则去掉特殊网站的公式奇怪的起始结尾
def wrap_math_md_custom(self, s):
"""去掉特殊网站的公式奇怪的起始结尾."""
for url, config in MATH_MD_CUSTOM_CONFIG.items():
if url in self.url:
for start, end in config:
if s.startswith(start) and s.endswith(end):
# 去除 start 和 end
s = s[len(start):-len(end)]
return s

def wrap_math_space(self, s):
"""转义空格."""
s = s.strip()
Expand Down Expand Up @@ -441,3 +465,6 @@ def build_cc_exception_tag(self, text, math_type, math_render) -> str:
print(cm.replace_math('ccmath-interline','asciimath','',html_to_element(r'<p>`(x+1)/x^2``1/3245`</p>'),None,True))
print(cm.replace_math('ccmath-interline','latex','',html_to_element(r'<p>start $$f(a,b,c) = (a^2+b^2+c^2)^3$$end</p>'),None,False))
print(cm.replace_math('ccmath-inline','latex','',html_to_element(r'<p>\( \newcommand{\norm}[1]{\| #1 \|}\)</p>'),None,False))
# cm.url = 'mathhelpforum.com'
# print(cm.wrap_math_md_custom(r'<br />\begin{align} a^2+b=c\end{align}\<br />'))
# print(cm.wrap_math_md_custom(r'<br />dz=\frac{1}{2}\frac{dx}{\cos ^2 x}<br />'))
24 changes: 12 additions & 12 deletions llm_web_kit/extractor/html/recognizer/ccmath.py
Comment thread
yogacc33 marked this conversation as resolved.
Outdated
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,13 @@
from llm_web_kit.libs.doc_element_type import DocElementType
from llm_web_kit.libs.html_utils import iter_node

cm = CCMATH()


class MathRecognizer(BaseHTMLElementRecognizer):
"""解析数学公式元素."""

def __init__(self):
super().__init__()
self.cm = CCMATH()

@override
def recognize(self, base_url: str, main_html_lst: List[Tuple[str, str]], raw_html: str) -> List[Tuple[str, str]]:
Expand All @@ -35,11 +34,12 @@ def recognize(self, base_url: str, main_html_lst: List[Tuple[str, str]], raw_htm
Returns: main_html_lst中发现有公式,则返回处理后的元素,标签更新为ccmath,否则原样返回.
"""
result = []
self.cm.url = base_url
# 获取数学公式渲染器
math_render = cm.get_math_render(raw_html)
math_render = self.cm.get_math_render(raw_html)
for cc_html, o_html in main_html_lst:
if not self.is_cc_html(cc_html):
result.extend(self.process_ccmath_html(cc_html, o_html, math_render))
result.extend(self.process_ccmath_html(cc_html, o_html, math_render, base_url))
else:
result.append((cc_html, o_html))

Expand Down Expand Up @@ -77,7 +77,7 @@ def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segm
if len(inter_ele) > 0:
# 获取math_content
math_content = inter_ele[0].text
math_content = cm.wrap_math_md(math_content)
math_content = self.cm.wrap_math_md(math_content)

return {
'type': DocElementType.EQUATION_INTERLINE,
Expand All @@ -103,7 +103,7 @@ def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segm
else:
raise HtmlMathRecognizerException(f'No ccmath element found in content: {parsed_content}')

def process_ccmath_html(self, cc_html: str, o_html: str, math_render: str) -> List[Tuple[str, str]]:
def process_ccmath_html(self, cc_html: str, o_html: str, math_render: str, base_url: str) -> List[Tuple[str, str]]:
"""处理数学公式,将外层标签修改为 ccmath.

Args:
Expand All @@ -127,36 +127,36 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: str) -> Li

# tag = span, class 为 math-containerm, 或者 mathjax 或者 wp-katex-eq
if node.tag == 'span' and node.get('class') and ('math-container' in node.get('class') or 'mathjax' in node.get('class') or 'wp-katex-eq' in node.get('class') or 'x-ck12-mathEditor' in node.get('class') or 'tex' in node.get('class')):
tag_common_modify.modify_tree(cm, math_render, original_html, node, parent)
tag_common_modify.modify_tree(self.cm, math_render, original_html, node, parent)

# script[type="math/tex"]
# if node.tag == 'script' and node.get('type') and 'math/tex' in node.get('type'):
# tag_common_modify.modify_tree(cm, math_render, original_html, node, parent)

# math tags
if node.tag == 'math' or node.tag.endswith(':math'):
tag_math.modify_tree(cm, math_render, original_html, node, parent)
tag_math.modify_tree(self.cm, math_render, original_html, node, parent)

# script[type="math/asciimath"]
# if node.tag == 'script' and node.get('type') == 'math/asciimath':
if node.tag in ('p','div') and node.text and '`' in node.text:
tag_asciimath.modify_tree(cm, math_render, original_html, node, parent)
tag_asciimath.modify_tree(self.cm, math_render, original_html, node, parent)

# Remove any .MathJax_Preview spans
if node.tag == 'span' and node.get('class') and 'MathJax_Preview' in node.get('class'):
pass

# img中的latex
if node.tag == 'img':
tag_img.modify_tree(cm, math_render, original_html, node, parent)
tag_img.modify_tree(self.cm, math_render, original_html, node, parent)

# span.katex
if node.tag == 'script' or 'math' == node.get('class') or 'katex' == node.get('class'):
tag_script.modify_tree(cm, math_render, original_html, node, parent)
tag_script.modify_tree(self.cm, math_render, original_html, node, parent)

# 14. 只处理只有一层的p标签
if node.tag == 'p' and len(node.getchildren()) == 0:
tag_common_modify.modify_tree(cm, math_render, original_html, node, parent)
tag_common_modify.modify_tree(self.cm, math_render, original_html, node, parent)
# 打印处理后的html
# print(self._element_to_html(tree))
return self.html_split_by_tags(self._element_to_html(tree), [CCTag.CC_MATH_INTERLINE])
Expand Down
16 changes: 15 additions & 1 deletion tests/llm_web_kit/extractor/html/recognizer/test_math.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,20 @@
'input': '',
'expected': ''
},

{
'input': r'<br />\begin{align} a^2+b=c\end{align}\<br />',
'url': 'mathhelpforum.com',
'expected': r'\begin{align} a^2+b=c\end{align}'
},
{
'input': r'<br />dz=\frac{1}{2}\frac{dx}{\cos ^2 x}<br />',
'url': 'mathhelpforum.com',
'expected': r'dz=\frac{1}{2}\frac{dx}{\cos ^2 x}'
},
{
'input': r'<br />\begin{align} a^2+b=c\end{align}\<br />',
'expected': r'<br />\begin{align} a^2+b=c\end{align}\<br />'
}
]

TEST_FIX_MATHML_SUPERSCRIPT = [
Expand Down Expand Up @@ -474,6 +487,7 @@ def test_wrap_math(self):
def test_wrap_math_md(self):
for test_case in TEST_WRAP_MATH_MD:
with self.subTest(input=test_case['input']):
self.ccmath.url = test_case.get('url', '')
output_math = self.ccmath.wrap_math_md(test_case['input'])
self.assertEqual(output_math, test_case['expected'])

Expand Down