From 6e8fb7156b11d44f523e104d8096918553b9fe02 Mon Sep 17 00:00:00 2001 From: Yanggq <1041206149@qq.com> Date: Thu, 4 Sep 2025 17:22:30 +0800 Subject: [PATCH 1/3] fix: escape '%' in MathML formula --- .../html/recognizer/cc_math/mmltex/tokens.xsl | 20 +++++++++++++++++-- .../html/recognizer/cc_math/tag_math.py | 3 +++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/mmltex/tokens.xsl b/llm_web_kit/extractor/html/recognizer/cc_math/mmltex/tokens.xsl index ad4c5243..7fd17fd4 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/mmltex/tokens.xsl +++ b/llm_web_kit/extractor/html/recognizer/cc_math/mmltex/tokens.xsl @@ -18,15 +18,31 @@ \mathrm{ - + } - + + + + + + + \% + + + + + + + + + + diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py b/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py index 66816c2d..78f51221 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py +++ b/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py @@ -52,6 +52,9 @@ def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, pa mathml = re.sub(r'([^\s])\s+([^\s])', r'\1 \2', mathml) # remove extra spaces latex = cm.mml_to_latex(mathml) + # 处理未转义的%为\% + # if latex: + # latex = re.sub(r'(? Date: Thu, 4 Sep 2025 19:03:31 +0800 Subject: [PATCH 2/3] x --- .../extractor/html/recognizer/cc_math/mmltex/tokens.xsl | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/mmltex/tokens.xsl b/llm_web_kit/extractor/html/recognizer/cc_math/mmltex/tokens.xsl index 7fd17fd4..da9f3015 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/mmltex/tokens.xsl +++ b/llm_web_kit/extractor/html/recognizer/cc_math/mmltex/tokens.xsl @@ -31,14 +31,18 @@ - + + + \% - + + + From f948ed36e22c418a9993ea7d3a1d47435ab2d426 Mon Sep 17 00:00:00 2001 From: Yanggq <1041206149@qq.com> Date: Fri, 5 Sep 2025 15:27:34 +0800 Subject: [PATCH 3/3] =?UTF-8?q?=E4=B8=8D=E4=BF=AE=E6=94=B9=E6=A8=A1?= =?UTF-8?q?=E6=9D=BF=EF=BC=8C=E8=80=8C=E6=98=AF=E5=9C=A8=E6=8A=BD=E5=8F=96?= =?UTF-8?q?=E5=90=8E=E5=AF=B9=E7=99=BE=E5=88=86=E5=8F=B7=E8=BF=9B=E8=A1=8C?= =?UTF-8?q?=E8=BD=AC=E4=B9=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../html/recognizer/cc_math/mmltex/tokens.xsl | 20 ++----------------- .../html/recognizer/cc_math/tag_math.py | 4 ++-- .../assets/ccmath/math_percentage.html | 1 + .../assets/ccmath/math_percentage_1.html | 1 + .../ccmath/math_percentage_inline_1.html | 1 + .../extractor/html/recognizer/test_math.py | 8 ++++++++ 6 files changed, 15 insertions(+), 20 deletions(-) create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage.html create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_inline_1.html diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/mmltex/tokens.xsl b/llm_web_kit/extractor/html/recognizer/cc_math/mmltex/tokens.xsl index 7fd17fd4..ad4c5243 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/mmltex/tokens.xsl +++ b/llm_web_kit/extractor/html/recognizer/cc_math/mmltex/tokens.xsl @@ -18,31 +18,15 @@ \mathrm{ - + } - + - - - - - - \% - - - - - - - - - - diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py b/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py index 78f51221..aed792c9 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py +++ b/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py @@ -53,8 +53,8 @@ def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, pa latex = cm.mml_to_latex(mathml) # 处理未转义的%为\% - # if latex: - # latex = re.sub(r'(? Mathematical Formulas with Percent Symbol

Mathematical Formulas Containing % Symbol

1. LaTeX Format Examples

Inline LaTeX formulas:

The percentage increase is calculated as $\frac{new - old}{old} \times 100\%$.

Display LaTeX formulas:

Compound interest formula:

$$A = P\left(1 + \frac{r\%}{n}\right)^{nt}$$

2. MathML Format Examples

Percentage change in MathML:

Δ % = new - old old × 100 %

Discount percentage:

Discount % = Original - Sale Original × 100 % \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html new file mode 100644 index 00000000..b19496cf --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html @@ -0,0 +1 @@ +A = P\left(1 + \frac{r\%}{n}\right)^{nt} \Delta \%=\frac{\mathrm{new}-\mathrm{old}}{\mathrm{old}}×100\% \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_inline_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_inline_1.html new file mode 100644 index 00000000..23c13989 --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_inline_1.html @@ -0,0 +1 @@ +\frac{new - old}{old} \times 100\% \mathrm{Discount}\%=\frac{\mathrm{Original}-\mathrm{Sale}}{\mathrm{Original}}×100\% \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_math.py b/tests/llm_web_kit/extractor/html/recognizer/test_math.py index f41af0c0..a6e1825e 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_math.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_math.py @@ -216,6 +216,14 @@ 'base_url': 'https://convertoctopus.com/4-7-years-to-minutes', 'expected': 'assets/ccmath/math_class_math_1.html', 'expected_inline': 'assets/ccmath/math_class_math_inline_1.html' + }, + { + 'input': [ + 'assets/ccmath/math_percentage.html', + ], + 'base_url': 'https://test.com/', + 'expected': 'assets/ccmath/math_percentage_1.html', + 'expected_inline': 'assets/ccmath/math_percentage_inline_1.html' } ]