From 6e8fb7156b11d44f523e104d8096918553b9fe02 Mon Sep 17 00:00:00 2001
From: Yanggq <1041206149@qq.com>
Date: Thu, 4 Sep 2025 17:22:30 +0800
Subject: [PATCH 1/3] fix: escape '%' in MathML formula
---
.../html/recognizer/cc_math/mmltex/tokens.xsl | 20 +++++++++++++++++--
.../html/recognizer/cc_math/tag_math.py | 3 +++
2 files changed, 21 insertions(+), 2 deletions(-)
diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/mmltex/tokens.xsl b/llm_web_kit/extractor/html/recognizer/cc_math/mmltex/tokens.xsl
index ad4c5243..7fd17fd4 100644
--- a/llm_web_kit/extractor/html/recognizer/cc_math/mmltex/tokens.xsl
+++ b/llm_web_kit/extractor/html/recognizer/cc_math/mmltex/tokens.xsl
@@ -18,15 +18,31 @@
\mathrm{
-
+
}
-
+
+
+
+
+
+
+ \%
+
+
+
+
+
+
+
+
+
+
diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py b/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py
index 66816c2d..78f51221 100644
--- a/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py
+++ b/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py
@@ -52,6 +52,9 @@ def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, pa
mathml = re.sub(r'([^\s])\s+([^\s])', r'\1 \2', mathml) # remove extra spaces
latex = cm.mml_to_latex(mathml)
+ # 处理未转义的%为\%
+ # if latex:
+ # latex = re.sub(r'(?
Date: Thu, 4 Sep 2025 19:03:31 +0800
Subject: [PATCH 2/3] x
---
.../extractor/html/recognizer/cc_math/mmltex/tokens.xsl | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/mmltex/tokens.xsl b/llm_web_kit/extractor/html/recognizer/cc_math/mmltex/tokens.xsl
index 7fd17fd4..da9f3015 100644
--- a/llm_web_kit/extractor/html/recognizer/cc_math/mmltex/tokens.xsl
+++ b/llm_web_kit/extractor/html/recognizer/cc_math/mmltex/tokens.xsl
@@ -31,14 +31,18 @@
-
+
+
+
\%
-
+
+
+
From f948ed36e22c418a9993ea7d3a1d47435ab2d426 Mon Sep 17 00:00:00 2001
From: Yanggq <1041206149@qq.com>
Date: Fri, 5 Sep 2025 15:27:34 +0800
Subject: [PATCH 3/3] =?UTF-8?q?=E4=B8=8D=E4=BF=AE=E6=94=B9=E6=A8=A1?=
=?UTF-8?q?=E6=9D=BF=EF=BC=8C=E8=80=8C=E6=98=AF=E5=9C=A8=E6=8A=BD=E5=8F=96?=
=?UTF-8?q?=E5=90=8E=E5=AF=B9=E7=99=BE=E5=88=86=E5=8F=B7=E8=BF=9B=E8=A1=8C?=
=?UTF-8?q?=E8=BD=AC=E4=B9=89?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
.../html/recognizer/cc_math/mmltex/tokens.xsl | 20 ++-----------------
.../html/recognizer/cc_math/tag_math.py | 4 ++--
.../assets/ccmath/math_percentage.html | 1 +
.../assets/ccmath/math_percentage_1.html | 1 +
.../ccmath/math_percentage_inline_1.html | 1 +
.../extractor/html/recognizer/test_math.py | 8 ++++++++
6 files changed, 15 insertions(+), 20 deletions(-)
create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage.html
create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html
create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_inline_1.html
diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/mmltex/tokens.xsl b/llm_web_kit/extractor/html/recognizer/cc_math/mmltex/tokens.xsl
index 7fd17fd4..ad4c5243 100644
--- a/llm_web_kit/extractor/html/recognizer/cc_math/mmltex/tokens.xsl
+++ b/llm_web_kit/extractor/html/recognizer/cc_math/mmltex/tokens.xsl
@@ -18,31 +18,15 @@
\mathrm{
-
+
}
-
+
-
-
-
-
-
- \%
-
-
-
-
-
-
-
-
-
-
diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py b/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py
index 78f51221..aed792c9 100644
--- a/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py
+++ b/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py
@@ -53,8 +53,8 @@ def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, pa
latex = cm.mml_to_latex(mathml)
# 处理未转义的%为\%
- # if latex:
- # latex = re.sub(r'(?
Mathematical Formulas with Percent Symbol
Mathematical Formulas Containing % Symbol
1. LaTeX Format Examples
Inline LaTeX formulas:
The percentage increase is calculated as $\frac{new - old}{old} \times 100\%$.
Display LaTeX formulas:
Compound interest formula:
$$A = P\left(1 + \frac{r\%}{n}\right)^{nt}$$
2. MathML Format Examples
Percentage change in MathML:
Discount percentage:
\ No newline at end of file
diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html
new file mode 100644
index 00000000..b19496cf
--- /dev/null
+++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html
@@ -0,0 +1 @@
+A = P\left(1 + \frac{r\%}{n}\right)^{nt}
\Delta \%=\frac{\mathrm{new}-\mathrm{old}}{\mathrm{old}}×100\%
\ No newline at end of file
diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_inline_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_inline_1.html
new file mode 100644
index 00000000..23c13989
--- /dev/null
+++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_inline_1.html
@@ -0,0 +1 @@
+\frac{new - old}{old} \times 100\%
\mathrm{Discount}\%=\frac{\mathrm{Original}-\mathrm{Sale}}{\mathrm{Original}}×100\%
\ No newline at end of file
diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_math.py b/tests/llm_web_kit/extractor/html/recognizer/test_math.py
index f41af0c0..a6e1825e 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/test_math.py
+++ b/tests/llm_web_kit/extractor/html/recognizer/test_math.py
@@ -216,6 +216,14 @@
'base_url': 'https://convertoctopus.com/4-7-years-to-minutes',
'expected': 'assets/ccmath/math_class_math_1.html',
'expected_inline': 'assets/ccmath/math_class_math_inline_1.html'
+ },
+ {
+ 'input': [
+ 'assets/ccmath/math_percentage.html',
+ ],
+ 'base_url': 'https://test.com/',
+ 'expected': 'assets/ccmath/math_percentage_1.html',
+ 'expected_inline': 'assets/ccmath/math_percentage_inline_1.html'
}
]