From e8c8e04c7fb27b33936c7fc269a0e5ef30a80f1c Mon Sep 17 00:00:00 2001 From: Yanggq <1041206149@qq.com> Date: Fri, 5 Sep 2025 15:38:31 +0800 Subject: [PATCH 1/4] fix: escape '%' in MathML formula v2 --- llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py | 3 +++ .../html/recognizer/assets/ccmath/math_percentage.html | 1 + .../html/recognizer/assets/ccmath/math_percentage_1.html | 1 + .../assets/ccmath/math_percentage_inline_1.html | 1 + tests/llm_web_kit/extractor/html/recognizer/test_math.py | 8 ++++++++ 5 files changed, 14 insertions(+) create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage.html create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_inline_1.html diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py b/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py index 66816c2d..aed792c9 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py +++ b/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py @@ -52,6 +52,9 @@ def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, pa mathml = re.sub(r'([^\s])\s+([^\s])', r'\1 \2', mathml) # remove extra spaces latex = cm.mml_to_latex(mathml) + # 处理未转义的%为\% + if latex: + latex = re.sub(r'(?
The percentage increase is calculated as $\frac{new - old}{old} \times 100\%$.
Compound interest formula:
$$A = P\left(1 + \frac{r\%}{n}\right)^{nt}$$Percentage change in MathML:
Discount percentage:
\ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html new file mode 100644 index 00000000..b19496cf --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html @@ -0,0 +1 @@ +A = P\left(1 + \frac{r\%}{n}\right)^{nt} \Delta \%=\frac{\mathrm{new}-\mathrm{old}}{\mathrm{old}}×100\% \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_inline_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_inline_1.html new file mode 100644 index 00000000..23c13989 --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_inline_1.html @@ -0,0 +1 @@ +\frac{new - old}{old} \times 100\% \mathrm{Discount}\%=\frac{\mathrm{Original}-\mathrm{Sale}}{\mathrm{Original}}×100\% \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_math.py b/tests/llm_web_kit/extractor/html/recognizer/test_math.py index f41af0c0..a6e1825e 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_math.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_math.py @@ -216,6 +216,14 @@ 'base_url': 'https://convertoctopus.com/4-7-years-to-minutes', 'expected': 'assets/ccmath/math_class_math_1.html', 'expected_inline': 'assets/ccmath/math_class_math_inline_1.html' + }, + { + 'input': [ + 'assets/ccmath/math_percentage.html', + ], + 'base_url': 'https://test.com/', + 'expected': 'assets/ccmath/math_percentage_1.html', + 'expected_inline': 'assets/ccmath/math_percentage_inline_1.html' } ] From 11d630fc0bf0c07967022cb9e86537794c11540c Mon Sep 17 00:00:00 2001 From: Yanggq <1041206149@qq.com> Date: Fri, 5 Sep 2025 16:04:50 +0800 Subject: [PATCH 2/4] fix: escape '%' in MathML formula v2 --- .../html/recognizer/cc_math/tag_math.py | 3 + .../assets/ccmath/math_percentage.html | 73 +++++++++++++++++++ .../assets/ccmath/math_percentage_1.html | 1 + .../ccmath/math_percentage_inline_1.html | 1 + .../extractor/html/recognizer/test_math.py | 8 ++ 5 files changed, 86 insertions(+) create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage.html create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_inline_1.html diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py b/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py index 66816c2d..aed792c9 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py +++ b/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py @@ -52,6 +52,9 @@ def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, pa mathml = re.sub(r'([^\s])\s+([^\s])', r'\1 \2', mathml) # remove extra spaces latex = cm.mml_to_latex(mathml) + # 处理未转义的%为\% + if latex: + latex = re.sub(r'(? + + + + +The percentage increase is calculated as $\frac{new - old}{old} \times 100\%$.
+ +Compound interest formula:
+ $$A = P\left(1 + \frac{r\%}{n}\right)^{nt}$$ + +Percentage change in MathML:
+ + +Discount percentage:
+ + + + diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html new file mode 100644 index 00000000..b19496cf --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html @@ -0,0 +1 @@ +A = P\left(1 + \frac{r\%}{n}\right)^{nt} \Delta \%=\frac{\mathrm{new}-\mathrm{old}}{\mathrm{old}}×100\% \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_inline_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_inline_1.html new file mode 100644 index 00000000..23c13989 --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_inline_1.html @@ -0,0 +1 @@ +\frac{new - old}{old} \times 100\% \mathrm{Discount}\%=\frac{\mathrm{Original}-\mathrm{Sale}}{\mathrm{Original}}×100\% \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_math.py b/tests/llm_web_kit/extractor/html/recognizer/test_math.py index f41af0c0..a6e1825e 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_math.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_math.py @@ -216,6 +216,14 @@ 'base_url': 'https://convertoctopus.com/4-7-years-to-minutes', 'expected': 'assets/ccmath/math_class_math_1.html', 'expected_inline': 'assets/ccmath/math_class_math_inline_1.html' + }, + { + 'input': [ + 'assets/ccmath/math_percentage.html', + ], + 'base_url': 'https://test.com/', + 'expected': 'assets/ccmath/math_percentage_1.html', + 'expected_inline': 'assets/ccmath/math_percentage_inline_1.html' } ] From baacef88b8e3513cccf621871d3af6b0081c0afc Mon Sep 17 00:00:00 2001 From: Yanggq <1041206149@qq.com> Date: Fri, 5 Sep 2025 16:09:10 +0800 Subject: [PATCH 3/4] x --- .../html/recognizer/assets/ccmath/math_percentage_1.html | 3 ++- .../recognizer/assets/ccmath/math_percentage_inline_1.html | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html index b19496cf..c877724c 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html @@ -1 +1,2 @@ -A = P\left(1 + \frac{r\%}{n}\right)^{nt} \Delta \%=\frac{\mathrm{new}-\mathrm{old}}{\mathrm{old}}×100\% \ No newline at end of file +A = P\left(1 + \frac{r\%}{n}\right)^{nt} +\Delta \%=\frac{\mathrm{new}-\mathrm{old}}{\mathrm{old}}×100\% \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_inline_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_inline_1.html index 23c13989..1fcfe99a 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_inline_1.html +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_inline_1.html @@ -1 +1,2 @@ -\frac{new - old}{old} \times 100\% \mathrm{Discount}\%=\frac{\mathrm{Original}-\mathrm{Sale}}{\mathrm{Original}}×100\% \ No newline at end of file +\frac{new - old}{old} \times 100\% +\mathrm{Discount}\%=\frac{\mathrm{Original}-\mathrm{Sale}}{\mathrm{Original}}×100\% \ No newline at end of file From 4caba9e6a2bf932c096ef0e9ddcab2804076ed1f Mon Sep 17 00:00:00 2001 From: Yanggq <1041206149@qq.com> Date: Fri, 5 Sep 2025 16:36:49 +0800 Subject: [PATCH 4/4] add --- .../assets/ccmath/math_percentage.html | 119 +++++++++++------- .../assets/ccmath/math_percentage_1.html | 3 +- 2 files changed, 77 insertions(+), 45 deletions(-) diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage.html index ad37d193..41d529c9 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage.html +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage.html @@ -16,58 +16,89 @@ -The percentage increase is calculated as $\frac{new - old}{old} \times 100\%$.
+The percentage increase is calculated as $\frac{new - old}{old} \times 100\%$.
-Compound interest formula:
- $$A = P\left(1 + \frac{r\%}{n}\right)^{nt}$$ +Compound interest formula:
+$$A = P\left(1 + \frac{r\%}{n}\right)^{nt}$$ -Percentage change in MathML:
- -Discount percentage:
-Discount percentage:
+Puyu badcase:
+