From e8c8e04c7fb27b33936c7fc269a0e5ef30a80f1c Mon Sep 17 00:00:00 2001 From: Yanggq <1041206149@qq.com> Date: Fri, 5 Sep 2025 15:38:31 +0800 Subject: [PATCH 1/4] fix: escape '%' in MathML formula v2 --- llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py | 3 +++ .../html/recognizer/assets/ccmath/math_percentage.html | 1 + .../html/recognizer/assets/ccmath/math_percentage_1.html | 1 + .../assets/ccmath/math_percentage_inline_1.html | 1 + tests/llm_web_kit/extractor/html/recognizer/test_math.py | 8 ++++++++ 5 files changed, 14 insertions(+) create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage.html create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_inline_1.html diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py b/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py index 66816c2d..aed792c9 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py +++ b/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py @@ -52,6 +52,9 @@ def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, pa mathml = re.sub(r'([^\s])\s+([^\s])', r'\1 \2', mathml) # remove extra spaces latex = cm.mml_to_latex(mathml) + # 处理未转义的%为\% + if latex: + latex = re.sub(r'(? Mathematical Formulas with Percent Symbol

Mathematical Formulas Containing % Symbol

1. LaTeX Format Examples

Inline LaTeX formulas:

The percentage increase is calculated as $\frac{new - old}{old} \times 100\%$.

Display LaTeX formulas:

Compound interest formula:

$$A = P\left(1 + \frac{r\%}{n}\right)^{nt}$$

2. MathML Format Examples

Percentage change in MathML:

Δ % = new - old old × 100 %

Discount percentage:

Discount % = Original - Sale Original × 100 % \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html new file mode 100644 index 00000000..b19496cf --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html @@ -0,0 +1 @@ +A = P\left(1 + \frac{r\%}{n}\right)^{nt} \Delta \%=\frac{\mathrm{new}-\mathrm{old}}{\mathrm{old}}×100\% \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_inline_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_inline_1.html new file mode 100644 index 00000000..23c13989 --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_inline_1.html @@ -0,0 +1 @@ +\frac{new - old}{old} \times 100\% \mathrm{Discount}\%=\frac{\mathrm{Original}-\mathrm{Sale}}{\mathrm{Original}}×100\% \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_math.py b/tests/llm_web_kit/extractor/html/recognizer/test_math.py index f41af0c0..a6e1825e 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_math.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_math.py @@ -216,6 +216,14 @@ 'base_url': 'https://convertoctopus.com/4-7-years-to-minutes', 'expected': 'assets/ccmath/math_class_math_1.html', 'expected_inline': 'assets/ccmath/math_class_math_inline_1.html' + }, + { + 'input': [ + 'assets/ccmath/math_percentage.html', + ], + 'base_url': 'https://test.com/', + 'expected': 'assets/ccmath/math_percentage_1.html', + 'expected_inline': 'assets/ccmath/math_percentage_inline_1.html' } ] From 11d630fc0bf0c07967022cb9e86537794c11540c Mon Sep 17 00:00:00 2001 From: Yanggq <1041206149@qq.com> Date: Fri, 5 Sep 2025 16:04:50 +0800 Subject: [PATCH 2/4] fix: escape '%' in MathML formula v2 --- .../html/recognizer/cc_math/tag_math.py | 3 + .../assets/ccmath/math_percentage.html | 73 +++++++++++++++++++ .../assets/ccmath/math_percentage_1.html | 1 + .../ccmath/math_percentage_inline_1.html | 1 + .../extractor/html/recognizer/test_math.py | 8 ++ 5 files changed, 86 insertions(+) create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage.html create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_inline_1.html diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py b/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py index 66816c2d..aed792c9 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py +++ b/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py @@ -52,6 +52,9 @@ def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, pa mathml = re.sub(r'([^\s])\s+([^\s])', r'\1 \2', mathml) # remove extra spaces latex = cm.mml_to_latex(mathml) + # 处理未转义的%为\% + if latex: + latex = re.sub(r'(? + + + + + Mathematical Formulas with Percent Symbol + + + + + +

Mathematical Formulas Containing % Symbol

+ +

1. LaTeX Format Examples

+ +

Inline LaTeX formulas:

+

The percentage increase is calculated as $\frac{new - old}{old} \times 100\%$.

+ +

Display LaTeX formulas:

+

Compound interest formula:

+ $$A = P\left(1 + \frac{r\%}{n}\right)^{nt}$$ + +

2. MathML Format Examples

+ +

Percentage change in MathML:

+ + + Δ + % + = + + + new + - + old + + old + + × + 100 + % + + + +

Discount percentage:

+ + + Discount + % + = + + + Original + - + Sale + + Original + + × + 100 + % + + + + + diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html new file mode 100644 index 00000000..b19496cf --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html @@ -0,0 +1 @@ +A = P\left(1 + \frac{r\%}{n}\right)^{nt} \Delta \%=\frac{\mathrm{new}-\mathrm{old}}{\mathrm{old}}×100\% \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_inline_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_inline_1.html new file mode 100644 index 00000000..23c13989 --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_inline_1.html @@ -0,0 +1 @@ +\frac{new - old}{old} \times 100\% \mathrm{Discount}\%=\frac{\mathrm{Original}-\mathrm{Sale}}{\mathrm{Original}}×100\% \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_math.py b/tests/llm_web_kit/extractor/html/recognizer/test_math.py index f41af0c0..a6e1825e 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_math.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_math.py @@ -216,6 +216,14 @@ 'base_url': 'https://convertoctopus.com/4-7-years-to-minutes', 'expected': 'assets/ccmath/math_class_math_1.html', 'expected_inline': 'assets/ccmath/math_class_math_inline_1.html' + }, + { + 'input': [ + 'assets/ccmath/math_percentage.html', + ], + 'base_url': 'https://test.com/', + 'expected': 'assets/ccmath/math_percentage_1.html', + 'expected_inline': 'assets/ccmath/math_percentage_inline_1.html' } ] From baacef88b8e3513cccf621871d3af6b0081c0afc Mon Sep 17 00:00:00 2001 From: Yanggq <1041206149@qq.com> Date: Fri, 5 Sep 2025 16:09:10 +0800 Subject: [PATCH 3/4] x --- .../html/recognizer/assets/ccmath/math_percentage_1.html | 3 ++- .../recognizer/assets/ccmath/math_percentage_inline_1.html | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html index b19496cf..c877724c 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html @@ -1 +1,2 @@ -A = P\left(1 + \frac{r\%}{n}\right)^{nt} \Delta \%=\frac{\mathrm{new}-\mathrm{old}}{\mathrm{old}}×100\% \ No newline at end of file +A = P\left(1 + \frac{r\%}{n}\right)^{nt} +\Delta \%=\frac{\mathrm{new}-\mathrm{old}}{\mathrm{old}}×100\% \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_inline_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_inline_1.html index 23c13989..1fcfe99a 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_inline_1.html +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_inline_1.html @@ -1 +1,2 @@ -\frac{new - old}{old} \times 100\% \mathrm{Discount}\%=\frac{\mathrm{Original}-\mathrm{Sale}}{\mathrm{Original}}×100\% \ No newline at end of file +\frac{new - old}{old} \times 100\% +\mathrm{Discount}\%=\frac{\mathrm{Original}-\mathrm{Sale}}{\mathrm{Original}}×100\% \ No newline at end of file From 4caba9e6a2bf932c096ef0e9ddcab2804076ed1f Mon Sep 17 00:00:00 2001 From: Yanggq <1041206149@qq.com> Date: Fri, 5 Sep 2025 16:36:49 +0800 Subject: [PATCH 4/4] add --- .../assets/ccmath/math_percentage.html | 119 +++++++++++------- .../assets/ccmath/math_percentage_1.html | 3 +- 2 files changed, 77 insertions(+), 45 deletions(-) diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage.html index ad37d193..41d529c9 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage.html +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage.html @@ -16,58 +16,89 @@ -

Mathematical Formulas Containing % Symbol

+

Mathematical Formulas Containing % Symbol

-

1. LaTeX Format Examples

+

1. LaTeX Format Examples

-

Inline LaTeX formulas:

-

The percentage increase is calculated as $\frac{new - old}{old} \times 100\%$.

+

Inline LaTeX formulas:

+

The percentage increase is calculated as $\frac{new - old}{old} \times 100\%$.

-

Display LaTeX formulas:

-

Compound interest formula:

- $$A = P\left(1 + \frac{r\%}{n}\right)^{nt}$$ +

Display LaTeX formulas:

+

Compound interest formula:

+$$A = P\left(1 + \frac{r\%}{n}\right)^{nt}$$ -

2. MathML Format Examples

+

2. MathML Format Examples

-

Percentage change in MathML:

- - - Δ - % - = - - - new - - - old - +

Percentage change in MathML:

+ + + Δ + % + = + + + new + - old - - × - 100 - % - - + + old + + × + 100 + % + + -

Discount percentage:

- - - Discount - % - = - - - Original - - - Sale - +

Discount percentage:

+ + + Discount + % + = + + Original - - × - 100 - % - - + - + Sale + + Original + + × + 100 + % + + +

Puyu badcase:

+ + + + + % + + Cell Death + = + + + ( + 1 + + ( + Post treatment cell counts + + + + + + / + initial cell counts + ) + ) + * + 100 + + + + diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html index c877724c..e29bbe09 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html @@ -1,2 +1,3 @@ A = P\left(1 + \frac{r\%}{n}\right)^{nt} -\Delta \%=\frac{\mathrm{new}-\mathrm{old}}{\mathrm{old}}×100\% \ No newline at end of file +\Delta \%=\frac{\mathrm{new}-\mathrm{old}}{\mathrm{old}}×100\% +\begin{array}{ll}\%\text{Cell Death}=& \left(1-\left(\text{Post treatment cell counts}\\ /\text{initial cell counts}\right)\right)*100\end{array} \ No newline at end of file