diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py b/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py index 66816c2d..aed792c9 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py +++ b/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py @@ -52,6 +52,9 @@ def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, pa mathml = re.sub(r'([^\s])\s+([^\s])', r'\1 \2', mathml) # remove extra spaces latex = cm.mml_to_latex(mathml) + # 处理未转义的%为\% + if latex: + latex = re.sub(r'(? Mathematical Formulas with Percent Symbol

Mathematical Formulas Containing % Symbol

1. LaTeX Format Examples

Inline LaTeX formulas:

The percentage increase is calculated as $\frac{new - old}{old} \times 100\%$.

Display LaTeX formulas:

Compound interest formula:

$$A = P\left(1 + \frac{r\%}{n}\right)^{nt}$$

2. MathML Format Examples

Percentage change in MathML:

Δ % = new - old old × 100 %

Discount percentage:

Discount % = Original - Sale Original × 100 % \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html new file mode 100644 index 00000000..b19496cf --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html @@ -0,0 +1 @@ +A = P\left(1 + \frac{r\%}{n}\right)^{nt} \Delta \%=\frac{\mathrm{new}-\mathrm{old}}{\mathrm{old}}×100\% \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_inline_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_inline_1.html new file mode 100644 index 00000000..23c13989 --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_inline_1.html @@ -0,0 +1 @@ +\frac{new - old}{old} \times 100\% \mathrm{Discount}\%=\frac{\mathrm{Original}-\mathrm{Sale}}{\mathrm{Original}}×100\% \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_math.py b/tests/llm_web_kit/extractor/html/recognizer/test_math.py index f41af0c0..a6e1825e 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_math.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_math.py @@ -216,6 +216,14 @@ 'base_url': 'https://convertoctopus.com/4-7-years-to-minutes', 'expected': 'assets/ccmath/math_class_math_1.html', 'expected_inline': 'assets/ccmath/math_class_math_inline_1.html' + }, + { + 'input': [ + 'assets/ccmath/math_percentage.html', + ], + 'base_url': 'https://test.com/', + 'expected': 'assets/ccmath/math_percentage_1.html', + 'expected_inline': 'assets/ccmath/math_percentage_inline_1.html' } ]