diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py b/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py index 66816c2d..aed792c9 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py +++ b/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py @@ -52,6 +52,9 @@ def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, pa mathml = re.sub(r'([^\s])\s+([^\s])', r'\1 \2', mathml) # remove extra spaces latex = cm.mml_to_latex(mathml) + # 处理未转义的%为\% + if latex: + latex = re.sub(r'(? + +
+ + +The percentage increase is calculated as $\frac{new - old}{old} \times 100\%$.
+ +Compound interest formula:
+$$A = P\left(1 + \frac{r\%}{n}\right)^{nt}$$ + +Percentage change in MathML:
+ + +Discount percentage:
+ + +Puyu badcase:
+ + + diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html new file mode 100644 index 00000000..e29bbe09 --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_1.html @@ -0,0 +1,3 @@ +A = P\left(1 + \frac{r\%}{n}\right)^{nt} +\Delta \%=\frac{\mathrm{new}-\mathrm{old}}{\mathrm{old}}×100\% +\begin{array}{ll}\%\text{Cell Death}=& \left(1-\left(\text{Post treatment cell counts}\\ /\text{initial cell counts}\right)\right)*100\end{array} \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_inline_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_inline_1.html new file mode 100644 index 00000000..1fcfe99a --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_percentage_inline_1.html @@ -0,0 +1,2 @@ +\frac{new - old}{old} \times 100\% +\mathrm{Discount}\%=\frac{\mathrm{Original}-\mathrm{Sale}}{\mathrm{Original}}×100\% \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_math.py b/tests/llm_web_kit/extractor/html/recognizer/test_math.py index f41af0c0..a6e1825e 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_math.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_math.py @@ -216,6 +216,14 @@ 'base_url': 'https://convertoctopus.com/4-7-years-to-minutes', 'expected': 'assets/ccmath/math_class_math_1.html', 'expected_inline': 'assets/ccmath/math_class_math_inline_1.html' + }, + { + 'input': [ + 'assets/ccmath/math_percentage.html', + ], + 'base_url': 'https://test.com/', + 'expected': 'assets/ccmath/math_percentage_1.html', + 'expected_inline': 'assets/ccmath/math_percentage_inline_1.html' } ]