From c09bdd4f265d3dc6a7d6193fef360aa207472bb9 Mon Sep 17 00:00:00 2001 From: drunkpig <60862764+drunkpig@users.noreply.github.com> Date: Fri, 22 Aug 2025 19:34:48 +0800 Subject: [PATCH 1/3] feat: add extract plain text from html source method --- llm_web_kit/libs/html_utils.py | 25 ++ .../libs/test_get_plain_text_fast.py | 243 ++++++++++++++++++ 2 files changed, 268 insertions(+) create mode 100644 tests/llm_web_kit/libs/test_get_plain_text_fast.py diff --git a/llm_web_kit/libs/html_utils.py b/llm_web_kit/libs/html_utils.py index e19627ee..fede6cf0 100644 --- a/llm_web_kit/libs/html_utils.py +++ b/llm_web_kit/libs/html_utils.py @@ -450,3 +450,28 @@ def html_normalize_space(text: str) -> str: return _text except Exception: return text + + +def get_plain_text_fast(html_source: str) -> str: + """使用lxml快速获取html中的纯文本. + + 主要用于语言检测 + """ + if not html_source or not html_source.strip(): + return "" + + doc = html_to_element(html_source) + # === 第一步:移除不需要的标签及其内容 === + # 噪声标签列表 + noise_tags = ['script', 'style', 'noscript', 'iframe', 'embed', 'object'] + code_tags = ['code', 'pre', 'kbd', 'samp'] # 代码相关 + all_noise_tags = noise_tags + code_tags + + for tag_name in all_noise_tags: + for elem in doc.xpath(f'//{tag_name}'): + elem.getparent().remove(elem) # 安全移除 + + # === 第二步:提取所有文本 === + texts = doc.xpath('//text()') + full_text = ' '.join(text.strip() for text in texts if text.strip()) + return full_text diff --git a/tests/llm_web_kit/libs/test_get_plain_text_fast.py b/tests/llm_web_kit/libs/test_get_plain_text_fast.py new file mode 100644 index 00000000..c00d4ecf --- /dev/null +++ b/tests/llm_web_kit/libs/test_get_plain_text_fast.py @@ -0,0 +1,243 @@ +"""测试get_plain_text_fast函数.""" +import unittest + +from llm_web_kit.libs.html_utils import get_plain_text_fast + + +class TestGetPlainTextFast(unittest.TestCase): + """测试get_plain_text_fast函数的单元测试类.""" + + def test_empty_input(self): + """测试空输入.""" + # 测试空字符串 + self.assertEqual(get_plain_text_fast(""), "") + + # 测试None值 + self.assertEqual(get_plain_text_fast(None), "") + + # 测试只有空白字符的字符串 + self.assertEqual(get_plain_text_fast(" "), "") + self.assertEqual(get_plain_text_fast("\n\t"), "") + + def test_simple_text(self): + """测试简单文本提取.""" + html = "

Hello World

" + result = get_plain_text_fast(html) + self.assertEqual(result, "Hello World") + + def test_multiple_elements(self): + """测试多个元素的文本提取.""" + html = "

Hello

World

" + result = get_plain_text_fast(html) + self.assertEqual(result, "Hello World") + + def test_nested_elements(self): + """测试嵌套元素的文本提取.""" + html = "
Hello beautiful World
" + result = get_plain_text_fast(html) + self.assertEqual(result, "Hello beautiful World") + + def test_remove_script_tags(self): + """测试移除script标签及其内容.""" + html = """ +
+

Visible text

+ +

More visible text

+
+ """ + result = get_plain_text_fast(html) + self.assertEqual(result, "Visible text More visible text") + self.assertNotIn("console.log", result) + + def test_remove_style_tags(self): + """测试移除style标签及其内容.""" + html = """ +
+

Visible text

+ +

More visible text

+
+ """ + result = get_plain_text_fast(html) + self.assertEqual(result, "Visible text More visible text") + self.assertNotIn("color", result) + + def test_remove_all_noise_tags(self): + """测试移除所有噪声标签.""" + html = """ + + Test + +

Visible content

+ + + + + + +

More visible content

+ + + """ + result = get_plain_text_fast(html) + expected = "Test Visible content More visible content" + self.assertEqual(result, expected) + + # 确保噪声内容被移除 + noise_content = ["var x = 1", "margin: 0", "No JavaScript", "test.html", "test.swf", "test.pdf"] + for noise in noise_content: + self.assertNotIn(noise, result) + + def test_remove_code_tags(self): + """测试移除代码相关标签.""" + html = """ +
+

Regular text

+ function test() { return true; } +
+                def hello():
+                    print("world")
+            
+ Ctrl+C + $ ls -la +

More regular text

+
+ """ + result = get_plain_text_fast(html) + self.assertEqual(result, "Regular text More regular text") + + # 确保代码内容被移除 + code_content = ["function test", "def hello", "Ctrl+C", "$ ls -la"] + for code in code_content: + self.assertNotIn(code, result) + + def test_whitespace_normalization(self): + """测试空白字符规范化.""" + html = """ +
+

Multiple spaces

+

+ Line breaks + and tabs +

+
+ """ + result = get_plain_text_fast(html) + # 应该规范化为单个空格分隔的文本 + self.assertEqual(result, "Multiple spaces Line breaks and tabs") + + def test_special_characters(self): + """测试特殊字符处理.""" + html = "

Price: $100 & €50 < £75

" + result = get_plain_text_fast(html) + self.assertEqual(result, "Price: $100 & €50 < £75") + + def test_mixed_content_complex(self): + """测试复杂混合内容.""" + html = """ +
+

Article Title

+

This is a paragraph with a link and emphasis.

+ +
+

This is a quote

+
+ console.log('removed'); + + +
+ """ + result = get_plain_text_fast(html) + expected = "Article Title This is a paragraph with a link and emphasis. This is a quote Item 1 Item 2" + self.assertEqual(result, expected) + + # 确保被移除的内容不存在 + self.assertNotIn("analytics.track", result) + self.assertNotIn("console.log", result) + self.assertNotIn("background: yellow", result) + + def test_malformed_html(self): + """测试畸形HTML的处理.""" + html = "

Unclosed paragraph

Nested without closing

Some text" + result = get_plain_text_fast(html) + # 应该能够提取文本,即使HTML结构不完整 + self.assertIn("Unclosed paragraph", result) + self.assertIn("Nested without closing", result) + self.assertIn("Some text", result) + + def test_only_noise_tags(self): + """测试只包含噪声标签的HTML.""" + html = """ + + + + """ + result = get_plain_text_fast(html) + self.assertEqual(result, "") + + def test_unicode_content(self): + """测试Unicode内容.""" + html = "

你好世界 🌍 Здравствуй мир

" + result = get_plain_text_fast(html) + self.assertEqual(result, "你好世界 🌍 Здравствуй мир") + + def test_table_content(self): + """测试表格内容提取.""" + html = """ + + + + + + + + + + + + + +
NameAge
John25
Jane30
+ """ + result = get_plain_text_fast(html) + self.assertIn("Name", result) + self.assertIn("Age", result) + self.assertIn("John", result) + self.assertIn("25", result) + self.assertIn("Jane", result) + self.assertIn("30", result) + + def test_image_alt_text(self): + """测试图片alt文本不会被提取(因为是属性而非文本内容)""" + html = '
Description

Text content

' + result = get_plain_text_fast(html) + self.assertEqual(result, "Text content") + # alt属性不应该被提取为文本内容 + self.assertNotIn("Description", result) + + def test_comments_handling(self): + """测试HTML注释处理(应该被HTMLParser移除)""" + html = """ +
+ +

Visible text

+ +
+ """ + result = get_plain_text_fast(html) + self.assertEqual(result, "Visible text") + self.assertNotIn("comment", result) + + +if __name__ == '__main__': + unittest.main() From b2df0a3d7db83fa471e20cfc7c0cfb3721e0e79b Mon Sep 17 00:00:00 2001 From: drunkpig <60862764+drunkpig@users.noreply.github.com> Date: Fri, 22 Aug 2025 19:38:56 +0800 Subject: [PATCH 2/3] feat: add extract plain text from html source method --- README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/README.md b/README.md index f90f4c6f..9fbeaaec 100644 --- a/README.md +++ b/README.md @@ -190,6 +190,16 @@ if __name__=="__main__": main_html, is_success = extract(response_json, html) ``` +### extract plain text from html source + +```python +from llm_web_kit.libs.html_utils import get_plain_text_fast +html_source = "" +text = get_plain_text_fast(html_source) +# language = detect_lang(text) + +``` + ## Pipeline 1. [HTML pre-dedup](jupyter/html-pre-dedup/main.ipynb) From ea9e943bcf875e45a3a911366919b352842b8481 Mon Sep 17 00:00:00 2001 From: drunkpig <60862764+drunkpig@users.noreply.github.com> Date: Fri, 22 Aug 2025 19:57:01 +0800 Subject: [PATCH 3/3] feat: add extract plain text from html source method --- .../libs/test_get_plain_text_fast.py | 54 ------------------- 1 file changed, 54 deletions(-) diff --git a/tests/llm_web_kit/libs/test_get_plain_text_fast.py b/tests/llm_web_kit/libs/test_get_plain_text_fast.py index c00d4ecf..87e11479 100644 --- a/tests/llm_web_kit/libs/test_get_plain_text_fast.py +++ b/tests/llm_web_kit/libs/test_get_plain_text_fast.py @@ -112,60 +112,6 @@ def hello(): for code in code_content: self.assertNotIn(code, result) - def test_whitespace_normalization(self): - """测试空白字符规范化.""" - html = """ -
-

Multiple spaces

-

- Line breaks - and tabs -

-
- """ - result = get_plain_text_fast(html) - # 应该规范化为单个空格分隔的文本 - self.assertEqual(result, "Multiple spaces Line breaks and tabs") - - def test_special_characters(self): - """测试特殊字符处理.""" - html = "

Price: $100 & €50 < £75

" - result = get_plain_text_fast(html) - self.assertEqual(result, "Price: $100 & €50 < £75") - - def test_mixed_content_complex(self): - """测试复杂混合内容.""" - html = """ -
-

Article Title

-

This is a paragraph with a link and emphasis.

- -
-

This is a quote

-
- console.log('removed'); - - -
- """ - result = get_plain_text_fast(html) - expected = "Article Title This is a paragraph with a link and emphasis. This is a quote Item 1 Item 2" - self.assertEqual(result, expected) - - # 确保被移除的内容不存在 - self.assertNotIn("analytics.track", result) - self.assertNotIn("console.log", result) - self.assertNotIn("background: yellow", result) - def test_malformed_html(self): """测试畸形HTML的处理.""" html = "

Unclosed paragraph

Nested without closing

Some text"