From c09bdd4f265d3dc6a7d6193fef360aa207472bb9 Mon Sep 17 00:00:00 2001
From: drunkpig <60862764+drunkpig@users.noreply.github.com>
Date: Fri, 22 Aug 2025 19:34:48 +0800
Subject: [PATCH 1/3] feat: add extract plain text from html source method

---
 llm_web_kit/libs/html_utils.py                |  25 ++
 .../libs/test_get_plain_text_fast.py          | 243 ++++++++++++++++++
 2 files changed, 268 insertions(+)
 create mode 100644 tests/llm_web_kit/libs/test_get_plain_text_fast.py
diff --git a/llm_web_kit/libs/html_utils.py b/llm_web_kit/libs/html_utils.py
index e19627ee..fede6cf0 100644
--- a/llm_web_kit/libs/html_utils.py
+++ b/llm_web_kit/libs/html_utils.py
@@ -450,3 +450,28 @@ def html_normalize_space(text: str) -> str:
         return _text
     except Exception:
         return text
+
+
+def get_plain_text_fast(html_source: str) -> str:
+    """使用lxml快速获取html中的纯文本.
+
+    主要用于语言检测
+    """
+    if not html_source or not html_source.strip():
+        return ""
+
+    doc = html_to_element(html_source)
+    # === 第一步：移除不需要的标签及其内容 ===
+    # 噪声标签列表
+    noise_tags = ['script', 'style', 'noscript', 'iframe', 'embed', 'object']
+    code_tags = ['code', 'pre', 'kbd', 'samp']  # 代码相关
+    all_noise_tags = noise_tags + code_tags
+
+    for tag_name in all_noise_tags:
+        for elem in doc.xpath(f'//{tag_name}'):
+            elem.getparent().remove(elem)  # 安全移除
+
+    # === 第二步：提取所有文本 ===
+    texts = doc.xpath('//text()')
+    full_text = ' '.join(text.strip() for text in texts if text.strip())
+    return full_text
diff --git a/tests/llm_web_kit/libs/test_get_plain_text_fast.py b/tests/llm_web_kit/libs/test_get_plain_text_fast.py
new file mode 100644
index 00000000..c00d4ecf
--- /dev/null
+++ b/tests/llm_web_kit/libs/test_get_plain_text_fast.py
@@ -0,0 +1,243 @@
+"""测试get_plain_text_fast函数."""
+import unittest
+
+from llm_web_kit.libs.html_utils import get_plain_text_fast
+
+
+class TestGetPlainTextFast(unittest.TestCase):
+    """测试get_plain_text_fast函数的单元测试类."""
+
+    def test_empty_input(self):
+        """测试空输入."""
+        # 测试空字符串
+        self.assertEqual(get_plain_text_fast(""), "")
+
+        # 测试None值
+        self.assertEqual(get_plain_text_fast(None), "")
+
+        # 测试只有空白字符的字符串
+        self.assertEqual(get_plain_text_fast("   "), "")
+        self.assertEqual(get_plain_text_fast("\n\t"), "")
+
+    def test_simple_text(self):
+        """测试简单文本提取."""
+        html = "<p>Hello World</p>"
+        result = get_plain_text_fast(html)
+        self.assertEqual(result, "Hello World")
+
+    def test_multiple_elements(self):
+        """测试多个元素的文本提取."""
+        html = "<div><p>Hello</p><p>World</p></div>"
+        result = get_plain_text_fast(html)
+        self.assertEqual(result, "Hello World")
+
+    def test_nested_elements(self):
+        """测试嵌套元素的文本提取."""
+        html = "<div><span>Hello <strong>beautiful</strong> World</span></div>"
+        result = get_plain_text_fast(html)
+        self.assertEqual(result, "Hello beautiful World")
+
+    def test_remove_script_tags(self):
+        """测试移除script标签及其内容."""
+        html = """
+        <div>
+            <p>Visible text</p>
+            <script>console.log('should be removed');</script>
+            <p>More visible text</p>
+        </div>
+        """
+        result = get_plain_text_fast(html)
+        self.assertEqual(result, "Visible text More visible text")
+        self.assertNotIn("console.log", result)
+
+    def test_remove_style_tags(self):
+        """测试移除style标签及其内容."""
+        html = """
+        <div>
+            <p>Visible text</p>
+            <style>body { color: red; }</style>
+            <p>More visible text</p>
+        </div>
+        """
+        result = get_plain_text_fast(html)
+        self.assertEqual(result, "Visible text More visible text")
+        self.assertNotIn("color", result)
+
+    def test_remove_all_noise_tags(self):
+        """测试移除所有噪声标签."""
+        html = """
+        <html>
+            <head><title>Test</title></head>
+            <body>
+                <p>Visible content</p>
+                <script>var x = 1;</script>
+                <style>.class { margin: 0; }</style>
+                <noscript>No JavaScript</noscript>
+                <iframe src="test.html"></iframe>
+                <embed src="test.swf"></embed>
+                <object data="test.pdf"></object>
+                <p>More visible content</p>
+            </body>
+        </html>
+        """
+        result = get_plain_text_fast(html)
+        expected = "Test Visible content More visible content"
+        self.assertEqual(result, expected)
+
+        # 确保噪声内容被移除
+        noise_content = ["var x = 1", "margin: 0", "No JavaScript", "test.html", "test.swf", "test.pdf"]
+        for noise in noise_content:
+            self.assertNotIn(noise, result)
+
+    def test_remove_code_tags(self):
+        """测试移除代码相关标签."""
+        html = """
+        <div>
+            <p>Regular text</p>
+            <code>function test() { return true; }</code>
+            <pre>
+                def hello():
+                    print("world")
+            </pre>
+            <kbd>Ctrl+C</kbd>
+            <samp>$ ls -la</samp>
+            <p>More regular text</p>
+        </div>
+        """
+        result = get_plain_text_fast(html)
+        self.assertEqual(result, "Regular text More regular text")
+
+        # 确保代码内容被移除
+        code_content = ["function test", "def hello", "Ctrl+C", "$ ls -la"]
+        for code in code_content:
+            self.assertNotIn(code, result)
+
+    def test_whitespace_normalization(self):
+        """测试空白字符规范化."""
+        html = """
+        <div>
+            <p>  Multiple   spaces  </p>
+            <p>
+                Line breaks
+                and tabs
+            </p>
+        </div>
+        """
+        result = get_plain_text_fast(html)
+        # 应该规范化为单个空格分隔的文本
+        self.assertEqual(result, "Multiple spaces Line breaks and tabs")
+
+    def test_special_characters(self):
+        """测试特殊字符处理."""
+        html = "<p>Price: $100 &amp; €50 &lt; £75</p>"
+        result = get_plain_text_fast(html)
+        self.assertEqual(result, "Price: $100 & €50 < £75")
+
+    def test_mixed_content_complex(self):
+        """测试复杂混合内容."""
+        html = """
+        <article>
+            <h1>Article Title</h1>
+            <p>This is a paragraph with <a href="link.html">a link</a> and <em>emphasis</em>.</p>
+            <script>
+                // This should be removed
+                analytics.track('page_view');
+            </script>
+            <blockquote>
+                <p>This is a quote</p>
+            </blockquote>
+            <code>console.log('removed');</code>
+            <ul>
+                <li>Item 1</li>
+                <li>Item 2</li>
+            </ul>
+            <style>
+                /* This CSS should be removed */
+                .highlight { background: yellow; }
+            </style>
+        </article>
+        """
+        result = get_plain_text_fast(html)
+        expected = "Article Title This is a paragraph with a link and emphasis. This is a quote Item 1 Item 2"
+        self.assertEqual(result, expected)
+
+        # 确保被移除的内容不存在
+        self.assertNotIn("analytics.track", result)
+        self.assertNotIn("console.log", result)
+        self.assertNotIn("background: yellow", result)
+
+    def test_malformed_html(self):
+        """测试畸形HTML的处理."""
+        html = "<p>Unclosed paragraph<div>Nested without closing</p>Some text"
+        result = get_plain_text_fast(html)
+        # 应该能够提取文本，即使HTML结构不完整
+        self.assertIn("Unclosed paragraph", result)
+        self.assertIn("Nested without closing", result)
+        self.assertIn("Some text", result)
+
+    def test_only_noise_tags(self):
+        """测试只包含噪声标签的HTML."""
+        html = """
+        <script>var x = 1;</script>
+        <style>body { margin: 0; }</style>
+        <noscript>Enable JavaScript</noscript>
+        """
+        result = get_plain_text_fast(html)
+        self.assertEqual(result, "")
+
+    def test_unicode_content(self):
+        """测试Unicode内容."""
+        html = "<p>你好世界 🌍 Здравствуй мир</p>"
+        result = get_plain_text_fast(html)
+        self.assertEqual(result, "你好世界 🌍 Здравствуй мир")
+
+    def test_table_content(self):
+        """测试表格内容提取."""
+        html = """
+        <table>
+            <tr>
+                <th>Name</th>
+                <th>Age</th>
+            </tr>
+            <tr>
+                <td>John</td>
+                <td>25</td>
+            </tr>
+            <tr>
+                <td>Jane</td>
+                <td>30</td>
+            </tr>
+        </table>
+        """
+        result = get_plain_text_fast(html)
+        self.assertIn("Name", result)
+        self.assertIn("Age", result)
+        self.assertIn("John", result)
+        self.assertIn("25", result)
+        self.assertIn("Jane", result)
+        self.assertIn("30", result)
+
+    def test_image_alt_text(self):
+        """测试图片alt文本不会被提取（因为是属性而非文本内容）"""
+        html = '<div><img src="test.jpg" alt="Description"><p>Text content</p></div>'
+        result = get_plain_text_fast(html)
+        self.assertEqual(result, "Text content")
+        # alt属性不应该被提取为文本内容
+        self.assertNotIn("Description", result)
+
+    def test_comments_handling(self):
+        """测试HTML注释处理（应该被HTMLParser移除）"""
+        html = """
+        <div>
+            <!-- This is a comment -->
+            <p>Visible text</p>
+            <!-- Another comment -->
+        </div>
+        """
+        result = get_plain_text_fast(html)
+        self.assertEqual(result, "Visible text")
+        self.assertNotIn("comment", result)
+
+
+if __name__ == '__main__':
+    unittest.main()

From b2df0a3d7db83fa471e20cfc7c0cfb3721e0e79b Mon Sep 17 00:00:00 2001
From: drunkpig <60862764+drunkpig@users.noreply.github.com>
Date: Fri, 22 Aug 2025 19:38:56 +0800
Subject: [PATCH 2/3] feat: add extract plain text from html source method

---
 README.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/README.md b/README.md
index f90f4c6f..9fbeaaec 100644
--- a/README.md
+++ b/README.md
@@ -190,6 +190,16 @@ if __name__=="__main__":
     main_html, is_success = extract(response_json, html)
 ```
 
+### extract plain text from html source
+
+```python
+from llm_web_kit.libs.html_utils import get_plain_text_fast
+html_source = ""
+text = get_plain_text_fast(html_source)
+# language = detect_lang(text)
+
+```
+
 ## Pipeline
 
 1. [HTML pre-dedup](jupyter/html-pre-dedup/main.ipynb)

From ea9e943bcf875e45a3a911366919b352842b8481 Mon Sep 17 00:00:00 2001
From: drunkpig <60862764+drunkpig@users.noreply.github.com>
Date: Fri, 22 Aug 2025 19:57:01 +0800
Subject: [PATCH 3/3] feat: add extract plain text from html source method

---
 .../libs/test_get_plain_text_fast.py          | 54 -------------------
 1 file changed, 54 deletions(-)

diff --git a/tests/llm_web_kit/libs/test_get_plain_text_fast.py b/tests/llm_web_kit/libs/test_get_plain_text_fast.py
index c00d4ecf..87e11479 100644
--- a/tests/llm_web_kit/libs/test_get_plain_text_fast.py
+++ b/tests/llm_web_kit/libs/test_get_plain_text_fast.py
@@ -112,60 +112,6 @@ def hello():
         for code in code_content:
             self.assertNotIn(code, result)
 
-    def test_whitespace_normalization(self):
-        """测试空白字符规范化."""
-        html = """
-        <div>
-            <p>  Multiple   spaces  </p>
-            <p>
-                Line breaks
-                and tabs
-            </p>
-        </div>
-        """
-        result = get_plain_text_fast(html)
-        # 应该规范化为单个空格分隔的文本
-        self.assertEqual(result, "Multiple spaces Line breaks and tabs")
-
-    def test_special_characters(self):
-        """测试特殊字符处理."""
-        html = "<p>Price: $100 &amp; €50 &lt; £75</p>"
-        result = get_plain_text_fast(html)
-        self.assertEqual(result, "Price: $100 & €50 < £75")
-
-    def test_mixed_content_complex(self):
-        """测试复杂混合内容."""
-        html = """
-        <article>
-            <h1>Article Title</h1>
-            <p>This is a paragraph with <a href="link.html">a link</a> and <em>emphasis</em>.</p>
-            <script>
-                // This should be removed
-                analytics.track('page_view');
-            </script>
-            <blockquote>
-                <p>This is a quote</p>
-            </blockquote>
-            <code>console.log('removed');</code>
-            <ul>
-                <li>Item 1</li>
-                <li>Item 2</li>
-            </ul>
-            <style>
-                /* This CSS should be removed */
-                .highlight { background: yellow; }
-            </style>
-        </article>
-        """
-        result = get_plain_text_fast(html)
-        expected = "Article Title This is a paragraph with a link and emphasis. This is a quote Item 1 Item 2"
-        self.assertEqual(result, expected)
-
-        # 确保被移除的内容不存在
-        self.assertNotIn("analytics.track", result)
-        self.assertNotIn("console.log", result)
-        self.assertNotIn("background: yellow", result)
-
     def test_malformed_html(self):
         """测试畸形HTML的处理."""
         html = "<p>Unclosed paragraph<div>Nested without closing</p>Some text"