diff --git a/README.md b/README.md index f90f4c6f..9fbeaaec 100644 --- a/README.md +++ b/README.md @@ -190,6 +190,16 @@ if __name__=="__main__": main_html, is_success = extract(response_json, html) ``` +### extract plain text from html source + +```python +from llm_web_kit.libs.html_utils import get_plain_text_fast +html_source = "" +text = get_plain_text_fast(html_source) +# language = detect_lang(text) + +``` + ## Pipeline 1. [HTML pre-dedup](jupyter/html-pre-dedup/main.ipynb) diff --git a/llm_web_kit/libs/html_utils.py b/llm_web_kit/libs/html_utils.py index e19627ee..fede6cf0 100644 --- a/llm_web_kit/libs/html_utils.py +++ b/llm_web_kit/libs/html_utils.py @@ -450,3 +450,28 @@ def html_normalize_space(text: str) -> str: return _text except Exception: return text + + +def get_plain_text_fast(html_source: str) -> str: + """使用lxml快速获取html中的纯文本. + + 主要用于语言检测 + """ + if not html_source or not html_source.strip(): + return "" + + doc = html_to_element(html_source) + # === 第一步:移除不需要的标签及其内容 === + # 噪声标签列表 + noise_tags = ['script', 'style', 'noscript', 'iframe', 'embed', 'object'] + code_tags = ['code', 'pre', 'kbd', 'samp'] # 代码相关 + all_noise_tags = noise_tags + code_tags + + for tag_name in all_noise_tags: + for elem in doc.xpath(f'//{tag_name}'): + elem.getparent().remove(elem) # 安全移除 + + # === 第二步:提取所有文本 === + texts = doc.xpath('//text()') + full_text = ' '.join(text.strip() for text in texts if text.strip()) + return full_text diff --git a/tests/llm_web_kit/libs/test_get_plain_text_fast.py b/tests/llm_web_kit/libs/test_get_plain_text_fast.py new file mode 100644 index 00000000..87e11479 --- /dev/null +++ b/tests/llm_web_kit/libs/test_get_plain_text_fast.py @@ -0,0 +1,189 @@ +"""测试get_plain_text_fast函数.""" +import unittest + +from llm_web_kit.libs.html_utils import get_plain_text_fast + + +class TestGetPlainTextFast(unittest.TestCase): + """测试get_plain_text_fast函数的单元测试类.""" + + def test_empty_input(self): + """测试空输入.""" + # 测试空字符串 + self.assertEqual(get_plain_text_fast(""), "") + + # 测试None值 + self.assertEqual(get_plain_text_fast(None), "") + + # 测试只有空白字符的字符串 + self.assertEqual(get_plain_text_fast(" "), "") + self.assertEqual(get_plain_text_fast("\n\t"), "") + + def test_simple_text(self): + """测试简单文本提取.""" + html = "
Hello World
" + result = get_plain_text_fast(html) + self.assertEqual(result, "Hello World") + + def test_multiple_elements(self): + """测试多个元素的文本提取.""" + html = "Hello
World
Visible text
+ +More visible text
+Visible text
+ +More visible text
+Visible content
+ + + + + + +More visible content
+ + + """ + result = get_plain_text_fast(html) + expected = "Test Visible content More visible content" + self.assertEqual(result, expected) + + # 确保噪声内容被移除 + noise_content = ["var x = 1", "margin: 0", "No JavaScript", "test.html", "test.swf", "test.pdf"] + for noise in noise_content: + self.assertNotIn(noise, result) + + def test_remove_code_tags(self): + """测试移除代码相关标签.""" + html = """ +Regular text
+function test() { return true; }
+
+ def hello():
+ print("world")
+
+ Ctrl+C
+ $ ls -la
+ More regular text
+Unclosed paragraph
你好世界 🌍 Здравствуй мир
" + result = get_plain_text_fast(html) + self.assertEqual(result, "你好世界 🌍 Здравствуй мир") + + def test_table_content(self): + """测试表格内容提取.""" + html = """ +| Name | +Age | +
|---|---|
| John | +25 | +
| Jane | +30 | +

Text content
Visible text
+ +