diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index fa24dd6d..cd7cd387 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -196,11 +196,14 @@ def __check_table_include_math_code(self, raw_html: HtmlElement): ] ele_res.extend(ccinterline_codes) else: - ele_res.extend([ - text.strip() - for text in self._build_html_tree(math_item[1]).itertext() - if text.strip() - ]) + texts = [] + # 使用 itertext() 遍历所有文本片段 + for text_segment in ele_item.itertext(): + # 统一处理文本:去空白 + 替换字面 \n + cleaned_text = text_segment.strip().replace('\\n', '') + if cleaned_text: # 过滤空字符串 + texts.append(cleaned_text) + ele_res.extend(texts) return ele_res def __simplify_td_th_content(self, elem: HtmlElement) -> None: @@ -212,7 +215,8 @@ def __simplify_td_th_content(self, elem: HtmlElement) -> None: parse_res.extend(math_res) for item in list(elem.iterchildren()): elem.remove(item) - elem.text = '
'.join(parse_res) + if parse_res: + elem.text = '
'.join(parse_res) return for child in elem.iter('td', 'th'): self.__simplify_td_th_content(child) @@ -227,18 +231,17 @@ def __get_table_body(self, table_type, table_root): cleaned_attrs = {k: v for k, v in table_root.attrib.items() if k in allowed_attributes} table_root.attrib.clear() table_root.attrib.update(cleaned_attrs) - # text进行strip操作,tail去掉(有较多空换行) + # text进行strip操作,tail保留(部分内容留在tail中) for elem in chain([table_root], table_root.iterdescendants()): - if elem.text: - elem.text = elem.text.strip() - if elem.tail: - elem.tail = None + if elem.text is not None: + elem.text = elem.text.strip().replace('\\n', '') + if elem.tail is not None: + elem.tail = elem.tail.strip().replace('\\n', '') self.__simplify_td_th_content(table_root) # 迭代 for child in table_root.iterchildren(): if child is not None: self.__get_table_body(table_type, child) - return self._element_to_html(table_root) def __do_extract_tables(self, root: HtmlElement) -> None: diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_tail_text.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_tail_text.html new file mode 100644 index 00000000..4044b9a3 --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_tail_text.html @@ -0,0 +1,367 @@ + + + + + + + + + 🇷🇺 | Show hub - Big-Empty DC++ Dchublist NMDC and ADCs хабов Huburi Хаблист + + + + + + + + + + + + + + + + + + + + + + + + +
+ +

Big-Empty

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Client + https://dchublists.com/clients/FlylinkDC_x64.exe +
StatusOnline | ID: 975
URL + https://dchublists.com/hub-975 +
Address +NMDC | dchub://big-empty.ru +
ASN + Style-Com LLC +
Failover + Not available +
NameBig-Empty
Topic + Not available +
Description + Хаб сети Arbital +
Category + Not available +
Software + PtokaX 0.5.3.0 +
Owner + Self +
Location + RU Russian Federation +
Users + 25 | 55 +
Clones0
Share + 4.39 TB | 90.60 TB +
User limit10000
Share limit0 B
Slot limit0
Hub limit0
Reliability99.04%
Checked + 2024-12-09 03:06:01 | 2021-05-07 +
Votes + +0 | -0 | 0 +
Website + Not available +
Email + Not available +
+
+

Online users

+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NickShare
Darv1n1.55 TB
PtokaX0 B
1975628.43 GB
AndyDesktop0 B
Crtyujgfdscvgjh35.54 GB
DaymarixZZZ37.57 GB
Evgeniy_D76.15 GB
Julia0 B
Kuzma0 B
Larsenv0 B
MAXMED8888888864.10 GB
Qwerty_ytr_R724237.12 GB
SERG_B149.65 GB
Sculli156.92 GB
Shareaza404613.03 GB
Soliton14.68 GB
Sweaborg794.15 GB
Viktor138283179.23 GB
[fly]Fire_dU3JR10.72 GB
[fly]Monkey_QGrFy124.72 GB
[fly]Moon_x7m61.13 GB
kotbaun0 B
marcs3.62 GB
minili59.30 GB
y2b4k698df328djei3261.82 GB
+
+
+ +

Comments

+ There are no comments for this hub, you can write one here. +
+
+ + + + +