diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py
index fa24dd6d..cd7cd387 100644
--- a/llm_web_kit/extractor/html/recognizer/table.py
+++ b/llm_web_kit/extractor/html/recognizer/table.py
@@ -196,11 +196,14 @@ def __check_table_include_math_code(self, raw_html: HtmlElement):
]
ele_res.extend(ccinterline_codes)
else:
- ele_res.extend([
- text.strip()
- for text in self._build_html_tree(math_item[1]).itertext()
- if text.strip()
- ])
+ texts = []
+ # 使用 itertext() 遍历所有文本片段
+ for text_segment in ele_item.itertext():
+ # 统一处理文本:去空白 + 替换字面 \n
+ cleaned_text = text_segment.strip().replace('\\n', '')
+ if cleaned_text: # 过滤空字符串
+ texts.append(cleaned_text)
+ ele_res.extend(texts)
return ele_res
def __simplify_td_th_content(self, elem: HtmlElement) -> None:
@@ -212,7 +215,8 @@ def __simplify_td_th_content(self, elem: HtmlElement) -> None:
parse_res.extend(math_res)
for item in list(elem.iterchildren()):
elem.remove(item)
- elem.text = '
'.join(parse_res)
+ if parse_res:
+ elem.text = '
'.join(parse_res)
return
for child in elem.iter('td', 'th'):
self.__simplify_td_th_content(child)
@@ -227,18 +231,17 @@ def __get_table_body(self, table_type, table_root):
cleaned_attrs = {k: v for k, v in table_root.attrib.items() if k in allowed_attributes}
table_root.attrib.clear()
table_root.attrib.update(cleaned_attrs)
- # text进行strip操作,tail去掉(有较多空换行)
+ # text进行strip操作,tail保留(部分内容留在tail中)
for elem in chain([table_root], table_root.iterdescendants()):
- if elem.text:
- elem.text = elem.text.strip()
- if elem.tail:
- elem.tail = None
+ if elem.text is not None:
+ elem.text = elem.text.strip().replace('\\n', '')
+ if elem.tail is not None:
+ elem.tail = elem.tail.strip().replace('\\n', '')
self.__simplify_td_th_content(table_root)
# 迭代
for child in table_root.iterchildren():
if child is not None:
self.__get_table_body(table_type, child)
-
return self._element_to_html(table_root)
def __do_extract_tables(self, root: HtmlElement) -> None:
diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_tail_text.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_tail_text.html
new file mode 100644
index 00000000..4044b9a3
--- /dev/null
+++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_tail_text.html
@@ -0,0 +1,367 @@
+
+
+
+
+
+
+
+
+ 🇷🇺 | Show hub - Big-Empty DC++ Dchublist NMDC and ADCs хабов Huburi Хаблист
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Big-Empty
+
+
+
+ | Client |
+
+ https://dchublists.com/clients/FlylinkDC_x64.exe
+ |
+
+
+ | Status |
+ Online | ID: 975 |
+
+
+ | URL |
+
+ https://dchublists.com/hub-975
+ |
+
+
+ | Address |
+
+NMDC | dchub://big-empty.ru
+ |
+
+
+ | ASN |
+
+ Style-Com LLC
+ |
+
+
+ | Failover |
+
+ Not available
+ |
+
+
+ | Name |
+ Big-Empty |
+
+ | Topic |
+
+ Not available
+ |
+
+
+ | Description |
+
+ Хаб сети Arbital
+ |
+
+
+ | Category |
+
+ Not available
+ |
+
+
+ | Software |
+
+ PtokaX 0.5.3.0
+ |
+
+ | Owner |
+
+ Self
+ |
+
+
+ | Location |
+
+ Russian Federation
+ |
+
+
+ | Users |
+
+ 25 | 55
+ |
+
+
+ | Clones |
+ 0 |
+
+
+ | Share |
+
+ 4.39 TB | 90.60 TB
+ |
+
+
+ | User limit |
+ 10000 |
+
+
+ | Share limit |
+ 0 B |
+
+
+ | Slot limit |
+ 0 |
+
+
+ | Hub limit |
+ 0 |
+
+
+ | Reliability |
+ 99.04% |
+
+
+ | Checked |
+
+ 2024-12-09 03:06:01 | 2021-05-07
+ |
+
+
+ | Votes |
+
+ +0 | -0 | 0
+ |
+
+
+ | Website |
+
+ Not available
+ |
+
+
+ | Email |
+
+ Not available
+ |
+
+
+
+
+
Online users
+
+
+
+
+ | Nick |
+ Share |
+
+
+
+ | Darv1n |
+ 1.55 TB |
+
+
+
+ | PtokaX |
+ 0 B |
+
+
+
+ | 1975 |
+ 628.43 GB |
+
+
+
+ | AndyDesktop |
+ 0 B |
+
+
+
+ | Crtyujgfdscvgjh |
+ 35.54 GB |
+
+
+
+ | DaymarixZZZ |
+ 37.57 GB |
+
+
+
+ | Evgeniy_D |
+ 76.15 GB |
+
+
+
+ | Julia |
+ 0 B |
+
+
+
+ | Kuzma |
+ 0 B |
+
+
+
+ | Larsenv |
+ 0 B |
+
+
+
+ | MAXMED88888888 |
+ 64.10 GB |
+
+
+
+ | Qwerty_ytr_R724 |
+ 237.12 GB |
+
+
+
+ | SERG_B |
+ 149.65 GB |
+
+
+
+ | Sculli |
+ 156.92 GB |
+
+
+
+ | Shareaza4046 |
+ 13.03 GB |
+
+
+
+ | Soliton |
+ 14.68 GB |
+
+
+
+ | Sweaborg |
+ 794.15 GB |
+
+
+
+ | Viktor138283 |
+ 179.23 GB |
+
+
+
+ | [fly]Fire_dU3JR |
+ 10.72 GB |
+
+
+
+ | [fly]Monkey_QGrFy |
+ 124.72 GB |
+
+
+
+ | [fly]Moon_x7m |
+ 61.13 GB |
+
+
+
+ | kotbaun |
+ 0 B |
+
+
+
+ | marcs |
+ 3.62 GB |
+
+
+
+ | minili |
+ 59.30 GB |
+
+
+
+ | y2b4k698df328djei3 |
+ 261.82 GB |
+
+
+
+
+
+
+
Comments
+ There are no comments for this hub, you can
write one here.
+
+
+
+
+
+
+