diff --git a/llm_web_kit/extractor/html/extractor.py b/llm_web_kit/extractor/html/extractor.py
index da5f4d92..4c51bda0 100644
--- a/llm_web_kit/extractor/html/extractor.py
+++ b/llm_web_kit/extractor/html/extractor.py
@@ -91,7 +91,7 @@ def _do_extract(self, data_json: DataJson) -> DataJson:
base_url:str = data_json['url']
page_layout_type:str = data_json.get('page_layout_type', HTMLPageLayoutType.LAYOUT_ARTICLE) # 默认是文章类型
- main_html, method = self._extract_main_html(raw_html, base_url, page_layout_type)
+ main_html, method, title = self._extract_main_html(raw_html, base_url, page_layout_type)
parsed_html = [(main_html,raw_html)]
for extract_func in [self._extract_code, self._extract_table, self._extract_math, self._extract_list,
self._extract_image,
@@ -100,10 +100,11 @@ def _do_extract(self, data_json: DataJson) -> DataJson:
content_list:ContentList = self._export_to_content_list(base_url, parsed_html, raw_html)
data_json['content_list'] = content_list
+ data_json['title'] = title
return data_json
- def _extract_main_html(self, raw_html:str, base_url:str, page_layout_type:str) -> Tuple[str, str]:
+ def _extract_main_html(self, raw_html:str, base_url:str, page_layout_type:str) -> Tuple[str, str, str]:
"""从html文本中提取主要的内容.
Args:
@@ -115,9 +116,8 @@ def _extract_main_html(self, raw_html:str, base_url:str, page_layout_type:str) -
str1: 主要的内容
str2: 获得内容的方式,可对质量进行评估
"""
- # TODO: 从html文本中提取主要的内容
dict_result = self.__magic_html_extractor.extract(raw_html, base_url=base_url, precision=False, html_type=page_layout_type)
- return dict_result['html'], dict_result['xp_num']
+ return dict_result['html'], dict_result['xp_num'], dict_result.get('title', '')
def _extract_code(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]:
"""从html文本中提取代码.