diff --git a/llm_web_kit/extractor/html/extractor.py b/llm_web_kit/extractor/html/extractor.py index 4c51bda0..e28f088b 100644 --- a/llm_web_kit/extractor/html/extractor.py +++ b/llm_web_kit/extractor/html/extractor.py @@ -2,6 +2,7 @@ from typing import List, Tuple import commentjson as json +from lxml.html import HtmlElement from overrides import override from llm_web_kit.config.cfg_reader import load_config @@ -20,6 +21,7 @@ from llm_web_kit.extractor.html.recognizer.title import TitleRecognizer from llm_web_kit.extractor.html.recognizer.video import VideoRecognizer from llm_web_kit.input.datajson import ContentList, DataJson +from llm_web_kit.libs.doc_element_type import DocElementType from llm_web_kit.libs.html_utils import element_to_html, html_to_element from llm_web_kit.libs.path_lib import get_py_pkg_root_dir @@ -92,12 +94,12 @@ def _do_extract(self, data_json: DataJson) -> DataJson: page_layout_type:str = data_json.get('page_layout_type', HTMLPageLayoutType.LAYOUT_ARTICLE) # 默认是文章类型 main_html, method, title = self._extract_main_html(raw_html, base_url, page_layout_type) - parsed_html = [(main_html,raw_html)] + main_html_element = html_to_element(main_html) + parsed_html = [(main_html_element, raw_html)] for extract_func in [self._extract_code, self._extract_table, self._extract_math, self._extract_list, self._extract_image, self._extract_title, self._extract_paragraph]: parsed_html = extract_func(base_url, parsed_html, raw_html) - content_list:ContentList = self._export_to_content_list(base_url, parsed_html, raw_html) data_json['content_list'] = content_list data_json['title'] = title @@ -119,7 +121,7 @@ def _extract_main_html(self, raw_html:str, base_url:str, page_layout_type:str) - dict_result = self.__magic_html_extractor.extract(raw_html, base_url=base_url, precision=False, html_type=page_layout_type) return dict_result['html'], dict_result['xp_num'], dict_result.get('title', '') - def _extract_code(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]: + def _extract_code(self, base_url:str, html_lst:List[Tuple[HtmlElement, HtmlElement]], raw_html:str) -> List[Tuple[HtmlElement,HtmlElement]]: """从html文本中提取代码. Args: @@ -256,43 +258,43 @@ def __is_valid_node(self, node: dict) -> bool: if not node: raise HtmlFileExtractorException('node is empty') node_type = node.get('type') - valid_types = {'list', 'code', 'equation-interline', 'image', 'table', 'title', 'paragraph'} + valid_types = {DocElementType.TITLE, DocElementType.LIST, DocElementType.CODE, DocElementType.EQUATION_INTERLINE, DocElementType.IMAGE, DocElementType.SIMPLE_TABLE, DocElementType.COMPLEX_TABLE, DocElementType.IMAGE, DocElementType.PARAGRAPH} if node_type not in valid_types: raise HtmlFileExtractorException(f'Invalid node type: {node_type}') # 检查列表类型的节点 - if node.get('type') == 'list': + if node.get('type') == DocElementType.LIST: items = node.get('content', {}).get('items', []) # 过滤掉None、空列表,以及只包含None或空值的列表 return bool(items) and any( isinstance(item, (dict, list)) and bool(item) for item in items) # 检测code类型的节点 - if node.get('type') == 'code': + if node.get('type') == DocElementType.CODE: code_content = node.get('content', {}).get('code_content') # 如果代码内容为None或空字符串,则视为无效节点 return bool(code_content and code_content.strip()) # 检测行间公式类型的节点 - if node.get('type') == 'equation-interline': + if node.get('type') == DocElementType.EQUATION_INTERLINE: math_content = node.get('content', {}).get('math_content') # 如果公式内容为None或空字符串,则视为无效节点 return bool(math_content and math_content.strip()) # 检测image类型的节点 - if node.get('type') == 'image': + if node.get('type') == DocElementType.IMAGE: content = node.get('content', {}) # 检查url、path或data字段是否至少有一个不为空 return bool(content.get('url') or content.get('path') or content.get('data')) # 检测table类型的节点 - if node.get('type') == 'table': + if node.get('type') == DocElementType.SIMPLE_TABLE or node.get('type') == DocElementType.COMPLEX_TABLE: html = node.get('content', {}).get('html') # 如果表格的html内容为None或空字符串,则视为无效节点 return bool(html and html.strip()) # 检测title类型的节点 - if node.get('type') == 'title': + if node.get('type') == DocElementType.TITLE: title_content = node.get('content', {}).get('title_content') # 如果标题内容为None或空字符串,则视为无效节点 return bool(title_content and title_content.strip()) # 检测段落类型的节点 - if node.get('type') == 'paragraph': + if node.get('type') == DocElementType.PARAGRAPH: content = node.get('content', []) # 检查content列表是否存在且不为空,并且至少有一个非空的内容项 return bool(content) and any( @@ -301,7 +303,7 @@ def __is_valid_node(self, node: dict) -> bool: ) return True - def _export_to_content_list(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str) -> ContentList: + def _export_to_content_list(self, base_url:str, html_lst:List[Tuple[HtmlElement,HtmlElement]], raw_html:str) -> ContentList: """将解析结果存入content_list格式中. Args: @@ -318,7 +320,9 @@ def _export_to_content_list(self, base_url:str, html_lst:List[Tuple[str,str]], r ccnode_html, cc_tag = self.__get_cc_node(parsed_html) parser:BaseHTMLElementRecognizer = self.__to_content_list_mapper.get(cc_tag) if parser: - node = parser.to_content_list_node(base_url, ccnode_html, raw_html) + raw_html_str = element_to_html(raw_html) + # raw_html_str = raw_html + node = parser.to_content_list_node(base_url, ccnode_html, raw_html_str) if node and self.__is_valid_node(node): one_page.append(node) else: @@ -326,7 +330,7 @@ def _export_to_content_list(self, base_url:str, html_lst:List[Tuple[str,str]], r content_list = ContentList([one_page]) # 对于网页来说仅有一页,如果多页,则剩下的每个都是一个论坛的回复 return content_list - def __get_cc_node(self, html:str) -> (str, str): + def __get_cc_node(self, html:HtmlElement) -> (HtmlElement, str): """获取html文本的根标签名。只获取一个,如果html文本中包含多个cc标签,则抛异常。 Args: @@ -335,7 +339,8 @@ def __get_cc_node(self, html:str) -> (str, str): Returns: str: 根标签名 """ - el = html_to_element(html) + # el = html_to_element(html) + el = html if el.tag in self.__to_content_list_mapper.keys(): return html, el.tag else: @@ -346,7 +351,8 @@ def __get_cc_node(self, html:str) -> (str, str): raise HtmlFileExtractorException(f'html文本中没有cc标签: {html}') if len(nodes) > 3: raise HtmlFileExtractorException(f'html文本中包含多个cc标签: {html}') - return element_to_html(nodes[0]), nodes[0].tag + # return element_to_html(nodes[0]), nodes[0].tag + return nodes[0], nodes[0].tag def __build_extractor(self): """ diff --git a/llm_web_kit/extractor/html/recognizer/audio.py b/llm_web_kit/extractor/html/recognizer/audio.py index 24acc343..f9e74a7b 100644 --- a/llm_web_kit/extractor/html/recognizer/audio.py +++ b/llm_web_kit/extractor/html/recognizer/audio.py @@ -1,5 +1,6 @@ from typing import List, Tuple +from lxml.html import HtmlElement from overrides import override from llm_web_kit.extractor.html.recognizer.recognizer import \ @@ -9,7 +10,7 @@ class AudioRecognizer(BaseHTMLElementRecognizer): """解析音频元素.""" @override - def recognize(self, base_url:str, main_html_lst: List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]: + def recognize(self, base_url:str, main_html_lst: List[Tuple[HtmlElement,HtmlElement]], raw_html:str) -> List[Tuple[HtmlElement,HtmlElement]]: """父类,解析音频元素. Args: @@ -22,5 +23,15 @@ def recognize(self, base_url:str, main_html_lst: List[Tuple[str,str]], raw_html: raise NotImplementedError @override - def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segment: str) -> dict: + def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_html_segment: str) -> dict: + """ + 把音频元素转换为content list node. + Args: + base_url: + parsed_content: + raw_html_segment: + + Returns: + + """ raise NotImplementedError diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/common.py b/llm_web_kit/extractor/html/recognizer/cc_math/common.py index 2cd91a19..08d3f9e2 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/common.py +++ b/llm_web_kit/extractor/html/recognizer/cc_math/common.py @@ -86,10 +86,10 @@ class MATH_TYPE_PATTERN: ['\\[', '\\]'], ['$$', '$$'], ['[tex]', '[/tex]'], # 这个网站自定义的分割,https://www.physicsforums.com/threads/turning-to-a-single-logarithm-then-simply.269419/ - ['\\begin{equation}', '\\end{equation}'], - ['\\begin{align}', '\\end{align}'], - ['\\begin{alignat}', '\\end{alignat}'], - ['\\begin{array}', '\\end{array}'], + # ['\\begin{equation}', '\\end{equation}'], + # ['\\begin{align}', '\\end{align}'], + # ['\\begin{alignat}', '\\end{alignat}'], + # ['\\begin{array}', '\\end{array}'], # 添加通用的begin/end匹配 ['\\begin{.*?}', '\\end{.*?}'], ], diff --git a/llm_web_kit/extractor/html/recognizer/cccode.py b/llm_web_kit/extractor/html/recognizer/cccode.py index 4a638fee..fe5744a7 100644 --- a/llm_web_kit/extractor/html/recognizer/cccode.py +++ b/llm_web_kit/extractor/html/recognizer/cccode.py @@ -7,19 +7,17 @@ tag_pre_code) from llm_web_kit.extractor.html.recognizer.recognizer import ( BaseHTMLElementRecognizer, CCTag) -from llm_web_kit.libs.html_utils import element_to_html, html_to_element class CodeRecognizer(BaseHTMLElementRecognizer): """解析代码元素.""" - @override def recognize( self, base_url: str, - main_html_lst: List[Tuple[str, str]], - raw_html: str, - ) -> List[Tuple[str, str]]: + main_html_lst: List[Tuple[HtmlElement, HtmlElement]], + raw_html: str + ) -> List[Tuple[HtmlElement, HtmlElement]]: """父类,解析代码元素. Args: @@ -38,7 +36,8 @@ def recognize( if self.is_cc_html(html): rtn.append((html, raw_html)) continue - root: HtmlElement = html_to_element(html) + # root: HtmlElement = html_to_element(html) + root = html while True: # 最常见: #
@@ -77,31 +76,36 @@ def remove_empty_code(r: HtmlElement): remove_empty_code(x) remove_empty_code(root) - - html_str: str = element_to_html(root) - - rtn.extend(BaseHTMLElementRecognizer.html_split_by_tags(html_str, CCTag.CC_CODE)) - + # html_str: str = element_to_html(root) + rtn.extend(BaseHTMLElementRecognizer.html_split_by_tags(root, CCTag.CC_CODE)) return rtn @override - def to_content_list_node(self, base_url:str, parsed_content: str, raw_html_segment:str) -> dict: - code_node: HtmlElement = html_to_element(parsed_content) + def to_content_list_node(self, base_url:str, parsed_content: HtmlElement, raw_html_segment:str) -> dict: + """ + 把代码元素转换为content list node. + Args: + base_url: + parsed_content: HtmlElement对象 + raw_html_segment: + + Returns: + """ d = { 'type': 'code', # "bbox": [], 'raw_content': raw_html_segment, - 'inline': code_node.get('inline', 'false') == 'true', + 'inline': parsed_content.get('inline', 'false') == 'true', 'content': { - 'code_content': code_node.text, + 'code_content': parsed_content.text, }, } - if lang := code_node.get('language', None): + if lang := parsed_content.get('language', None): d['content']['language'] = lang - if by := code_node.get('by', None): + if by := parsed_content.get('by', None): d['content']['by'] = by return d diff --git a/llm_web_kit/extractor/html/recognizer/ccmath.py b/llm_web_kit/extractor/html/recognizer/ccmath.py index 91ba602d..62701417 100644 --- a/llm_web_kit/extractor/html/recognizer/ccmath.py +++ b/llm_web_kit/extractor/html/recognizer/ccmath.py @@ -26,7 +26,7 @@ def __init__(self): self.cm = CCMATH() @override - def recognize(self, base_url: str, main_html_lst: List[Tuple[str, str]], raw_html: str) -> List[Tuple[str, str]]: + def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html: str) -> List[Tuple[HtmlElement, HtmlElement]]: """父类,解析数学公式元素. Args: @@ -56,7 +56,7 @@ def recognize(self, base_url: str, main_html_lst: List[Tuple[str, str]], raw_htm return result @override - def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segment: str) -> dict: + def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_html_segment: str) -> dict: """将content转换成content_list_node. 每种类型的html元素都有自己的content-list格式:参考 docs/specification/output_format/content_list_spec.md 例如代码的返回格式: @@ -78,7 +78,7 @@ def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segm Returns: dict: content_list_node """ - tree = self._build_html_tree(parsed_content) + tree = parsed_content if tree is None: raise HtmlMathRecognizerException(f'Failed to load html: {parsed_content}') @@ -125,7 +125,7 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe """ # node是从cc_html中解析出来的lxml节点 self.cm.url = base_url - tree = self._build_html_tree(cc_html) + tree = cc_html math_render_type = math_render.get_render_type() if tree is None: raise HtmlMathRecognizerException(f'Failed to load html: {cc_html}') @@ -171,20 +171,20 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe # 保存处理后的html # with open('math_physicsforums_1_processed.html', 'w') as f: # f.write(self._element_to_html(tree)) - return self.html_split_by_tags(self._element_to_html(tree), [CCTag.CC_MATH_INTERLINE]) + return self.html_split_by_tags(tree, [CCTag.CC_MATH_INTERLINE]) - def process_mathjax_html(self, cc_html: str, o_html: str, math_render: BaseMathRender, base_url: str) -> List[Tuple[str, str]]: + def process_mathjax_html(self, cc_html: HtmlElement, o_html: HtmlElement, math_render: BaseMathRender, base_url: str) -> List[Tuple[HtmlElement, HtmlElement]]: """处理mathjax有自定义标识符的数学公式.""" self.cm.url = base_url try: - tree = self._build_html_tree(cc_html) + tree = cc_html math_render.find_math(tree) # with open('math_physicsforums_1_processed.html', 'w') as f: # f.write(self._element_to_html(tree)) except Exception as e: raise HtmlMathMathjaxRenderRecognizerException(f'处理mathjax有自定义标识符的数学公式失败: {e}') - return self.html_split_by_tags(self._element_to_html(tree), [CCTag.CC_MATH_INTERLINE]) + return self.html_split_by_tags(tree, [CCTag.CC_MATH_INTERLINE]) if __name__ == '__main__': diff --git a/llm_web_kit/extractor/html/recognizer/code/tag_code.py b/llm_web_kit/extractor/html/recognizer/code/tag_code.py index 98d9aa3f..760d312b 100644 --- a/llm_web_kit/extractor/html/recognizer/code/tag_code.py +++ b/llm_web_kit/extractor/html/recognizer/code/tag_code.py @@ -1,3 +1,5 @@ +from typing import Optional + from lxml.html import HtmlElement from llm_web_kit.extractor.html.recognizer.code.common import ( @@ -29,111 +31,6 @@ def __is_all_chars_in_code_element(node: HtmlElement) -> bool: return full_text == code_text -def __group_code_by_distance( - root: HtmlElement, - node_paths: list[list[str]], - dist: list[list[int]], -) -> list[str]: - father = list(range(len(node_paths))) - - def get_father(x: int) -> int: - if father[x] == x: - return x - father[x] = get_father(father[x]) - return father[x] - - edges: list[tuple[int, int, int]] = [] - root_paths: list[list[str]] = [] - for i in range(len(node_paths)): - root_paths.append(node_paths[i]) - for j in range(i + 1, len(node_paths)): - edges.append((dist[i][j], i, j)) - edges = sorted(edges) - - used_edge = 0 - meet = set() - for edge in edges: - _, i, j = edge - i = get_father(i) - j = get_father(j) - if i != j and (i, j) not in meet: - common_node_idx = min(len(root_paths[i]), len(root_paths[j])) - for idx, (x, y) in enumerate(zip(root_paths[i], root_paths[j])): - if idx == 0: - continue - if x != y: - common_node_idx = idx - break - maybe_tree_root = __get_html_element(root, root_paths[i][:common_node_idx]) - - if len(maybe_tree_root.xpath(f'.//{CCTag.CC_CODE}|.//{CCTag.CC_CODE_INLINE}')) > 0: - meet.add((i, j)) - continue - - if not __is_all_chars_in_code_element(maybe_tree_root): - meet.add((i, j)) - continue - - root_paths[i] = root_paths[i][:common_node_idx] - used_edge += 1 - father[j] = i - - root_paths = [ - root_path for i, root_path in enumerate(root_paths) if i == get_father(i) - ] - - removed = set() - root_paths_joined = sorted( - list(set(['/'.join(root_path) for root_path in root_paths])) - ) - for x in root_paths_joined: - for y in root_paths_joined: - if len(x) < len(y) and y.startswith(x): - removed.add(y) - return [x for x in root_paths_joined if x not in removed] - - -def __compute_distance_matrix(node_paths: list[list[str]]) -> list[list[int]]: - """ - 计算节点路径的距离矩阵,具体步骤: - 1. 创建距离矩阵,计算每两个节点之间的距离 - 2. 距离计算方法:从共同祖先节点到两个节点的路径长度之和 - 例如: - 节点1路径:/html/body/div/code - 节点2路径:/html/body/pre/code - 共同祖先到 body,距离为 2(div->code) + 2(pre->code) = 4 - 节点1和节点2的距离为 4 - - 距离矩阵(对称矩阵): - [0, 1, 2, 3], - [1, 0, 1, 2], - [2, 1, 0, 1], - [3, 2, 1, 0] - - Args: - node_paths: 节点路径 - - Returns: - list[list[int]]: 距离矩阵 - """ - def get_lca_depth(path1: list[str], path2: list[str]) -> int: - for i, (x, y) in enumerate(zip(path1, path2)): - if x != y: - return i - return min(len(path1), len(path2)) - - n = len(node_paths) - dist = [[0] * n for _ in range(n)] - - for i in range(n): - for j in range(i + 1, n): - lca_depth = get_lca_depth(node_paths[i], node_paths[j]) - d = len(node_paths[i]) + len(node_paths[j]) - 2 * lca_depth - dist[i][j] = dist[j][i] = d - - return dist - - def __get_code_node_paths(html_el: HtmlElement) -> list[list[str]]: """获取 html_el 中所有 code 标签的路径 只获取最外层的code标签, 如果code标签内还有code标签,则不获取。 @@ -223,6 +120,49 @@ def __detect_inline_code(root: HtmlElement, node_paths: list[list[str]]) -> tupl return new_node_paths, inline_code +def __group_code(root: HtmlElement, node_paths: list[list[str]]) -> list[str]: + root_paths = [] + + def next_parent(code_node: HtmlElement, code_tags: int) -> tuple[Optional[HtmlElement], int]: + parent: Optional[HtmlElement] = code_node.getparent() + while parent is not None: + new_code_tags = len(parent.xpath('.//code')) + if new_code_tags == code_tags: + parent = parent.getparent() + else: + return parent, new_code_tags + return None, 0 + + while len(node_paths): + code_node = __get_html_element(root, node_paths[0]) + code_tags = len(code_node.xpath('.//code')) + + parent, new_code_tags = next_parent(code_node, code_tags) + while parent is not None: + if not __is_all_chars_in_code_element(parent): + break + + if len(parent.xpath(f'.//{CCTag.CC_CODE}|.//{CCTag.CC_CODE_INLINE}')) > 0: + break + + code_node = parent + code_tags = new_code_tags + + parent, new_code_tags = next_parent(code_node, code_tags) + + root_path = code_node.getroottree().getpath(code_node) + root_paths.append(root_path) + + new_node_path = [] + for node_path in node_paths: + if '/'.join(node_path).startswith(root_path): + continue + new_node_path.append(node_path) + node_paths = new_node_path + + return root_paths + + def modify_tree(root: HtmlElement) -> None: """将 html 树中所有 code 标签转换为代码块. @@ -239,8 +179,8 @@ def modify_tree(root: HtmlElement) -> None: elif len(node_paths) == 1: tree_roots = ['/'.join(node_paths[0])] else: - dist_matrix = __compute_distance_matrix(node_paths) # 计算距离矩阵 - tree_roots = __group_code_by_distance(root, node_paths, dist_matrix) # 根据距离矩阵,对code标签进行分组 + tree_roots = __group_code(root, node_paths) # 根据距离矩阵,对code标签进行分组 + tree_roots = sorted(tree_roots) nodes = __get_code_blocks_nodes(root, tree_roots) # 获取所有需要被转换为代码块的节点,并进行标签替换 for node in nodes: diff --git a/llm_web_kit/extractor/html/recognizer/image.py b/llm_web_kit/extractor/html/recognizer/image.py index 7be5b862..af362110 100644 --- a/llm_web_kit/extractor/html/recognizer/image.py +++ b/llm_web_kit/extractor/html/recognizer/image.py @@ -18,7 +18,7 @@ class ImageRecognizer(BaseHTMLElementRecognizer): IMG_LABEL = ['.jpg', '.jpeg', '.png', '.gft', '.webp', '.bmp', '.svg', 'data:image', '.gif'] # '.pdf' @override - def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segment: str) -> dict: + def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_html_segment: str) -> dict: """将content转换成content_list_node. 每种类型的html元素都有自己的content-list格式:参考 docs/specification/output_format/content_list_spec.md 例如代码的返回格式: @@ -43,7 +43,8 @@ def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segm Returns: dict: content_list_node """ - html_obj = self._build_html_tree(parsed_content) + # html_obj = self._build_html_tree(parsed_content) + html_obj = parsed_content if html_obj.tag == CCTag.CC_IMAGE: return self.__ccimg_to_content_list(raw_html_segment, html_obj) @@ -66,7 +67,7 @@ def __ccimg_to_content_list(self, raw_html_segment: str, html_obj: HtmlElement) return result @override - def recognize(self, base_url: str, main_html_lst: List[Tuple[str, str]], raw_html: str) -> List[Tuple[str, str]]: + def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html: str) -> List[Tuple[HtmlElement, HtmlElement]]: """父类,解析图片元素. Args: @@ -88,9 +89,10 @@ def recognize(self, base_url: str, main_html_lst: List[Tuple[str, str]], raw_htm ccimg_html.append(html_li) return ccimg_html - def __parse_html_img(self, base_url: str, html_str: Tuple[str, str]) -> List[Tuple[str, str]]: + def __parse_html_img(self, base_url: str, html_str: Tuple[HtmlElement, HtmlElement]) -> List[Tuple[HtmlElement, HtmlElement]]: """解析html,获取img标签.""" - html_obj = self._build_html_tree(html_str[0]) + # html_obj = self._build_html_tree(html_str[0]) + html_obj = html_str[0] image_related_selectors = [ '//*[contains(@class, "image-embed") or contains(@id, "image-embed")]', # 可能包含嵌入图片的自定义标签 '//*[starts-with(@src, "data:image/") and not(self::img)]', @@ -168,7 +170,8 @@ def __parse_img_elements(self, base_url: str, img_elements: HtmlElement, html_ob self._replace_element(elem, new_ccimage) if is_valid_img: - updated_html = self._element_to_html(html_obj) + # updated_html = self._element_to_html(html_obj) + updated_html = html_obj return (updated_html, img_tag) else: return (None, None) diff --git a/llm_web_kit/extractor/html/recognizer/list.py b/llm_web_kit/extractor/html/recognizer/list.py index 7694ba1a..2615f60e 100644 --- a/llm_web_kit/extractor/html/recognizer/list.py +++ b/llm_web_kit/extractor/html/recognizer/list.py @@ -1,7 +1,7 @@ import json from typing import Any, List, Tuple -from lxml.etree import _Element as HtmlElement +from lxml.html import HtmlElement from overrides import override from llm_web_kit.exception.exception import HtmlListRecognizerException @@ -13,7 +13,7 @@ class ListRecognizer(BaseHTMLElementRecognizer): """解析列表元素.""" - def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segment: str) -> dict: + def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_html_segment: str) -> dict: """专化为列表元素的解析. Args: @@ -23,6 +23,8 @@ def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segm Returns: """ + if not isinstance(parsed_content, HtmlElement): + raise HtmlListRecognizerException(f'parsed_content 必须是 HtmlElement 类型,而不是 {type(parsed_content)}') ordered, content_list, _, list_nest_level = self.__get_attribute(parsed_content) ele_node = { 'type': DocElementType.LIST, @@ -37,7 +39,7 @@ def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segm return ele_node @override - def recognize(self, base_url: str, main_html_lst: List[Tuple[str, str]], raw_html: str) -> List[Tuple[str, str]]: + def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html: str) -> List[Tuple[HtmlElement, HtmlElement]]: """父类,解析列表元素. Args: @@ -57,7 +59,7 @@ def recognize(self, base_url: str, main_html_lst: List[Tuple[str, str]], raw_htm new_html_lst.extend(lst) return new_html_lst - def _extract_list(self, raw_html: str) -> List[Tuple[str, str]]: + def _extract_list(self, raw_html: HtmlElement) -> List[Tuple[HtmlElement, HtmlElement]]: """提取列表元素. 不支持嵌套列表,如果有嵌套的情况,则内部列表将作为一个单独的段落,内部列表的每个列表项作为一个单独的句子,使用句号结尾。 列表在html中有以下几个标签: @@ -70,12 +72,13 @@ def _extract_list(self, raw_html: str) -> List[Tuple[str, str]]: Returns: List[Tuple[str, str]]: 列表元素, 第一个str是xxx, 第二个str是原始的html内容 """ - tree = self._build_html_tree(raw_html) + # tree = self._build_html_tree(raw_html) + tree = raw_html self.__do_extract_list(tree) # 最后切割html - new_html = self._element_to_html(tree) + # new_html = self._element_to_html(tree) + new_html = tree lst = self.html_split_by_tags(new_html, CCTag.CC_LIST) - return lst def __do_extract_list(self, root:HtmlElement) -> None: @@ -219,7 +222,7 @@ def __extract_list_item_text_recusive(el: HtmlElement) -> list[list]: return text_paragraph - def __get_attribute(self, html:str) -> Tuple[bool, dict, str]: + def __get_attribute(self, html:HtmlElement) -> Tuple[bool, dict, str]: """获取element的属性. Args: @@ -228,7 +231,8 @@ def __get_attribute(self, html:str) -> Tuple[bool, dict, str]: Returns: Tuple[str]: 第一个元素是是否有序; 第二个元素是个python list,内部是文本和行内公式,具体格式参考list的content_list定义。第三个元素是列表原始的html内容 """ - ele = self._build_html_tree(html) + # ele = self._build_html_tree(html) + ele = html if ele is not None and ele.tag == CCTag.CC_LIST: ordered = ele.attrib.get('ordered', 'False') in ['True', 'true'] content_list = json.loads(ele.text) diff --git a/llm_web_kit/extractor/html/recognizer/recognizer.py b/llm_web_kit/extractor/html/recognizer/recognizer.py index 736b3637..6ab1a5ef 100644 --- a/llm_web_kit/extractor/html/recognizer/recognizer.py +++ b/llm_web_kit/extractor/html/recognizer/recognizer.py @@ -29,7 +29,7 @@ class BaseHTMLElementRecognizer(ABC): """基本的元素解析类.""" @abstractmethod - def recognize(self, base_url:str, main_html_lst: List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]: + def recognize(self, base_url:str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html:str) -> List[Tuple[HtmlElement, HtmlElement]]: """父类,解析html中的元素. Args: @@ -38,11 +38,12 @@ def recognize(self, base_url:str, main_html_lst: List[Tuple[str,str]], raw_html: raw_html: 原始完整的html Returns: + List[Tuple[HtmlElement, HtmlElement]]: 处理后的HTML元素列表 """ raise NotImplementedError @abstractmethod - def to_content_list_node(self, base_url:str, parsed_content: str, raw_html_segment:str) -> dict: + def to_content_list_node(self, base_url:str, parsed_content: HtmlElement, raw_html_segment:str) -> dict: """将content转换成content_list_node. 每种类型的html元素都有自己的content-list格式:参考 docs/specification/output_format/content_list_spec.md 例如代码的返回格式: @@ -119,7 +120,7 @@ def _replace_element(self, element:HtmlElement, cc_element:HtmlElement) -> None: replace_element(element, cc_element) @staticmethod - def html_split_by_tags(html_segment: str, split_tag_names:str | list) -> List[Tuple[str,str]]: + def html_split_by_tags(root: HtmlElement, split_tag_names:str | list) -> List[Tuple[HtmlElement,HtmlElement]]: """根据split_tag_name将html分割成不同的部分. Args: @@ -127,7 +128,7 @@ def html_split_by_tags(html_segment: str, split_tag_names:str | list) -> List[Tu split_tag_names: str|list: 分割标签名, 例如 'p' 或者 'div' 或者 ['p', 'div'] """ copy_attri = True # 是否copy 父节点的属性 - root = html_to_element(html_segment) + # root = html_to_element(html_segment) if isinstance(split_tag_names, str): # 如果参数是str,转换成list split_tag_names = [split_tag_names] @@ -184,7 +185,8 @@ def __split_node(elem: HtmlElement): for sub_elem in elem: if sub_elem.tag in split_tag_names: # previous elements - nodes = raw_nodes = element_to_html(path[0]) + # nodes = raw_nodes = element_to_html(path[0]) + nodes = raw_nodes = path[0] if not __is_element_text_empty(path[0]): yield nodes, raw_nodes @@ -196,7 +198,11 @@ def __split_node(elem: HtmlElement): if not html_source_segment: mylogger.error(f'{sub_elem.tag} has no html attribute') # TODO raise exception - nodes, raw_nodes = element_to_html(path[0]), html_source_segment + # nodes, raw_nodes = element_to_html(path[0]), html_source_segment + if html_source_segment: + nodes, raw_nodes = path[0], html_to_element(html_source_segment) + else: + nodes, raw_nodes = path[0], None # if not __is_element_text_empty(path[0]): yield nodes, raw_nodes # 这个地方无需检查是否为空,因为这个是分割元素,必须返还 @@ -213,7 +219,8 @@ def __split_node(elem: HtmlElement): copied.tail = elem.tail if not path: - nodes = raw_nodes = element_to_html(copied) + nodes = raw_nodes = copied + # raw_nodes = element_to_html(copied) if not __is_element_text_empty(copied): yield nodes, raw_nodes @@ -221,30 +228,38 @@ def __split_node(elem: HtmlElement): return rtn @staticmethod - def is_cc_html(html: str, tag_name: str | list = None) -> bool: + def is_cc_html(el: HtmlElement, tag_name: str | list = None) -> bool: """判断html片段是否是cc标签. 判断的时候由于自定义ccmath等标签可能会含有父标签,因此要逐层判断tagname. 含有父html 完整路径的如:...,这种情况也会被识别为cc标签. - TODO 保证进来的cc标签没有父标签,只有一个根标签。 + Args: - html: str: html片段 + el: str|HtmlElement: html片段或HtmlElement对象 tag_name: str|list: cc标签,如ccmath, cccode, 如果指定了那么就只检查这几个标签是否在html里,否则检查所有cc标签 """ - # cc标签是指自定义标签,例如等,输入html片段,判断是否是cc标签 - el = html_to_element(html) if el is None: return False + # 默认cc标签列表 + default_tag_names = [ + CCTag.CC_CODE, CCTag.CC_MATH_INTERLINE, CCTag.CC_IMAGE, CCTag.CC_VIDEO, + CCTag.CC_AUDIO, CCTag.CC_TABLE, CCTag.CC_LIST, CCTag.CC_TEXT, CCTag.CC_TITLE + ] + + # 确定需要检查的标签集合 if tag_name: if isinstance(tag_name, str): - tag_to_check = [tag_name] + tags = {tag_name} else: - tag_to_check = tag_name + tags = set(tag_name) else: - tag_to_check = [CCTag.CC_CODE, CCTag.CC_MATH_INTERLINE, CCTag.CC_IMAGE, CCTag.CC_VIDEO, CCTag.CC_AUDIO, CCTag.CC_TABLE, CCTag.CC_LIST, CCTag.CC_TEXT, CCTag.CC_TITLE] + tags = set(default_tag_names) + + # 如果当前元素的标签匹配,直接返回True + if el.tag in tags: + return True - for tag in tag_to_check: - if el.tag == tag or el.xpath(f'.//{tag}') : - return True - return False + # 构建XPath表达式,检查子元素是否包含目标标签 + xpath_expr = ' or '.join([f'self::{tag}' for tag in tags]) + return bool(el.xpath(f'.//*[{xpath_expr}]')) diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index db9351a0..3effe18a 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -9,6 +9,7 @@ from llm_web_kit.extractor.html.recognizer.recognizer import ( BaseHTMLElementRecognizer, CCTag) from llm_web_kit.libs.doc_element_type import DocElementType +from llm_web_kit.libs.html_utils import remove_element class TableRecognizer(BaseHTMLElementRecognizer): @@ -16,12 +17,13 @@ class TableRecognizer(BaseHTMLElementRecognizer): def __init__(self): super().__init__() + self.math_recognizer = MathRecognizer() @override def recognize(self, base_url: str, - main_html_lst: List[Tuple[str, str]], - raw_html: str) -> List[Tuple[str, str]]: + main_html_lst: List[Tuple[HtmlElement, HtmlElement]], + raw_html: str) -> List[Tuple[HtmlElement, HtmlElement]]: """父类,解析表格元素. Args: @@ -30,6 +32,7 @@ def recognize(self, raw_html: 原始完整的html Returns: + List[Tuple[HtmlElement, HtmlElement]]: 处理后的HTML元素列表 """ final_result = list() for cc_html, o_html in main_html_lst: @@ -41,66 +44,59 @@ def recognize(self, return final_result @override - def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segment: str) -> dict: - if not parsed_content: - raise HtmlTableRecognizerException(f'table parsed_content{parsed_content}为空') + def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_html_segment: str) -> dict: + if not isinstance(parsed_content, HtmlElement): + raise HtmlTableRecognizerException(f'parsed_content 必须是 HtmlElement 类型,而不是 {type(parsed_content)}') + table_type, table_nest_level, table_body = self.__get_attribute(parsed_content) + + # 确保 table_body 不为 None 且是字符串类型 + html_content = table_body if table_body is not None else '' + # 使用传入的 raw_html_segment 或将 parsed_content 转换为字符串 + if table_type: + cc_table_type = DocElementType.COMPLEX_TABLE + else: + cc_table_type = DocElementType.SIMPLE_TABLE d = { - 'type': DocElementType.TABLE, - # "bbox": [], + 'type': cc_table_type, 'raw_content': raw_html_segment, 'content': { - 'html': table_body, - }, + 'html': html_content, + 'is_complex': table_type, + 'table_nest_level': table_nest_level + } } - d['content']['is_complex'] = table_type - d['content']['table_nest_level'] = table_nest_level return d - def __is_contain_cc_html(self, cc_html: str) -> bool: + def __is_contain_cc_html(self, cc_html: HtmlElement) -> bool: """判断html片段是否是cc标签.""" return BaseHTMLElementRecognizer.is_cc_html(cc_html) - def __is_table_empty(self, table) -> bool: - """检查表格是否为空(递归检查嵌套元素) - - :param table: lxml.html.HtmlElement 对象,表示一个 元素 - :return: 如果表格为空,返回 True;否则返回 False - """ - def is_element_empty(elem): - # 检查元素本身的文本内容 - if elem.text and elem.text.strip(): - return False - # 检查所有子元素 - for child in elem.iterchildren(): - # 如果是嵌套表格,递归检查表格是否为空 - if child.tag == 'table': - if not self.__is_table_empty(child): - return False - # 其他元素需要递归检查 - elif not is_element_empty(child): - return False - # 检查尾部文本(如 后的文本) - if elem.tail and elem.tail.strip(): - return False - return True - - # 检查所有单元格 - for cell in table.xpath('.//td | .//th'): - # 检查单元格内容 + def __is_table_empty(self, table: HtmlElement) -> bool: + """table是否为空.""" + # 合并单元格查询 + cells = table.xpath('.//td | .//th') + for cell in cells: if cell.text and cell.text.strip(): return False - # 递归检查子元素 - if not is_element_empty(cell): - return False + stack = [cell] + while stack: + elem = stack.pop() + if elem.text and elem.text.strip(): + return False + if elem.tail and elem.tail.strip(): + return False + # 添加子元素到栈中(倒序保证处理顺序) + stack.extend(reversed(elem.getchildren())) return True - def __is_simple_table(self, tree) -> bool: + def __is_simple_table(self, tree: HtmlElement) -> bool: """处理table元素,判断是是否复杂:是否包含合并单元格.""" - cells = tree.xpath('.//td') + tree.xpath('.//th') + print('tree', self._element_to_html(tree)) + cells = tree.xpath('.//td | .//th') for cell in cells: - colspan_str = cell.get('colspan', '1') - rowspan_str = cell.get('rowspan', '1') + colspan_str = cell.get('colspan', '1').strip('"\'\\') + rowspan_str = cell.get('rowspan', '1').strip('"\'\\') try: colspan = int(colspan_str) rowspan = int(rowspan_str) @@ -111,34 +107,44 @@ def __is_simple_table(self, tree) -> bool: return False return True - def __is_table_nested(self, element) -> int: - """计算表格的嵌套层级(非表格返回0,根据原始table判断的.""" + def __is_table_nested(self, element: HtmlElement) -> int: + """计算表格的嵌套层级.""" if element.tag != 'table': return 0 - # 获取当前表格下所有的表格(包括自身) - all_tables = [element] + element.xpath('.//table') - max_level = 1 # 初始层级为1(当前表格) - # 计算每个表格的层级,取最大值 - for table in all_tables: - ancestor_count = len(table.xpath('ancestor::table')) - level = ancestor_count + 1 - max_level = max(max_level, level) + + # 初始化栈结构:存储(当前元素, 当前层级) + stack = [(element, 1)] + max_level = 1 + + # 深度优先遍历 + while stack: + current, current_level = stack.pop() + # 更新最大层级 + max_level = max(max_level, current_level) + # 遍历子元素(倒序保证处理顺序) + for child in reversed(current.getchildren()): + if child.tag == 'table': + # 遇到子表格时层级+1 + stack.append((child, current_level + 1)) + else: + # 非表格元素保持当前层级 + stack.append((child, current_level)) return max_level - def __extract_tables(self, ele: str) -> List[Tuple[str, str]]: + def __extract_tables(self, tree: HtmlElement) -> List[Tuple[HtmlElement, HtmlElement]]: """提取html中的table元素.""" - tree = self._build_html_tree(ele) self.__do_extract_tables(tree) - new_html = self._element_to_html(tree) + new_html = tree lst = self.html_split_by_tags(new_html, CCTag.CC_TABLE) return lst def __get_table_type(self, child: HtmlElement) -> str: """获取table的类型.""" + assert isinstance(child, HtmlElement) empty_flag = self.__is_table_empty(child) - level = self.__is_table_nested(child) if empty_flag: return 'empty' + level = self.__is_table_nested(child) # 是否跨行跨列 flag = (self.__is_simple_table(child) and level < 2) if flag: @@ -149,16 +155,16 @@ def __get_table_type(self, child: HtmlElement) -> str: def __check_table_include_math_code(self, raw_html: HtmlElement): """检查table中的内容,包括普通文本、数学公式和代码.""" - math_html = self._element_to_html(raw_html) - math_recognizer = MathRecognizer() - math_res_parts = math_recognizer.recognize( + math_raw_html = self._element_to_html(raw_html) + math_html = raw_html + math_res_parts = self.math_recognizer.recognize( base_url='', main_html_lst=[(math_html, math_html)], - raw_html=math_html + raw_html=math_raw_html ) result = [] for math_item in math_res_parts: - ele_item = self._build_html_tree(math_item[0]) + ele_item = math_item[0] def process_node(node): """处理行内公式、行间公式、行间代码、行内代码.""" @@ -216,16 +222,16 @@ def __simplify_td_th_content(self, table_nest_level, elem: HtmlElement) -> None: # 处理非表格元素 math_res = self.__check_table_include_math_code(child) parse_res.extend(math_res) - elem.remove(child) + remove_element(child) # 将非表格内容拼接后放在表格前面 if parse_res: - elem.text = ' '.join(parse_res) + (elem.text or '') + elem.text = ' '.join(parse_res) else: # 没有嵌套表格,直接简化 math_res = self.__check_table_include_math_code(elem) parse_res.extend(math_res) for item in list(elem.iterchildren()): - elem.remove(item) + remove_element(item) if parse_res: elem.text = ' '.join(parse_res) return @@ -244,7 +250,7 @@ def __get_table_body(self, table_type, table_nest_level, table_root): table_root.attrib.clear() table_root.attrib.update(cleaned_attrs) # text进行strip操作,tail保留(部分内容留在tail中) - for elem in chain([table_root], table_root.iterdescendants()): + for elem in chain([table_root], table_root.iterchildren()): if elem.text is not None: elem.text = elem.text.strip().replace('\\n', '') if elem.tail is not None: @@ -273,9 +279,9 @@ def __do_extract_tables(self, root: HtmlElement) -> None: for child in root.iterchildren(): self.__do_extract_tables(child) - def __get_attribute(self, html: str) -> Tuple[bool, Any, Any]: + def __get_attribute(self, ele: HtmlElement) -> Tuple[bool, Any, Any]: """获取element的属性.""" - ele = self._build_html_tree(html) + # ele = self._build_html_tree(html) if ele is not None and ele.tag == CCTag.CC_TABLE: table_type = ele.attrib.get('table_type') table_nest_level = ele.attrib.get('table_nest_level') @@ -283,7 +289,7 @@ def __get_attribute(self, html: str) -> Tuple[bool, Any, Any]: table_body = ele.text return table_flag, table_nest_level, table_body else: - raise HtmlTableRecognizerException(f'{html}中没有cctable标签') + raise HtmlTableRecognizerException(f'{ele}中没有cctable标签') def __get_content_list_table_type(self, table_type): """complex|simple 转为True|False.""" diff --git a/llm_web_kit/extractor/html/recognizer/text.py b/llm_web_kit/extractor/html/recognizer/text.py index f39ffb19..a47a393d 100644 --- a/llm_web_kit/extractor/html/recognizer/text.py +++ b/llm_web_kit/extractor/html/recognizer/text.py @@ -2,14 +2,14 @@ import string from typing import List, Tuple -from lxml import etree +from lxml import html from lxml.html import HtmlElement from overrides import override from llm_web_kit.extractor.html.recognizer.recognizer import ( BaseHTMLElementRecognizer, CCTag) from llm_web_kit.libs.doc_element_type import DocElementType, ParagraphTextType -from llm_web_kit.libs.html_utils import element_to_html +from llm_web_kit.libs.html_utils import element_to_html, html_to_element special_symbols = [ # TODO 从文件读取 '®', # 注册商标符号 @@ -42,27 +42,28 @@ class TextParagraphRecognizer(BaseHTMLElementRecognizer): """解析文本段落元素.""" @override - def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segment: str) -> dict: + def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_html_segment: str) -> dict: """ 把文本段落元素转换为content list node. Args: base_url: - parsed_content: + parsed_content: 可能是字符串或HtmlElement对象 raw_html_segment: Returns: """ - el = self._build_html_tree(parsed_content) + # 如果是字符串则转换为HtmlElement,否则直接使用 + el = parsed_content node = { 'type': DocElementType.PARAGRAPH, - 'raw_content': el.attrib.get('html', ''), + 'raw_content': raw_html_segment, 'content': json.loads(el.text), } return node @override - def recognize(self, base_url:str, main_html_lst: List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]: + def recognize(self, base_url:str, main_html_lst: List[Tuple[HtmlElement | str, HtmlElement | str]], raw_html:str) -> List[Tuple[HtmlElement, HtmlElement]]: """父类,解析文本段落元素. Args: @@ -73,31 +74,32 @@ def recognize(self, base_url:str, main_html_lst: List[Tuple[str,str]], raw_html: Returns: """ new_html_lst = [] - for html, raw_html in main_html_lst: - if self.is_cc_html(html): - new_html_lst.append((html, raw_html)) + for html_element, raw_html_element in main_html_lst: + # 如果是字符串则转换为 HtmlElement + if self.is_cc_html(html_element): + new_html_lst.append((html_element, raw_html_element)) else: - root_el = self._build_html_tree(html) - lst = list(self.__extract_paragraphs(root_el)) - # 然后对lst[Element, raw_html] 进行处理. 提出Element里的文字,做成<>标签 + lst = list(self.__extract_paragraphs(html_element)) new_lst = self.__to_cctext_lst(lst) new_html_lst.extend(new_lst) return new_html_lst - def __to_cctext_lst(self, lst: List[Tuple[HtmlElement, str]]) -> List[Tuple[str, str]]: + def __to_cctext_lst(self, lst: List[Tuple[HtmlElement | str, HtmlElement | str]]) -> List[Tuple[HtmlElement, HtmlElement]]: """将lst[Element, raw_html] 进行处理. 提出Element里的文字,做成<>标签. Args: - lst: List[Tuple[HtmlElement, str]]: Element和raw_html组成的列表 + lst: List[Tuple[HtmlElement | str, HtmlElement | str]]: Element和raw_html组成的列表 """ new_lst = [] for el, raw_html in lst: - para_text = self.__get_paragraph_text(el) - if para_text: - cctext_el = self._build_cc_element(CCTag.CC_TEXT, json.dumps(para_text, ensure_ascii=False, indent=4), '', html=raw_html) - cc_node_html = self._element_to_html(cctext_el) - new_lst.append((cc_node_html, raw_html)) + # 如果是字符串则转换为 HtmlElement + el_element = html_to_element(el) if isinstance(el, str) else el + raw_html_element = html_to_element(raw_html) if isinstance(raw_html, str) else raw_html + para_text = self.__get_paragraph_text(el_element) + if para_text: + cctext_el = self._build_cc_element(CCTag.CC_TEXT, json.dumps(para_text, ensure_ascii=False, indent=4), '', html=element_to_html(raw_html_element)) + new_lst.append((cctext_el, raw_html_element)) return new_lst def __combine_text(self, text1:str, text2:str, lang='en') -> str: @@ -172,7 +174,7 @@ def __extract_paragraphs(self, root: HtmlElement): 解析后的文本段落元素 """ path: List[HtmlElement] = [] - parser = etree.HTMLParser(collect_ids=False, encoding='utf-8', remove_comments=True, remove_pis=True) + parser = html.HTMLParser(collect_ids=False, encoding='utf-8', remove_comments=True, remove_pis=True) def is_contain_readable_text(text): return text.strip() if text else text @@ -223,12 +225,18 @@ def helper(elem: HtmlElement): path[-1].append(copied) path.append(copied) + # elem直接有text,则直接添加返回 + if has_direct_text(elem): + rebuild_path() + path[-1].append(copy_helper(elem)) + yield path[0], path[0] + rebuild_path() for sub_elem in elem: if has_direct_text(sub_elem) or (sub_elem.tag == 'p' and has_text(sub_elem)): rebuild_path() path[-1].append(copy_helper(sub_elem)) - yield path[0], element_to_html(path[0]) - + # yield path[0], element_to_html(path[0]) + yield path[0], path[0] # detach the yielded tree rebuild_path() continue diff --git a/llm_web_kit/extractor/html/recognizer/title.py b/llm_web_kit/extractor/html/recognizer/title.py index 8f2043db..9b18bdb4 100644 --- a/llm_web_kit/extractor/html/recognizer/title.py +++ b/llm_web_kit/extractor/html/recognizer/title.py @@ -1,6 +1,7 @@ from typing import List, Tuple -from lxml.etree import _Element as HtmlElement +# from lxml.etree import _Element as HtmlElement +from lxml.html import HtmlElement from overrides import override from llm_web_kit.extractor.html.recognizer.recognizer import ( @@ -12,7 +13,7 @@ class TitleRecognizer(BaseHTMLElementRecognizer): """解析多级标题元素.""" @override - def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segment: str) -> dict: + def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_html_segment: str) -> dict: """将html转换成content_list_node. Args: @@ -37,8 +38,8 @@ def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segm return cctitle_content_node @override - def recognize(self, base_url:str, main_html_lst: List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]: - """父类,解析多级标题元素. + def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html: str) -> List[Tuple[HtmlElement, HtmlElement]]: + """父类,解析标题元素. Args: base_url: str: 基础url @@ -46,9 +47,12 @@ def recognize(self, base_url:str, main_html_lst: List[Tuple[str,str]], raw_html: raw_html: 原始完整的html Returns: + List[Tuple[HtmlElement, HtmlElement]]: 处理后的HTML元素列表 """ new_html_lst = [] for html, raw_html in main_html_lst: + if isinstance(html, str): + html = self._build_html_tree(html) if self.is_cc_html(html): new_html_lst.append((html, raw_html)) else: @@ -56,22 +60,19 @@ def recognize(self, base_url:str, main_html_lst: List[Tuple[str,str]], raw_html: new_html_lst.extend(lst) return new_html_lst - def _extract_title(self, raw_html:str) -> List[Tuple[str,str]]: - """ - 提取多级标题元素 + def _extract_title(self, raw_html: HtmlElement) -> List[Tuple[HtmlElement, HtmlElement]]: + """提取多级标题元素 Args: - raw_html: + raw_html: HtmlElement对象 Returns: - List[Tuple[str,str]]: 多级标题元素, 第一个str是xxx, 第二个str是原始的html内容 - + List[Tuple[HtmlElement, HtmlElement]]: 多级标题元素列表 """ - tree = self._build_html_tree(raw_html) - self.__do_extract_title(tree) # 遍历这个tree, 找到所有h1, h2, h3, h4, h5, h6标签, 并得到其对应的原始的html片段 + tree = raw_html + self.__do_extract_title(tree) # 遍历这个tree, 找到所有h1, h2, h3, h4, h5, h6标签 # 最后切割html - new_html = self._element_to_html(tree) + new_html = tree lst = self.html_split_by_tags(new_html, CCTag.CC_TITLE) - return lst def __do_extract_title(self, root:HtmlElement) -> None: @@ -137,9 +138,10 @@ def __extract_title_text_recusive(el: HtmlElement, with_tail: bool = True) -> li return ' '.join(blk for blk in blks if blk) - def __get_attribute(self, html:str) -> Tuple[int, str]: + def __get_attribute(self, html:HtmlElement) -> Tuple[int, str]: """获取element的属性.""" - ele = self._build_html_tree(html) + # ele = self._build_html_tree(html) + ele = html # 找到cctitle标签 if ele is not None: level = ele.attrib.get('level') diff --git a/llm_web_kit/extractor/html/recognizer/video.py b/llm_web_kit/extractor/html/recognizer/video.py index 227736a1..bed7df5a 100644 --- a/llm_web_kit/extractor/html/recognizer/video.py +++ b/llm_web_kit/extractor/html/recognizer/video.py @@ -1,5 +1,6 @@ from typing import List, Tuple +from lxml.html import HtmlElement from overrides import override from llm_web_kit.extractor.html.recognizer.recognizer import \ @@ -9,7 +10,7 @@ class VideoRecognizer(BaseHTMLElementRecognizer): """解析视元素.""" @override - def recognize(self, base_url:str, main_html_lst: List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]: + def recognize(self, base_url:str, main_html_lst: List[Tuple[HtmlElement,HtmlElement]], raw_html:str) -> List[Tuple[HtmlElement,HtmlElement]]: """父类,解析视频元素. Args: @@ -22,5 +23,5 @@ def recognize(self, base_url:str, main_html_lst: List[Tuple[str,str]], raw_html: raise NotImplementedError @override - def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segment: str) -> dict: + def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_html_segment: str) -> dict: raise NotImplementedError diff --git a/llm_web_kit/input/datajson.py b/llm_web_kit/input/datajson.py index 49b36d0a..26246b58 100644 --- a/llm_web_kit/input/datajson.py +++ b/llm_web_kit/input/datajson.py @@ -5,8 +5,10 @@ from overrides import override +from llm_web_kit.exception.exception import ExtractorChainInputException from llm_web_kit.libs.doc_element_type import DocElementType, ParagraphTextType -from llm_web_kit.libs.html_utils import (get_element_text, html_to_element, +from llm_web_kit.libs.html_utils import (element_to_html, get_element_text, + html_to_element, html_to_markdown_table, table_cells_count) @@ -51,11 +53,13 @@ def __init__(self): self.__list_item_start = '-' # md里的列表项前缀 self.__list_para_prefix = ' ' # 两个空格,md里的列表项非第一个段落的前缀:如果多个段落的情况,第二个以及之后的段落前缀 self.__md_special_chars = ['#', '`', ] # TODO: 先去掉$,会影响行内公式,后面再处理 + self.__nodes_document_type = [DocElementType.MM_NODE_LIST, DocElementType.PARAGRAPH, DocElementType.LIST, DocElementType.SIMPLE_TABLE, DocElementType.COMPLEX_TABLE, DocElementType.TITLE, DocElementType.IMAGE, DocElementType.AUDIO, DocElementType.VIDEO, DocElementType.CODE, DocElementType.EQUATION_INTERLINE] + self.__inline_types_document_type = [ParagraphTextType.EQUATION_INLINE, ParagraphTextType.CODE_INLINE] def to_html(self): raise NotImplementedError('This method must be implemented by the subclass.') - def to_txt(self, exclude_nodes=DocElementType.MM_NODE_LIST): + def to_txt(self, exclude_nodes=DocElementType.MM_NODE_LIST, exclude_inline_types=[]): """把content_list转化为txt格式. Args: @@ -68,7 +72,7 @@ def to_txt(self, exclude_nodes=DocElementType.MM_NODE_LIST): for page in content_lst: for content_lst_node in page: if content_lst_node['type'] not in exclude_nodes: - txt_content = self.__content_lst_node_2_txt(content_lst_node) + txt_content = self.__content_lst_node_2_txt(content_lst_node, exclude_inline_types) if txt_content and len(txt_content) > 0: text_blocks.append(txt_content) @@ -76,7 +80,7 @@ def to_txt(self, exclude_nodes=DocElementType.MM_NODE_LIST): txt = txt.strip() + self.__text_end # 加上结尾换行符 return txt - def __to_md(self, exclude_nodes=[]): + def __to_md(self, exclude_nodes=[], exclude_inline_types=[]): """把content_list转化为md格式. Args: @@ -89,7 +93,7 @@ def __to_md(self, exclude_nodes=[]): for page in content_lst: for content_lst_node in page: if content_lst_node['type'] not in exclude_nodes: - txt_content = self.__content_lst_node_2_md(content_lst_node) + txt_content = self.__content_lst_node_2_md(content_lst_node, exclude_inline_types) if txt_content and len(txt_content) > 0: md_blocks.append(txt_content) @@ -97,15 +101,31 @@ def __to_md(self, exclude_nodes=[]): md = md.strip() + self.__text_end # 加上结尾换行符 return md - def to_nlp_md(self, MM_NODE_LIST=[]): - if MM_NODE_LIST: - md = self.__to_md(exclude_nodes=MM_NODE_LIST) - else: - md = self.__to_md(exclude_nodes=DocElementType.MM_NODE_LIST) + def __validate_exclude_nodes(self, exclude_nodes, exclude_inline_types): + if isinstance(exclude_nodes, str): + exclude_nodes = [exclude_nodes] + if isinstance(exclude_inline_types, str): + exclude_inline_types = [exclude_inline_types] + if not isinstance(exclude_nodes, list): + raise ExtractorChainInputException('exclude_nodes must be a list type.') + if not isinstance(exclude_inline_types, list): + raise ExtractorChainInputException('exclude_inline_types must be a list type.') + for node in exclude_nodes: + if node not in self.__nodes_document_type: + raise ExtractorChainInputException(f'exclude_nodes contains invalid element type: {node}') + for inline_type in exclude_inline_types: + if inline_type not in self.__inline_types_document_type: + raise ExtractorChainInputException(f'exclude_inline_types contains invalid inline type: {inline_type}') + return exclude_nodes, exclude_inline_types + + def to_nlp_md(self, exclude_nodes=[], exclude_inline_types=[]): + exclude_nodes, exclude_inline_types = self.__validate_exclude_nodes(exclude_nodes, exclude_inline_types) + md = self.__to_md(exclude_nodes + DocElementType.MM_NODE_LIST, exclude_inline_types) return md - def to_mm_md(self): - md = self.__to_md() + def to_mm_md(self, exclude_nodes=[], exclude_inline_types=[]): + self.__validate_exclude_nodes(exclude_nodes, exclude_inline_types) + md = self.__to_md(exclude_nodes, exclude_inline_types) return md def to_main_html(self) -> str: @@ -121,9 +141,11 @@ def to_main_html(self) -> str: for page in content_lst: for content_lst_node in page: raw_html = content_lst_node['raw_content'] - if raw_html: - html += raw_html - + if isinstance(raw_html, str): + html_segment = raw_html # 直接使用字符串 + else: + html_segment = element_to_html(raw_html) # 转换HtmlElement为字符串 + html += html_segment return html def to_json(self, pretty=False) -> str: @@ -140,7 +162,7 @@ def to_dict(self) -> dict: def _get_data(self) -> List[Dict]: raise NotImplementedError('This method must be implemented by the subclass.') - def __content_lst_node_2_md(self, content_lst_node: dict) -> str: + def __content_lst_node_2_md(self, content_lst_node: dict, exclude_inline_types: list = []) -> str: """把content_list里定义的每种元素块转化为markdown格式. Args: @@ -202,7 +224,7 @@ def __content_lst_node_2_md(self, content_lst_node: dict) -> str: return md_title elif node_type == DocElementType.PARAGRAPH: paragraph_el_lst = content_lst_node['content'] - one_para = self.__join_one_para(paragraph_el_lst) + one_para = self.__join_one_para(paragraph_el_lst, exclude_inline_types) return one_para elif node_type == DocElementType.LIST: items_paras = [] @@ -210,7 +232,7 @@ def __content_lst_node_2_md(self, content_lst_node: dict) -> str: for item_idx, item in enumerate(content_lst_node['content']['items']): paras_of_item = [] for para in item: - one_para = self.__join_one_para(para) + one_para = self.__join_one_para(para, exclude_inline_types) paras_of_item.append(one_para) # 由于markdown的列表项里可以有多个段落,这里拼装成md列表段落格式 list_prefix = f'{item_idx + 1}.' if is_ordered else self.__list_item_start # 有序列表和无需列表前缀 @@ -218,7 +240,7 @@ def __content_lst_node_2_md(self, content_lst_node: dict) -> str: items_paras.append(item_paras_md) md_list = '\n'.join(items_paras) return md_list - elif node_type == DocElementType.TABLE: + elif node_type == DocElementType.SIMPLE_TABLE: # 对文本格式来说,普通表格直接转为md表格,复杂表格返还原始html html_table = content_lst_node['content']['html'] if html_table is not None: @@ -227,12 +249,15 @@ def __content_lst_node_2_md(self, content_lst_node: dict) -> str: if cells_count <= 1: # 单个单元格的表格,直接返回文本 text = get_element_text(html_to_element(html_table)).strip() return text - is_complex = content_lst_node['content']['is_complex'] - if is_complex: - return html_table - else: - md_table = html_to_markdown_table(html_table) - return md_table + md_table = html_to_markdown_table(html_table) + return md_table + else: + return '' + elif node_type == DocElementType.COMPLEX_TABLE: + html_table = content_lst_node['content']['html'] + if html_table is not None: + html_table = html_table.strip() + return html_table else: return '' else: @@ -274,7 +299,7 @@ def __para_2_md_list_item(self, paras_of_item: list, list_prefix: str) -> str: return md_list_item - def __content_lst_node_2_txt(self, content_lst_node: dict) -> str: + def __content_lst_node_2_txt(self, content_lst_node: dict, exclude_inline_types=[]) -> str: """把content_list里定义的每种元素块转化为纯文本格式. Args: @@ -330,35 +355,38 @@ def __content_lst_node_2_txt(self, content_lst_node: dict) -> str: return title_content elif node_type == DocElementType.PARAGRAPH: paragraph_el_lst = content_lst_node['content'] - one_para = self.__join_one_para(paragraph_el_lst) + one_para = self.__join_one_para(paragraph_el_lst, exclude_inline_types) return one_para elif node_type == DocElementType.LIST: items_paras = [] for item in content_lst_node['content']['items']: paras_of_item = [] for para in item: - one_para = self.__join_one_para(para) + one_para = self.__join_one_para(para, exclude_inline_types) paras_of_item.append(one_para) items_paras.append(paras_of_item) items_paras = [self.__txt_para_splitter.join(item) for item in items_paras] return self.__txt_para_splitter.join(items_paras) # 对于txt格式来说一个列表项里多个段落没啥问题,但是对于markdown来说,多个段落要合并成1个,否则md格式无法表达。 - elif node_type == DocElementType.TABLE: + elif node_type == DocElementType.SIMPLE_TABLE: # 对文本格式来说,普通表格直接转为md表格,复杂表格返还原始html html_table = content_lst_node['content']['html'] if html_table is not None: html_table = html_table.strip() - is_complex = content_lst_node['content']['is_complex'] - if is_complex: - return html_table - else: - md_table = html_to_markdown_table(html_table) - return md_table + md_table = html_to_markdown_table(html_table) + return md_table + else: + return '' + elif node_type == DocElementType.COMPLEX_TABLE: + html_table = content_lst_node['content']['html'] + if html_table is not None: + html_table = html_table.strip() + return html_table else: return '' else: raise ValueError(f'content_lst_node contains invalid element type: {node_type}') # TODO: 自定义异常 - def __join_one_para(self, para: list) -> str: + def __join_one_para(self, para: list, exclude_inline_types: list = []) -> str: """把一个段落的元素块连接起来. Args: @@ -368,6 +396,8 @@ def __join_one_para(self, para: list) -> str: """ one_para = [] for el in para: + if el['t'] in exclude_inline_types: + continue if el['t'] == ParagraphTextType.TEXT: c = el['c'] if not c or not c.strip(): @@ -393,10 +423,10 @@ def _validate(self, json_obj: dict): json_obj (dict): _description_ """ if not isinstance(json_obj, dict): - raise ValueError('json_obj must be a dict type.') + raise ExtractorChainInputException('json_obj must be a dict type.') if DataJsonKey.CONTENT_LIST in json_obj: if not isinstance(json_obj.get(DataJsonKey.CONTENT_LIST, ''), list): - raise ValueError('content_list must be a list type.') + raise ExtractorChainInputException('content_list must be a list type.') class ContentList(StructureMapper): diff --git a/llm_web_kit/libs/doc_element_type.py b/llm_web_kit/libs/doc_element_type.py index c3c63fdb..dd962ed7 100644 --- a/llm_web_kit/libs/doc_element_type.py +++ b/llm_web_kit/libs/doc_element_type.py @@ -8,7 +8,8 @@ class ParagraphTextType(object): class DocElementType(object): PARAGRAPH = 'paragraph' LIST = 'list' - TABLE = 'table' + SIMPLE_TABLE = 'simple_table' + COMPLEX_TABLE = 'complex_table' EQUATION_INTERLINE = 'equation-interline' CODE = 'code' TITLE = 'title' diff --git a/llm_web_kit/libs/statics.py b/llm_web_kit/libs/statics.py index df640617..006cdcc8 100644 --- a/llm_web_kit/libs/statics.py +++ b/llm_web_kit/libs/statics.py @@ -94,10 +94,10 @@ def process_list_items(items, parent_type): elif element_type == DocElementType.LIST: # 使用递归函数处理列表项 process_list_items(element['content']['items'], DocElementType.LIST) - elif element_type == DocElementType.TABLE: + elif element_type == DocElementType.COMPLEX_TABLE: # 统计复杂表格数量 if element.get('content', {}).get('is_complex', False): - item_type = f'{DocElementType.TABLE}.complex' + item_type = f'{DocElementType.COMPLEX_TABLE}.complex' current_count = self.statics.get(item_type, 0) self.statics[item_type] = current_count + 1 diff --git a/llm_web_kit/tools/cli.py b/llm_web_kit/tools/cli.py index c2260e1f..ca8a41b4 100644 --- a/llm_web_kit/tools/cli.py +++ b/llm_web_kit/tools/cli.py @@ -58,7 +58,6 @@ def cli(input_path, output_path, debug_mode): extractor = HTMLFileFormatExtractor({}) data_e = extractor.extract(DataJson(input_data)) output_json = data_e.to_json() - if output_path: output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) diff --git a/tests/llm_web_kit/cli_sdk/test_cli_sdk.py b/tests/llm_web_kit/cli_sdk/test_cli_sdk.py index 6aad22ba..f0085a69 100644 --- a/tests/llm_web_kit/cli_sdk/test_cli_sdk.py +++ b/tests/llm_web_kit/cli_sdk/test_cli_sdk.py @@ -73,11 +73,12 @@ def test_process_html_file_path(self, runner, json_with_file_path, tmp_path): def test_stdout_output(self, runner, json_with_html_path): """测试输出到标准输出.""" + print('json_with_html_path', json_with_html_path) result = runner.invoke(cli, ['-i', str(json_with_html_path)]) assert result.exit_code == 0 assert result.output - + print('result.output', result.output) output_data = json.loads(result.output) assert 'content_list' in output_data assert isinstance(output_data['content_list'], list) diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/content_list_empty.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/content_list_empty.html new file mode 100644 index 00000000..30fce8a5 --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/content_list_empty.html @@ -0,0 +1 @@ +北京大平层,奶油风浪漫到家!
\n-
\n设计案例: 168m轻法式大平层设计
\n项目地址:北京市大兴区
\n-
\n在这个168平方米的轻法式大平层设计中,全屋以浪漫的奶白色为主色调,搭配驼色,营造出空间的呼吸感。客餐厅一体设计,地面铺满柔光砖,裸调的高级质感扑面而来。
\n
\n转角沙发与充满设计感的小型休闲椅相搭配,家居格调瞬间提升。威尼斯棕大理石餐桌的加入,为餐厅增添了更多的层次感和温柔。坐在沙发上,可以一览餐厅和厨房的空间,增加了互动性。
\n
\n墙面采用暖白色,搭配一些局部的原木色护墙板,让空间的视觉效果更加灵动,不易产生疲劳感。阳光透过窗户洒进室内,整个空间显得格外治愈,喜欢这种明亮纯粹的家。 diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/exclude_complex_table.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/exclude_complex_table.html new file mode 100644 index 00000000..e0b1d2bb --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/exclude_complex_table.html @@ -0,0 +1,528 @@ +\n\n \n \n\n \n\n\n + +\n \n WikiProcessors – smartmontools\n \n + \n + \n + \n + \n + \n + \n + \n + \n + \n + \n + \n + \n \n + \n + \n + \n + \n + \n + \n + + \n + \n + + \n +\n + +\n\t
\n\t\t
\n\t\t\t\n\n +
\n
\n

smartmontools +

\n
\n
\n
\n \n \n \n
\n \n + \n
\n \n
\n \n
\n

Context Navigation

\n \n +
\n +
\n
\n
\n
\n + + + \n + + \n
Version 3 (modified by trac, 5 years ago)\n (diff)
\n

\n--\n

\n\n
\n
\n \n
+

Wiki Processors

\n

\nProcessors are WikiMacros designed to provide alternative markup + formats for the Wiki engine. Processors + can be thought of as macro functions to process user-edited text. \n

\n

+ \nWiki processors can be used in any Wiki text throughout Trac, such as:\n

\n +

Using Processors

\n

\nTo use a processor on a block of + text, first delimit the lines using a Wiki code block:\n

\n +
{{{\nThe lines\nthat should be processed...\n}}}\n
+

\nImmediately after the {{{ or on the line just below, add #! + followed by the processor name:\n

\n +
{{{\n#!processorname\nThe lines\nthat should be processed...\n}}}\n
+

\nThis is the \"shebang\" notation, familiar to most UNIX users.\n

\n

\nBesides + their content, some Wiki processors can also accept parameters, which are then + given as key=value pairs after the processor name and on the same line. If + value has to contain space, as it's often the case for the style parameter, + a quoted string can be used (key=\"value with space\").\n

\n

\nAs some + processors are meant to process Wiki markup, it's quite possible to nest + processor blocks.\nYou may want to indent the content of nested blocks for increased + clarity, this extra indentation will be ignored when processing the content.\n

\n

Examples

\n\n + + + + + + + \n + + + \n + + \n + + + \n + + \n + + +
Wiki Markup Display \n
+
\n

\nExample 1: Inserting raw + HTML\n

\n
\n +
+
{{{\n#!html\n<h1 style=\"color: grey\">This is raw HTML</h1>\n}}}\n
+
+

This is raw HTML

\n +
+
\n

\nExample 2: Highlighted + Python code in a <div> block with custom style\n

\n +
\n +
+
{{{#!div style=\"background: #ffd; border: 3px ridge\"\n\nThis is an example of embedded \"code\" block:\n\n  {{{\n  #!python\n  def hello():\n      return \"world\"\n  }}}\n\n}}}\n
+
+
+

\nThis is an example of embedded \"code\" block:\n

\n
+
+
def hello():\n    return \"world\"\n
+
+
+
+
+
\n

\nExample 3: Searching tickets + from a wiki page, by keywords.\n

\n
\n +
+
{{{\n#!html\n<form action=\"/query\" method=\"get\"><div>\n<input type=\"text\" name=\"keywords\" value=\"~\" size=\"30\"/>\n<input type=\"submit\" value=\"Search by Keywords\"/>\n<!-- To control what fields show up use hidden fields\n<input type=\"hidden\" name=\"col\" value=\"id\"/>\n<input type=\"hidden\" name=\"col\" value=\"summary\"/>\n<input type=\"hidden\" name=\"col\" value=\"status\"/>\n<input type=\"hidden\" name=\"col\" value=\"milestone\"/>\n<input type=\"hidden\" name=\"col\" value=\"version\"/>\n<input type=\"hidden\" name=\"col\" value=\"owner\"/>\n<input type=\"hidden\" name=\"col\" value=\"priority\"/>\n<input type=\"hidden\" name=\"col\" value=\"component\"/>\n-->\n</div></form>\n}}}\n
+
+
+
\n\n\n
+
\n +
\n

Available Processors

\n

\nThe following + processors are included in the Trac distribution:\n

\n\n + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
#!default Present the text verbatim in a preformatted text block. This is the same as + specifying no processor name (and no #!). \n
#!comment Do not process the text in this section, i.e. contents exist only in the plain + text - not in the rendered page. \n
#!rtl Introduce a Right-To-Left block with appropriate CSS direction and styling. + (since 0.12.2) \n
\n
HTML + related \n
#!html Insert custom HTML in a wiki page. \n
#!htmlcomment Insert an HTML comment in a wiki page. (since 0.12) \n
Note that #!html blocks have to be self-contained, i.e. + you can't start an HTML element in one block and close it later in a second + block. Use the following processors for achieving a similar effect. \n
#!div Wrap wiki content inside a <div> element. \n
#!span Wrap wiki content inside a <span> element. \n
#!td Wrap wiki content inside a <td> element. (since 0.12) \n
#!th Wrap wiki content inside a <th> element. (since 0.12) \n
#!tr Can optionally be used for wrapping #!td and #!th + blocks, either for specifying row attributes or better visual grouping. + (since 0.12) \n
#!table Can optionally be used for wrapping #!tr, #!td and + #!th blocks, for specifying table attributes. One current + limitation however is that tables cannot be nested. (since 0.12) \n +
See WikiHtml for example usage + and more details about these processors. \n
\n
Other Markups \n
#!rst Trac support for Restructured Text. See WikiRestructuredText. \n
#!textile Supported if Textile is installed. See a Textile reference. \n
\n
Code Highlighting Support + \n
#!c
#!cpp + (C++)
#!python
+ #!perl
#!ruby +
#!php
+ #!asp
#!java +
#!js (Javascript)
+ #!sql
#!xml + (XML or HTML)
#!sh (Bourne/Bash shell) +
etc.
Trac includes processors to provide inline syntax highlighting for source code + in various languages.

Trac relies on Pygments for + syntax coloring.

See TracSyntaxColoring for information + about which languages are supported and how to enable support for more + languages. \n
\n
\n

\nSince 1.1.2 the default, coding highlighting and MIME-type processors support + the argument lineno for adding line numbering to the code block. When a + value is specified, as in lineno=3, the numbering will start at the + specified value. When used in combination with the lineno argument, the + marks argument is also supported for highlighting lines. A single line + number, set of line numbers and range of line numbers are allowed. For example, + marks=3, marks=3-6, marks=3,5,7 and + marks=3-5,7 are all allowed. The specified values are relative to the + numbered lines, so if lineno=2 is specified to start the line numbering at + 2, marks=2 will result in the first line being highlighted.\n

\n

+ \nUsing the MIME type as processor, it is possible to syntax-highlight the same + languages that are supported when browsing source code.\n

\n\n + + + + + + + + + + + +
MIME Type Processors \n
+

\nSome examples:\n

\n +
{{{#!text/html\n<h1>text</h1>\n}}}\n
+
+

\nThe result will be syntax highlighted HTML code:\n

\n
+
+
<h1>text</h1>\n
+
+
+

\nThe same is valid for all other mime types + supported.\n

\n +
+
{{{#!diff\n--- Version 55\n+++ Version 56\n@@ -115,8 +115,9 @@\n     name='TracHelloWorld', version='1.0',\n     packages=find_packages(exclude=['*.tests*']),\n-    entry_points = \"\"\"\n-        [trac.plugins]\n-        helloworld = myplugs.helloworld\n-    \"\"\",\n+    entry_points = {\n+        'trac.plugins': [\n+            'helloworld = myplugs.helloworld',\n+        ],\n+    },\n )\n}}}\n
+
+

\n#!diff has a particularly nice renderer:\n +

\n
+
\n\n
    \n \n
  • \n

    \n + Version\n \n

    \n \n \n \n \n \n + \n + + + + \n \n \n \n \n \n + \n \n \n \n \n \n + \n\n \n \n \n \n \n \n + + \n \n \n \n + + \n \n \n \n \n + + \n \n \n \n + \n \n\n \n\n \n\n \n\n \n + + \n \n\n \n\n \n\n \n\n \n \n \n \n \n + \n + + \n + + \n + + \n + + \n + + + \n + + \n + + \n + \n \n \n + + \n + + \n + + \n + + \n + + \n + + \n + + \n + + \n + + \n + \n \n \n \n\n \n + + \n \n\n \n \n \n \n \n \n + + \n \n \n \n + \n \n\n \n\n \n\n \n\n \n \n \n \n +
    \n \n \n \n \n \n  
    115115    + name='TracHelloWorld', version='1.0', +
    116116    + packages=find_packages(exclude=['*.tests*']), +
    117     entry_points = + \"\"\"
    118         + [trac.plugins]
    119         + helloworld = myplugs.helloworld
    120     \"\"\", +
     117    entry_points = + {
     118        + 'trac.plugins': [
     119        +     'helloworld = + myplugs.helloworld',
     120        + ],
     121    },
    121 + 122)
    \n
  • \n \n
\n\n
+
+
\n

\nLine numbers can be added to code blocks and lines can be highlighted + (since 1.1.2).\n

\n +
{{{#!python lineno=3 marks=3,9-10,16\ndef expand_markup(stream, ctxt=None):\n    \"\"\"A Genshi stream filter for expanding `genshi.Markup` events.\n\n    Note: Expansion may not be possible if the fragment is badly\n    formed, or partial.\n    \"\"\"\n    for event in stream:\n        if isinstance(event[1], Markup):\n            try:\n                for subevent in HTML(event[1]):\n                    yield subevent\n            except ParseError:\n                yield event\n        else:\n            yield event\n}}}\n
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Line 
3def expand_markup(stream, ctxt=None):
4    \"\"\"A Genshi stream filter for expanding + `genshi.Markup` events.
5
6    Note: Expansion may not be possible if the + fragment is badly
7    formed, or partial.
8    \"\"\"
9    for event in stream:
10        if isinstance(event[1], Markup):
11            try:
12                for subevent in HTML(event[1]):
13                    yield subevent
14            except ParseError:
15                yield event
16        else:
17            yield event
+
+

\nFor more processor macros developed and/or contributed by users, visit the Trac + Hacks community site.\n

\n

\nDeveloping processors is no different from + Wiki macros. In fact, they work the same way, only the usage syntax differs. See WikiMacros#DevelopingCustomMacros + for more information.\n

\n +
\n

\nSee also: WikiMacros, WikiHtml, WikiRestructuredText, TracSyntaxColoring, WikiFormatting, TracGuide\n

\n +
\n \n \n \n \n
\n \n\n \n + + \n + + \n
\n

Download in other formats:

\n \n
\n + \n
+
\n \"Trac\n

Powered by Trac + 1.2.5
\n By Edgewall Software. +

\n

Validator: Check + XHTML

\n +
\n\n\t\t\n \n\n
\n +\n + + \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/para_is_short.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/para_is_short.html new file mode 100644 index 00000000..a7126065 --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/para_is_short.html @@ -0,0 +1 @@ +\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nA plain blog about politics: Acceptable\n\n\n\n\n\n\n\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n\n
\n
\n
\n
\n
\n
\n

Monday, December 5, 2011

\n
\n
\n
\n\n\n\n

\nAcceptable\n

\n
\n
\n
\n
\nSince I've commented quite a bit on polling that as I read it shows Mitt Romney broadly acceptable to most Republican voters, I definitely need to say something about a new poll today that doesn't exactly show that. Gallup got around to actually asking that very question (\"Please tell me if you would find ___ to be an acceptable  nominee for president from the Republican Party, or not\"). The answers mostly showed the weakness of the field, with six of the eight candidates asked about scoring well below 50% acceptable. But the clear most-acceptable candidate is Newt Gingrich, with a 62/34 acceptable/not acceptable ratio, while Romney is only at 54/41.
\n
\nThere are a lot of ways to look at this, but overall it's certainly a piece of evidence that the anti-Romney vote is, well, around 40%. Only a piece of evidence, however. It's not clear how hard these kinds of numbers might be, in either direction. On the positive side, it seems unlikely that Newt would remain over 60% once more Republicans know that he's been lobbying for Freddie Mac, and supported the individual mandate on health insurance, and made a climate change ad with Nancy Pelosi, and all the rest of it. On the other hand, it's certainly possible that the \"unacceptable\" answers are awful soft, for Romney and for everyone else.
\n
\nIn particular, as Greg pointed out, Romney only does three points better on the \"acceptable\" scale with moderate Republicans than does Newt. This isn't the first indication we've had that Romney isn't doing as well with moderate Republicans as one would think he should be. Whether that means he has some room to grow or that he's just not an appealing politician is, I guess, still entirely up in the air at this point.
\n
\nI still overall don't see a low cap on Romney's support, but of course all the evidence counts, and polling in general begins to be a little more important the closer we get to actual voting. I'll be continuing to track anything more we get on this one.\n
\n
\n\n
\n
\n\n

14 comments:

\n
\n\n \n\n\n\n", + src=\"https://whatsknow.com/wp-content/cache/autoptimize/js/autoptimize_fe6b5f33f1d030f29a946c59f754e0ce.js\"> \n\n\n\n \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_code.py b/tests/llm_web_kit/extractor/html/recognizer/test_code.py index ea7a4fb7..ba8b3f38 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_code.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_code.py @@ -245,12 +245,12 @@ def test_code_rec(self): base_url = test_case['input'][1] print(base_url) raw_html = raw_html_path.read_text() - parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) - parts = [ - part[0] - for part in parts - if CCTag.CC_CODE in part[0] or CCTag.CC_CODE_INLINE in part[0] - ] + parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) + # parts = [ + # part[0] + # for part in parts + # if CCTag.CC_CODE in part[0] or CCTag.CC_CODE_INLINE in part[0] + # ] # for part in parts: # part_el = html_to_element(part) # answer = get_element_text(part_el).strip() @@ -259,7 +259,7 @@ def test_code_rec(self): # print("--------------------------------------------------") answers = [] for part in parts: - part_el = html_to_element(part) + part_el = part[0] cccodes = part_el.xpath(f'.//{CCTag.CC_CODE}') + part_el.xpath( f'.//{CCTag.CC_CODE_INLINE}' ) @@ -532,4 +532,4 @@ def test_lineno_4(self):
""" # 无须检查内容,只要不爆错就可以了 - _ = self.rec.recognize('', [(html, html)], html) + _ = self.rec.recognize('', [(html_to_element(html), html_to_element(html))], html) diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_image.py b/tests/llm_web_kit/extractor/html/recognizer/test_image.py index ab3cd733..9f374848 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_image.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_image.py @@ -3,6 +3,7 @@ from llm_web_kit.extractor.html.recognizer.image import ImageRecognizer from llm_web_kit.extractor.html.recognizer.recognizer import CCTag +from llm_web_kit.libs.html_utils import html_to_element TEST_CASES_HTML = [ { @@ -98,7 +99,7 @@ def test_recognize(self): raw_html_path = base_dir.joinpath(test_case['input']) base_url = test_case['base_url'] raw_html = raw_html_path.read_text(encoding='utf-8') - parts = self.img_recognizer.recognize(base_url, [(raw_html, raw_html)], raw_html) + parts = self.img_recognizer.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) self.assertEqual(len(parts), test_case['expected']) ccimg_datas = [ccimg[0] for ccimg in parts if CCTag.CC_IMAGE in ccimg[0] and 'by="svg"' not in ccimg[0]] if ccimg_datas: @@ -109,7 +110,7 @@ def test_recognize(self): def test_to_content_list_node(self): for test_case in TEST_CC_CASE: try: - res = self.img_recognizer.to_content_list_node(test_case['url'], test_case['parsed_content'], + res = self.img_recognizer.to_content_list_node(test_case['url'], html_to_element(test_case['parsed_content']), test_case['html']) self.assertEqual(res, test_case['expected']) self.assertEqual(res['content']['alt'], test_case['alt']) diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_list.py b/tests/llm_web_kit/extractor/html/recognizer/test_list.py index 2cc10aac..0696618f 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_list.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_list.py @@ -2,6 +2,7 @@ import unittest from llm_web_kit.extractor.html.recognizer.list import ListRecognizer +from llm_web_kit.libs.html_utils import html_to_element class TestSimpleListRecognize(unittest.TestCase): @@ -17,10 +18,10 @@ def setUp(self): self.__complex_list_content = file.read() def test_simple_list(self): - html_part = self.__list_recognize.recognize('http://url.com', [(self.__simple_list_content, self.__complex_list_content)], self.__simple_list_content) + html_part = self.__list_recognize.recognize('http://url.com', [(html_to_element(self.__simple_list_content), html_to_element(self.__complex_list_content))], self.__simple_list_content) assert len(html_part) == 6 def test_complex_list(self): # TODO: Fix this test - html_part = self.__list_recognize.recognize('http://url.com', [(self.__simple_list_content, self.__complex_list_content)], self.__complex_list_content) + html_part = self.__list_recognize.recognize('http://url.com', [(html_to_element(self.__simple_list_content), html_to_element(self.__complex_list_content))], self.__complex_list_content) assert len(html_part) == 6 diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_math.py b/tests/llm_web_kit/extractor/html/recognizer/test_math.py index fa9d7614..eb5fbbf4 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_math.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_math.py @@ -4,7 +4,7 @@ from llm_web_kit.exception.exception import HtmlMathRecognizerException from llm_web_kit.extractor.html.recognizer.ccmath import CCMATH, MathRecognizer from llm_web_kit.extractor.html.recognizer.recognizer import CCTag -from llm_web_kit.libs.html_utils import html_to_element +from llm_web_kit.libs.html_utils import element_to_html, html_to_element TEST_CASES = [ # 基本公式测试用例 @@ -365,17 +365,14 @@ def test_math_recognizer(self): with self.subTest(input=test_case['input'], raw_html=test_case['raw_html']): output_html = self.math_recognizer.recognize( 'https://www.baidu.com', - test_case['input'], + [(html_to_element(test_case['input'][0][0]), html_to_element(test_case['input'][0][1]))], test_case['raw_html'] ) - print(output_html) expect_len = len(test_case['expected']) self.assertEqual(len(output_html), len(test_case['expected']), msg=f'result is: {len(output_html)}, expected is: {expect_len}') for i in range(len(output_html)): expect = test_case['expected'][i][0] - print(output_html[i][0]) - print(expect) - self.assertEqual(output_html[i][0], expect, msg=f'result is: {output_html[i][0]}, expected is: {expect}') + self.assertEqual(element_to_html(output_html[i][0]), expect, msg=f'result is: {output_html[i][0]}, expected is: {expect}') def test_math_recognizer_html(self): for test_case in TEST_CASES_HTML: @@ -383,7 +380,7 @@ def test_math_recognizer_html(self): # print('raw_html_path::::::::', raw_html_path) base_url = test_case['base_url'] raw_html = raw_html_path.read_text() - parts = self.math_recognizer.recognize(base_url, [(raw_html, raw_html)], raw_html) + parts = self.math_recognizer.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) # print(parts) # 将parts列表中第一个元素拼接保存到文件,带随机数 # import random @@ -391,8 +388,10 @@ def test_math_recognizer_html(self): # for part in parts: # f.write(str(part[0])) # 检查行间公式抽取正确性 - parts = [part[0] for part in parts if CCTag.CC_MATH_INTERLINE in part[0]] - print(len(parts)) + new_parts = [] + for part in parts: + new_parts.append((element_to_html(part[0]), element_to_html(part[1]))) + parts = [part[0] for part in new_parts if CCTag.CC_MATH_INTERLINE in part[0]] expect_text = base_dir.joinpath(test_case['expected']).read_text().strip() expect_formulas = [formula for formula in expect_text.split('\n') if formula] self.assertEqual(len(parts), len(expect_formulas)) @@ -410,10 +409,7 @@ def test_math_recognizer_html(self): # self.write_to_html(answers, test_case['input'][0]) # 检查行内公式抽取正确性 if test_case.get('expected_inline', None): - print('expected_inline::::::::', test_case['expected_inline']) parts = [part[0] for part in parts if CCTag.CC_MATH_INLINE in part[0]] - print(len(parts)) - print(parts) def write_to_html(self, answers, file_name): file_name = file_name.split('.')[0] @@ -427,9 +423,11 @@ def test_to_content_list_node(self): with self.subTest(input=test_case['input']): output_node = self.math_recognizer.to_content_list_node( test_case['input'][0], - test_case['input'][1], + html_to_element(test_case['input'][1]), test_case['input'][2] ) + print('output_node::::::::', output_node) + print(test_case['expected']) self.assertEqual(output_node, test_case['expected']) # 测试没有ccmath标签的情况 @@ -441,7 +439,7 @@ def test_to_content_list_node(self): with self.assertRaises(HtmlMathRecognizerException) as exc_info: self.math_recognizer.to_content_list_node( invalid_content[0], - invalid_content[1], + html_to_element(invalid_content[1]), invalid_content[2] ) self.assertIn('No ccmath element found in content', str(exc_info.exception)) diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_para.py b/tests/llm_web_kit/extractor/html/recognizer/test_para.py index 42e988bd..adb38c59 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_para.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_para.py @@ -20,21 +20,21 @@ def test_recognize_simple_para(self): html = f.read() # 执行识别 - result = self.recognizer.recognize('', [(html, html)], html) + result = self.recognizer.recognize('', [(html_to_element(html), html_to_element(html))], html) # 验证结果 self.assertEqual(len(result), 2) # 应该识别出2个段落 # 验证第一个段落 first_para = result[0][0] - ccel = html_to_element(first_para) + ccel = first_para jso = json.loads(ccel.text) self.assertEqual(jso[0]['c'], '质量方程') self.assertEqual(jso[0]['t'], 'text') # 验证第二个段落 second_para = result[1][0] - text = html_to_element(second_para).text + text = second_para.text jso = json.loads(text) self.assertEqual(jso[0]['c'], '爱因斯坦的方程') self.assertEqual(jso[0]['t'], 'text') diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_recognizer.py b/tests/llm_web_kit/extractor/html/recognizer/test_recognizer.py index 7bedf512..86b303e6 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_recognizer.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_recognizer.py @@ -3,6 +3,7 @@ from llm_web_kit.extractor.html.recognizer.recognizer import \ BaseHTMLElementRecognizer +from llm_web_kit.libs.html_utils import element_to_html, html_to_element class TestBaseHTMLElementRecognizer(unittest.TestCase): @@ -10,51 +11,50 @@ def test_html_split_by_tags_1(self): with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/image.html', 'r') as file: html_content = file.read() - result = BaseHTMLElementRecognizer.html_split_by_tags(html_content, ['img']) + result = BaseHTMLElementRecognizer.html_split_by_tags(html_to_element(html_content), ['img']) assert len(result) == 7 def test_html_split_by_tags_2(self): with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/cccode.html', 'r') as file: html_content = file.read() - result = BaseHTMLElementRecognizer.html_split_by_tags(html_content, ['cccode']) + result = BaseHTMLElementRecognizer.html_split_by_tags(html_to_element(html_content), ['cccode']) assert len(result) == 3 def test_html_split_by_tags_3(self): with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/raw_html_attr.html', 'r') as file: html_content = file.read() - - result = BaseHTMLElementRecognizer.html_split_by_tags(html_content, ['ccmath']) + result = BaseHTMLElementRecognizer.html_split_by_tags(html_to_element(html_content), ['ccmath']) assert len(result) == 2 - assert result[0][1] == '$E=MC^2$' + assert element_to_html(result[0][1]) == '$E=MC^2$' def test_html_split_by_tags_with_parent_nodes(self): """测试是否能够正确带上父节点.""" with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/with_parent_nodes.html', 'r') as file: html_content = file.read() - result_with_parent = BaseHTMLElementRecognizer.html_split_by_tags(html_content, 'cccode') + result_with_parent = BaseHTMLElementRecognizer.html_split_by_tags(html_to_element(html_content), 'cccode') assert len(result_with_parent) == 7 - assert result_with_parent[0][0] == """
+ assert element_to_html(result_with_parent[0][0]) == """
这里是text 这里是span
""" - assert result_with_parent[2][0] == '
print("BBBBBB")
' - assert result_with_parent[3][0] == """
+ assert element_to_html(result_with_parent[2][0]) == '
print("BBBBBB")
' + assert element_to_html(result_with_parent[3][0]) == """
这里是tail

这里是div text 这里是span2

""" - result = BaseHTMLElementRecognizer.html_split_by_tags(html_content, 'cccode') + result = BaseHTMLElementRecognizer.html_split_by_tags(html_to_element(html_content), 'cccode') assert len(result) == 7 def test_is_cctag(self): with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/iscctag.html', 'r') as file: html_content = file.read() - assert BaseHTMLElementRecognizer.is_cc_html(html_content, 'cccode') - assert BaseHTMLElementRecognizer.is_cc_html(html_content, 'ccmath') - assert BaseHTMLElementRecognizer.is_cc_html(html_content, 'ccimage') - assert not BaseHTMLElementRecognizer.is_cc_html(html_content, 'ccvideo') - assert not BaseHTMLElementRecognizer.is_cc_html(html_content, 'cctitle') - assert BaseHTMLElementRecognizer.is_cc_html(html_content, ['cccode', 'ccxxx']) + assert BaseHTMLElementRecognizer.is_cc_html(html_to_element(html_content), 'cccode') + assert BaseHTMLElementRecognizer.is_cc_html(html_to_element(html_content), 'ccmath') + assert BaseHTMLElementRecognizer.is_cc_html(html_to_element(html_content), 'ccimage') + assert not BaseHTMLElementRecognizer.is_cc_html(html_to_element(html_content), 'ccvideo') + assert not BaseHTMLElementRecognizer.is_cc_html(html_to_element(html_content), 'cctitle') + assert BaseHTMLElementRecognizer.is_cc_html(html_to_element(html_content), ['cccode', 'ccxxx']) diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index 6e91c85e..2470c060 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -45,7 +45,7 @@ def test_involve_cctale(self): raw_html_path = base_dir.joinpath(test_case['input'][0]) base_url = test_case['input'][1] raw_html = raw_html_path.read_text() - parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) + parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) self.assertEqual(len(parts), 4) def test_not_involve_table(self): @@ -54,7 +54,7 @@ def test_not_involve_table(self): raw_html_path = base_dir.joinpath(test_case['input'][1]) base_url = test_case['input'][1] raw_html = raw_html_path.read_text(encoding='utf-8') - parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) + parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) self.assertEqual(len(parts), 1) def test_only_involve_table(self): @@ -63,9 +63,9 @@ def test_only_involve_table(self): raw_html_path = base_dir.joinpath(test_case['input'][2]) base_url = test_case['input'][1] raw_html = raw_html_path.read_text() - parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) + parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) self.assertEqual(len(parts), 2) - table_body = html_to_element(parts[1][0]).text_content() + table_body = parts[1][0].text_content() assert table_body == r'
Mrs S Hindle
ShowCCRCC
Driffield 5th October 2006CH. Ricksbury Royal HeroCH. Keyingham Branwell
Manchester 16th January 2008CH. Lochbuie GeordieMerryoth Maeve
Darlington 20th September 2009CH. Maibee Make BelieveCH. Loranka Just Like Heaven JW
Blackpool 22nd June 2012CH. Loranka Sherrie BabyDear Magic Touch De La Fi Au Songeur
Welsh Kennel Club 2014Brymarden Carolina SunriseCh. Wandris Evan Elp Us
Welsh Kennel Club 2014Ch. Charnell Clematis of SalegreenCH. Byermoor Queens Maid
' def test_table_include_img_label(self): @@ -74,9 +74,9 @@ def test_table_include_img_label(self): raw_html_path = base_dir.joinpath(test_case['input'][6]) base_url = test_case['input'][1] raw_html = raw_html_path.read_text() - parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) + parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) assert len(parts) == 3 - simple_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}')[0] + simple_table_tag = parts[1][0].xpath(f'.//{CCTag.CC_TABLE}')[0] simple_table_type = simple_table_tag.attrib assert simple_table_type['table_type'] == 'simple' @@ -86,9 +86,9 @@ def test_cc_simple_table(self): raw_html_path = base_dir.joinpath(test_case['input'][7]) base_url = test_case['input'][8] raw_html = raw_html_path.read_text(encoding='utf-8') - parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) + parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) assert len(parts) == 3 - content = html_to_element(parts[1][0]).text_content() + content = parts[1][0].text_content() assert content == r'
Рейтинг:Рейтинг 5.00 из 5 на основе опроса 3 пользователей
Тип товара:Препараты для омоложения
Форма:Крем
Объем:50 мл
Рецепт:Отпускается без рецепта
Способ хранения:Хранить при температуре 4-20°
Примечание:Беречь от детей
Оплата:Наличными/банковской картой
Доступность в Северске:В наличии
Доставка:2-7 Дней
Цена:84 ₽
' def test_cc_complex_table(self): @@ -97,11 +97,11 @@ def test_cc_complex_table(self): raw_html_path = base_dir.joinpath(test_case['input'][8]) base_url = test_case['input'][8] raw_html = raw_html_path.read_text(encoding='utf-8') - parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) + parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) assert len(parts) == 3 - content = html_to_element(parts[1][0]).text_content() + content = parts[1][0].text_content() assert content == r'
ফেব্রুয়ারি ২০২৪
সোমমঙ্গলবুধবৃহশুক্রশনিরবি
« জানুয়ারি
১০১১
১২১৩১৪১৫১৬১৭১৮
১৯২০২১২২২৩২৪২৫
২৬২৭২৮২৯
' - table_type = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}')[0] + table_type = parts[1][0].xpath(f'.//{CCTag.CC_TABLE}')[0] assert table_type.attrib['table_type'] == 'complex' def test_simple_complex_table(self): @@ -110,12 +110,12 @@ def test_simple_complex_table(self): raw_html_path = base_dir.joinpath(test_case['input'][3]) base_url = test_case['input'][1] raw_html = raw_html_path.read_text(encoding='utf-8') - parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) - simple_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}')[0] + parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) + simple_table_tag = parts[1][0].xpath(f'.//{CCTag.CC_TABLE}')[0] simple_table_type = simple_table_tag.attrib assert simple_table_type['table_type'] == 'simple' assert simple_table_type == {'table_type': 'simple', 'table_nest_level': '1', 'html': '\n \n \n \n \n \n \n \n \n
12
34
\n\n'} - complex_table_tag = html_to_element(parts[2][0]).xpath(f'.//{CCTag.CC_TABLE}')[0] + complex_table_tag = parts[2][0].xpath(f'.//{CCTag.CC_TABLE}')[0] complex_table_type = complex_table_tag.attrib assert complex_table_type['table_type'] == 'complex' assert complex_table_type == {'table_type': 'complex', 'table_nest_level': '1', 'html': '\n \n \n \n \n \n \n \n \n \n \n \n \n \n
123
4
567
\n '} @@ -127,9 +127,10 @@ def test_table_to_content_list_node_simple(self): base_url = test_case['input'][1] raw_html = raw_html_path.read_text(encoding='utf-8') parsed_content = raw_html - result = self.rec.to_content_list_node(base_url, parsed_content, raw_html) + result = self.rec.to_content_list_node(base_url, html_to_element(parsed_content), raw_html) expect = base_dir.joinpath(test_case['expected'][0]) expect_json = expect.read_text(encoding='utf-8') + print(result) assert result['type'] == json.loads(expect_json)['type'] assert result['content']['is_complex'] == json.loads(expect_json)['content']['is_complex'] assert result['raw_content'] == json.loads(expect_json)['raw_content'] @@ -142,7 +143,7 @@ def test_table_to_content_list_node_complex(self): raw_html_path = base_dir.joinpath(test_case['input'][5]) expect_path = base_dir.joinpath(test_case['expected'][1]) raw_html = raw_html_path.read_text(encoding='utf-8') - result = self.rec.to_content_list_node(expect_path, raw_html, raw_html) + result = self.rec.to_content_list_node(expect_path, html_to_element(raw_html), raw_html) fr = open(expect_path, 'r', encoding='utf-8') expect_result = json.loads(fr.read()) assert result == expect_result @@ -153,9 +154,11 @@ def test_table_involve_equation(self): raw_html_path = base_dir.joinpath(test_case['input'][9]) base_url = 'https://en.m.wikipedia.org/wiki/Variance' raw_html = raw_html_path.read_text(encoding='utf-8') - parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) - complex_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}') - print(complex_table_tag[0].text) + parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) + complex_table_tag = parts[1][0].xpath(f'.//{CCTag.CC_TABLE}') + assert complex_table_tag[0].text == r'
Name of the probability distributionProbability distribution functionMeanVariance
Binomial distribution${\displaystyle \Pr \,(X=k)={\binom {n}{k}}p^{k}(1-p)^{n-k}}$${\displaystyle np}$${\displaystyle np(1-p)}$
Geometric distribution${\displaystyle \Pr \,(X=k)=(1-p)^{k-1}p}$${\displaystyle {\frac {1}{p}}}$${\displaystyle {\frac {(1-p)}{p^{2}}}}$
Normal distribution${\displaystyle f\left(x\mid \mu ,\sigma ^{2}\right)={\frac {1}{\sqrt {2\pi \sigma ^{2}}}}e^{-{\frac {(x-\mu )^{2}}{2\sigma ^{2}}}}}$${\displaystyle \mu }$${\displaystyle \sigma ^{2}}$
Uniform distribution (continuous)${\displaystyle f(x\mid a,b)={\begin{cases}{\frac {1}{b-a}}&{\text{for }}a\leq x\leq b,\\[3pt]0&{\text{for }}xb\end{cases}}}$${\displaystyle {\frac {a+b}{2}}}$${\displaystyle {\frac {(b-a)^{2}}{12}}}$
Exponential distribution${\displaystyle f(x\mid \lambda )=\lambda e^{-\lambda x}}$${\displaystyle {\frac {1}{\lambda }}}$${\displaystyle {\frac {1}{\lambda ^{2}}}}$
Poisson distribution${\displaystyle f(k\mid \lambda )={\frac {e^{-\lambda }\lambda ^{k}}{k!}}}$${\displaystyle \lambda }$${\displaystyle \lambda }$
' + parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) + complex_table_tag = parts[1][0].xpath(f'.//{CCTag.CC_TABLE}') assert complex_table_tag[0].text == r'
Name of the probability distributionProbability distribution functionMeanVariance
Binomial distribution${\displaystyle \Pr \,(X=k)={\binom {n}{k}}p^{k}(1-p)^{n-k}}$${\displaystyle np}$${\displaystyle np(1-p)}$
Geometric distribution${\displaystyle \Pr \,(X=k)=(1-p)^{k-1}p}$${\displaystyle {\frac {1}{p}}}$${\displaystyle {\frac {(1-p)}{p^{2}}}}$
Normal distribution${\displaystyle f\left(x\mid \mu ,\sigma ^{2}\right)={\frac {1}{\sqrt {2\pi \sigma ^{2}}}}e^{-{\frac {(x-\mu )^{2}}{2\sigma ^{2}}}}}$${\displaystyle \mu }$${\displaystyle \sigma ^{2}}$
Uniform distribution (continuous)${\displaystyle f(x\mid a,b)={\begin{cases}{\frac {1}{b-a}}&{\text{for }}a\leq x\leq b,\\[3pt]0&{\text{for }}xb\end{cases}}}$${\displaystyle {\frac {a+b}{2}}}$${\displaystyle {\frac {(b-a)^{2}}{12}}}$
Exponential distribution${\displaystyle f(x\mid \lambda )=\lambda e^{-\lambda x}}$${\displaystyle {\frac {1}{\lambda }}}$${\displaystyle {\frac {1}{\lambda ^{2}}}}$
Poisson distribution${\displaystyle f(k\mid \lambda )={\frac {e^{-\lambda }\lambda ^{k}}{k!}}}$${\displaystyle \lambda }$${\displaystyle \lambda }$
' def test_table_involve_after_code(self): @@ -164,8 +167,8 @@ def test_table_involve_after_code(self): raw_html_path = base_dir.joinpath(test_case['input'][10]) base_url = 'https://en.m.wikipedia.org/wiki/Variance' raw_html = raw_html_path.read_text(encoding='utf-8') - parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) - assert html_to_element(parts[0][0]).xpath(f'.//{CCTag.CC_TABLE}')[0].text is None + parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) + assert parts[0][0].xpath(f'.//{CCTag.CC_TABLE}')[0].text is None @unittest.skip(reason='在code模块解决了table嵌套多行代码问题') def test_table_involve_code(self): @@ -174,8 +177,8 @@ def test_table_involve_code(self): raw_html_path = base_dir.joinpath(test_case['input'][11]) base_url = 'https://en.m.wikipedia.org/wiki/Variance' raw_html = raw_html_path.read_text(encoding='utf-8') - parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) - complex_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}') + parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) + complex_table_tag = parts[1][0].xpath(f'.//{CCTag.CC_TABLE}') expect_path = base_dir.joinpath(test_case['expected'][3]) content = open(expect_path, 'r', encoding='utf-8').read() assert complex_table_tag[0].text == content.strip('\n') @@ -187,8 +190,8 @@ def test_table_involve_complex_code(self): raw_html_path = base_dir.joinpath(test_case['input'][12]) base_url = 'https://en.m.wikipedia.org/wiki/Variance' raw_html = raw_html_path.read_text(encoding='utf-8') - parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) - complex_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}') + parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) + complex_table_tag = parts[1][0].xpath(f'.//{CCTag.CC_TABLE}') expect_path = base_dir.joinpath(test_case['expected'][3]) content = open(expect_path, 'r', encoding='utf-8').read() assert complex_table_tag[0].text == content.strip('\n') diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_text.py b/tests/llm_web_kit/extractor/html/recognizer/test_text.py index a9d368a1..7c85ddd7 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_text.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_text.py @@ -9,6 +9,7 @@ BaseHTMLElementRecognizer from llm_web_kit.extractor.html.recognizer.text import TextParagraphRecognizer from llm_web_kit.input.datajson import DataJson +from llm_web_kit.libs.html_utils import element_to_html, html_to_element class TestTextParagraphRecognize(unittest.TestCase): @@ -28,8 +29,8 @@ def test_text_1(self): assert self.text_recognize._TextParagraphRecognizer__combine_text('知识乱象\n', '中共中央政治局召开会议审议《成-2020年10月16日新闻联播', 'zh')[:7] == '知识乱象\n中共' - result = self.text_recognize.recognize('http://www.baidu.com', [(html_content, html_content)], html_content) - assert result[909][0][1413:1422] == '知识乱象\\n 中共' + result = self.text_recognize.recognize('http://www.baidu.com', [(html_to_element(html_content), html_to_element(html_content))], html_content) + assert '知识乱象\\n 中共' in element_to_html(result[908][0]) def test_text_2(self): """ @@ -150,8 +151,8 @@ def test_text_7(self): """ with open(Path(__file__).parent.parent.parent / 'assets/extractor_chain_input/good_data/html/text7.html', 'r') as file: html_content = file.read() - result = self.text_recognize.recognize('http://www.baidu.com', [(html_content, html_content)], html_content) - assert '1) A man takes 5 hrs and 45 mins to walk to a certain place and ride back' in result[0][0] and BaseHTMLElementRecognizer.is_cc_html(result[0][0]) + result = self.text_recognize.recognize('http://www.baidu.com', [(html_to_element(html_content), html_to_element(html_content))], html_content) + assert '1) A man takes 5 hrs and 45 mins to walk to a certain place and ride back' in element_to_html(result[0][0]) and BaseHTMLElementRecognizer.is_cc_html(result[0][0]) def test_text_8(self): """ @@ -162,8 +163,8 @@ def test_text_8(self): """ with open(Path(__file__).parent.parent.parent / 'assets/extractor_chain_input/good_data/html/text8.html', 'r') as file: html_content = file.read() - result = self.text_recognize.recognize('http://www.baidu.com', [(html_content, html_content)], html_content) - assert "40xy' -ln(x^8) = 0\\n\\n\\nInitial Condition: y(1)=31" in result[0][0] and BaseHTMLElementRecognizer.is_cc_html(result[0][0]) + result = self.text_recognize.recognize('http://www.baidu.com', [(html_to_element(html_content), html_to_element(html_content))], html_content) + assert "40xy' -ln(x^8) = 0\\n\\n\\nInitial Condition: y(1)=31" in element_to_html(result[0][0]) and BaseHTMLElementRecognizer.is_cc_html(result[0][0]) def test_text_9(self): """ @@ -174,8 +175,8 @@ def test_text_9(self): """ with open(Path(__file__).parent.parent.parent / 'assets/extractor_chain_input/good_data/html/text9.html', 'r') as file: html_content = file.read() - result = self.text_recognize.recognize('http://www.baidu.com', [(html_content, html_content)], html_content) - assert '1) Consider the formula f(x)=lim(n-->infinity)((x^n)/(1+x^n)).\\n Let D={x:f(x) is an element of R}. Calculate f(x) for all x elements of D and determine where f: D-->R is continuous.\\n\\n 2) Let f: D-->R and suppose that f(x) greater than equal 0 for all x elements of D. Define sqrt(f)-->R by (sqrt(f))(x) = sqrt(f(x)). If f is continuous at c elements of D, prove that sqrt(f) is continuous at c.' in result[50][0] and BaseHTMLElementRecognizer.is_cc_html(result[50][0]) + result = self.text_recognize.recognize('http://www.baidu.com', [(html_to_element(html_content), html_to_element(html_content))], html_content) + assert '1) Consider the formula f(x)=lim(n-->infinity)((x^n)/(1+x^n)).\\n Let D={x:f(x) is an element of R}. Calculate f(x) for all x elements of D and determine where f: D-->R is continuous.\\n\\n 2) Let f: D-->R and suppose that f(x) greater than equal 0 for all x elements of D. Define sqrt(f)-->R by (sqrt(f))(x) = sqrt(f(x)). If f is continuous at c elements of D, prove that sqrt(f) is continuous at c.' in element_to_html(result[50][0]) and BaseHTMLElementRecognizer.is_cc_html(result[50][0]) def test_text_10(self): """ diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_title.py b/tests/llm_web_kit/extractor/html/recognizer/test_title.py index d3eedc2d..8cc8eeeb 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_title.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_title.py @@ -4,6 +4,7 @@ import pytest from llm_web_kit.extractor.html.recognizer.title import TitleRecognizer +from llm_web_kit.libs.html_utils import element_to_html @pytest.fixture @@ -17,9 +18,9 @@ def test_title_recognizer(title_recognizer): result = title_recognizer.recognize('http://www.baidu.com', [(html_content, html_content)], html_content) assert len(result) == 10 - assert result[0][0] == """大模型好,大模型棒1""" - assert result[6][0] == """大模型好,大模型棒5 大模型很棒""" @@ -27,5 +28,5 @@ def test_title_tails_and_levels(title_recognizer): html_content = """

TEST:import *TEST

Tail

aaa

""" result = title_recognizer.recognize('http://www.baidu.com', [(html_content, html_content)], html_content) assert len(result) == 2 - assert result[0][0] == '
TEST: `import *` TEST
' + assert element_to_html(result[0][0]) == '
TEST: `import *` TEST
' pass diff --git a/tests/llm_web_kit/extractor/html/test_ContentListStaticsPostExtractor.py b/tests/llm_web_kit/extractor/html/test_ContentListStaticsPostExtractor.py index da4a4d7e..cd7196c9 100644 --- a/tests/llm_web_kit/extractor/html/test_ContentListStaticsPostExtractor.py +++ b/tests/llm_web_kit/extractor/html/test_ContentListStaticsPostExtractor.py @@ -54,7 +54,7 @@ def setUp(self): } }, { - 'type': 'table', + 'type': 'complex_table', 'raw_content': '', 'content': { 'html': '
12
', @@ -75,5 +75,5 @@ def test_content_list_statics_post_extractor(self): self.assertEqual(data_json.get(DataJsonKey.METAINFO, {}).get(DataJsonKey.STATICS, {}).get('paragraph.text'), 2) self.assertEqual(data_json.get(DataJsonKey.METAINFO, {}).get(DataJsonKey.STATICS, {}).get('paragraph.equation-inline'), 1) self.assertEqual(data_json.get(DataJsonKey.METAINFO, {}).get(DataJsonKey.STATICS, {}).get('equation-interline'), 1) - self.assertEqual(data_json.get(DataJsonKey.METAINFO, {}).get(DataJsonKey.STATICS, {}).get('table'), 1) - self.assertEqual(data_json.get(DataJsonKey.METAINFO, {}).get(DataJsonKey.STATICS, {}).get('table.complex'), 1) + self.assertEqual(data_json.get(DataJsonKey.METAINFO, {}).get(DataJsonKey.STATICS, {}).get('complex_table'), 1) + self.assertEqual(data_json.get(DataJsonKey.METAINFO, {}).get(DataJsonKey.STATICS, {}).get('complex_table.complex'), 1) diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index 01fb611a..271bcfdd 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -60,7 +60,7 @@ def setUp(self): for line in f: self.data_json.append(json.loads(line.strip())) - assert len(self.data_json) == 21 + assert len(self.data_json) == 24 # Config for HTML extraction self.config = load_pipe_tpl('html-test') @@ -105,13 +105,13 @@ def test_html_pipeline(self): # 然后是simple table html_content = html_content_list[4] - self.assertEqual(html_content['type'], DocElementType.TABLE) + self.assertEqual(html_content['type'], DocElementType.SIMPLE_TABLE) self.assertEqual(html_content['content']['is_complex'], False) assert html_content['content']['html'].startswith('' not in content_txt + assert '' not in content_txt + + def test_para_is_short(self): + """测试para识别后内容太短.""" + chain = ExtractSimpleFactory.create(self.config) + self.assertIsNotNone(chain) + test_data = self.data_json[22] + input_data = DataJson(test_data) + result = chain.extract(input_data) + content_txt = result.get_content_list().to_nlp_md() + print('content_txt', content_txt) + assert len(content_txt) == 3983 + def test_xml_tag(self): """测试xml标签.""" chain = ExtractSimpleFactory.create(self.config) self.assertIsNotNone(chain) - test_data = self.data_json[20] + test_data = self.data_json[23] input_data = DataJson(test_data) result = chain.extract(input_data) result_md = result.get_content_list().to_mm_md() diff --git a/tests/llm_web_kit/input/test_datajson.py b/tests/llm_web_kit/input/test_datajson.py index c967e330..413a657a 100644 --- a/tests/llm_web_kit/input/test_datajson.py +++ b/tests/llm_web_kit/input/test_datajson.py @@ -2,7 +2,9 @@ import pytest +from llm_web_kit.exception.exception import ExtractorChainInputException from llm_web_kit.input.datajson import ContentList, DataJson, DataJsonKey +from llm_web_kit.libs.doc_element_type import DocElementType def test_datajson_init(): @@ -98,14 +100,71 @@ def test_datajson_serialization(): def test_datajson_validation(): # Test invalid input type - with pytest.raises(ValueError): + with pytest.raises(ExtractorChainInputException): DataJson([]) # List instead of dict # Test invalid content_list type - with pytest.raises(ValueError): + with pytest.raises(ExtractorChainInputException): DataJson({DataJsonKey.CONTENT_LIST: 'invalid'}) # String instead of list +def test_datajson_exclude_nodes_to_nlp_md(): + data = { + DataJsonKey.DATASET_NAME: 'test_dataset', + DataJsonKey.FILE_FORMAT: 'html', + DataJsonKey.CONTENT_LIST: [[{ + 'type': 'simple_table', + 'raw_content': "
Title: T.J. Byrne, Slide of floor plan, Poor Law Commission cottage, 1872.
Authors: T.J., Byrne
Fewer, Michael
Keywords: T.J. Byrne
Cottages
Poor Law Commission
Issue Date: 2011
2011
Description: T.J. Byrne's slide of a one storey cottage, labelled 'Mr Barney's Plan', recommended by the Poor Law Commission, 1872.
URI: https://hdl.handle.net/10599/5719
Appears in Collections:Published Items
T.J. Byrne Collection
", + 'content': { + 'html': "
Title:T.J. Byrne, Slide of floor plan, Poor Law Commission cottage, 1872.
Authors:T.J., Byrne Fewer, Michael
Keywords:T.J. Byrne Cottages Poor Law Commission
Issue Date:2011 2011
Description:T.J. Byrne's slide of a one storey cottage, labelled 'Mr Barney's Plan', recommended by the Poor Law Commission, 1872.
URI:https://hdl.handle.net/10599/5719
Appears in Collections:Published Items T.J. Byrne Collection
", + 'is_complex': False, + 'table_nest_level': '1' + } + }]] + } + datajson = DataJson(data) + md = datajson.get_content_list().to_nlp_md(exclude_nodes=DocElementType.COMPLEX_TABLE) + assert '' not in md + + +def test_datajson_exclude_nodes_to_mmd(): + data = { + DataJsonKey.DATASET_NAME: 'test_dataset', + DataJsonKey.FILE_FORMAT: 'html', + DataJsonKey.CONTENT_LIST: [[{ + 'type': 'simple_table', + 'raw_content': "
Title: T.J. Byrne, Slide of floor plan, Poor Law Commission cottage, 1872.
Authors: T.J., Byrne
Fewer, Michael
Keywords: T.J. Byrne
Cottages
Poor Law Commission
Issue Date: 2011
2011
Description: T.J. Byrne's slide of a one storey cottage, labelled 'Mr Barney's Plan', recommended by the Poor Law Commission, 1872.
URI: https://hdl.handle.net/10599/5719
Appears in Collections:Published Items
T.J. Byrne Collection
", + 'content': { + 'html': "
Title:T.J. Byrne, Slide of floor plan, Poor Law Commission cottage, 1872.
Authors:T.J., Byrne Fewer, Michael
Keywords:T.J. Byrne Cottages Poor Law Commission
Issue Date:2011 2011
Description:T.J. Byrne's slide of a one storey cottage, labelled 'Mr Barney's Plan', recommended by the Poor Law Commission, 1872.
URI:https://hdl.handle.net/10599/5719
Appears in Collections:Published Items T.J. Byrne Collection
", + 'is_complex': False, + 'table_nest_level': '1' + } + }, { + 'type': 'complex_table', + 'raw_content': "
Title: T.J. Byrne, Slide of floor plan, Poor Law Commission cottage, 1872.
Authors: T.J., Byrne
Fewer, Michael
Keywords: T.J. Byrne
Cottages
Poor Law Commission
Issue Date: 2011
2011
Description: T.J. Byrne's slide of a one storey cottage, labelled 'Mr Barney's Plan', recommended by the Poor Law Commission, 1872.
URI: https://hdl.handle.net/10599/5719
Appears in Collections:Published Items
T.J. Byrne Collection
", + 'content': { + 'html': "
Title:T.J. Byrne, Slide of floor plan, Poor Law Commission cottage, 1872.
Authors:T.J., Byrne Fewer, Michael
Keywords:T.J. Byrne Cottages Poor Law Commission
Issue Date:2011 2011
Description:T.J. Byrne's slide of a one storey cottage, labelled 'Mr Barney's Plan', recommended by the Poor Law Commission, 1872.
URI:https://hdl.handle.net/10599/5719
Appears in Collections:Published Items T.J. Byrne Collection
", + 'is_complex': True, + 'table_nest_level': '1' + } + }, { + 'type': 'image', + 'raw_content': "\"Curtindo", + 'content': { + 'url': 'https://naproadavida.com/wp-content/uploads/2020/11/20201024-Airbnb-SP-Consolacao_getaway_manha_Sony-1.jpg', + 'data': None, + 'alt': 'Curtindo o apartamento com piscina no centro de SP. ', + 'title': 'Curtindo o apartamento com piscina no centro de SP. ', + 'caption': None + } + }]] + } + datajson = DataJson(data) + md = datajson.get_content_list().to_mm_md(exclude_nodes=DocElementType.COMPLEX_TABLE) + assert '' not in md + assert 'Curtindo o apartamento com piscina no centro de SP.' in md + + def test_data_json_deepcopy(): """从一个外部dict构建datajson, 改变datajson,不改变外部dict.""" d = {'track_id': '32266dfa-c335-45c5-896e-56f057889d28', @@ -174,7 +233,7 @@ def test_data_json_to_nlp_md(): } }, { - 'type': 'table', + 'type': 'simple_table', 'raw_content': '
\n\t\t\t\tMaandag\n\t\t\t\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\tDinsdag\n\t\t\t\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\tWoensdag\n\t\t\t\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\tDonderdag\n\t\t\t\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\tVrijdag\n\t\t\t\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\tZaterdag\n\t\t\t\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\tZondag\n\t\t\t\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
', 'content': { 'html': '
Maandag-
Dinsdag-
Woensdag-
Donderdag-
Vrijdag-
Zaterdag-
Zondag-
', @@ -205,7 +264,7 @@ def test_default_exclude(): def test_custom_exclude(): datajson = DataJson(d) - md = datajson.get_content_list().to_nlp_md(MM_NODE_LIST=['table']) + md = datajson.get_content_list().to_nlp_md(exclude_nodes=[DocElementType.COMPLEX_TABLE, DocElementType.SIMPLE_TABLE]) assert 'Ziet u iets wat niet hoort of niet klopt?' in md assert 'Openingstijden' in md assert 'Maandag' not in md