From 913781a5fac109c15f8f243c122fe7d9ea82f215 Mon Sep 17 00:00:00 2001 From: ningwenchang Date: Fri, 22 Aug 2025 00:54:56 +0000 Subject: [PATCH 1/5] [fix]: Fix badcases in the simplify stage and improve its robustness. --- .../main_html_parser/parser/tag_simplifier.py | 4 +- .../simplify_html/simplify_html.py | 265 +- .../simplify_cases/abnormal_comment.html | 1 + .../simplify_cases/block_select.html | 70 + .../simplify_cases/header_tag.html | 842 +++ .../simplify_cases/inline_block.html | 422 ++ .../test_html_data/simplify_cases/list.html | 208 + .../simplify_cases/nav_class.html | 441 ++ .../simplify_cases/nested_table_caption.html | 217 + .../simplify_cases/nested_table_headers.html | 389 ++ .../simplify_cases/non_list_child.html | 5343 +++++++++++++++++ .../test_html_data/simplify_cases/table.html | 677 +++ .../parser/test_tag_simplifier.py | 233 +- 13 files changed, 9003 insertions(+), 109 deletions(-) create mode 100644 tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/abnormal_comment.html create mode 100644 tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/block_select.html create mode 100644 tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/header_tag.html create mode 100644 tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/inline_block.html create mode 100644 tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/list.html create mode 100644 tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/nav_class.html create mode 100644 tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/nested_table_caption.html create mode 100644 tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/nested_table_headers.html create mode 100644 tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/non_list_child.html create mode 100644 tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/table.html diff --git a/llm_web_kit/main_html_parser/parser/tag_simplifier.py b/llm_web_kit/main_html_parser/parser/tag_simplifier.py index eede6cfb..493d46cc 100644 --- a/llm_web_kit/main_html_parser/parser/tag_simplifier.py +++ b/llm_web_kit/main_html_parser/parser/tag_simplifier.py @@ -19,12 +19,11 @@ def parse(self, pre_data: PreDataJson) -> PreDataJson: """ # 获取输入数据 typical_raw_html = pre_data.get(PreDataJsonKey.TYPICAL_RAW_HTML, '') - is_xpath = pre_data.get(PreDataJsonKey.IS_XPATH, True) # layout_file_list = pre_data.get(PreDataJsonKey.LAYOUT_FILE_LIST, []) # 执行HTML标签简化逻辑 try: - simplified_html, original_html, _ = simplify_html(typical_raw_html, is_xpath=is_xpath) + simplified_html, original_html = simplify_html(typical_raw_html) except TagSimplifiedParserException as e1: raise e1 except Exception as e2: @@ -33,6 +32,5 @@ def parse(self, pre_data: PreDataJson) -> PreDataJson: # 设置输出数据 pre_data[PreDataJsonKey.TYPICAL_RAW_TAG_HTML] = original_html # 保存原始标签HTML pre_data[PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML] = simplified_html # 保存简化后的HTML - pre_data[PreDataJsonKey.XPATH_MAPPING] = _ # 保存xpath return pre_data diff --git a/llm_web_kit/main_html_parser/simplify_html/simplify_html.py b/llm_web_kit/main_html_parser/simplify_html/simplify_html.py index bfff8f29..45938f1c 100644 --- a/llm_web_kit/main_html_parser/simplify_html/simplify_html.py +++ b/llm_web_kit/main_html_parser/simplify_html/simplify_html.py @@ -10,12 +10,15 @@ inline_tags = { 'map', 'optgroup', 'span', 'br', 'input', 'time', 'u', 'strong', 'textarea', 'small', 'sub', 'samp', 'blink', 'b', 'code', 'nobr', 'strike', 'bdo', 'basefont', 'abbr', 'var', 'i', 'cccode-inline', - 'select', 's', 'pic', 'label', 'mark', 'object', 'dd', 'dt', 'ccmath-inline', 'svg', 'li', + 'select', 's', 'pic', 'label', 'mark', 'object', 'ccmath-inline', 'svg', 'button', 'a', 'font', 'dfn', 'sup', 'kbd', 'q', 'script', 'acronym', 'option', 'img', 'big', 'cite', 'em', 'marked-tail', 'marked-text' - # 'td', 'th' + # 'td', 'th', 'dd', 'dt', 'li' } +# 表格内部可能包含的跟表格相关的标签 +table_tags_set = {"caption", "colgroup", "col", "thead", "tbody", "tfoot", "tr", "td", "th"} + # 需要删除的标签 tags_to_remove = { 'head', @@ -125,37 +128,77 @@ def get_relative_xpath(element): return f'//{"/".join(path_from_element)}' +def judge_table_parent(table_element, node_list): + for node in node_list: + ancestor = node.getparent() + while ancestor is not None: + if ancestor is table_element: + return True + elif ancestor.tag == 'table': + break + ancestor = ancestor.getparent() + return False + + def is_data_table(table_element: html.HtmlElement) -> bool: """判断表格是否是数据表格而非布局表格.""" - # 检查表格是否有 caption 标签 - if table_element.xpath('.//caption'): + # 检查当前表格(不包括内部嵌套表格)是否有 caption 标签 + caption_nodes = table_element.xpath('.//caption') + if judge_table_parent(table_element, caption_nodes): return True - # 检查是否有 th 标签 - if table_element.xpath('.//th'): + # 检查当前表格(不包括内部嵌套表格)是否有 colgroup 或 col 标签 + col_nodes = table_element.xpath('.//col') + colgroup_nodes = table_element.xpath('.//colgroup') + if judge_table_parent(table_element, col_nodes) or judge_table_parent(table_element, colgroup_nodes): return True - # 检查是否有 thead 或 tfoot 标签 - if table_element.xpath('.//thead') or table_element.xpath('.//tfoot'): + # 检查是否有 role="table" 或 data-table 属性 + if table_element.get('role') == 'table' or table_element.get('data-table'): return True - # 检查是否有 colgroup 或 col 标签 - if table_element.xpath('.//colgroup') or table_element.xpath('.//col'): + # 检查当前表格(不包括内部嵌套表格)单元格是否有 headers 属性 + cell_nodes = table_element.xpath(".//*[self::td or self::th][@headers]") + if judge_table_parent(table_element, cell_nodes): return True - # 检查是否有 summary 属性 - if table_element.get('summary'): - return True + for node in table_element.iterdescendants(): + if node.tag in table_tags_set: + continue + if node not in inline_tags: + return False - # 检查是否有 role="table" 或 data-table 属性 - if table_element.get('role') == 'table' or table_element.get('data-table'): - return True + return True - # 检查单元格是否有 headers 属性 - if table_element.xpath('.//*[@headers]'): - return True - return False +def has_non_listitem_children(list_element): + """检查列表元素是否包含非列表项的直接子节点. + + :param list_element: lxml元素对象 (ul, ol, dl) + :return: True 如果存在非列表项的直接子节点,否则 False + """ + # 获取所有直接子元素(不包括文本节点) + direct_children = list_element.xpath("./*") + + # 根据列表类型确定允许的子元素标签 + if list_element.tag in ['ul', 'ol']: + allowed_tags = {'li'} + elif list_element.tag == 'dl': + allowed_tags = {'dt', 'dd'} + else: + # 如果不是列表元素,返回False + return False + + # 检查是否存在不允许的元素 + for child in direct_children: + if child.tag not in allowed_tags: + return True + + # 检查是否存在非空白文本节点 + text_children = list_element.xpath("./text()") + non_whitespace_text = any(text.strip() for text in text_children) + + return non_whitespace_text def extract_paragraphs(processing_dom: html.HtmlElement, uid_map: Dict[str, html.HtmlElement], @@ -185,33 +228,68 @@ def extract_paragraphs(processing_dom: html.HtmlElement, uid_map: Dict[str, html for table in processing_dom.xpath('.//table'): table_types[table.get('data-uid')] = is_data_table(table) + # 创建列表类型映射,记录每个列表是内容列表还是布局列表 + list_types = {} + def is_block_element(node) -> bool: """判断是否为块级元素.""" - # 处理表格单元格特殊情况 - if node.tag in ('td', 'th'): - # 找到最近的祖先table元素 - table_ancestor = node - while table_ancestor is not None and table_ancestor.tag != 'table': - table_ancestor = table_ancestor.getparent() - - # 如果是表格单元格,根据表格类型决定是否为块级元素 - if table_ancestor is not None: - table_uid = table_ancestor.get('data-uid') - if table_types.get(table_uid, False): - # 数据表格的td/th不作为块级元素 + def judge_special_case(node, expected_tags, types_map): + ancestor = node + while ancestor is not None and ancestor.tag not in expected_tags: + ancestor = ancestor.getparent() + + if ancestor is not None: + ancestor_uid = ancestor.get('data-uid') + if types_map.get(ancestor_uid, False): + # 数据表格/内容列表的子元素不作为块级元素 return False else: - # 布局表格的td/th作为块级元素 + # 布局表格/列表的子元素作为块级元素 return True + # 处理表格和列表的特殊情况 + if node.tag in ('td', 'th'): + return judge_special_case(node, ['table'], table_types) + + if node.tag == "li": + return judge_special_case(node, ['ul', 'ol'], list_types) + + if node.tag == "dt" or node.tag == "dd": + return judge_special_case(node, ['dl'], list_types) + # 默认处理其他元素 if node.tag in inline_tags: return False return isinstance(node, html.HtmlElement) - def has_block_children(node) -> bool: - """判断是否有块级子元素.""" - return any(is_block_element(child) for child in node.iterchildren()) + def has_block_descendants(node): + for child in node.iterdescendants(): + if is_block_element(child): + return True + return False + + def is_content_list(list_element): + # 获取列表项(支持多种列表类型) + items = list_element.xpath("li | dt | dd") + + # 空列表直接返回普通列表 + if len(items) == 0: + return True + # 列表包含非列表项子元素视为布局列表 + if has_non_listitem_children(list_element): + return False + + # 列表内任意子项存在块级元素,则视为布局列表 + for item in items: + if has_block_descendants(item): + return False + + # 默认视为普通列表 + return True + + # 先分析所有列表的类型 + for list_element in processing_dom.xpath('.//ul | .//ol | .//dl'): + list_types[list_element.get('data-uid')] = is_content_list(list_element) def clone_structure(path: List[html.HtmlElement]) -> Tuple[html.HtmlElement, html.HtmlElement]: """克隆节点结构.""" @@ -245,7 +323,7 @@ def process_node(node: html.HtmlElement, path: List[html.HtmlElement]): # 处理子节点 for child in node: - if is_block_element(child): + if is_block_element(child) or has_block_descendants(child): # 处理累积的内联内容 if inline_content: try: @@ -271,7 +349,7 @@ def process_node(node: html.HtmlElement, path: List[html.HtmlElement]): content_sources = [] # 处理块级元素 - if not has_block_children(child): + if table_types.get(child.get('data-uid')) or (not has_block_descendants(child)): try: root, last_node = clone_structure(current_path + [child]) last_node.text = child.text if child.text else None @@ -358,12 +436,26 @@ def merge_inline_content(parent: html.HtmlElement, content_list: List[Tuple[str, return unique_paragraphs +def safely_remove_comments(html_content): + # 创建解析器并设置为移除注释节点 + parser = html.HTMLParser(remove_comments=True) + doc = html.fromstring(html_content, parser=parser) + + # 重新序列化为字符串 + return etree.tostring( + doc, + encoding='unicode', + method='html', + with_tail=False + ) + + def remove_xml_declaration(html_string): # 正则表达式匹配 (没有问号结尾的情况) pattern = r'<\?xml\s+.*?\??>' html_content = re.sub(pattern, '', html_string, flags=re.DOTALL) - # 1. 删除HTML注释 - html_content = re.sub(r'', '', html_content, flags=re.DOTALL) + # 删除HTML注释 + html_content = safely_remove_comments(html_content) return html_content @@ -373,7 +465,7 @@ def post_process_html(html_content: str) -> str: return html_content # 1. 删除HTML注释 - html_content = re.sub(r'', '', html_content, flags=re.DOTALL) + html_content = safely_remove_comments(html_content) # 2. 处理标签外的空白(保留标签内文本的换行) def replace_outside_tag_space(match): @@ -401,7 +493,11 @@ def remove_tags(dom): for node in dom.xpath(f'.//{tag}'): parent = node.getparent() if parent is not None: - parent.remove(node) + if tag == "header" or tag == "footer": + if parent.tag == 'body': + parent.remove(node) + else: + parent.remove(node) def is_meaningful_content(element) -> bool: @@ -574,31 +670,14 @@ def simplify_list(element): def should_remove_element(element) -> bool: """判断元素的class或id属性是否匹配需要删除的模式.""" - # 检查class属性 - class_name = element.get('class', '') - if class_name: - class_parts = class_name.strip().split() - for part in class_parts: - # 检查是否完全匹配独立单词 - if part in ATTR_PATTERNS_TO_REMOVE: - return True - # 检查是否包含特定前缀/后缀 - # for pattern in ATTR_SUFFIX_TO_REMOVE: - # if part.endswith(pattern): - # return True - # 检查id属性 + class_name = element.get('class', '') id_name = element.get('id', '') - if id_name: - id_parts = id_name.strip().split('-') # id通常用连字符分隔 - for part in id_parts: - # 检查是否完全匹配独立单词 - if part in ATTR_PATTERNS_TO_REMOVE: - return True - # 检查是否包含特定前缀/后缀 - # for pattern in ATTR_SUFFIX_TO_REMOVE: - # if part.endswith(pattern): - # return True + + if class_name in ATTR_PATTERNS_TO_REMOVE or id_name in ATTR_PATTERNS_TO_REMOVE: + parent = element.getparent() + if parent is not None and parent.tag == 'body': + return True # 检查style属性 style_attr = element.get('style', '') @@ -665,7 +744,7 @@ def truncate_text_content(element, max_length=500): remaining -= len(text) -def process_paragraphs(paragraphs: List[Dict[str, str]], uid_map: Dict[str, html.HtmlElement], is_xpath: bool = True) -> Tuple[str, html.HtmlElement]: +def process_paragraphs(paragraphs: List[Dict[str, str]], uid_map: Dict[str, html.HtmlElement]) -> Tuple[str, html.HtmlElement]: """处理段落并添加 _item_id,同时在原始DOM的对应元素上添加相同ID. Args: @@ -680,7 +759,7 @@ def process_paragraphs(paragraphs: List[Dict[str, str]], uid_map: Dict[str, html for para in paragraphs: try: - html_content = re.sub(r'', '', para['html'], flags=re.DOTALL) + html_content = safely_remove_comments(para['html']) # 解析段落HTML root = html.fromstring(html_content) root_for_xpath = copy.deepcopy(root) @@ -698,29 +777,6 @@ def process_paragraphs(paragraphs: List[Dict[str, str]], uid_map: Dict[str, html # 截断过长的文本内容 truncate_text_content(root, max_length=1000) - para_xpath = [] - if is_xpath: - if content_type in ('inline_elements', 'mixed'): - for child in root_for_xpath.iterchildren(): - original_element = uid_map.get(child.get('data-uid')) - try: - _xpath = get_relative_xpath(original_element) - except Exception: - _xpath = None - para_xpath.append(_xpath) - elif content_type == 'block_element': - try: - _xpath = get_relative_xpath(para['_original_element']) - except Exception: - _xpath = None - para_xpath.append(_xpath) - else: - try: - _xpath = get_relative_xpath(para['_original_element']) - except Exception: - _xpath = None - para_xpath.append(_xpath) - # 为当前段落和原始元素添加相同的 _item_id current_id = str(item_id) root.set('_item_id', current_id) @@ -767,6 +823,9 @@ def process_paragraphs(paragraphs: List[Dict[str, str]], uid_map: Dict[str, html # 创建wrapper元素 wrapper = etree.Element(tail_block_tag) wrapper.set('_item_id', current_id) + # 如果父元素包含cc-select,那么包裹的wrapper元素也应该包含cc-select,避免_item_id和cc-select不在同一层级中 + if original_parent.get("cc-select") is not None: + wrapper.set("cc-select", original_parent.get("cc-select")) # 设置前面的文本 if leading_text: @@ -804,7 +863,9 @@ def process_paragraphs(paragraphs: List[Dict[str, str]], uid_map: Dict[str, html wrapper = etree.Element(tail_block_tag) wrapper.set('_item_id', current_id) wrapper.text = original_parent.text - + # 如果父元素包含cc-select,那么包裹的wrapper元素也应该包含cc-select + if original_parent.get("cc-select") is not None: + wrapper.set("cc-select", original_parent.get("cc-select")) # 替换父节点的text original_parent.text = None @@ -824,7 +885,9 @@ def process_paragraphs(paragraphs: List[Dict[str, str]], uid_map: Dict[str, html wrapper = etree.Element(tail_block_tag) wrapper.set('_item_id', current_id) wrapper.text = child.tail - + # 如果父元素包含cc-select,那么包裹的wrapper元素也应该包含cc-select + if original_parent.get("cc-select") is not None: + wrapper.set("cc-select", original_parent.get("cc-select")) # 替换tail child.tail = None @@ -838,6 +901,10 @@ def process_paragraphs(paragraphs: List[Dict[str, str]], uid_map: Dict[str, html else: # 块级元素直接设置属性 original_parent.set('_item_id', current_id) + for child in original_parent.iterdescendants(): + if child.get("cc-select") is not None: + original_parent.set("cc-select", child.get("cc-select")) + break item_id += 1 @@ -846,7 +913,6 @@ def process_paragraphs(paragraphs: List[Dict[str, str]], uid_map: Dict[str, html result.append({ 'html': cleaned_html, '_item_id': current_id, - '_xpath': para_xpath, 'content_type': content_type }) @@ -859,10 +925,10 @@ def process_paragraphs(paragraphs: List[Dict[str, str]], uid_map: Dict[str, html simplified_html = '' + ''.join( p['html'] for p in result) + '' - return post_process_html(simplified_html), result + return post_process_html(simplified_html) -def simplify_html(html_str, is_xpath: bool = True) -> etree.Element: +def simplify_html(html_str) -> etree.Element: """ :return: simplified_html: 精简HTML @@ -891,14 +957,9 @@ def simplify_html(html_str, is_xpath: bool = True) -> etree.Element: paragraphs = extract_paragraphs(processing_dom, original_uid_map, include_parents=False) # 处理段落(同步添加ID) - simplified_html, result = process_paragraphs(paragraphs, original_uid_map, is_xpath) + simplified_html = process_paragraphs(paragraphs, original_uid_map) remove_all_uids(original_dom) original_html = etree.tostring(original_dom, pretty_print=True, method='html', encoding='unicode') - _xpath_mapping = {item['_item_id']: { - '_xpath': item['_xpath'], - 'content_type': item['content_type'] - } for item in result} - - return simplified_html, original_html, _xpath_mapping + return simplified_html, original_html diff --git a/tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/abnormal_comment.html b/tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/abnormal_comment.html new file mode 100644 index 00000000..840fc308 --- /dev/null +++ b/tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/abnormal_comment.html @@ -0,0 +1 @@ + Appell sequence

 

.

In mathematics, an Appell sequence, named after Paul Émile Appell, is any polynomial sequence {pn(x)}n = 0, 1, 2, ... satisfying the identity

\( {d \over dx} p_n(x) = np_{n-1}(x), \)

and in which p0(x) is a non-zero constant.

Among the most notable Appell sequences besides the trivial example { xn } are the Hermite polynomials, the Bernoulli polynomials, and the Euler polynomials. Every Appell sequence is a Sheffer sequence, but most Sheffer sequences are not Appell sequences.

Equivalent characterizations of Appell sequences

The following conditions on polynomial sequences can easily be seen to be equivalent:

For n = 1, 2, 3, ...,

\( {d \over dx} p_n(x) = np_{n-1}(x) \)

and p0(x) is a non-zero constant;

For some sequence {cn}n = 0, 1, 2, ... of scalars with c0 ≠ 0,

\( p_n(x) = \sum_{k=0}^n {n \choose k} c_k x^{n-k}; \)

For the same sequence of scalars,

\( p_n(x) = \left(\sum_{k=0}^\infty {c_k \over k!} D^k\right) x^n, \)

where

D = {d \over dx};

For n = 0, 1, 2, ...,

p_n(x+y) = \sum_{k=0}^n {n \choose k} p_k(x) y^{n-k}.

Recursion formula

Suppose

\( p_n(x) = \left(\sum_{k=0}^\infty {c_k \over k!} D^k\right) x^n = Sx^n, \)

where the last equality is taken to define the linear operator S on the space of polynomials in x. Let

\( T = S^{-1} = \left(\sum_{k=0}^\infty {c_k \over k!} D^k\right)^{-1} = \sum_{k=1}^\infty {a_k \over k!} D^k \)

be the inverse operator, the coefficients ak being those of the usual reciprocal of a formal power series, so that

\( Tp_n(x) = x^n.\, \)

In the conventions of the umbral calculus, one often treats this formal power series T as representing the Appell sequence {pn}. One can define

\( \log T = \log\left(\sum_{k=0}^\infty {a_k \over k!} D^k \right) \)

by using the usual power series expansion of the log(1 + x) and the usual definition of composition of formal power series. Then we have

\( p_{n+1}(x) = (x - (\log T)')p_n(x).\, \)

(This formal differentiation of a power series in the differential operator D is an instance of Pincherle differentiation.)

In the case of Hermite polynomials, this reduces to the conventional recursion formula for that sequence.
Subgroup of the Sheffer polynomials

The set of all Appell sequences is closed under the operation of umbral composition of polynomial sequences, defined as follows. Suppose { pn(x) : n = 0, 1, 2, 3, ... } and { qn(x) : n = 0, 1, 2, 3, ... } are polynomial sequences, given by

\( p_n(x)=\sum_{k=0}^n a_{n,k}x^k\ \mbox{and}\ q_n(x)=\sum_{k=0}^n b_{n,k}x^k. \)

Then the umbral composition p o q is the polynomial sequence whose nth term is

\( (p_n\circ q)(x)=\sum_{k=0}^n a_{n,k}q_k(x)=\sum_{0\le k \le \ell \le n} a_{n,k}b_{k,\ell}x^\ell \)

(the subscript n appears in pn, since this is the n term of that sequence, but not in q, since this refers to the sequence as a whole rather than one of its terms).

Under this operation, the set of all Sheffer sequences is a non-abelian group, but the set of all Appell sequences is an abelian subgroup. That it is abelian can be seen by considering the fact that every Appell sequence is of the form

\( p_n(x) = \left(\sum_{k=0}^\infty {c_k \over k!} D^k\right) x^n, \)

and that umbral composition of Appell sequences corresponds to multiplication of these formal power series in the operator D.
Different convention

Another convention followed by some authors (see Chihara) defines this concept in a different way, conflicting with Appell's original definition, by using the identity

\( {d \over dx} p_n(x) = p_{n-1}(x) \)

instead.
See also

Sheffer sequence
Umbral calculus
Generalized Appell polynomials
Wick product

References

Paul Appell, "Sur une classe de polynômes", Annales scientifiques de l'École Normale Supérieure 2e série, tome 9, 1880.
Steven Roman and Gian-Carlo Rota, "The Umbral Calculus", Advances in Mathematics, volume 27, pages 95 – 188, (1978).
G.-C. Rota, D. Kahaner, and A. Odlyzko, "Finite Operator Calculus", Journal of Mathematical Analysis and its Applications, vol. 42, no. 3, June 1973. Reprinted in the book with the same title, Academic Press, New York, 1975.
Steven Roman. The Umbral Calculus. Dover Publications.
Theodore Seio Chihara (1978). An Introduction to Orthogonal Polynomials. Gordon and Breach, New York. ISBN 0-677-04150-0.

External links

Appell Sequence at MathWorld

Mathematics Encyclopedia

Retrieved from "http://en.wikipedia.org/"
All text is available under the terms of the GNU Free Documentation License

Home - Hellenica World


\ No newline at end of file diff --git a/tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/block_select.html b/tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/block_select.html new file mode 100644 index 00000000..ac222e70 --- /dev/null +++ b/tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/block_select.html @@ -0,0 +1,70 @@ + + + +Ikea - Term Papers - Business + + + + + + + + + + + + + + + + +
+
+ +
+ + + +
+full version Ikea Essay +

Ikea

+

Category: Business

+

Autor: jessica85 01 June 2010

+

Words: 1191 | Pages: 5

+

A. What are the cultural factors which make expansion abroad in retailing difficult? What has made it possible in IKEA's case?

Retailing expansions can be difficult, because of differences in culture in the global market. When entering a new market, corporations tend to do considerable studies catered towards local tastes. There are many factors to consider when expanding into a new area or culture, because culture can have a great impact on merchandising, and promotion of products. (Hibbert, Edgar 2000;)
The retailing difficulties are not only limited to merchandising and promotion but the cross-over of store brands and brand images. The social systems and social behavior also affects the corporation as different management styles and company cultures may be difficult for employees to adjust to and their maybe clashes which can make the whole process less effective and also less efficient. If there are major differences in the existing culture and language difficulties it may establish greater cultural barriers.
Culture also affects the four P’s in marketing the new product abroad, there can be difficulties and adjustments if the new market is price sensitive, has a high context culture and the corporation came from a country with a low context culture or vise-versa which can affect the promotion of the product. In another case study presented it was highlighted, that some cultures associate the price of a product with the quality of the product. (Hibbert, Edgar 2000;)The culture of the country also dictates if the target markets will be living in urban, rural or suburban areas and research must be done to show if they would be willing to travel out of their area to come to a different place to obtain the product.
In expanding to global markets retaining corporations will have to take into consideration the import, duties and taxes, and also government rules and regulations. The cost and availability of land, for example in when Toys �R’ Us moved to Japan, land in city areas were scare, limited and very expensive. (Hibbert, Edgar 2000;)
IKEA is an European store, more specifically, a Swedish furniture store. Expanding throughout Europe brought about less challenging difficulties because being an European store, there were many cultural similarities and cost advantages due to economies of scale. However, in there early expansion to the United States they faced many hurdles as they failed to adapt there strategies to the American culture and instead of imposed their own.
IKEA ultimately recognized their mistakes, for example Ikea tried to impose their European standard bed, which were longer and thinner, while selling American standardize bed sheets to the American customers. IKEA soon redesigned its American product range which immediately increased there sales. They also reduced their dependence on outside suppliers and recruited American suppliers. They also had their own people working alongside the manufactures to give technical tips and to find the better quality or lower cost materials. IKEA also had to change the way they did promotions, because the United States did not have a homogenous culture so the traditional forms of promotion would not have been as effective as elsewhere.


B. How does the TV advertising campaign initiated by IKEA overcome the entry barrier of high advertising expenditures?

IKEA could no longer use their strategy since America has a very diverse population with a variety of sub cultures and the “word of mouth” strategy would have been less effective than it was Europe and other countries. Because of the culturally diversity that exist in the United States, social norms and interpersonal communication are less reliable, foreign companies coming to the United States often find that corporate advertising done here far outstrips what they have used elsewhere. ” ( Johansson, Johny K. 2006; )
Therefore, IKEA came up with a new slogan and advertising message that would have the same effect and be consistent with previous marketing strategies used in countries with a more homogenous population. To implement this strategy IKEA’s advertising company created eight, thirty second ads that showed people in the different stages in their life. This focus allowed them to capitalize on reaching diverse markets at a fairly low cost.
IKEA’s TV advertising campaign overcame the entry barrier of high advertising by studying the American advertising, where they realized “in Europe you advertise to fain business; in the United States you advertise to stay in business The role of advertising in the United States is much greater than in other countries, according to statistics in 2004, the United States spent 242.5 billion in advertising, Canada 5.2 billion and Sweden 2.7 billion. In order to be on an even playing field IKEA found a strategy that was at the same time low cost and very effective. (Johansson, Johny K. 2006; )



C. Should IKEA expand further in the United States or focus on other countries? Be specific. What should they do and why?

As a marketing manager for IKEA, the suggestion of expanding further in the United States would be the best recommendation. As stated in the opening of the case the United States is potentially a very large market for IKEA which has not been taken advantage of, seeing that IKEA has been in the United States for twenty years and only has twenty stores. IKEA has not recognized its full potential in the United States; they need to build many smaller stores shifting away from its current strategy of a few large stores. They need to target communities to who they can cater to specifically.
The United States has many urban areas where IKEA products would be in high demand. IKEA products are user friendly in design, which includes furniture -in -a -box model and other low cost varieties of furniture. However, people in urban areas would not necessarily want to travel to suburban areas many miles from home to choose their furniture. Even with delivery, the locations of most of IKEA stores are inconvenient to people who do not have cars to get to the stores.
IKEA needs to be better promoted in the United States; commercials that may have worked with their original entrance cannot compete with new advertising aimed at several of their target markets. Retailers often redo their advertising every year, or have commercials made for specific targets. With low cost, packaged kits, IKEA should for example, have advertising to target college kids during the months of August and September, and advertising targeting new graduates during the months of May and June, young adults who are now starting off with their first homes.
In conclusion, IKEA products which are well designed and low-cost, suit very many consumers, however the locations maybe too out of the way to attract costumers who value convenience and ease over price. In addition to their huge warehouse stores IKEA needs to open smaller retail outlets in key urban areas where their product are needed, only then will they have taken advantage of the markets in the United States and can focus on expanding into other countries.


References
Hibbert, E. (2000). Globalisation in retailing -The impact of culture on market entry. Retrieved 12/8, 2007, from http://www.mubs.mdx.ac.uk/Research/Discussion_Papers/Marketing/dpap_mkt_no14.pdf
Johansson, J. K. (2006). Global marketing (4th edition ed.). New York: McGraw Hill Irwin.

+
+
+
+ \ No newline at end of file diff --git a/tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/header_tag.html b/tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/header_tag.html new file mode 100644 index 00000000..613e7aaf --- /dev/null +++ b/tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/header_tag.html @@ -0,0 +1,842 @@ + + +2018 - How a Construction Contractor Improved My Home + + + + + + + + + + + + + + + + + + + +
+
+
+
+

2018

+
+
    +
  • +
    +

    Tips to Ensure Safe DIY Removal of Non-Friable Asbestos From Your Home

    +

    +

    + It is always best to you hire an asbestos removal professional when you need to remove asbestos from your abode. But if you only need to remove a small amount of firmly-bound asbestos that does not require you to obtain an asbestos removal permit, then you can take on the project yourself provided you take the necessary safety measures. Here are a number of effective safety practices to follow when removing non-friable asbestos from your house.
    +[Read More] +

    +
    +
  • +
+
    +
  • +
    +

    How to strengthen your home's foundation against flood damage

    +

    +

    + If you live in a flood-prone area, your foundation may need extra strength against floodwaters. The foundation should also be strengthened against storm surges, strong winds, and heavy flows of water. In this way, your home is likely to experience less damage when a hurricane or tornado strikes. +There are several ways through which you can strengthen your home's foundation. Some foundation work is more complex than others, so make sure you determine the level of strength that your property needs.
    +[Read More] +

    +
    +
  • +
+
    +
  • +
    +

    What are the Evident Signs that Your Building Needs Underpinning?

    +

    +

    + Problems with a building's foundation can affect its ability to endure the stress and weight of the entire structure. If you don't take immediate measures to stabilise the foundation, you will end up with even costlier repairs. Alternatively, you may choose to demolish the structure, especially if the amount of repair work is too costly and involved. That's why you need to look out for the following signs that indicate your foundation could be experiencing some damage.
    +[Read More] +

    +
    +
  • +
+
    +
  • +
    +

    Home Improvement: Simple Renovations Tips for a Small Budget

    +

    +

    + Renovating your home can be an expensive undertaking, and the costs can get out of control, causing unexpected complications. Therefore, if you are planning on improving your home, you should prepare a suitable budget. Financial planning will help you manage your spending and avoid exceeding your means. Unfortunately, your resources might be limited even if you create a good budget. If you are worried about the total cost for the renovation work, you should use the outlined guidelines to keep your expenses in check.
    +[Read More] +

    +
    +
  • +
+
    +
  • +
    +

    Survey Services - Why You Should Invest In Drone Technology

    +

    +

    + Before you can embark on construction on your lot, one of the most critical assessments to have done is a topographical land survey. This survey is essential to enlighten your contractors on the state of the land before they can begin developing it. It typically is conducted manually and is a long and arduous process to ensure precise mapping out of the property. However, as tech advances, there is an advanced method of doing this through drones.
    +[Read More] +

    +
    +
  • +
+
    +
  • +
    +

    Everything That's Great About Metal Roofs

    +

    +

    + Without a doubt, metal is one of the most beneficial roofing materials available. Compared to other alternatives, there is a lot that is cool about metal roofs. Firstly, the savings on air conditioning will be significant. That's right; a metal roof will make your home cooler and save you on AC costs. As metal roofing contractors will confirm, the secret to how energy efficient a metal roof will be lies in the finishing chosen.
    +[Read More] +

    +
    +
  • +
+
    +
  • +
    +

    Exploring Various Types of Tennis Court Surfaces

    +

    +

    + Since 1859, tennis has been a popular sport played in many different areas around the world. In current times, tennis features some of the world's most lucrative tournaments, such as the U.S Open, the French Open, and the Australian Open. Fans of tennis enjoy emulating top athletes such as Rodger Federer, Serena Williams and Rafael Nadal, just to name a few. +While the dimensions of a tennis court are typically standard, the playing surfaces differ significantly.
    +[Read More] +

    +
    +
  • +
+
    +
  • +
    +

    Grin And Bore It: The Many Ways Using Bore Water Can Lower Your Farm's Water Bills

    +

    +

    + Of the many concerns a farmer has to deal with as they go about the day-to-day business of running their farms, one of the most important is the amount of water their business consumes. All varieties of farm, from crop-growing operations to livestock ranches, use prodigious amounts of water for various tasks, and if all of that water is drawn from centralised, government-owned sources, the costs of using it can add up very quickly.
    +[Read More] +

    +
    +
  • +
+
    +
  • +
    +

    How to repair chipped areas of your kerb

    +

    +

    + Many people fear to deal with concrete kerbs because it seems like one of the most complicated building materials to operate. However, there are times when it becomes close to impossible to avoid making minor repairs on your concrete, especially when the kerb has served you for some years, and it is starting to disintegrate. One of the common damages that happen to kerbs as they age is that they start chipping.
    +[Read More] +

    +
    +
  • +
+
    +
  • +
    +

    How piling ensures a stable foundation for your building

    +

    +

    + A strong foundation is essential to support your building. Whether you want to create a new build property or just add a room to an existing building you need to be sure that what you build will stand the test of time. Before you start building work it is important that you begin with a detailed survey of your land to ensure it is stable enough to support your building. If any issues are highlighted by the survey you should discuss the results with a qualified building professional who will be able to guide you towards the most appropriate solutions.
    +[Read More] +

    +
    +
  • +
+ +
+
+
+ + + + + + + + \ No newline at end of file diff --git a/tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/inline_block.html b/tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/inline_block.html new file mode 100644 index 00000000..40b59109 --- /dev/null +++ b/tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/inline_block.html @@ -0,0 +1,422 @@ + + + + + + + +WEBINAR Replay: Wellbeing in Schools: Discussion and Solutions + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

 In this webinar, recorded on Wednesday 28 July 2021, CEO and Founder Nikki Bonus invites Cheryl Edward, Senior Psychologist/Pastoral Care and Wellbeing Officer: Catholic Education (NT) and Lisa Franks, Assistant Principal at Kurrajong Public School (NSW), to discuss their whole-school approaches to wellbeing and its benefits so every child can be supported and connect, thrive and learn.

+

In this webinar, our experts will discuss:

+
    +
  • Their strategic and planned approaches to wellbeing
  • +
  • What data they used to inform their strategic direction and evaluate outcomes contributing to the success of their wellbeing practices
  • +
  • How to start measuring wellbeing effectively
  • +
  • How Life Skills GO supports the teaching and measurement of wellbeing
  • +
+

About our experts:

+

Cheryl Edward is a psychologist currently working in the Northern Territory, supporting Inclusion Support Services and Pastoral Care and Wellbeing in Catholic Education, NT.  She started as a School Counsellor with Katherine Group School, in the Department of Education, in 2010, working in 8 schools east of Katherine.  By the time Cheryl moved to Catholic Education, NT in Term 2, 2017, she was working across what is known as the Big Rivers region, covering 28 schools and 4,000 students across 400,000 square km’s. Throughout her work over the past 12 years, she has seen the most difference when schools work with a trauma-informed lens, incorporating neuroscience with social emotional learning. 

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+

Download the replay:

+
+
+
+
+
+
+
+
+
+
+
+ + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/list.html b/tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/list.html new file mode 100644 index 00000000..b318814b --- /dev/null +++ b/tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/list.html @@ -0,0 +1,208 @@ + + + Who Is In Your Top 3 Mentalists Of All Time? • MAGICIANSANDMAGIC.COM

Who is in your Top 3 Mentalists of all time?

  • This topic has 4 replies, 2 voices, and was last updated 1 year ago by Kenny.
Login Register
Viewing 4 reply threads
  • Author
    Posts
    • #1515
      Kenny
      Keymaster

      If you want to – tell us who the top mentalist in your country is.

    • #7011
      Bernie Amler
      Participant

      Obviously top billing goes to Banachek (Steve Shaw)
      and in no particular order…
      Max Maven (Phil Goldstein)
      The Amazing Kreskin (yes, he’s still around)
      Richard Osterlind
      Marc Salem
      and many who has passed on.

    • #7018
      Kenny
      Keymaster

      Great answer Bernie.

    • #8617
      Bernie Amler
      Participant

      I just recently found out that Banacheck originally lived in Soth Africa. He is from Port Elizabeth.

    • #8624
      Kenny
      Keymaster

      Hi Bernie, yes he is from SA. Good observation.

Login Register
Viewing 4 reply threads
  • You must be logged in to reply to this topic.
\ No newline at end of file diff --git a/tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/nav_class.html b/tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/nav_class.html new file mode 100644 index 00000000..9481bf78 --- /dev/null +++ b/tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/nav_class.html @@ -0,0 +1,441 @@ + + +

 + + +

+ + +【澎湃新闻】谢贵安:张居正对万历皇帝的儒学教育-国学院 + + + + + + + + + + + +
+ + +
+
+ +
+
+
+
院内新闻
+ +
+
+
学院公告
+ +
+
+
媒体报道
+ +
+
+
+
您的位置: + 首页 > 国学培训>
+ +
+
+
+ + + + + + +
+ +
+
+
+
+
+
+
+
+
+
友情链接 + +
+

Copyright?2014 武汉大学国学院版权所有 All Rights Reserved.     地址:中国·武汉·珞珈山     邮编:430072

+
+ +
\ No newline at end of file diff --git a/tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/nested_table_caption.html b/tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/nested_table_caption.html new file mode 100644 index 00000000..10ee612b --- /dev/null +++ b/tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/nested_table_caption.html @@ -0,0 +1,217 @@ + + + + +"Электромеханика" | Products + + + + + + + +
+ + + + + + + + + + + + + + + + +
+
РУС / ENG
+
+ + + +
+

Thermo

+ +
 
+ + + +
+ + + + + + +
+
+ +

Aggregates of recirculation heating “ARN” type

+

Appointment

+


+
Aggregates of recirculation heating “ARN” type are made for the manufacture of various parts of composite materials, based on epoxy, polyester and other resins.
+Main products are large-sized body parts, including complex core models (punches, matrices), technical reservoirs and other.

+

+Назад
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Specifications
Dimensions of the unit, mm
+  - length
+  - width
+  - height
“ARN-2”
+
12600
+ 5800
+ 3700
“ARN-3А”
+
16 600
+ 6200
+ 3700
“ARN-3B”
+
16600
+ 4800
+ 3700
“ARN-4”
+
15600
+ 10400
+ 6600
Dimensions of operating space, mm:
+  - length
+  - width
+  - height
4130
+ 3500
+ 2500
6490
+ 3100
+ 2500
6490
+ 2000
+ 2500
5830
+ 8000
+ 4510
Heat-transfer agent temperature, °C300250250250
Temperature drop in operating space, °C±2±2±2±2
Heat-transfer temperature controlManual, automatic
Temperature
+ regulation accuracy, °C
±1,5±2±2±1,5
Heating methodAerodynamic
Number of heaters2214
Power of rotary heater, kW555575110
Weight of charge, kg1000300030008000
Speed of heating and cooling
+ to 100°C, min

+ Not controlled
Speed of heating and cooling
+ above 100°C, min
1-21-21-21-2
Regulation of the heating rate,
+ above 100°C
Software
+

 

+ +
+ + + +
+ +
+
+ + + + \ No newline at end of file diff --git a/tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/nested_table_headers.html b/tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/nested_table_headers.html new file mode 100644 index 00000000..61408cb0 --- /dev/null +++ b/tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/nested_table_headers.html @@ -0,0 +1,389 @@ + + + + + AIT Associated Repository of Academic Resources: Browsing DSpace + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ DSpace + + + + DSpace + + + + 日本語
+ +
 
+ + + + + + + + + + + + + + + + + + + +
+ + +

+AIT Associated Repository of Academic Resources > +

+ +

+ Browsing by Author 中村, 栄治 +

+ + +
+
+ + + + + + + + + + + +
+ + + + + + + +
+ Jump to: + 0-9 + A + B + C + D + E + F + G + H + I + J + K + L + M + N + O + P + Q + R + S + T + U + V + W + X + Y + Z +
+ or enter first few letters:  +   +
+
+
+
+ + +
+
+ + + Sort by: + + In order: + + Results/Page + + Authors/Record: + + +
+
+ +
+ Showing results 5 to 24 of 37 +
+ +
+ < previous  +  next > +
+ + + + + + + + + + + + + + + + + + + + + + + + + + +
Issue DateTitleAuthor(s)
31-Mar-2012DCTおよびVQを用いた画像電子透かし柴田, 且崇; 沢田, 克敏; 中村, 栄治; SHIBATA, Katsutaka; SAWADA, Katsutoshi; NAKAMURA, Eiji
30-Sep-2015ESLによるハンズフリーセキュリティシステム中村, 栄治; 森, 雅斗; 伊藤, 朔太; NAKAMURA, Eiji; MORI, Masato; ITO, Shota
31-Aug-2021FDSシミュレーション結果を取り入れた避難シミュレーションの試み中村, 栄治; NAKAMURA, Eiji
31-Mar-2003サブバンドおよびベクトル量子化と組み合わせたフラクタル画像符号化石川, 敬介; 中村, 栄治; 沢田, 克敏; ISHIKAWA, Keisuke; NAKAMURA, Eiji; SAWADA, Katsutoshi
31-Mar-2013サブバンド処理を用いた画像電子透かし栗本, 裕巳; 沢田, 克敏; 中村, 栄治; KURIMOTO, Hiromi; SAWADA, Katsutoshi; NAKAMURA, Eiji
31-Mar-2001サブバンド分割と組み合わせたフラクタル画像符号化永井, 進也; 中村, 栄治; 沢田, 克敏; NAGAI, Shinya; NAKAMURA, Eiji; SAWADA, Katsutoshi
31-Mar-2000サブブロック輝度シフトを用いたフラクタル画像符号化平岩, 裕樹; 中村, 栄治; 沢田, 克敏; HIRAIWA, Yuuki; NAKAMURA, Eiji; SAWADA, Katsutoshi
30-Sep-2019サリエンシーと人工知能による景観に配慮した防災サインの設置検討山本, 義幸; 中村, 栄治; 倉橋, 奨; YAMAMOTO, Yoshiyuki; NAKAMURA, Eiji; KURAHASHI, Susumu
31-Oct-2016トンネル災害調査を想定した調査ロボットシステム奥川, 雅之; 中村, 栄治; 山本, 義幸; 倉橋, 奨; 落合, 鋭充; OKUGAWA, Masayuki; NAKAMURA, Eiji; YAMAMOTO, Yoshiyuki; KURAHASHI, Susumu; OCHIAI, Toshimichi
31-Mar-2005ブロックの分散・平均値を用いた画像電子透かし桃井, 秀人; 中村, 栄治; 沢田, 克敏; MOMOI, Hideto; NAKAMURA, Eiji; SAWADA, Katsutoshi
31-Jul-2015衛星動画による波の挙動解析ー津波モニタリングに向けてー山本, 義幸; 田中, 純; 中村, 栄治; YAMAMOTO, Yoshiyuki; TANAKA, Jyun; NAKAMURA, Eiji
23-Sep-2016下水マンホールデータビューワの開発中村, 栄治; 蟹江, 秀俊; NAKAMURA, Eiji; KANIE, Hidetoshi
31-Aug-2022火災時における階段施設が主避難路となる場合の避難シミュレーション中村, 栄治; NAKAMURA, Eiji
31-Jul-2014海水浴場における津波避難行動に関する研究森田, 匡俊; 小池, 則満; 小林, 哲郎; 山本, 義幸; 中村, 栄治; 正木, 和明; MORITA, Masatoshi; KOIKE, Norimitsu; KOBAYASHI, Tetsurou; YAMAMOTO, Yoshiyuki; NAKAMURA, Eiji; MASAKI, Kazuaki
31-Jul-2015環境情報取得におけるUAV活用の検討中村, 栄治; 山本, 義幸; NAKAMURA, Eiji; YAMAMOTO, Yoshiyuki
31-Mar-2003輝度・色差成分間の相関を利用したカラー画像のサブバンド・フラクタル符号化中根, 勇樹; 中村, 栄治; 沢田, 克敏; NAKANE, Yuki; NAKAMURA, Eiji; SAWADA, Katsutoshi
31-Oct-2016巨大津波を想定した海上ハザードマップの作成に関する研究 : 三重県南伊勢町を事例として小池, 則満; 服部, 亜由未; 森田, 匡俊; 岩見, 麻子; 江見, 友作; 中村, 栄治; KOIKE, Norimitsu; HATTORI, Ayumi; MORITA, Masatoshi; IWAMI, Asako; EMI, Yusaku; NAKAMURA, Eiji
31-Aug-2023交通信号機の機能停止時における車両渋滞の回避対策中村, 栄治; 中井, 俊; NAKAMURA, Eiji; NAKAI, Shun
31-Jul-2013自然災害に対する意思決定支援システムの構築正木, 和明; 小池, 則満; 森田, 匡俊; 中村, 栄治; 奥川, 雅之; 山本, 義幸; 倉橋, 奨; 落合, 鋭充; MASAKI, Kazuaki; KOIKE, Norimitsu; MORITA, Masatoshi; NAKAMURA, Eiji; OKUGAWA, Masayuki; YAMAMOTO, Yoshiyuki; KURAHASHI, Susumu; OCHIAI, Toshimichi
30-Sep-2020大規模屋内施設からの避難シミュレーション中村, 栄治; 小池, 則満; NAKAMURA, Eiji; KOIKE, Norimitsu
+ + +
+ Showing results 5 to 24 of 37 +
+ +
+ < previous  +  next > +
+ + + +

 

+
+ + + + + + +
+ Valid XHTML 1.0! + + DSpace Software Copyright © 2002-2010  Duraspace - + Feedback + + +
+
+ + diff --git a/tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/non_list_child.html b/tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/non_list_child.html new file mode 100644 index 00000000..27ecb944 --- /dev/null +++ b/tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/non_list_child.html @@ -0,0 +1,5343 @@ + + + + + + + +泡泡详情 | 泡泡 - 一个清新文艺的微社区 + + + + + + + + + + + + + + + + + + + + + + + + +
  • henryspace @henryspace
    #Crypto + +发现个端到端加密的聊天和文件共享工具,挺有意思~ +
    0
    19
    0
    0
  • 评论
    默认
    最新
  • henryspace @henryspace
    浙江省杭州市
    发展大了的电报毕竟也要受监管,凡是中心化的东西都逃不脱要被控制的命运;但是太自由了也很容易玩脱了,成为作恶和犯罪的温床,所以区块链这种东西注定也只是小众的,只要有人群在的地方,人们不允许有不能被控制的东西任意发展。
  • 北野 @alimy
    上海市
    好熟悉的app啊,看👀其logo,就好像哪见过,之前好像有开源版本来着?后来被zoom给收购了? 忘了,可能搞混了吧~
  • 北野 @alimy
    上海市
    @henryspace 端到端加密 说实话也并不是大众群体的刚性需求,这玩意说以开发商的角度思考,就TM的一个好噱头,方便讲故事拉投资顺便拉点用户。 +站在普通大众群体,端密聊天,可有可无,老子又不是名人,也没人说我是帅哥美女,除了对面的聊天对象,谁tm在乎老子啊,都不在乎了,还怕你吖的看老子聊啥啊,不存在的~ 所以端密聊天,大众群体是概念模糊的,有选择当然更好,没得选也不太在乎~ +
  • yanjun @yanjun
    广东省深圳市
    新的练手项目get
搜一搜...
+ + \ No newline at end of file diff --git a/tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/table.html b/tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/table.html new file mode 100644 index 00000000..b310658e --- /dev/null +++ b/tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/table.html @@ -0,0 +1,677 @@ + + +Bias class + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + +
+
+ + +
+ + +
London Power ad
+ + + +
+ +
+
+
+
+ + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + +
+ +
+[-] +
+ +Search the Forum +
+
+ + + + +
+ + + +

+
+ + + +

+
+ + + + +

+ + (Advanced Search) + +

+ + + + + +
+ + + + + + + + + +
+ +
+Bias class +
+
+
+ + +
+ +
+
+ +
+#1 +
+ + +
+
+ Hi Guys
+
+The idle condition of a power amplifier output stage is referred to as its "operating class". It is important to note that the bias condition is signal-dependent and has nothing to do with how the output stage devices are controlled. Again, bias condition is a universally-applicable concept.
+
+Class-A: All of the output devices contribute to the signal over the full audio cycle (360-degrees)
+
+Class-A2: Tubes only, the "2" indicates that grid conduction occurs in the output devices. This is simply class-AB below with a low-impedance drive  circuit and very high idle current.
+
+Limiting Class-A: The peak signal current never exceeds the total idle current. This term was common in tube days, but still applies universally although it may be considered redundant to the class-A definition above.
+
+Sliding Class-A: Solid-state, a method of varying the idle condition so that neither half of the circuit ever turns off even though transference of signal control may occur.
+
+Class-B: In push-pull, each half of the output stage contributes exactly half the output signal (180-degrees)
+
+Class-B2: Tubes only, the "2" indicates grid conduction made possible by a low-impedance drive circuit.
+
+Class-AB: In push-pull, each half of the output stage contributes to slightly more than half of the signal output. Most "class-B" output stages are actually biased this way, with a slight overlap of conduction between circuit halves.
+
+Class-C: The output device conducts for only half the signal cycle (180-degrees) with a tuned load providing the remainder. Used in RF.
+
+Class-D: Solid-state only, a method of using a nonlinear output stage where the devices switch 'on' and 'off' in a pulse-width-modulated (PWM) format, and the output signal is integrated using LC filters. This approach is highly load noncompliant inasmuch as the load should be of fixed value versus frequency (resistive rather than inductive or capacitive). Class-D allows cold operation of the output devices but is only suitable for driving subwoofers in audio.
+
+Class-E: Solid-state, where parallel-driven output stages supported by different supply values contribute to the signal. The low-voltage stage amplifies the signal up to its limits with the high-voltage stage contributing higher amplitude signals as required. The low-voltage output stage can be biased class-A,-B or -AB.
+
+Class-G: Solid-state, a multi-tier output stage uses multiple supply voltages,switching between them as the signal requires. The transition shifts the burden of output heat from the low-tier device to the next higher-tier device. Overall dissipation is generally reduced by the number of tiers.
+
+Class-H: Solid-state, a multi-tier output stage supported by multiple supply voltages, switching between them as the signal requires. The supply switches turn 'on' hard and the burden of heat dissipation remains with the lowest-tier devices. Overall dissipation is reduced by the number of tiers.
+
+Class-I: Similar to sliding class-A.
+
+Class-T: A variation of class-D, with all of the same inherent issues.
+
+Class-Z: A method of power transfer using saturable coils "steered" by tubes with output stage power provided by a switching supply. designed by Lundahl (SE) in the 1960s, then revised and patented by Berning in the 1990s. +
+ +
+
+
+ +Find + +
+
+ +Reply + +
+
+
+ +
+
+ +
+
+ + + + +
+
+ + +
+
+
+
+
+
+
+ +
+ +
+Forum Jump: + + + + +
+ + +
+
+ +
+ + + + + + + + + + + + + + + + + + + +
+ +
+[-] +
+ +Come in where it's warm! +
A warm welcome to tube amp modding fans and those interested in hi-fi audio! Readers of Kevin O'Connor's The Ultimate Tone (TUT) book series form a part of our population. Kevin O'Connor is the creator of the popular Power Scaling methodology for amplifiers.
Please remember these three principles: respect, sharing, community.
Not familiar with The Ultimate Tone book series? See discussion topics, or click here to visit London Power/Power Press Publishing.

+ + + + + + + + + + + + + + + +
+ +
+[-] +
+ +Tube Amp Forum Hosted by London Power +
London Power logo

+ + + + + +
+ +
+
+ + + + +
+ + + \ No newline at end of file diff --git a/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py b/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py index ea5d57e9..83f007c2 100644 --- a/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py +++ b/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py @@ -1,6 +1,9 @@ +import re import unittest from pathlib import Path +from lxml import html + from llm_web_kit.input.pre_data_json import PreDataJson, PreDataJsonKey from llm_web_kit.main_html_parser.parser.tag_simplifier import \ HtmlTagSimplifierParser @@ -29,7 +32,7 @@ def test_tag_simplifier1(self): pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') _item_id_count = simplifier_raw_html.count('_item_id') - self.assertEqual(_item_id_count, 18) + self.assertEqual(_item_id_count, 31) def test_tag_simplifier2(self): file_path = base_dir / 'assets/test_html_data/normal_table.html' @@ -40,7 +43,7 @@ def test_tag_simplifier2(self): pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') _item_id_count = simplifier_raw_html.count('_item_id') - self.assertEqual(_item_id_count, 30) + self.assertEqual(_item_id_count, 60) def test_tag_simplifier3(self): file_path = base_dir / 'assets/test_html_data/special_table_1.html' @@ -51,7 +54,7 @@ def test_tag_simplifier3(self): pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') _item_id_count = simplifier_raw_html.count('_item_id') - self.assertEqual(_item_id_count, 66) + self.assertEqual(_item_id_count, 69) def test_tag_simplifier4(self): file_path = base_dir / 'assets/test_html_data/1.html' @@ -62,7 +65,229 @@ def test_tag_simplifier4(self): pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') _item_id_count = simplifier_raw_html.count('_item_id') - self.assertEqual(_item_id_count, 51) + self.assertEqual(_item_id_count, 114) + + def test_tag_simplifier_table(self): + file_path = base_dir / 'assets/test_html_data/simplify_cases/table.html' + with open(file_path, 'r', encoding='utf-8') as file: + raw_html = file.read() + data_dict = {PreDataJsonKey.TYPICAL_RAW_HTML: raw_html} + pre_data = PreDataJson(data_dict) + pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) + simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') + raw_tag_html = pre_data_result.get(PreDataJsonKey.TYPICAL_RAW_TAG_HTML, '') + _item_id_count = simplifier_raw_html.count('_item_id') + self.assertEqual(_item_id_count, 35) + + id_dom = html.fromstring(raw_tag_html) + # 用xpath定位data-anno-uid="anno-uid-3vtzg9uxee4"的table元素,该table用于布局 + table_element = id_dom.xpath('//table[@data-anno-uid="anno-uid-3vtzg9uxee4"]')[0] + # 确认该table元素没有_item_id属性 + self.assertEqual(table_element.get('_item_id'), None) + # 确认该table的3个td元素的内部都包含若干个存在_item_id属性的元素 + for td_element in table_element.xpath('./tbody/tr/td'): + td_item_count = 0 + for child in td_element.iter(): + if child.get('_item_id') is not None: + td_item_count += 1 + self.assertNotEqual(td_item_count, 0) + + def test_tag_simplifier_nested_table_headers(self): + file_path = base_dir / 'assets/test_html_data/simplify_cases/nested_table_headers.html' + with open(file_path, 'r', encoding='utf-8') as file: + raw_html = file.read() + data_dict = {PreDataJsonKey.TYPICAL_RAW_HTML: raw_html} + pre_data = PreDataJson(data_dict) + pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) + simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') + raw_tag_html = pre_data_result.get(PreDataJsonKey.TYPICAL_RAW_TAG_HTML, '') + _item_id_count = simplifier_raw_html.count('_item_id') + self.assertEqual(_item_id_count, 37) + + id_dom = html.fromstring(raw_tag_html) + # 用xpath定位外层table元素,该table用于布局 + table_element = id_dom.xpath('//table[@class="centralPane"]')[0] + # 确认该table元素没有_item_id属性 + self.assertIsNone(table_element.get('_item_id')) + + # 用xpath定位内层table元素,该table是数据表格,其单元格包含headers属性 + table_element = id_dom.xpath('//table[@class="miscTable"]')[0] + # 确认该table元素有_item_id属性 + self.assertIsNotNone(table_element.get('_item_id')) + + def test_tag_simplifier_nested_table_caption(self): + file_path = base_dir / 'assets/test_html_data/simplify_cases/nested_table_caption.html' + with open(file_path, 'r', encoding='utf-8') as file: + raw_html = file.read() + data_dict = {PreDataJsonKey.TYPICAL_RAW_HTML: raw_html} + pre_data = PreDataJson(data_dict) + pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) + simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') + raw_tag_html = pre_data_result.get(PreDataJsonKey.TYPICAL_RAW_TAG_HTML, '') + _item_id_count = simplifier_raw_html.count('_item_id') + self.assertEqual(_item_id_count, 14) + + id_dom = html.fromstring(raw_tag_html) + # 用xpath定位外层table元素,该table用于布局 + table_element = id_dom.xpath('//table[@data-anno-uid="anno-uid-xgzpvn8fnqk"]')[0] + # 确认该table元素没有_item_id属性 + self.assertIsNone(table_element.get('_item_id')) + + # 用xpath定位内层table元素,该table是数据表格,其包含caption元素 + table_element = id_dom.xpath('//table[@data-anno-uid="anno-uid-olo3onur84"]')[0] + # 确认该table元素有_item_id属性 + self.assertIsNotNone(table_element.get('_item_id')) + + def test_tag_simplifier_list(self): + file_path = base_dir / 'assets/test_html_data/simplify_cases/list.html' + with open(file_path, 'r', encoding='utf-8') as file: + raw_html = file.read() + data_dict = {PreDataJsonKey.TYPICAL_RAW_HTML: raw_html} + pre_data = PreDataJson(data_dict) + pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) + simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') + raw_tag_html = pre_data_result.get(PreDataJsonKey.TYPICAL_RAW_TAG_HTML, '') + _item_id_count = simplifier_raw_html.count('_item_id') + self.assertEqual(_item_id_count, 45) + + id_dom = html.fromstring(raw_tag_html) + # 用xpath定位ul元素,该ul用于布局 + list_element = id_dom.xpath('//ul[@data-anno-uid="anno-uid-7s58m3hrcz5"]')[0] + # 确认该ul元素没有_item_id属性 + self.assertIsNone(list_element.get('_item_id')) + # 确认该ul元素下的li元素内均包含有_item_id属性的元素 + for li_element in list_element.xpath('./li'): + li_item_count = 0 + for child in li_element.iter(): + if child.get('_item_id') is not None: + li_item_count += 1 + self.assertNotEqual(li_item_count, 0) + + def test_tag_simplifier_non_list_child(self): + file_path = base_dir / 'assets/test_html_data/simplify_cases/non_list_child.html' + with open(file_path, 'r', encoding='utf-8') as file: + raw_html = file.read() + data_dict = {PreDataJsonKey.TYPICAL_RAW_HTML: raw_html} + pre_data = PreDataJson(data_dict) + pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) + simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') + raw_tag_html = pre_data_result.get(PreDataJsonKey.TYPICAL_RAW_TAG_HTML, '') + _item_id_count = simplifier_raw_html.count('_item_id') + self.assertEqual(_item_id_count, 151) + + id_dom = html.fromstring(raw_tag_html) + # 用xpath定位ul元素,该ul用于布局 + list_element = id_dom.xpath('//ul[@data-anno-uid="anno-uid-myobddy8ord"]')[0] + # 确认该ul元素没有_item_id属性 + self.assertIsNone(list_element.get('_item_id')) + # 用xpath定位上述ul内部的一个li,该li内部结构复杂,应该包含多个_item_id + li_element = id_dom.xpath('//li[@data-anno-uid="anno-uid-7wux77fqc7t"]')[0] + li_item_count = 0 + for child in li_element.iter(): + if child.get('_item_id') is not None: + li_item_count += 1 + self.assertNotEqual(li_item_count, 0) + + def test_tag_simplifier_inline_block(self): + file_path = base_dir / 'assets/test_html_data/simplify_cases/inline_block.html' + with open(file_path, 'r', encoding='utf-8') as file: + raw_html = file.read() + data_dict = {PreDataJsonKey.TYPICAL_RAW_HTML: raw_html} + pre_data = PreDataJson(data_dict) + pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) + simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') + raw_tag_html = pre_data_result.get(PreDataJsonKey.TYPICAL_RAW_TAG_HTML, '') + _item_id_count = simplifier_raw_html.count('_item_id') + self.assertEqual(_item_id_count, 12) + + id_dom = html.fromstring(raw_tag_html) + # 用xpath定位span元素,该span内部包含了块级元素 + span_element = id_dom.xpath('//span[@data-anno-uid="anno-uid-yrlyp4ay0l"]')[0] + # 确认该span元素没有_item_id属性 + self.assertIsNone(span_element.get('_item_id')) + # 该span元素内部包含多个块级元素,每个块级元素都包含_item_id属性 + for child in span_element.iterchildren(): + self.assertIsNotNone(child.get("_item_id")) + + def test_tag_simplifier_abnormal_comment(self): + file_path = base_dir / 'assets/test_html_data/simplify_cases/abnormal_comment.html' + with open(file_path, 'r', encoding='utf-8') as file: + raw_html = file.read() + data_dict = {PreDataJsonKey.TYPICAL_RAW_HTML: raw_html} + pre_data = PreDataJson(data_dict) + pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) + simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') + raw_tag_html = pre_data_result.get(PreDataJsonKey.TYPICAL_RAW_TAG_HTML, '') + _item_id_count = simplifier_raw_html.count('_item_id') + self.assertEqual(_item_id_count, 53) + # 验证不规范的注释内包含的有效内容没有被删除 + self.assertIn('', raw_tag_html) + # 验证规范的注释都已被删除 + comment_res = re.search(r'', raw_tag_html, flags=re.DOTALL) + self.assertIsNone(comment_res) + + def test_tag_simplifier_header_tag(self): + file_path = base_dir / 'assets/test_html_data/simplify_cases/header_tag.html' + with open(file_path, 'r', encoding='utf-8') as file: + raw_html = file.read() + data_dict = {PreDataJsonKey.TYPICAL_RAW_HTML: raw_html} + pre_data = PreDataJson(data_dict) + pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) + simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') + raw_tag_html = pre_data_result.get(PreDataJsonKey.TYPICAL_RAW_TAG_HTML, '') + _item_id_count = simplifier_raw_html.count('_item_id') + self.assertEqual(_item_id_count, 32) + + id_dom = html.fromstring(raw_tag_html) + # 用xpath定位元素,该元素的id是header,且是body的直接子元素 + header_element = id_dom.xpath('//section[@data-anno-uid="anno-uid-a5n4leb0qxv"]')[0] + # 确认该元素没有_item_id属性,也就是被删掉了 + self.assertIsNone(header_element.get('_item_id')) + # 用xpath定位元素,该元素位于header标签内部,但这个header不是body的直接子元素 + header_element = id_dom.xpath('//h2[@data-anno-uid="anno-uid-g8cyd0j0kn6"]')[0] + # 确认该元素有_item_id属性,也就是被保留了 + self.assertIsNotNone(header_element.get('_item_id')) + + def test_tag_simplifier_nav_class(self): + file_path = base_dir / 'assets/test_html_data/simplify_cases/nav_class.html' + with open(file_path, 'r', encoding='utf-8') as file: + raw_html = file.read() + data_dict = {PreDataJsonKey.TYPICAL_RAW_HTML: raw_html} + pre_data = PreDataJson(data_dict) + pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) + simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') + raw_tag_html = pre_data_result.get(PreDataJsonKey.TYPICAL_RAW_TAG_HTML, '') + _item_id_count = simplifier_raw_html.count('_item_id') + self.assertEqual(_item_id_count, 100) + + id_dom = html.fromstring(raw_tag_html) + # 用xpath定位元素,该元素的class是nav,但不是body的直接子元素,应该保留 + nav_element = id_dom.xpath('//div[@data-anno-uid="anno-uid-v6mugwj7iv"]')[0] + # 验证nav内部有_item_id属性的元素,证明nav没有被删除 + nav_item_count = 0 + for child in nav_element.iter(): + if child.get('_item_id') is not None: + nav_item_count += 1 + self.assertNotEqual(nav_item_count, 0) + + def test_tag_simplifier_block_select(self): + file_path = base_dir / 'assets/test_html_data/simplify_cases/block_select.html' + with open(file_path, 'r', encoding='utf-8') as file: + raw_html = file.read() + data_dict = {PreDataJsonKey.TYPICAL_RAW_HTML: raw_html} + pre_data = PreDataJson(data_dict) + pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) + simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') + raw_tag_html = pre_data_result.get(PreDataJsonKey.TYPICAL_RAW_TAG_HTML, '') + _item_id_count = simplifier_raw_html.count('_item_id') + self.assertEqual(_item_id_count, 7) + + id_dom = html.fromstring(raw_tag_html) + # 用xpath定位元素,该元素是块级元素且内部不包含块级元素,但该元素本身没有cc-select,只是其内部元素有cc-select + p_element = id_dom.xpath('//p[@data-anno-uid="anno-uid-tnbktgx26s"]')[0] + # 验证该元素被加上了_item_id和cc-select + self.assertIsNotNone(p_element.get("_item_id")) + self.assertIsNotNone(p_element.get("cc-select")) if __name__ == '__main__': From 939336d9881c1d90807daca3809d6a9e40927366 Mon Sep 17 00:00:00 2001 From: ningwenchang Date: Tue, 26 Aug 2025 13:26:49 +0000 Subject: [PATCH 2/5] [fix]: Fix test failures in test_layout_parser.py due to simplify_html's return value signature change. --- .../template_www.wdi.it_llm.json | 11 +++-------- .../main_html_parser/parser/test_layout_parser.py | 12 ++++++------ 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/tests/llm_web_kit/main_html_parser/parser/assets/input_layout_batch_parser/template_www.wdi.it_llm.json b/tests/llm_web_kit/main_html_parser/parser/assets/input_layout_batch_parser/template_www.wdi.it_llm.json index abb40deb..917c500b 100644 --- a/tests/llm_web_kit/main_html_parser/parser/assets/input_layout_batch_parser/template_www.wdi.it_llm.json +++ b/tests/llm_web_kit/main_html_parser/parser/assets/input_layout_batch_parser/template_www.wdi.it_llm.json @@ -7,8 +7,8 @@ "item_id 6": "No", "item_id 7": "No", "item_id 8": "No", - "item_id 9": "Yes", - "item_id 10": "No", + "item_id 9": "No", + "item_id 10": "Yes", "item_id 11": "No", "item_id 12": "No", "item_id 13": "No", @@ -30,10 +30,5 @@ "item_id 29": "No", "item_id 30": "No", "item_id 31": "No", - "item_id 32": "No", - "item_id 33": "No", - "item_id 34": "No", - "item_id 35": "No", - "item_id 36": "No", - "item_id 37": "No" + "item_id 32": "No" } \ No newline at end of file diff --git a/tests/llm_web_kit/main_html_parser/parser/test_layout_parser.py b/tests/llm_web_kit/main_html_parser/parser/test_layout_parser.py index d936fe89..8ca08f59 100644 --- a/tests/llm_web_kit/main_html_parser/parser/test_layout_parser.py +++ b/tests/llm_web_kit/main_html_parser/parser/test_layout_parser.py @@ -163,7 +163,7 @@ def test_dynamic_id(self): template_source = re.sub('post-37041', 'test-37041', template_source) expand_source = re.sub('test-37041', 'test-25031', template_source) # 简化网页 - simplified_html, typical_raw_tag_html, _ = simplify_html(template_source) + simplified_html, typical_raw_tag_html = simplify_html(template_source) # 模型结果格式改写 llm_path = base_dir.joinpath(TEST_CASES[0]['input'][2][0]) llm_response = json.loads(llm_path.read_text(encoding='utf-8')) @@ -203,7 +203,7 @@ def test_dynamic_classid(self): expand_source2 = re.sub('testid-25031', '', expand_source) template_source2 = re.sub('testid-37041', '', template_source) # 简化网页 - simplified_html, typical_raw_tag_html, _ = simplify_html(template_source) + simplified_html, typical_raw_tag_html = simplify_html(template_source) # 模型结果格式改写 llm_path = base_dir.joinpath(TEST_CASES[0]['input'][2][0]) llm_response = json.loads(llm_path.read_text(encoding='utf-8')) @@ -226,7 +226,7 @@ def test_dynamic_classid(self): assert 'Permalink link a questo articolo' not in main_html_body and 'Con la stesura di un' in main_html_body # 简化网页 - simplified_html, typical_raw_tag_html, _ = simplify_html(template_source2) + simplified_html, typical_raw_tag_html = simplify_html(template_source2) # 模型结果格式改写 llm_path = base_dir.joinpath(TEST_CASES[0]['input'][2][0]) llm_response = json.loads(llm_path.read_text(encoding='utf-8')) @@ -268,7 +268,7 @@ def test_more_noise_enable(self): new_p.text = 'test more noise' expand_source = html.tostring(tree, encoding='utf-8').decode() # 简化网页 - simplified_html, typical_raw_tag_html, _ = simplify_html(template_source) + simplified_html, typical_raw_tag_html = simplify_html(template_source) # 模型结果格式改写 llm_path = base_dir.joinpath(TEST_CASES[0]['input'][2][0]) llm_response = json.loads(llm_path.read_text(encoding='utf-8')) @@ -305,7 +305,7 @@ def test_classid_with_first_class(self): template_source = re.sub('post-37041', '', template_source) expand_source = re.sub('post-classid', 'post-classid template-classid', template_source) # 简化网页 - simplified_html, typical_raw_tag_html, _ = simplify_html(template_source) + simplified_html, typical_raw_tag_html = simplify_html(template_source) # 模型结果格式改写 llm_path = base_dir.joinpath(TEST_CASES[0]['input'][2][0]) llm_response = json.loads(llm_path.read_text(encoding='utf-8')) @@ -385,7 +385,7 @@ def test_incomplete_tag(self): # 模型结果格式改写 llm_path = 'assets/input_layout_batch_parser/test_incomplete_tag.json' llm_response = json.loads(base_dir.joinpath(llm_path).read_text(encoding='utf-8')) - simplified_html, typical_raw_tag_html, _ = simplify_html(html_source) + simplified_html, typical_raw_tag_html = simplify_html(html_source) pre_data = {'typical_raw_tag_html': typical_raw_tag_html, 'typical_raw_html': html_source, 'llm_response': llm_response} pre_data = PreDataJson(pre_data) From 94216b8b4a3c4b19a947092f268905c15a1bea04 Mon Sep 17 00:00:00 2001 From: ningwenchang Date: Fri, 5 Sep 2025 03:48:54 +0000 Subject: [PATCH 3/5] [fix]: Replace beautifulsoup with selectolax, retaining headers/footers --- .../simplify_html/simplify_html.py | 144 ++++-------------- .../parser/test_tag_simplifier.py | 24 +-- 2 files changed, 44 insertions(+), 124 deletions(-) diff --git a/llm_web_kit/main_html_parser/simplify_html/simplify_html.py b/llm_web_kit/main_html_parser/simplify_html/simplify_html.py index 45938f1c..7ae842ba 100644 --- a/llm_web_kit/main_html_parser/simplify_html/simplify_html.py +++ b/llm_web_kit/main_html_parser/simplify_html/simplify_html.py @@ -3,8 +3,8 @@ import uuid from typing import Dict, List, Tuple -from bs4 import BeautifulSoup from lxml import etree, html +from selectolax.parser import HTMLParser # 行内标签 inline_tags = { @@ -21,18 +21,16 @@ # 需要删除的标签 tags_to_remove = { + 'title', 'head', - 'header', - 'footer', 'nav', - 'aside', 'style', 'script', 'noscript', 'link', 'meta', 'iframe', - 'frame' + 'frame', } # 需要保留的特殊标签(即使它们是行内标签) @@ -40,7 +38,7 @@ # 需要删除的属性名模式(独立单词) ATTR_PATTERNS_TO_REMOVE = { - 'nav', 'footer', 'header', # 独立单词 + 'nav', # 'footer', 'header', # 独立单词 } # 需要删除的属性名模式(特定前缀/后缀) @@ -75,59 +73,6 @@ def build_uid_map(dom: html.HtmlElement) -> Dict[str, html.HtmlElement]: return {node.get('data-uid'): node for node in dom.iter() if node.get('data-uid')} -def is_unique_attribute(tree, attr_name, attr_value): - """检查给定的属性名和值组合是否在文档中唯一。""" - elements = tree.xpath(f"//*[@{attr_name}='{attr_value}']") - return len(elements) == 1 - - -def get_relative_xpath(element): - root_tree = element.getroottree() - current_element = element - path_from_element = [] - found_unique_ancestor = False - - # 从当前元素开始向上查找 - while current_element is not None and current_element.getparent() is not None: - siblings = [sib for sib in current_element.getparent() if sib.tag == current_element.tag] - - # 检查当前元素是否有唯一属性 - unique_attr = None - candidate_attrs = [ - attr for attr in current_element.attrib - if not (attr.startswith('data-') or attr == 'style' or - attr == '_item_id' or - (current_element.attrib[attr].startswith('{') and current_element.attrib[attr].endswith('}'))) - ] - - for attr in candidate_attrs: - if is_unique_attribute(root_tree, attr, current_element.attrib[attr]): - unique_attr = attr - break - - # 如果有唯一属性,构建相对路径并停止向上查找 - if unique_attr is not None: - path_from_element.insert(0, f'*[@{unique_attr}="{current_element.attrib[unique_attr]}"]') - found_unique_ancestor = True - break - else: - # 没有唯一属性,使用常规方式 - if len(siblings) > 1: - index = siblings.index(current_element) + 1 - path_from_element.insert(0, f'{current_element.tag}[{index}]') - else: - path_from_element.insert(0, current_element.tag) - - current_element = current_element.getparent() - - # 构建最终的XPath - if found_unique_ancestor: - return f'//{"/".join(path_from_element)}' - else: - # 如果没有找到唯一属性祖先,返回完整路径 - return f'//{"/".join(path_from_element)}' - - def judge_table_parent(table_element, node_list): for node in node_list: ancestor = node.getparent() @@ -153,19 +98,19 @@ def is_data_table(table_element: html.HtmlElement) -> bool: if judge_table_parent(table_element, col_nodes) or judge_table_parent(table_element, colgroup_nodes): return True - # 检查是否有 role="table" 或 data-table 属性 - if table_element.get('role') == 'table' or table_element.get('data-table'): - return True - # 检查当前表格(不包括内部嵌套表格)单元格是否有 headers 属性 cell_nodes = table_element.xpath(".//*[self::td or self::th][@headers]") if judge_table_parent(table_element, cell_nodes): return True + # 检查是否有 role="table" 或 data-table 属性 + if table_element.get('role') == 'table' or table_element.get('data-table'): + return True + for node in table_element.iterdescendants(): if node.tag in table_tags_set: continue - if node not in inline_tags: + if node.tag not in inline_tags: return False return True @@ -177,22 +122,21 @@ def has_non_listitem_children(list_element): :param list_element: lxml元素对象 (ul, ol, dl) :return: True 如果存在非列表项的直接子节点,否则 False """ - # 获取所有直接子元素(不包括文本节点) - direct_children = list_element.xpath("./*") # 根据列表类型确定允许的子元素标签 if list_element.tag in ['ul', 'ol']: allowed_tags = {'li'} elif list_element.tag == 'dl': allowed_tags = {'dt', 'dd'} - else: - # 如果不是列表元素,返回False - return False - # 检查是否存在不允许的元素 - for child in direct_children: - if child.tag not in allowed_tags: - return True + # 使用XPath直接查找是否存在不允许的直接子元素 + # 例如,对于
    ,查找所有不是
  • 的直接子元素 + # 对于
    ,查找所有不是
    的直接子元素 + exclude_conditions = " and ".join([f"name()!='{tag}'" for tag in allowed_tags]) + disallowed_children_xpath = f"./*[{exclude_conditions}]" + + if list_element.xpath(disallowed_children_xpath): + return True # 检查是否存在非空白文本节点 text_children = list_element.xpath("./text()") @@ -272,9 +216,9 @@ def is_content_list(list_element): # 获取列表项(支持多种列表类型) items = list_element.xpath("li | dt | dd") - # 空列表直接返回普通列表 + # 不包含列表项,则不是内容列表 if len(items) == 0: - return True + return False # 列表包含非列表项子元素视为布局列表 if has_non_listitem_children(list_element): return False @@ -284,7 +228,7 @@ def is_content_list(list_element): if has_block_descendants(item): return False - # 默认视为普通列表 + # 默认视为内容列表 return True # 先分析所有列表的类型 @@ -436,26 +380,11 @@ def merge_inline_content(parent: html.HtmlElement, content_list: List[Tuple[str, return unique_paragraphs -def safely_remove_comments(html_content): - # 创建解析器并设置为移除注释节点 - parser = html.HTMLParser(remove_comments=True) - doc = html.fromstring(html_content, parser=parser) - - # 重新序列化为字符串 - return etree.tostring( - doc, - encoding='unicode', - method='html', - with_tail=False - ) - - def remove_xml_declaration(html_string): # 正则表达式匹配 (没有问号结尾的情况) pattern = r'<\?xml\s+.*?\??>' html_content = re.sub(pattern, '', html_string, flags=re.DOTALL) - # 删除HTML注释 - html_content = safely_remove_comments(html_content) + return html_content @@ -464,10 +393,7 @@ def post_process_html(html_content: str) -> str: if not html_content: return html_content - # 1. 删除HTML注释 - html_content = safely_remove_comments(html_content) - - # 2. 处理标签外的空白(保留标签内文本的换行) + # 处理标签外的空白(保留标签内文本的换行) def replace_outside_tag_space(match): """只替换标签外的连续空白.""" if match.group(1): # 如果是标签内容 @@ -493,11 +419,7 @@ def remove_tags(dom): for node in dom.xpath(f'.//{tag}'): parent = node.getparent() if parent is not None: - if tag == "header" or tag == "footer": - if parent.tag == 'body': - parent.remove(node) - else: - parent.remove(node) + parent.remove(node) def is_meaningful_content(element) -> bool: @@ -759,9 +681,7 @@ def process_paragraphs(paragraphs: List[Dict[str, str]], uid_map: Dict[str, html for para in paragraphs: try: - html_content = safely_remove_comments(para['html']) - # 解析段落HTML - root = html.fromstring(html_content) + root = html.fragment_fromstring(para['html'], create_parent=False) root_for_xpath = copy.deepcopy(root) content_type = para.get('content_type', 'block_element') @@ -935,15 +855,15 @@ def simplify_html(html_str) -> etree.Element: original_html: 添加_item_id的原始HTML _xpath_mapping: xpath映射 """ - # 预处理 - preprocessed_html = remove_xml_declaration(html_str) + # 使用selectolax的HTMLParser来修复html + soup = HTMLParser(html_str) + fixed_html = soup.html - # 用 BeautifulSoup 修复未闭合标签,lxml 无法完全修复 - soup = BeautifulSoup(preprocessed_html, 'html.parser') - fixed_html = str(soup) - - # 解析原始DOM - original_dom = html.fromstring(fixed_html) + preprocessed_html = remove_xml_declaration(fixed_html) + # 注释通过lxml的HTMLParser的remove_comments参数处理 + parser = html.HTMLParser(remove_comments=True) + original_dom = html.fromstring(preprocessed_html, parser=parser) + # 添加data_uid add_data_uids(original_dom) original_uid_map = build_uid_map(original_dom) diff --git a/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py b/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py index 83f007c2..215eb49b 100644 --- a/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py +++ b/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py @@ -21,7 +21,7 @@ def test_tag_simplifier(self): pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') _item_id_count = simplifier_raw_html.count('_item_id') - self.assertEqual(_item_id_count, 32) + self.assertEqual(_item_id_count, 34) def test_tag_simplifier1(self): file_path = base_dir / 'assets/test_html_data/normal_dl.html' @@ -32,7 +32,7 @@ def test_tag_simplifier1(self): pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') _item_id_count = simplifier_raw_html.count('_item_id') - self.assertEqual(_item_id_count, 31) + self.assertEqual(_item_id_count, 48) def test_tag_simplifier2(self): file_path = base_dir / 'assets/test_html_data/normal_table.html' @@ -43,7 +43,7 @@ def test_tag_simplifier2(self): pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') _item_id_count = simplifier_raw_html.count('_item_id') - self.assertEqual(_item_id_count, 60) + self.assertEqual(_item_id_count, 11) def test_tag_simplifier3(self): file_path = base_dir / 'assets/test_html_data/special_table_1.html' @@ -54,7 +54,7 @@ def test_tag_simplifier3(self): pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') _item_id_count = simplifier_raw_html.count('_item_id') - self.assertEqual(_item_id_count, 69) + self.assertEqual(_item_id_count, 41) def test_tag_simplifier4(self): file_path = base_dir / 'assets/test_html_data/1.html' @@ -65,7 +65,7 @@ def test_tag_simplifier4(self): pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') _item_id_count = simplifier_raw_html.count('_item_id') - self.assertEqual(_item_id_count, 114) + self.assertEqual(_item_id_count, 113) def test_tag_simplifier_table(self): file_path = base_dir / 'assets/test_html_data/simplify_cases/table.html' @@ -102,7 +102,7 @@ def test_tag_simplifier_nested_table_headers(self): simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') raw_tag_html = pre_data_result.get(PreDataJsonKey.TYPICAL_RAW_TAG_HTML, '') _item_id_count = simplifier_raw_html.count('_item_id') - self.assertEqual(_item_id_count, 37) + self.assertEqual(_item_id_count, 13) id_dom = html.fromstring(raw_tag_html) # 用xpath定位外层table元素,该table用于布局 @@ -148,7 +148,7 @@ def test_tag_simplifier_list(self): simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') raw_tag_html = pre_data_result.get(PreDataJsonKey.TYPICAL_RAW_TAG_HTML, '') _item_id_count = simplifier_raw_html.count('_item_id') - self.assertEqual(_item_id_count, 45) + self.assertEqual(_item_id_count, 118) id_dom = html.fromstring(raw_tag_html) # 用xpath定位ul元素,该ul用于布局 @@ -236,16 +236,16 @@ def test_tag_simplifier_header_tag(self): simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') raw_tag_html = pre_data_result.get(PreDataJsonKey.TYPICAL_RAW_TAG_HTML, '') _item_id_count = simplifier_raw_html.count('_item_id') - self.assertEqual(_item_id_count, 32) + self.assertEqual(_item_id_count, 35) id_dom = html.fromstring(raw_tag_html) # 用xpath定位元素,该元素的id是header,且是body的直接子元素 header_element = id_dom.xpath('//section[@data-anno-uid="anno-uid-a5n4leb0qxv"]')[0] - # 确认该元素没有_item_id属性,也就是被删掉了 - self.assertIsNone(header_element.get('_item_id')) + # 确认该元素有_item_id属性,也就是被保留了 + self.assertIsNotNone(header_element.get('_item_id')) # 用xpath定位元素,该元素位于header标签内部,但这个header不是body的直接子元素 header_element = id_dom.xpath('//h2[@data-anno-uid="anno-uid-g8cyd0j0kn6"]')[0] - # 确认该元素有_item_id属性,也就是被保留了 + # 确认该元素有_item_id属性,也被保留了(目前的simplify是所有的header都保留) self.assertIsNotNone(header_element.get('_item_id')) def test_tag_simplifier_nav_class(self): @@ -258,7 +258,7 @@ def test_tag_simplifier_nav_class(self): simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') raw_tag_html = pre_data_result.get(PreDataJsonKey.TYPICAL_RAW_TAG_HTML, '') _item_id_count = simplifier_raw_html.count('_item_id') - self.assertEqual(_item_id_count, 100) + self.assertEqual(_item_id_count, 58) id_dom = html.fromstring(raw_tag_html) # 用xpath定位元素,该元素的class是nav,但不是body的直接子元素,应该保留 From 8aea7c838218a566b57cb9d19f45b8974274bd5f Mon Sep 17 00:00:00 2001 From: ningwenchang Date: Mon, 8 Sep 2025 06:47:12 +0000 Subject: [PATCH 4/5] fix unittest issue --- .../parser/test_tag_simplifier.py | 73 ++++++++++++++++++- 1 file changed, 71 insertions(+), 2 deletions(-) diff --git a/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py b/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py index 215eb49b..3e193079 100644 --- a/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py +++ b/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py @@ -23,6 +23,11 @@ def test_tag_simplifier(self): _item_id_count = simplifier_raw_html.count('_item_id') self.assertEqual(_item_id_count, 34) + raw_tag_html = pre_data_result.get(PreDataJsonKey.TYPICAL_RAW_TAG_HTML, '') + out_path = base_dir / 'assets/test_html_data/simplify_output/test_tah_simplifier_output.html' + with open(out_path, "w") as fp: + fp.write(raw_tag_html) + def test_tag_simplifier1(self): file_path = base_dir / 'assets/test_html_data/normal_dl.html' with open(file_path, 'r', encoding='utf-8') as file: @@ -34,6 +39,11 @@ def test_tag_simplifier1(self): _item_id_count = simplifier_raw_html.count('_item_id') self.assertEqual(_item_id_count, 48) + raw_tag_html = pre_data_result.get(PreDataJsonKey.TYPICAL_RAW_TAG_HTML, '') + out_path = base_dir / 'assets/test_html_data/simplify_output/normal_dl_output.html' + with open(out_path, "w") as fp: + fp.write(raw_tag_html) + def test_tag_simplifier2(self): file_path = base_dir / 'assets/test_html_data/normal_table.html' with open(file_path, 'r', encoding='utf-8') as file: @@ -45,6 +55,11 @@ def test_tag_simplifier2(self): _item_id_count = simplifier_raw_html.count('_item_id') self.assertEqual(_item_id_count, 11) + raw_tag_html = pre_data_result.get(PreDataJsonKey.TYPICAL_RAW_TAG_HTML, '') + out_path = base_dir / 'assets/test_html_data/simplify_output/normal_table_output.html' + with open(out_path, "w") as fp: + fp.write(raw_tag_html) + def test_tag_simplifier3(self): file_path = base_dir / 'assets/test_html_data/special_table_1.html' with open(file_path, 'r', encoding='utf-8') as file: @@ -56,6 +71,11 @@ def test_tag_simplifier3(self): _item_id_count = simplifier_raw_html.count('_item_id') self.assertEqual(_item_id_count, 41) + raw_tag_html = pre_data_result.get(PreDataJsonKey.TYPICAL_RAW_TAG_HTML, '') + out_path = base_dir / 'assets/test_html_data/simplify_output/special_table_1_output.html' + with open(out_path, "w") as fp: + fp.write(raw_tag_html) + def test_tag_simplifier4(self): file_path = base_dir / 'assets/test_html_data/1.html' with open(file_path, 'r', encoding='utf-8') as file: @@ -67,6 +87,15 @@ def test_tag_simplifier4(self): _item_id_count = simplifier_raw_html.count('_item_id') self.assertEqual(_item_id_count, 113) + raw_tag_html = pre_data_result.get(PreDataJsonKey.TYPICAL_RAW_TAG_HTML, '') + out_path = base_dir / 'assets/test_html_data/simplify_output/1_output.html' + with open(out_path, "w") as fp: + fp.write(raw_tag_html) + + out_path = base_dir / 'assets/test_html_data/simplify_output/1_sim_output.html' + with open(out_path, "w") as fp: + fp.write(simplifier_raw_html) + def test_tag_simplifier_table(self): file_path = base_dir / 'assets/test_html_data/simplify_cases/table.html' with open(file_path, 'r', encoding='utf-8') as file: @@ -92,6 +121,10 @@ def test_tag_simplifier_table(self): td_item_count += 1 self.assertNotEqual(td_item_count, 0) + out_path = base_dir / 'assets/test_html_data/simplify_output/table_output.html' + with open(out_path, "w") as fp: + fp.write(raw_tag_html) + def test_tag_simplifier_nested_table_headers(self): file_path = base_dir / 'assets/test_html_data/simplify_cases/nested_table_headers.html' with open(file_path, 'r', encoding='utf-8') as file: @@ -115,6 +148,10 @@ def test_tag_simplifier_nested_table_headers(self): # 确认该table元素有_item_id属性 self.assertIsNotNone(table_element.get('_item_id')) + out_path = base_dir / 'assets/test_html_data/simplify_output/nested_table_headers_output.html' + with open(out_path, "w") as fp: + fp.write(raw_tag_html) + def test_tag_simplifier_nested_table_caption(self): file_path = base_dir / 'assets/test_html_data/simplify_cases/nested_table_caption.html' with open(file_path, 'r', encoding='utf-8') as file: @@ -138,6 +175,10 @@ def test_tag_simplifier_nested_table_caption(self): # 确认该table元素有_item_id属性 self.assertIsNotNone(table_element.get('_item_id')) + out_path = base_dir / 'assets/test_html_data/simplify_output/nested_table_caption_output.html' + with open(out_path, "w") as fp: + fp.write(raw_tag_html) + def test_tag_simplifier_list(self): file_path = base_dir / 'assets/test_html_data/simplify_cases/list.html' with open(file_path, 'r', encoding='utf-8') as file: @@ -163,6 +204,10 @@ def test_tag_simplifier_list(self): li_item_count += 1 self.assertNotEqual(li_item_count, 0) + out_path = base_dir / 'assets/test_html_data/simplify_output/list_output.html' + with open(out_path, "w") as fp: + fp.write(raw_tag_html) + def test_tag_simplifier_non_list_child(self): file_path = base_dir / 'assets/test_html_data/simplify_cases/non_list_child.html' with open(file_path, 'r', encoding='utf-8') as file: @@ -188,6 +233,10 @@ def test_tag_simplifier_non_list_child(self): li_item_count += 1 self.assertNotEqual(li_item_count, 0) + out_path = base_dir / 'assets/test_html_data/simplify_output/non_list_child_output.html' + with open(out_path, "w") as fp: + fp.write(raw_tag_html) + def test_tag_simplifier_inline_block(self): file_path = base_dir / 'assets/test_html_data/simplify_cases/inline_block.html' with open(file_path, 'r', encoding='utf-8') as file: @@ -209,6 +258,10 @@ def test_tag_simplifier_inline_block(self): for child in span_element.iterchildren(): self.assertIsNotNone(child.get("_item_id")) + out_path = base_dir / 'assets/test_html_data/simplify_output/inline_block_output.html' + with open(out_path, "w") as fp: + fp.write(raw_tag_html) + def test_tag_simplifier_abnormal_comment(self): file_path = base_dir / 'assets/test_html_data/simplify_cases/abnormal_comment.html' with open(file_path, 'r', encoding='utf-8') as file: @@ -226,6 +279,10 @@ def test_tag_simplifier_abnormal_comment(self): comment_res = re.search(r'', raw_tag_html, flags=re.DOTALL) self.assertIsNone(comment_res) + out_path = base_dir / 'assets/test_html_data/simplify_output/abnormal_comment_output.html' + with open(out_path, "w") as fp: + fp.write(raw_tag_html) + def test_tag_simplifier_header_tag(self): file_path = base_dir / 'assets/test_html_data/simplify_cases/header_tag.html' with open(file_path, 'r', encoding='utf-8') as file: @@ -239,8 +296,8 @@ def test_tag_simplifier_header_tag(self): self.assertEqual(_item_id_count, 35) id_dom = html.fromstring(raw_tag_html) - # 用xpath定位元素,该元素的id是header,且是body的直接子元素 - header_element = id_dom.xpath('//section[@data-anno-uid="anno-uid-a5n4leb0qxv"]')[0] + # 用xpath定位元素,该元素位于id名为header的元素内部,且这个'header'是body的直接子元素 + header_element = id_dom.xpath('//h1[@data-anno-uid="anno-uid-g513k4pfha8"]')[0] # 确认该元素有_item_id属性,也就是被保留了 self.assertIsNotNone(header_element.get('_item_id')) # 用xpath定位元素,该元素位于header标签内部,但这个header不是body的直接子元素 @@ -248,6 +305,10 @@ def test_tag_simplifier_header_tag(self): # 确认该元素有_item_id属性,也被保留了(目前的simplify是所有的header都保留) self.assertIsNotNone(header_element.get('_item_id')) + out_path = base_dir / 'assets/test_html_data/simplify_output/header_tag_output.html' + with open(out_path, "w") as fp: + fp.write(raw_tag_html) + def test_tag_simplifier_nav_class(self): file_path = base_dir / 'assets/test_html_data/simplify_cases/nav_class.html' with open(file_path, 'r', encoding='utf-8') as file: @@ -270,6 +331,10 @@ def test_tag_simplifier_nav_class(self): nav_item_count += 1 self.assertNotEqual(nav_item_count, 0) + out_path = base_dir / 'assets/test_html_data/simplify_output/nav_class_output.html' + with open(out_path, "w") as fp: + fp.write(raw_tag_html) + def test_tag_simplifier_block_select(self): file_path = base_dir / 'assets/test_html_data/simplify_cases/block_select.html' with open(file_path, 'r', encoding='utf-8') as file: @@ -289,6 +354,10 @@ def test_tag_simplifier_block_select(self): self.assertIsNotNone(p_element.get("_item_id")) self.assertIsNotNone(p_element.get("cc-select")) + out_path = base_dir / 'assets/test_html_data/simplify_output/block_select_output.html' + with open(out_path, "w") as fp: + fp.write(raw_tag_html) + if __name__ == '__main__': unittest.main() From acfde1e7450f48a06fc6e401de98d0117facdf1b Mon Sep 17 00:00:00 2001 From: ningwenchang Date: Tue, 9 Sep 2025 06:55:55 +0000 Subject: [PATCH 5/5] [fix]: Fix the issue where inline elements were referenced multiple times when they contained block-level elements. --- .../simplify_html/simplify_html.py | 8 +- .../parser/test_tag_simplifier.py | 235 +++++++++++------- 2 files changed, 151 insertions(+), 92 deletions(-) diff --git a/llm_web_kit/main_html_parser/simplify_html/simplify_html.py b/llm_web_kit/main_html_parser/simplify_html/simplify_html.py index 7ae842ba..3c37da8e 100644 --- a/llm_web_kit/main_html_parser/simplify_html/simplify_html.py +++ b/llm_web_kit/main_html_parser/simplify_html/simplify_html.py @@ -209,6 +209,9 @@ def judge_special_case(node, expected_tags, types_map): def has_block_descendants(node): for child in node.iterdescendants(): if is_block_element(child): + if node.tag in inline_tags: + original_element = uid_map.get(node.get('data-uid')) + original_element.set('cc-block-type', "true") return True return False @@ -706,9 +709,9 @@ def process_paragraphs(paragraphs: List[Dict[str, str]], uid_map: Dict[str, html if content_type != 'block_element': if original_parent is not None: # root_for_xpath有子元素 + original_element = uid_map.get(root_for_xpath.get('data-uid')) if len(root_for_xpath) > 0: - if root_for_xpath.tag in inline_tags and uid_map.get(root_for_xpath.get('data-uid')).tag != 'body': - original_element = uid_map.get(root_for_xpath.get('data-uid')) + if root_for_xpath.tag in inline_tags and original_element.tag != 'body' and original_element.get('cc-block-type') != "true": original_element.set('_item_id', current_id) else: # 收集需要包裹的子元素 @@ -769,7 +772,6 @@ def process_paragraphs(paragraphs: List[Dict[str, str]], uid_map: Dict[str, html # last_child.tail = None else: if content_type == 'inline_elements': - original_element = uid_map.get(root_for_xpath.get('data-uid')) original_element.set('_item_id', current_id) else: # root_for_xpath只有文本内容 diff --git a/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py b/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py index 3e193079..a7dd3099 100644 --- a/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py +++ b/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py @@ -12,101 +12,143 @@ class MyTestCase(unittest.TestCase): + def check_and_find_max_item_id(self, input_str: str) -> int: + # 匹配所有 _item_id="XXX" 的模式,提取XXX部分 + pattern = "_item_id" + r'="(\d+)"' + matches = re.findall(pattern, input_str) + + # 至少匹配一个 + if len(matches) == 0: + return 0 + + # 匹配到的对象全部转化为int + int_list = [] + for match in matches: + try: + int_list.append(int(match)) + except Exception: + raise ValueError(f'error while convert match {match} to int') + + # 检查是否为从1开始的连续整数 + target_value = 1 + for int_id in int_list: + if int_id == target_value: + target_value += 1 + else: + raise ValueError( + f'mistake find in int list, current target value is {target_value}, but find {int_id}' + '\n' + input_str + ) + + # 都没有问题的情况下,返回最大的数 + return int_list[-1] + def test_tag_simplifier(self): file_path = base_dir / 'assets/test_html_data/test_tah_simplifier.html' with open(file_path, 'r', encoding='utf-8') as file: raw_html = file.read() + data_dict = {PreDataJsonKey.TYPICAL_RAW_HTML: raw_html} pre_data = PreDataJson(data_dict) + pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) + simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') - _item_id_count = simplifier_raw_html.count('_item_id') - self.assertEqual(_item_id_count, 34) + simple_id_count = self.check_and_find_max_item_id(simplifier_raw_html) + self.assertEqual(simple_id_count, 34) raw_tag_html = pre_data_result.get(PreDataJsonKey.TYPICAL_RAW_TAG_HTML, '') - out_path = base_dir / 'assets/test_html_data/simplify_output/test_tah_simplifier_output.html' - with open(out_path, "w") as fp: - fp.write(raw_tag_html) + tag_id_count = self.check_and_find_max_item_id(raw_tag_html) + self.assertEqual(tag_id_count, simple_id_count) def test_tag_simplifier1(self): file_path = base_dir / 'assets/test_html_data/normal_dl.html' with open(file_path, 'r', encoding='utf-8') as file: raw_html = file.read() + data_dict = {PreDataJsonKey.TYPICAL_RAW_HTML: raw_html} pre_data = PreDataJson(data_dict) + pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) + simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') - _item_id_count = simplifier_raw_html.count('_item_id') - self.assertEqual(_item_id_count, 48) + simple_id_count = self.check_and_find_max_item_id(simplifier_raw_html) + self.assertEqual(simple_id_count, 48) raw_tag_html = pre_data_result.get(PreDataJsonKey.TYPICAL_RAW_TAG_HTML, '') - out_path = base_dir / 'assets/test_html_data/simplify_output/normal_dl_output.html' - with open(out_path, "w") as fp: - fp.write(raw_tag_html) + tag_id_count = self.check_and_find_max_item_id(raw_tag_html) + self.assertEqual(tag_id_count, simple_id_count) def test_tag_simplifier2(self): file_path = base_dir / 'assets/test_html_data/normal_table.html' with open(file_path, 'r', encoding='utf-8') as file: raw_html = file.read() + data_dict = {PreDataJsonKey.TYPICAL_RAW_HTML: raw_html} pre_data = PreDataJson(data_dict) + pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) + simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') - _item_id_count = simplifier_raw_html.count('_item_id') - self.assertEqual(_item_id_count, 11) + simple_id_count = self.check_and_find_max_item_id(simplifier_raw_html) + self.assertEqual(simple_id_count, 11) raw_tag_html = pre_data_result.get(PreDataJsonKey.TYPICAL_RAW_TAG_HTML, '') - out_path = base_dir / 'assets/test_html_data/simplify_output/normal_table_output.html' - with open(out_path, "w") as fp: - fp.write(raw_tag_html) + tag_id_count = self.check_and_find_max_item_id(raw_tag_html) + self.assertEqual(tag_id_count, simple_id_count) def test_tag_simplifier3(self): file_path = base_dir / 'assets/test_html_data/special_table_1.html' with open(file_path, 'r', encoding='utf-8') as file: raw_html = file.read() + data_dict = {PreDataJsonKey.TYPICAL_RAW_HTML: raw_html} pre_data = PreDataJson(data_dict) + pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) + simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') - _item_id_count = simplifier_raw_html.count('_item_id') - self.assertEqual(_item_id_count, 41) + simple_id_count = self.check_and_find_max_item_id(simplifier_raw_html) + self.assertEqual(simple_id_count, 41) raw_tag_html = pre_data_result.get(PreDataJsonKey.TYPICAL_RAW_TAG_HTML, '') - out_path = base_dir / 'assets/test_html_data/simplify_output/special_table_1_output.html' - with open(out_path, "w") as fp: - fp.write(raw_tag_html) + tag_id_count = self.check_and_find_max_item_id(raw_tag_html) + self.assertEqual(tag_id_count, simple_id_count) def test_tag_simplifier4(self): file_path = base_dir / 'assets/test_html_data/1.html' with open(file_path, 'r', encoding='utf-8') as file: raw_html = file.read() + data_dict = {PreDataJsonKey.TYPICAL_RAW_HTML: raw_html, PreDataJsonKey.IS_XPATH: False} pre_data = PreDataJson(data_dict) + pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) + simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') - _item_id_count = simplifier_raw_html.count('_item_id') - self.assertEqual(_item_id_count, 113) + simple_id_count = self.check_and_find_max_item_id(simplifier_raw_html) + self.assertEqual(simple_id_count, 113) raw_tag_html = pre_data_result.get(PreDataJsonKey.TYPICAL_RAW_TAG_HTML, '') - out_path = base_dir / 'assets/test_html_data/simplify_output/1_output.html' - with open(out_path, "w") as fp: - fp.write(raw_tag_html) - - out_path = base_dir / 'assets/test_html_data/simplify_output/1_sim_output.html' - with open(out_path, "w") as fp: - fp.write(simplifier_raw_html) + tag_id_count = self.check_and_find_max_item_id(raw_tag_html) + self.assertEqual(tag_id_count, simple_id_count) def test_tag_simplifier_table(self): file_path = base_dir / 'assets/test_html_data/simplify_cases/table.html' with open(file_path, 'r', encoding='utf-8') as file: raw_html = file.read() + data_dict = {PreDataJsonKey.TYPICAL_RAW_HTML: raw_html} pre_data = PreDataJson(data_dict) + pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) + simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') + simple_id_count = self.check_and_find_max_item_id(simplifier_raw_html) + self.assertEqual(simple_id_count, 35) + raw_tag_html = pre_data_result.get(PreDataJsonKey.TYPICAL_RAW_TAG_HTML, '') - _item_id_count = simplifier_raw_html.count('_item_id') - self.assertEqual(_item_id_count, 35) + tag_id_count = self.check_and_find_max_item_id(raw_tag_html) + self.assertEqual(tag_id_count, simple_id_count) id_dom = html.fromstring(raw_tag_html) # 用xpath定位data-anno-uid="anno-uid-3vtzg9uxee4"的table元素,该table用于布局 @@ -121,21 +163,23 @@ def test_tag_simplifier_table(self): td_item_count += 1 self.assertNotEqual(td_item_count, 0) - out_path = base_dir / 'assets/test_html_data/simplify_output/table_output.html' - with open(out_path, "w") as fp: - fp.write(raw_tag_html) - def test_tag_simplifier_nested_table_headers(self): file_path = base_dir / 'assets/test_html_data/simplify_cases/nested_table_headers.html' with open(file_path, 'r', encoding='utf-8') as file: raw_html = file.read() + data_dict = {PreDataJsonKey.TYPICAL_RAW_HTML: raw_html} pre_data = PreDataJson(data_dict) + pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) + simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') + simple_id_count = self.check_and_find_max_item_id(simplifier_raw_html) + self.assertEqual(simple_id_count, 13) + raw_tag_html = pre_data_result.get(PreDataJsonKey.TYPICAL_RAW_TAG_HTML, '') - _item_id_count = simplifier_raw_html.count('_item_id') - self.assertEqual(_item_id_count, 13) + tag_id_count = self.check_and_find_max_item_id(raw_tag_html) + self.assertEqual(tag_id_count, simple_id_count) id_dom = html.fromstring(raw_tag_html) # 用xpath定位外层table元素,该table用于布局 @@ -148,21 +192,23 @@ def test_tag_simplifier_nested_table_headers(self): # 确认该table元素有_item_id属性 self.assertIsNotNone(table_element.get('_item_id')) - out_path = base_dir / 'assets/test_html_data/simplify_output/nested_table_headers_output.html' - with open(out_path, "w") as fp: - fp.write(raw_tag_html) - def test_tag_simplifier_nested_table_caption(self): file_path = base_dir / 'assets/test_html_data/simplify_cases/nested_table_caption.html' with open(file_path, 'r', encoding='utf-8') as file: raw_html = file.read() + data_dict = {PreDataJsonKey.TYPICAL_RAW_HTML: raw_html} pre_data = PreDataJson(data_dict) + pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) + simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') + simple_id_count = self.check_and_find_max_item_id(simplifier_raw_html) + self.assertEqual(simple_id_count, 14) + raw_tag_html = pre_data_result.get(PreDataJsonKey.TYPICAL_RAW_TAG_HTML, '') - _item_id_count = simplifier_raw_html.count('_item_id') - self.assertEqual(_item_id_count, 14) + tag_id_count = self.check_and_find_max_item_id(raw_tag_html) + self.assertEqual(tag_id_count, simple_id_count) id_dom = html.fromstring(raw_tag_html) # 用xpath定位外层table元素,该table用于布局 @@ -175,21 +221,23 @@ def test_tag_simplifier_nested_table_caption(self): # 确认该table元素有_item_id属性 self.assertIsNotNone(table_element.get('_item_id')) - out_path = base_dir / 'assets/test_html_data/simplify_output/nested_table_caption_output.html' - with open(out_path, "w") as fp: - fp.write(raw_tag_html) - def test_tag_simplifier_list(self): file_path = base_dir / 'assets/test_html_data/simplify_cases/list.html' with open(file_path, 'r', encoding='utf-8') as file: raw_html = file.read() + data_dict = {PreDataJsonKey.TYPICAL_RAW_HTML: raw_html} pre_data = PreDataJson(data_dict) + pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) + simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') + simple_id_count = self.check_and_find_max_item_id(simplifier_raw_html) + self.assertEqual(simple_id_count, 118) + raw_tag_html = pre_data_result.get(PreDataJsonKey.TYPICAL_RAW_TAG_HTML, '') - _item_id_count = simplifier_raw_html.count('_item_id') - self.assertEqual(_item_id_count, 118) + tag_id_count = self.check_and_find_max_item_id(raw_tag_html) + self.assertEqual(tag_id_count, simple_id_count) id_dom = html.fromstring(raw_tag_html) # 用xpath定位ul元素,该ul用于布局 @@ -204,21 +252,23 @@ def test_tag_simplifier_list(self): li_item_count += 1 self.assertNotEqual(li_item_count, 0) - out_path = base_dir / 'assets/test_html_data/simplify_output/list_output.html' - with open(out_path, "w") as fp: - fp.write(raw_tag_html) - def test_tag_simplifier_non_list_child(self): file_path = base_dir / 'assets/test_html_data/simplify_cases/non_list_child.html' with open(file_path, 'r', encoding='utf-8') as file: raw_html = file.read() + data_dict = {PreDataJsonKey.TYPICAL_RAW_HTML: raw_html} pre_data = PreDataJson(data_dict) + pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) + simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') + simple_id_count = self.check_and_find_max_item_id(simplifier_raw_html) + self.assertEqual(simple_id_count, 151) + raw_tag_html = pre_data_result.get(PreDataJsonKey.TYPICAL_RAW_TAG_HTML, '') - _item_id_count = simplifier_raw_html.count('_item_id') - self.assertEqual(_item_id_count, 151) + tag_id_count = self.check_and_find_max_item_id(raw_tag_html) + self.assertEqual(tag_id_count, simple_id_count) id_dom = html.fromstring(raw_tag_html) # 用xpath定位ul元素,该ul用于布局 @@ -233,21 +283,23 @@ def test_tag_simplifier_non_list_child(self): li_item_count += 1 self.assertNotEqual(li_item_count, 0) - out_path = base_dir / 'assets/test_html_data/simplify_output/non_list_child_output.html' - with open(out_path, "w") as fp: - fp.write(raw_tag_html) - def test_tag_simplifier_inline_block(self): file_path = base_dir / 'assets/test_html_data/simplify_cases/inline_block.html' with open(file_path, 'r', encoding='utf-8') as file: raw_html = file.read() + data_dict = {PreDataJsonKey.TYPICAL_RAW_HTML: raw_html} pre_data = PreDataJson(data_dict) + pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) + simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') + simple_id_count = self.check_and_find_max_item_id(simplifier_raw_html) + self.assertEqual(simple_id_count, 12) + raw_tag_html = pre_data_result.get(PreDataJsonKey.TYPICAL_RAW_TAG_HTML, '') - _item_id_count = simplifier_raw_html.count('_item_id') - self.assertEqual(_item_id_count, 12) + tag_id_count = self.check_and_find_max_item_id(raw_tag_html) + self.assertEqual(tag_id_count, simple_id_count) id_dom = html.fromstring(raw_tag_html) # 用xpath定位span元素,该span内部包含了块级元素 @@ -258,42 +310,47 @@ def test_tag_simplifier_inline_block(self): for child in span_element.iterchildren(): self.assertIsNotNone(child.get("_item_id")) - out_path = base_dir / 'assets/test_html_data/simplify_output/inline_block_output.html' - with open(out_path, "w") as fp: - fp.write(raw_tag_html) - def test_tag_simplifier_abnormal_comment(self): file_path = base_dir / 'assets/test_html_data/simplify_cases/abnormal_comment.html' with open(file_path, 'r', encoding='utf-8') as file: raw_html = file.read() + data_dict = {PreDataJsonKey.TYPICAL_RAW_HTML: raw_html} pre_data = PreDataJson(data_dict) + pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) + simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') + simple_id_count = self.check_and_find_max_item_id(simplifier_raw_html) + self.assertEqual(simple_id_count, 53) + raw_tag_html = pre_data_result.get(PreDataJsonKey.TYPICAL_RAW_TAG_HTML, '') - _item_id_count = simplifier_raw_html.count('_item_id') - self.assertEqual(_item_id_count, 53) + tag_id_count = self.check_and_find_max_item_id(raw_tag_html) + self.assertEqual(tag_id_count, simple_id_count) + # 验证不规范的注释内包含的有效内容没有被删除 self.assertIn('', raw_tag_html) # 验证规范的注释都已被删除 comment_res = re.search(r'', raw_tag_html, flags=re.DOTALL) self.assertIsNone(comment_res) - out_path = base_dir / 'assets/test_html_data/simplify_output/abnormal_comment_output.html' - with open(out_path, "w") as fp: - fp.write(raw_tag_html) - def test_tag_simplifier_header_tag(self): file_path = base_dir / 'assets/test_html_data/simplify_cases/header_tag.html' with open(file_path, 'r', encoding='utf-8') as file: raw_html = file.read() + data_dict = {PreDataJsonKey.TYPICAL_RAW_HTML: raw_html} pre_data = PreDataJson(data_dict) + pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) + simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') + simple_id_count = self.check_and_find_max_item_id(simplifier_raw_html) + self.assertEqual(simple_id_count, 35) + raw_tag_html = pre_data_result.get(PreDataJsonKey.TYPICAL_RAW_TAG_HTML, '') - _item_id_count = simplifier_raw_html.count('_item_id') - self.assertEqual(_item_id_count, 35) + tag_id_count = self.check_and_find_max_item_id(raw_tag_html) + self.assertEqual(tag_id_count, simple_id_count) id_dom = html.fromstring(raw_tag_html) # 用xpath定位元素,该元素位于id名为header的元素内部,且这个'header'是body的直接子元素 @@ -305,21 +362,23 @@ def test_tag_simplifier_header_tag(self): # 确认该元素有_item_id属性,也被保留了(目前的simplify是所有的header都保留) self.assertIsNotNone(header_element.get('_item_id')) - out_path = base_dir / 'assets/test_html_data/simplify_output/header_tag_output.html' - with open(out_path, "w") as fp: - fp.write(raw_tag_html) - def test_tag_simplifier_nav_class(self): file_path = base_dir / 'assets/test_html_data/simplify_cases/nav_class.html' with open(file_path, 'r', encoding='utf-8') as file: raw_html = file.read() + data_dict = {PreDataJsonKey.TYPICAL_RAW_HTML: raw_html} pre_data = PreDataJson(data_dict) + pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) + simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') + simple_id_count = self.check_and_find_max_item_id(simplifier_raw_html) + self.assertEqual(simple_id_count, 58) + raw_tag_html = pre_data_result.get(PreDataJsonKey.TYPICAL_RAW_TAG_HTML, '') - _item_id_count = simplifier_raw_html.count('_item_id') - self.assertEqual(_item_id_count, 58) + tag_id_count = self.check_and_find_max_item_id(raw_tag_html) + self.assertEqual(tag_id_count, simple_id_count) id_dom = html.fromstring(raw_tag_html) # 用xpath定位元素,该元素的class是nav,但不是body的直接子元素,应该保留 @@ -331,21 +390,23 @@ def test_tag_simplifier_nav_class(self): nav_item_count += 1 self.assertNotEqual(nav_item_count, 0) - out_path = base_dir / 'assets/test_html_data/simplify_output/nav_class_output.html' - with open(out_path, "w") as fp: - fp.write(raw_tag_html) - def test_tag_simplifier_block_select(self): file_path = base_dir / 'assets/test_html_data/simplify_cases/block_select.html' with open(file_path, 'r', encoding='utf-8') as file: raw_html = file.read() + data_dict = {PreDataJsonKey.TYPICAL_RAW_HTML: raw_html} pre_data = PreDataJson(data_dict) + pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) + simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') + simple_id_count = self.check_and_find_max_item_id(simplifier_raw_html) + self.assertEqual(simple_id_count, 7) + raw_tag_html = pre_data_result.get(PreDataJsonKey.TYPICAL_RAW_TAG_HTML, '') - _item_id_count = simplifier_raw_html.count('_item_id') - self.assertEqual(_item_id_count, 7) + tag_id_count = self.check_and_find_max_item_id(raw_tag_html) + self.assertEqual(tag_id_count, simple_id_count) id_dom = html.fromstring(raw_tag_html) # 用xpath定位元素,该元素是块级元素且内部不包含块级元素,但该元素本身没有cc-select,只是其内部元素有cc-select @@ -354,10 +415,6 @@ def test_tag_simplifier_block_select(self): self.assertIsNotNone(p_element.get("_item_id")) self.assertIsNotNone(p_element.get("cc-select")) - out_path = base_dir / 'assets/test_html_data/simplify_output/block_select_output.html' - with open(out_path, "w") as fp: - fp.write(raw_tag_html) - if __name__ == '__main__': unittest.main()