Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions llm_web_kit/main_html_parser/parser/layout_batch_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
from hashlib import sha256

import nltk
from bs4 import BeautifulSoup
from lxml import html
from lxml.html import etree
from nltk.tokenize import word_tokenize
from selectolax.parser import HTMLParser

from llm_web_kit.html_layout.html_layout_cosin import get_feature, similarity
from llm_web_kit.input.pre_data_json import PreDataJson, PreDataJsonKey
Expand Down Expand Up @@ -43,8 +43,8 @@ def parse_tuple_key(self, key_str):
def parse(self, pre_data: PreDataJson) -> PreDataJson:
# 支持输入字符串和tag mapping后的dict对象
html_source = pre_data[PreDataJsonKey.HTML_SOURCE]
soup = BeautifulSoup(html_source, 'html.parser')
html_source = str(soup)
selectolax_tree = HTMLParser(html_source)
html_source = selectolax_tree.html
template_dict_html = pre_data.get(PreDataJsonKey.TYPICAL_DICT_HTML, '<html></html>')
self.dynamic_id_enable = pre_data.get(PreDataJsonKey.DYNAMIC_ID_ENABLE, False)
self.dynamic_classid_enable = pre_data.get(PreDataJsonKey.DYNAMIC_CLASSID_ENABLE, False)
Expand Down Expand Up @@ -146,7 +146,7 @@ def find_blocks_drop(self, element, depth, element_dict, parent_keyy, parent_lab
length = len(self.get_tokens(element.text_content().strip()))
length_tail = 0
text = element.xpath('string()').strip()
is_natural_language = self.__is_natural_language(text)
is_natural_language = self.__is_natural_language(text) or length_tail >= 10
if element.tail:
length_tail = len(element.tail.strip())
idd = element.get('id')
Expand Down Expand Up @@ -276,7 +276,7 @@ def find_blocks_drop(self, element, depth, element_dict, parent_keyy, parent_lab
# 判断当前节点是否是红色节点
if keyy in layer_nodes_dict:
if 'red' not in layer_nodes_dict[keyy]:
if self.more_noise_enable and tag in ['p', 'ul'] and not idd and not class_tag and is_natural_language:
if self.more_noise_enable and tag in ['p', 'ul', 'br'] and not idd and is_natural_language:
label = 'red'
else:
parent = element.getparent()
Expand Down Expand Up @@ -306,7 +306,7 @@ def drop_node_element(self, html_source, element_dict, template_dict_html):

def htmll_to_content2(self, body_str):
body = html.fromstring(body_str)
tags_to_remove = ['header', 'footer', 'nav', 'aside', 'script', 'style']
tags_to_remove = ['footer', 'nav', 'aside', 'script', 'style']
for tag in tags_to_remove:
for element in list(body.xpath(f'//{tag}')):
prev = element.getprevious()
Expand Down
16 changes: 11 additions & 5 deletions llm_web_kit/main_html_parser/parser/tag_mapping.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from bs4 import BeautifulSoup
from lxml import etree, html
from selectolax.parser import HTMLParser

from llm_web_kit.exception.exception import TagMappingParserException
from llm_web_kit.html_layout.html_layout_cosin import get_feature, similarity
Expand Down Expand Up @@ -32,8 +32,8 @@ def parse(self, pre_data: PreDataJson) -> PreDataJson:
# tag映射逻辑
try:
template_raw_html = pre_data[PreDataJsonKey.TYPICAL_RAW_HTML]
soup = BeautifulSoup(template_raw_html, 'html.parser')
template_raw_html = str(soup)
selectolax_tree = HTMLParser(template_raw_html)
template_raw_html = selectolax_tree.html
template_tag_html = pre_data[PreDataJsonKey.TYPICAL_RAW_TAG_HTML]
response_json = pre_data[PreDataJsonKey.LLM_RESPONSE]
root = html.fromstring(template_tag_html)
Expand Down Expand Up @@ -149,7 +149,10 @@ def deal_element_direct(self, item_id, test_root):
deal_element = elements[0]
deal_element.set('magic_main_html', 'True')
for ele in deal_element:
ele.set('magic_main_html', 'True')
try:
ele.set('magic_main_html', 'True')
except Exception:
continue

def find_affected_element_after_drop(self, element):
prev_sibling = element.getprevious()
Expand All @@ -159,7 +162,10 @@ def find_affected_element_after_drop(self, element):
if len(element) > 0:
if is_main:
for ele in element:
ele.set('magic_main_html', 'True')
try:
ele.set('magic_main_html', 'True')
except Exception:
continue

element.drop_tag()
# 如果包含子tag并且还有text,text有可能是兄弟节点的tail
Expand Down
1 change: 1 addition & 0 deletions requirements/runtime.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ overrides==7.7.0
py-asciimath==0.3.0
pyahocorasick==2.0.0
scikit-learn>=1.6.1
selectolax==0.3.33
torch>=2.3.0
tqdm==4.67.1
transformers==4.40.2
Expand Down
Loading