diff --git a/llm_web_kit/main_html_parser/parser/layout_batch_parser.py b/llm_web_kit/main_html_parser/parser/layout_batch_parser.py index ce76ef70..616d72e9 100644 --- a/llm_web_kit/main_html_parser/parser/layout_batch_parser.py +++ b/llm_web_kit/main_html_parser/parser/layout_batch_parser.py @@ -3,10 +3,10 @@ from hashlib import sha256 import nltk -from bs4 import BeautifulSoup from lxml import html from lxml.html import etree from nltk.tokenize import word_tokenize +from selectolax.parser import HTMLParser from llm_web_kit.html_layout.html_layout_cosin import get_feature, similarity from llm_web_kit.input.pre_data_json import PreDataJson, PreDataJsonKey @@ -43,8 +43,8 @@ def parse_tuple_key(self, key_str): def parse(self, pre_data: PreDataJson) -> PreDataJson: # 支持输入字符串和tag mapping后的dict对象 html_source = pre_data[PreDataJsonKey.HTML_SOURCE] - soup = BeautifulSoup(html_source, 'html.parser') - html_source = str(soup) + selectolax_tree = HTMLParser(html_source) + html_source = selectolax_tree.html template_dict_html = pre_data.get(PreDataJsonKey.TYPICAL_DICT_HTML, '') self.dynamic_id_enable = pre_data.get(PreDataJsonKey.DYNAMIC_ID_ENABLE, False) self.dynamic_classid_enable = pre_data.get(PreDataJsonKey.DYNAMIC_CLASSID_ENABLE, False) @@ -146,7 +146,7 @@ def find_blocks_drop(self, element, depth, element_dict, parent_keyy, parent_lab length = len(self.get_tokens(element.text_content().strip())) length_tail = 0 text = element.xpath('string()').strip() - is_natural_language = self.__is_natural_language(text) + is_natural_language = self.__is_natural_language(text) or length_tail >= 10 if element.tail: length_tail = len(element.tail.strip()) idd = element.get('id') @@ -276,7 +276,7 @@ def find_blocks_drop(self, element, depth, element_dict, parent_keyy, parent_lab # 判断当前节点是否是红色节点 if keyy in layer_nodes_dict: if 'red' not in layer_nodes_dict[keyy]: - if self.more_noise_enable and tag in ['p', 'ul'] and not idd and not class_tag and is_natural_language: + if self.more_noise_enable and tag in ['p', 'ul', 'br'] and not idd and is_natural_language: label = 'red' else: parent = element.getparent() @@ -306,7 +306,7 @@ def drop_node_element(self, html_source, element_dict, template_dict_html): def htmll_to_content2(self, body_str): body = html.fromstring(body_str) - tags_to_remove = ['header', 'footer', 'nav', 'aside', 'script', 'style'] + tags_to_remove = ['footer', 'nav', 'aside', 'script', 'style'] for tag in tags_to_remove: for element in list(body.xpath(f'//{tag}')): prev = element.getprevious() diff --git a/llm_web_kit/main_html_parser/parser/tag_mapping.py b/llm_web_kit/main_html_parser/parser/tag_mapping.py index c635615d..b05a4baf 100644 --- a/llm_web_kit/main_html_parser/parser/tag_mapping.py +++ b/llm_web_kit/main_html_parser/parser/tag_mapping.py @@ -1,5 +1,5 @@ -from bs4 import BeautifulSoup from lxml import etree, html +from selectolax.parser import HTMLParser from llm_web_kit.exception.exception import TagMappingParserException from llm_web_kit.html_layout.html_layout_cosin import get_feature, similarity @@ -32,8 +32,8 @@ def parse(self, pre_data: PreDataJson) -> PreDataJson: # tag映射逻辑 try: template_raw_html = pre_data[PreDataJsonKey.TYPICAL_RAW_HTML] - soup = BeautifulSoup(template_raw_html, 'html.parser') - template_raw_html = str(soup) + selectolax_tree = HTMLParser(template_raw_html) + template_raw_html = selectolax_tree.html template_tag_html = pre_data[PreDataJsonKey.TYPICAL_RAW_TAG_HTML] response_json = pre_data[PreDataJsonKey.LLM_RESPONSE] root = html.fromstring(template_tag_html) @@ -149,7 +149,10 @@ def deal_element_direct(self, item_id, test_root): deal_element = elements[0] deal_element.set('magic_main_html', 'True') for ele in deal_element: - ele.set('magic_main_html', 'True') + try: + ele.set('magic_main_html', 'True') + except Exception: + continue def find_affected_element_after_drop(self, element): prev_sibling = element.getprevious() @@ -159,7 +162,10 @@ def find_affected_element_after_drop(self, element): if len(element) > 0: if is_main: for ele in element: - ele.set('magic_main_html', 'True') + try: + ele.set('magic_main_html', 'True') + except Exception: + continue element.drop_tag() # 如果包含子tag并且还有text,text有可能是兄弟节点的tail diff --git a/requirements/runtime.txt b/requirements/runtime.txt index c54ad7f4..f6ddacfe 100644 --- a/requirements/runtime.txt +++ b/requirements/runtime.txt @@ -23,6 +23,7 @@ overrides==7.7.0 py-asciimath==0.3.0 pyahocorasick==2.0.0 scikit-learn>=1.6.1 +selectolax==0.3.33 torch>=2.3.0 tqdm==4.67.1 transformers==4.40.2 diff --git a/tests/llm_web_kit/main_html_parser/assets/test_illegal_tag.html b/tests/llm_web_kit/main_html_parser/assets/test_illegal_tag.html new file mode 100644 index 00000000..41f20d17 --- /dev/null +++ b/tests/llm_web_kit/main_html_parser/assets/test_illegal_tag.html @@ -0,0 +1,177 @@ + +
+ + + + + + +
845 Sullivan Ave
+South Windsor, CT 06074-2047
+(860) 664-1001 Other
+Activities: Ice cream, batting cages
2007 Ratings 2011 +Ratings
+Difficulty: 4 Difficulty: +7
+Creativity: 2 Creativity: 8
+Atmosphere: 8 Atmosphere: 8
++
Perfect Swing must be named for
+its batting cages considering no one’s really taking swings in mini
+golf. The Putting Penguin played this course on a beautiful sunny
+Sunday in early May, and we were the only people there! Really it
+was a shame because it was a good course. We hope they get a website
+running or do some more advertising.
This course looks like many other
+tournament-style courses. Most of the holes were a variation of a
+curve and some type of bump in the course. It looks pretty
+generic. So did the score card. It has no course name on
+it or the par for each hole. The cups were a little too shallow on the front
+nine, and the ball hopped out on a number of occasions. There are
+flags in the cup at each hole, which is a minor annoyance because you have to
+remove and replace them throughout play. The water wasn’t running,
+but it was early in the season. Hole 13 was a 3
+tiered hole and was neat except for the pvc pipe that was a little beat
+up and didn’t allow you to get the ball in the hole at the
+bottom. There were some small lumps in the mats on the back nine
+that we weren’t sure if they were purposeful or a problem with the mats, but they
+didn’t seem to interfere too much with the movement of the ball.
Despite these criticisms we
+enjoyed this course. The layout and vegetation looked great, and the
+course was well maintained. Considering how prices have been going
+up for miniature golf we thought the $5.50 was quite reasonable. The
+location is great; it’s just down the street from the Buckland Hills Mall and
+Evergreen Walk. We would recommend this course to
+everyone. They even have Gifford’s ice cream!
2011 Update
It has been four years since we
+have reviewed Perfect Swing, and right off the bat we have to say it is well
+worth the visit if you live in the South Windsor area. It hasn’t even changed its price!
One noticeable difference between
+our review in 2007 and our update is that we have changed the Creativity score
+quite a bit. After playing over 100
+courses, we have to admit that there are hole designs at Perfect Swing that we
+have never seen any other courses. We
+also noticed this time that the “variation of a curve and some type of bump in
+the course” appear to be purposeful. If
+someone were to play this course a few times it would be possible to putt along
+these curves and bumps and make an easy 2 or even an ace. We’re thinking we must not have had that kind
+of appreciation for the course the first time we played there.
We still don’t like the rubber
+starting mats, but the course no longer has the flags in the cups, which we’re
+happy about. Unfortunately, now there
+are no hole numbers on the course so a player needs to pay attention while
+keeping score on the score card. The
+scorecards themselves are still generic with no pars listed, some of the cups
+are still too shallow, some of the carpets need a little work, and the pvc pipe
+on hole 13 is still beat up. But on a
+positive note, the landscaping is still excellently maintained, and there’s a
+lot of space between the holes.
The Putting Penguin still highly
+recommends this course!
Reviewed by Pat, Mandy and Putt
+Reviewed in 2007 & 2011
++ +
Course Pictures +(click to enlarge)
+You can also see pictures of all 18 holes on our Flickr +Page
+Map
Price: +$5
+Par:
+41
Difficulty: +3
+Creativity:
+3
Atmosphere: +2
++
This +course was wonderfully reviewed in the past but seems to have fallen into +disrepair & neglect. It was early in the season but water features were not +on, holes were not numbered & carpeting was not well maintained.
+Reviewed +by Glenda R
+Reviewed +in 2018
+