From e46bbfab9c5f26a57de0c9f21fb2e65b1b564785 Mon Sep 17 00:00:00 2001 From: liukaiwen Date: Fri, 22 Aug 2025 16:49:52 +0800 Subject: [PATCH 1/2] : fix main html loss due to br tail and p tag --- .../parser/layout_batch_parser.py | 12 ++++++------ .../main_html_parser/parser/tag_mapping.py | 16 +++++++++++----- requirements/runtime.txt | 1 + .../wdi_main_html.html | 4 ++-- .../parser/test_layout_parser.py | 3 ++- 5 files changed, 22 insertions(+), 14 deletions(-) diff --git a/llm_web_kit/main_html_parser/parser/layout_batch_parser.py b/llm_web_kit/main_html_parser/parser/layout_batch_parser.py index ce76ef70..616d72e9 100644 --- a/llm_web_kit/main_html_parser/parser/layout_batch_parser.py +++ b/llm_web_kit/main_html_parser/parser/layout_batch_parser.py @@ -3,10 +3,10 @@ from hashlib import sha256 import nltk -from bs4 import BeautifulSoup from lxml import html from lxml.html import etree from nltk.tokenize import word_tokenize +from selectolax.parser import HTMLParser from llm_web_kit.html_layout.html_layout_cosin import get_feature, similarity from llm_web_kit.input.pre_data_json import PreDataJson, PreDataJsonKey @@ -43,8 +43,8 @@ def parse_tuple_key(self, key_str): def parse(self, pre_data: PreDataJson) -> PreDataJson: # 支持输入字符串和tag mapping后的dict对象 html_source = pre_data[PreDataJsonKey.HTML_SOURCE] - soup = BeautifulSoup(html_source, 'html.parser') - html_source = str(soup) + selectolax_tree = HTMLParser(html_source) + html_source = selectolax_tree.html template_dict_html = pre_data.get(PreDataJsonKey.TYPICAL_DICT_HTML, '') self.dynamic_id_enable = pre_data.get(PreDataJsonKey.DYNAMIC_ID_ENABLE, False) self.dynamic_classid_enable = pre_data.get(PreDataJsonKey.DYNAMIC_CLASSID_ENABLE, False) @@ -146,7 +146,7 @@ def find_blocks_drop(self, element, depth, element_dict, parent_keyy, parent_lab length = len(self.get_tokens(element.text_content().strip())) length_tail = 0 text = element.xpath('string()').strip() - is_natural_language = self.__is_natural_language(text) + is_natural_language = self.__is_natural_language(text) or length_tail >= 10 if element.tail: length_tail = len(element.tail.strip()) idd = element.get('id') @@ -276,7 +276,7 @@ def find_blocks_drop(self, element, depth, element_dict, parent_keyy, parent_lab # 判断当前节点是否是红色节点 if keyy in layer_nodes_dict: if 'red' not in layer_nodes_dict[keyy]: - if self.more_noise_enable and tag in ['p', 'ul'] and not idd and not class_tag and is_natural_language: + if self.more_noise_enable and tag in ['p', 'ul', 'br'] and not idd and is_natural_language: label = 'red' else: parent = element.getparent() @@ -306,7 +306,7 @@ def drop_node_element(self, html_source, element_dict, template_dict_html): def htmll_to_content2(self, body_str): body = html.fromstring(body_str) - tags_to_remove = ['header', 'footer', 'nav', 'aside', 'script', 'style'] + tags_to_remove = ['footer', 'nav', 'aside', 'script', 'style'] for tag in tags_to_remove: for element in list(body.xpath(f'//{tag}')): prev = element.getprevious() diff --git a/llm_web_kit/main_html_parser/parser/tag_mapping.py b/llm_web_kit/main_html_parser/parser/tag_mapping.py index c635615d..b05a4baf 100644 --- a/llm_web_kit/main_html_parser/parser/tag_mapping.py +++ b/llm_web_kit/main_html_parser/parser/tag_mapping.py @@ -1,5 +1,5 @@ -from bs4 import BeautifulSoup from lxml import etree, html +from selectolax.parser import HTMLParser from llm_web_kit.exception.exception import TagMappingParserException from llm_web_kit.html_layout.html_layout_cosin import get_feature, similarity @@ -32,8 +32,8 @@ def parse(self, pre_data: PreDataJson) -> PreDataJson: # tag映射逻辑 try: template_raw_html = pre_data[PreDataJsonKey.TYPICAL_RAW_HTML] - soup = BeautifulSoup(template_raw_html, 'html.parser') - template_raw_html = str(soup) + selectolax_tree = HTMLParser(template_raw_html) + template_raw_html = selectolax_tree.html template_tag_html = pre_data[PreDataJsonKey.TYPICAL_RAW_TAG_HTML] response_json = pre_data[PreDataJsonKey.LLM_RESPONSE] root = html.fromstring(template_tag_html) @@ -149,7 +149,10 @@ def deal_element_direct(self, item_id, test_root): deal_element = elements[0] deal_element.set('magic_main_html', 'True') for ele in deal_element: - ele.set('magic_main_html', 'True') + try: + ele.set('magic_main_html', 'True') + except Exception: + continue def find_affected_element_after_drop(self, element): prev_sibling = element.getprevious() @@ -159,7 +162,10 @@ def find_affected_element_after_drop(self, element): if len(element) > 0: if is_main: for ele in element: - ele.set('magic_main_html', 'True') + try: + ele.set('magic_main_html', 'True') + except Exception: + continue element.drop_tag() # 如果包含子tag并且还有text,text有可能是兄弟节点的tail diff --git a/requirements/runtime.txt b/requirements/runtime.txt index c54ad7f4..f6ddacfe 100644 --- a/requirements/runtime.txt +++ b/requirements/runtime.txt @@ -23,6 +23,7 @@ overrides==7.7.0 py-asciimath==0.3.0 pyahocorasick==2.0.0 scikit-learn>=1.6.1 +selectolax==0.3.33 torch>=2.3.0 tqdm==4.67.1 transformers==4.40.2 diff --git a/tests/llm_web_kit/main_html_parser/parser/assets/output_layout_batch_parser/wdi_main_html.html b/tests/llm_web_kit/main_html_parser/parser/assets/output_layout_batch_parser/wdi_main_html.html index 74a3c3fa..582fc102 100644 --- a/tests/llm_web_kit/main_html_parser/parser/assets/output_layout_batch_parser/wdi_main_html.html +++ b/tests/llm_web_kit/main_html_parser/parser/assets/output_layout_batch_parser/wdi_main_html.html @@ -1,4 +1,4 @@ -('\n' +'\n' '\n' ' + + + + + + + +Perfect Swing + + + + + +
+

*Course Closed – Latest Update 1/2/10*

+

+Perfect Swing                                                                        Return +to Connecticut +pageCity Mini Golf +

+

 

+

845 Sullivan Ave
+
South Windsor, CT 06074-2047

+
(860) 664-1001                                                                       Other +Activities: Ice cream, batting cages

+

 

+

Cost: $5.50

+

Par: None listed

+

 

+

2007 Ratings                                       2011 +Ratings 

+

Difficulty: 4                                       Difficulty: +7

+

Creativity: 2                                      Creativity: 8

+

Atmosphere: 8                                   Atmosphere: 8

+

 

+

Perfect Swing must be named for +its batting cages considering no one’s really taking swings in mini +golf.  The Putting Penguin played this course on a beautiful sunny +Sunday in early May, and we were the only people there!  Really it +was a shame because it was a good course.  We hope they get a website +running or do some more advertising.

+

 

+

This course looks like many other +tournament-style courses.  Most of the holes were a variation of a +curve and some type of bump in the course.  It looks pretty +generic.  So did the score card.  It has no course name on +it or the par for each hole. The cups were a little too shallow on the front +nine, and the ball hopped out on a number of occasions.  There are +flags in the cup at each hole, which is a minor annoyance because you have to +remove and replace them throughout play.  The water wasn’t running, +but it was early in the season.  Hole 13 was a 3 +tiered hole and was neat except for the pvc pipe that was a little beat +up and didn’t allow you to get the ball in the hole at the +bottom.  There were some small lumps in the mats on the back nine +that we weren’t sure if they were purposeful or a problem with the mats, but they +didn’t seem to interfere too much with the movement of the ball.

+

 

+

Despite these criticisms we +enjoyed this course.  The layout and vegetation looked great, and the +course was well maintained.  Considering how prices have been going +up for miniature golf we thought the $5.50 was quite reasonable.  The +location is great; it’s just down the street from the Buckland Hills Mall and +Evergreen Walk.  We would recommend this course to +everyone.  They even have Gifford’s ice cream!

+

 

+

2011 Update

+

It has been four years since we +have reviewed Perfect Swing, and right off the bat we have to say it is well +worth the visit if you live in the South Windsor area.  It hasn’t even changed its price!

+

 

+

One noticeable difference between +our review in 2007 and our update is that we have changed the Creativity score +quite a bit.  After playing over 100 +courses, we have to admit that there are hole designs at Perfect Swing that we +have never seen any other courses.  We +also noticed this time that the “variation of a curve and some type of bump in +the course” appear to be purposeful.  If +someone were to play this course a few times it would be possible to putt along +these curves and bumps and make an easy 2 or even an ace.  We’re thinking we must not have had that kind +of appreciation for the course the first time we played there.

+

 

+

We still don’t like the rubber +starting mats, but the course no longer has the flags in the cups, which we’re +happy about.  Unfortunately, now there +are no hole numbers on the course so a player needs to pay attention while +keeping score on the score card.  The +scorecards themselves are still generic with no pars listed, some of the cups +are still too shallow, some of the carpets need a little work, and the pvc pipe +on hole 13 is still beat up.  But on a +positive note, the landscaping is still excellently maintained, and there’s a +lot of space between the holes.

+

 

+

The Putting Penguin still highly +recommends this course!

+

 

+

 

+

Reviewed by Pat, Mandy and Putt

+

Reviewed in 2007 & 2011

+

 

+

Visitor Review #1

+

 

+

REVIEW IT

+

 

+

Course Pictures +(click to enlarge)

+

 

+

Perfect Swing Miniature Golf                Perfect Swing Miniature Golf                Perfect Swing Miniature Golf

+

 

+

Perfect Swing Miniature Golf                Perfect Swing Miniature Golf

+

 

+

You can also see pictures of all 18 holes on our Flickr +Page

+

 

+

Map

+

 

+ +


+View +Larger Map

+

 

+

 

+

Visitor Review #1 +(2018)

+

 

+

Price: +$5        

+

Par: +41

+

 

+

Difficulty: +3

+

Creativity: +3

+

Atmosphere: +2

+

 

+

This +course was wonderfully reviewed in the past but seems to have fallen into +disrepair & neglect. It was early in the season but water features were not +on, holes were not numbered & carpeting was not well maintained.

+

 

+

Reviewed +by Glenda R

+

Reviewed +in 2018

+

 

+
+ + \ No newline at end of file diff --git a/tests/llm_web_kit/main_html_parser/assets/test_illegal_tag.json b/tests/llm_web_kit/main_html_parser/assets/test_illegal_tag.json new file mode 100644 index 00000000..ca2abf11 --- /dev/null +++ b/tests/llm_web_kit/main_html_parser/assets/test_illegal_tag.json @@ -0,0 +1,38 @@ +{ + "item_id 1": 0, + "item_id 2": 1, + "item_id 3": 1, + "item_id 4": 1, + "item_id 5": 1, + "item_id 6": 1, + "item_id 7": 1, + "item_id 8": 1, + "item_id 9": 1, + "item_id 10": 1, + "item_id 11": 1, + "item_id 12": 1, + "item_id 13": 1, + "item_id 14": 1, + "item_id 15": 1, + "item_id 16": 1, + "item_id 17": 1, + "item_id 18": 0, + "item_id 19": 0, + "item_id 20": 0, + "item_id 21": 0, + "item_id 22": 0, + "item_id 23": 0, + "item_id 24": 0, + "item_id 25": 0, + "item_id 26": 0, + "item_id 27": 0, + "item_id 28": 0, + "item_id 29": 0, + "item_id 30": 0, + "item_id 31": 0, + "item_id 32": 0, + "item_id 33": 0, + "item_id 34": 0, + "item_id 35": 0, + "item_id 36": 0 +} \ No newline at end of file diff --git a/tests/llm_web_kit/main_html_parser/processor/test_tag_mapping.py b/tests/llm_web_kit/main_html_parser/processor/test_tag_mapping.py index 2ae8341c..5d3d9f62 100644 --- a/tests/llm_web_kit/main_html_parser/processor/test_tag_mapping.py +++ b/tests/llm_web_kit/main_html_parser/processor/test_tag_mapping.py @@ -95,3 +95,19 @@ def test_parse_single_empty(self): parser = MapItemToHtmlTagsParser({}) pre_data = parser.parse_single(pre_data) self.assertEqual(pre_data['typical_main_html'], '') + + def test_illegal_tag(self): + # 构造测试html + typical_raw_tag_html = base_dir.joinpath('assets/test_illegal_tag.html').read_text( + encoding='utf-8') + # 简化网页 + # 模型结果格式改写 + llm_path = 'assets/test_illegal_tag.json' + llm_response = json.loads(base_dir.joinpath(llm_path).read_text(encoding='utf-8')) + pre_data = {'typical_raw_tag_html': typical_raw_tag_html, 'typical_raw_html': typical_raw_tag_html, + 'llm_response': llm_response} + pre_data = PreDataJson(pre_data) + # 映射 + parser = MapItemToHtmlTagsParser({}) + pre_data = parser.parse(pre_data) + assert 'This course looks' in pre_data[PreDataJsonKey.TYPICAL_MAIN_HTML]