Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 36 additions & 11 deletions llm_web_kit/main_html_parser/parser/layout_batch_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from hashlib import sha256

import nltk
from bs4 import BeautifulSoup
from lxml import html
from lxml.html import etree
from nltk.tokenize import word_tokenize
Expand All @@ -29,6 +30,7 @@ def __init__(self, template_data: str | dict):
self.dynamic_classid_enable = False
self.more_noise_enable = False
self.dynamic_classid_similarity_threshold = 0.85
self.ids = dict()

def parse_tuple_key(self, key_str):
if key_str.startswith('(') and key_str.endswith(')'):
Expand All @@ -41,6 +43,8 @@ def parse_tuple_key(self, key_str):
def parse(self, pre_data: PreDataJson) -> PreDataJson:
# 支持输入字符串和tag mapping后的dict对象
html_source = pre_data[PreDataJsonKey.HTML_SOURCE]
soup = BeautifulSoup(html_source, 'html.parser')
html_source = str(soup)
template_dict_html = pre_data.get(PreDataJsonKey.TYPICAL_DICT_HTML, '<html></html>')
self.dynamic_id_enable = pre_data.get(PreDataJsonKey.DYNAMIC_ID_ENABLE, False)
self.dynamic_classid_enable = pre_data.get(PreDataJsonKey.DYNAMIC_CLASSID_ENABLE, False)
Expand Down Expand Up @@ -112,13 +116,19 @@ def normalize_key(self, tup):
tag, class_id, idd = tup
if class_id:
class_id = re.sub(r' +', ' ', class_id)

if idd:
valid_id = self.ids.get(idd, True)
idd = re.sub(r' +', ' ', idd)

# 如果有id,则无需判断class,因为有的网页和模版id相同,但是class不同
if tag in ['body', 'html']:
return (tag, None, None)
if idd:
return (tag, None, self.replace_post_number(idd))

if idd and valid_id:
idd_norm = self.replace_post_number(idd)
return (tag, None, idd_norm)

return (tag, self.replace_post_number(class_id), self.replace_post_number(idd))

def replace_post_number(self, text):
Expand All @@ -129,7 +139,7 @@ def replace_post_number(self, text):
# 使用 \1 保留前面的 "post" 或 "postid",但替换数字部分
return re.sub(pattern, lambda m: f'{m.group(1)}-', text, flags=re.IGNORECASE).strip()

def find_blocks_drop(self, element, depth, element_dict, parent_keyy, parent_label, template_doc):
def find_blocks_drop(self, element, depth, element_dict, parent_keyy, parent_label, template_doc, tree):
# 判断这个tag是否有id
if isinstance(element, etree._Comment):
return
Expand All @@ -144,8 +154,16 @@ def find_blocks_drop(self, element, depth, element_dict, parent_keyy, parent_lab
layer_nodes = element_dict[depth]
class_tag = element.get('class')
ori_keyy = (tag, class_tag, idd)
if idd and idd.strip():
try:
idd_ele = tree.xpath(f'//*[@id="{idd}"]')
if len(idd_ele) > 3:
self.ids[idd] = False
else:
self.ids[idd] = True
except Exception:
self.ids[idd] = True
keyy = self.normalize_key(ori_keyy)

# 获取element的当前层的所有节点
element_parent = element.getparent()
current_layer_keys = {}
Expand All @@ -167,6 +185,16 @@ def find_blocks_drop(self, element, depth, element_dict, parent_keyy, parent_lab
layer_norm_eles = {}
# 构造当前层的候选映射字典
for ele_keyy, ele_value in layer_nodes.items():
layer_node_idd = ele_keyy[2]
if layer_node_idd and layer_node_idd.strip() and layer_node_idd not in self.ids:
try:
idd_ele = template_doc.xpath(f'//*[@id="{layer_node_idd}"]')
if len(idd_ele) > 3:
self.ids[layer_node_idd] = False
else:
self.ids[layer_node_idd] = self.ids.get(layer_node_idd, True)
except Exception:
self.ids[layer_node_idd] = self.ids.get(layer_node_idd, True)
ele_parent_keyy = self.normalize_key(ele_value[1])
if ele_parent_keyy is not None:
ele_parent_keyy = tuple(ele_parent_keyy)
Expand Down Expand Up @@ -267,13 +295,13 @@ def find_blocks_drop(self, element, depth, element_dict, parent_keyy, parent_lab
return

for child in element:
self.find_blocks_drop(child, depth + 1, element_dict, keyy, label, template_doc)
self.find_blocks_drop(child, depth + 1, element_dict, keyy, label, template_doc, tree)

def drop_node_element(self, html_source, element_dict, template_dict_html):
# 解析 HTML 内容
tree = html_to_element(html_source)
doc = html_to_element(template_dict_html)
self.find_blocks_drop(tree, 0, element_dict, None, '', doc)
self.find_blocks_drop(tree, 0, element_dict, None, '', doc, tree)
return element_to_html(tree)

def htmll_to_content2(self, body_str):
Expand Down Expand Up @@ -408,7 +436,7 @@ def __match_tag(self, layer_nodes, current_layer_key, parent_key, node_html, tem

return None, None, None

def __is_natural_language(self, text, min_words=3):
def __is_natural_language(self, text, min_words=10):
"""判断文本是否像自然语言.

:param text: 输入文本
Expand All @@ -417,7 +445,4 @@ def __is_natural_language(self, text, min_words=3):
"""
# 移除标点符号和多余空格
cleaned_text = re.sub(r'[^\w\s]', '', text.strip())
words = cleaned_text.split()
if len(words) <= min_words:
return False
return True
return len(cleaned_text) >= min_words
3 changes: 3 additions & 0 deletions llm_web_kit/main_html_parser/parser/tag_mapping.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from bs4 import BeautifulSoup
from lxml import etree, html

from llm_web_kit.exception.exception import TagMappingParserException
Expand Down Expand Up @@ -31,6 +32,8 @@ def parse(self, pre_data: PreDataJson) -> PreDataJson:
# tag映射逻辑
try:
template_raw_html = pre_data[PreDataJsonKey.TYPICAL_RAW_HTML]
soup = BeautifulSoup(template_raw_html, 'html.parser')
template_raw_html = str(soup)
template_tag_html = pre_data[PreDataJsonKey.TYPICAL_RAW_TAG_HTML]
response_json = pre_data[PreDataJsonKey.LLM_RESPONSE]
root = html.fromstring(template_tag_html)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<title>&#72;&#22411;&#38050;&#30340;&#21033;&#29992;&#29575;&#26159;&#20160;&#20040;&#95;&#20844;&#21496;&#26032;&#38395;&#95;&#72;&#22411;&#38050;&#95;&#72;&#22411;&#38050;&#20215;&#26684;&#95;&#28909;&#36711;&#72;&#22411;&#38050;&#95;&#28938;&#25509;&#72;&#22411;&#38050;&#95;&#39640;&#39057;&#28938;&#25509;&#72;&#22411;&#38050;&#45;&#22825;&#27941;&#37070;&#20016;&#36798;&#37329;&#23646;&#21046;&#21697;&#26377;&#38480;&#20844;&#21496;&#32;</title><script language="javascript" type="text/javascript" src="/wwwroot/js/1cd84495-89f6-4669-9ab3-9ac38fbf4386-head.js" charset="utf-8"></script>
<meta http-equiv="Cache-Control" content="no-transform" />
<meta http-equiv="Cache-Control" content="no-siteapp" /><script>var V_PATH="/";window.onerror=function(){ return true; };</script>
<meta name="Keywords" content="" />
<meta name="Description" content="" />
<link type="text/css" rel="stylesheet" href="/theme/tianjinjinshu/css/style.css"/>
<link type="text/css" rel="stylesheet" href="/theme/tianjinjinshu/css/nei.css"/>



<script>var webroot="/",infoid="316",murl="show.asp?id=316",contenturl="http://www.ww787000.com/show.asp?id=316";</script>



</head>
<h1><a href="http://ww787000.com/">缅甸银河国际网址</a></h1>

<div id="lcpn00y8fh49" class="top">
<div id="lcpn00y8fh49" class="header">
<p>您好!歡迎進入天津郎豐達金屬制品有限公司網站 !</p>
<div id="lcpn00y8fh49" class="header_r">
<a href="javascript:;" title="設為首頁" onclick="SetHome(this,'http://www.ww787000.com');">設為首頁</a>|<a href="javascript:;" title="收藏本站" onClick="AddFavorite(document.title,window.location)">收藏本站</a>
</div>
</div>
</div>
<div id="lcpn00y8fh49" class="logo">
<a href="http://www.ww787000.com" title="H型鋼_H型鋼價格_熱軋H型鋼_焊接H型鋼_高頻焊接H型鋼-天津郎豐達金屬制品有限公司"><img src="/theme/tianjinjinshu/images/hx_01.jpg" alt="H型鋼_H型鋼價格_熱軋H型鋼_焊接H型鋼_高頻焊接H型鋼-天津郎豐達金屬制品有限公司"/> </a>
</div>
<div id="lcpn00y8fh49" class="nav">
<ul class="menu">
<li><a href="http://www.ww787000.com" title="網站首頁">網站首頁</a></li>

<li ><a href="/list.asp?classid=1" title="公司簡介">公司簡介</a></li>

<li ><a href="/list.asp?classid=2" title="產品中心">產品中心</a></li>

<li class="libh"><a href="/list.asp?classid=3" title="新聞中心">新聞中心</a></li>

<li ><a href="/list.asp?classid=4" title="庫房現貨">庫房現貨</a></li>

<li ><a href="/list.asp?classid=5" title="應用案例">應用案例</a></li>

<li ><a href="/list.asp?classid=6" title="聯系我們">聯系我們</a></li>

<li class="linob"><a href="/plug/book.asp" title="在線留言">在線留言</a></li>
</ul>
</div>
<div id="lcpn00y8fh49" class="banner">
<div id="lcpn00y8fh49" class="slide-box">
<div id="lcpn00y8fh49" class="slide"><li style="background:url(/theme/tianjinjinshu/images/banner1.jpg) center center no-repeat; width:100%; height:390px;list-style:none;" alt="幻燈1" title="幻燈1"></li></div>
<div id="lcpn00y8fh49" class="slide"><li style="background:url(/theme/tianjinjinshu/images/banner2.jpg) center center no-repeat; width:100%; height:390px;list-style:none;" alt="幻燈2" title="幻燈2"></li></div>
</div>
<div id="lcpn00y8fh49" class="item">
<a class="cur"></a><a></a>
</div>
</div>
<div id="lcpn00y8fh49" class="ss">
<div id="lcpn00y8fh49" class="sscon">
<div id="lcpn00y8fh49" class="ssH">
<span>熱門關鍵詞:</span>  

<a href="/list.asp?classid=22" title="H型鋼">H型鋼</a>

<a href="/list.asp?classid=30" title="鍍鋅H型鋼">鍍鋅H型鋼</a>

<a href="/list.asp?classid=32" title="焊接H型鋼">焊接H型鋼</a>

<a href="/list.asp?classid=33" title="高頻焊接H型鋼">高頻焊接H型鋼</a>

<a href="/list.asp?classid=29" title="其他型鋼">其他型鋼</a>

</div>
<div id="lcpn00y8fh49" class="search">
<form action="/plug/search.asp" method="get" onsubmit="return checksearch(this)">
<input type="text" name="key" value="請輸入關鍵字" onfocus="if(this.value==defaultValue)this.value=''" onblur="if(this.value=='')this.value=defaultValue" class="txt"/>
<input type="submit" value="搜索" class="btn"/>
</form>
</div>
</div>
</div>

<div id="lcpn00y8fh49" class="main">
<div id="lcpn00y8fh49" class="main_left">
<div id="lcpn00y8fh49" class="leftitem">
<div id="lcpn00y8fh49" class="tit">欄目導航</div>
<div id="lcpn00y8fh49" class="cates">
<ul>


<li ><a href="/list.asp?classid=8" title="行業資訊">行業資訊</a></li>

<li class="cur"><a href="/list.asp?classid=7" title="公司新聞">公司新聞</a></li>


</ul>
</div>
</div>
<div id="lcpn00y8fh49" class="leftitem">
<div id="lcpn00y8fh49" class="tit"> 聯系我們 </div>
<div id="lcpn00y8fh49" class="lianxi">
<dl>
<dt>
<div>服務熱線</div>
<div id="lcpn00y8fh49" class="telx"><p>022-85103518<br/>13612183033</p></div>
</dt>
<p>聯系人:李經理</p><p>電話:022-85103518</p><p>手機:13612183033<br/></p><p><br/></p>
</dl>
</div>
</div>
</div>
<div id="lcpn00y8fh49" class="main_right">
<div id="lcpn00y8fh49" class="sitemap">當前位置:<a href="/" title="首頁">首頁</a> > <a href="/list.asp?classid=3" title="新聞中心">新聞中心</a> > <a href="/list.asp?classid=7" title="公司新聞">公司新聞</a></div>
<div id="lcpn00y8fh49" class="content">
<div id="lcpn00y8fh49" class="news_xaingxi">H型鋼的利用率是什么</div>
<div id="lcpn00y8fh49" class="news_author">
瀏覽:<span id="hits">167</span> 發布日期:2019-04-25 11:42:00
</div>
<div id="lcpn00y8fh49" class="news_content"><p style="text-align: center;">  <a href="http://www.ww787000.com/show.asp?id=170" title="H型鋼" target="_blank" class="sitelink">H型鋼</a>在進行操作時主要是指以熱軋或者是冷軋帶鋼為原料,在常溫的狀態下經壓力加工制成的各種復雜斷面型材,亦稱薄壁型鋼,是輕型建筑結構鋼材的一種。<a href="http://www.ww787000.com/list.asp?classid=22" title="H型鋼" target="_blank" class="sitelink">H型鋼</a>是以熱軋或冷軋帶鋼為坯料經彎曲成型制成的各種截面形狀尺寸的型鋼。</p><p>  <a href="http://www.ww787000.com/" title="H型鋼" target="_blank" class="sitelink">H型鋼</a>具有以下特點</p><p>缅甸银河国际网址  1。
截面經濟合理,節省材料,其H型鋼的截面形狀是可以根據需要設計,結構合理,單位重量的截面系數高于熱軋型鋼。在同樣負荷下,可減輕構件重量,節約材料。H型鋼用于建筑結構可比熱軋型鋼節約金屬38%~50%,用于農業機械和車輛可節約金屬15%~60%。方便施工,降低綜合費用。</p><p>  2. H型鋼的品種繁多,在進行操作時可以生產用一般熱軋方法難以生產的壁厚均勻、截面形狀復雜的各種型材和各種不同材質的H型鋼。</p><p style="text-align: center;"><img src="http://www.ww787000.com/upfile/201904/2019041535569517.jpg"/></p><p>  3。產品表面光潔,外觀好,尺寸精確,而且長度也可以根據需要靈活調整,全部按定尺或倍尺供應,提高材料的利用率。</p><p>缅甸银河国际网址  4.生產中還可與沖孔等工序相配合,以滿足不同的需要。</p><p>  H型鋼主要是采用其普通的碳素結構鋼以及優質的碳素結構鋼、低合金結構鋼板或鋼帶冷彎制成。H型鋼是屬于經濟斷面鋼材,也是高效節能材料,是一種具有強大生命力的新型鋼材品種,它廣泛應用于國家經濟的各個領域,其用途大約可以分為公路護欄板、鋼結構、汽車、集裝箱、鋼模板和腳手架、鐵道車輛、船舶和橋梁、鋼板樁、輸電鐵塔、其他10大類。</p><p><br/></p></div>
<div id="lcpn00y8fh49" class="pagebar">
<p class="pl">上一篇:


<a href="/show.asp?id=308" title="H型鋼的特點分析">H型鋼的特點分析</a>

</p>
<p class="pr">下一篇:


<a href="/show.asp?id=318" title="H型鋼的結構及除銹問題">H型鋼的結構及除銹問題</a>

</p>
</div>
</div>
</div>
</div>

<div id="lcpn00y8fh49" class="footer">
<div id="lcpn00y8fh49" class="fnav">
<a href="/" title="網站首頁" >網站首頁</a>-

<a href="/list.asp?classid=1" title="公司簡介">公司簡介</a>-

<a href="/list.asp?classid=2" title="產品中心">產品中心</a>-

<a href="/list.asp?classid=3" title="新聞中心">新聞中心</a>-

<a href="/list.asp?classid=4" title="庫房現貨">庫房現貨</a>-

<a href="/list.asp?classid=5" title="應用案例">應用案例</a>-

<a href="/list.asp?classid=6" title="聯系我們">聯系我們</a>-

<a href="/plug/book.asp" title="在線留言">在線留言</a>
</div>
<div id="lcpn00y8fh49" class="fb">
<p>聯系人:李經理 &nbsp; 電話:022-85103518 &nbsp; 手機:13612183033&nbsp; 傳真:022-65670668</p><p><br/></p>
<p>缅甸银河国际网址Copyright @ 2017 天津郎豐達金屬制品有限公司 All Rights Reserved。 &nbsp;&nbsp; </p>
<p>天津郎豐達金屬材料有限公司批發<a href="http://www.ww787000.com">H型鋼</a>、<a href="http://www.ww787000.com">H型鋼價格</a>和<a href="http://www.ww787000.com">熱軋H型鋼</a>,<a href="http://www.ww787000.com">焊接H型鋼</a>、<a href="http://www.ww787000.com">高頻焊H型鋼</a>批發兼零售,批發電話022-85103518<br></p>
</div>
</div>

<script language="javascript" type="text/javascript" src="/wwwroot/js/1cd84495-89f6-4669-9ab3-9ac38fbf4386-tj.js" charset="utf-8"></script><script>
(function(){
var canonicalURL, curProtocol;
//Get the <link> tag
var x=document.getElementsByTagName("link");
//Find the last canonical URL
if(x.length > 0){
for (i=0;i<x.length;i++){
if(x[i].rel.toLowerCase() == 'canonical' && x[i].href){
canonicalURL=x[i].href;
}
}
}
//Get protocol
if (!canonicalURL){
curProtocol = window.location.protocol.split(':')[0];
}
else{
curProtocol = canonicalURL.split(':')[0];
}
//Get current URL if the canonical URL does not exist
if (!canonicalURL) canonicalURL = window.location.href;
//Assign script content. Replace current URL with the canonical URL
!function(){var e=/([http|https]:\/\/[a-zA-Z0-9\_\.]+\.baidu\.com)/gi,r=canonicalURL,t=document.referrer;if(!e.test(r)){var n=(String(curProtocol).toLowerCase() === 'https')?"https://sp0.baidu.com/9_Q4simg2RQJ8t7jm9iCKT-xh_/s.gif":"//api.share.baidu.com/s.gif";t?(n+="?r="+encodeURIComponent(document.referrer),r&&(n+="&l="+r)):r&&(n+="?l="+r);var i=new Image;i.src=n}}(window);})();
</script><style>#pl_css_ganrao{ display:none }</style></body>
</html>
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
{
"item_id 1": 0,
"item_id 2": 0,
"item_id 3": 0,
"item_id 4": 0,
"item_id 5": 0,
"item_id 6": 0,
"item_id 7": 0,
"item_id 8": 0,
"item_id 9": 0,
"item_id 10": 0,
"item_id 11": 0,
"item_id 12": 0,
"item_id 13": 0,
"item_id 14": 0,
"item_id 15": 0,
"item_id 16": 0,
"item_id 17": 0,
"item_id 18": 0,
"item_id 19": 0,
"item_id 20": 0,
"item_id 21": 1,
"item_id 22": 0,
"item_id 23": 1,
"item_id 24": 1,
"item_id 25": 1,
"item_id 26": 1,
"item_id 27": 1,
"item_id 28": 1,
"item_id 29": 1,
"item_id 30": 1,
"item_id 31": 1,
"item_id 32": 1,
"item_id 33": 1,
"item_id 34": 1,
"item_id 35": 1,
"item_id 36": 1,
"item_id 37": 1,
"item_id 38": 1,
"item_id 39": 1,
"item_id 40": 1,
"item_id 41": 1,
"item_id 42": 1,
"item_id 43": 1,
"item_id 44": 1,
"item_id 45": 1,
"item_id 46": 0,
"item_id 47": 0
}
Loading