From 69d420cf2c8b1357bc9b454269da995991e4c669 Mon Sep 17 00:00:00 2001 From: quyuan Date: Tue, 25 Feb 2025 13:01:20 +0800 Subject: [PATCH 01/46] resolve nest table --- .../extractor/html/recognizer/table.py | 137 ++- .../recognizer/table_include_code_expect.json | 299 +++++ .../assets/recognizer/table_involve_code.html | 1001 +++++++++++++++++ .../table_to_content_list_complex_res.json | 3 +- .../extractor/html/recognizer/test_table.py | 23 +- 5 files changed, 1421 insertions(+), 42 deletions(-) create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_include_code_expect.json create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_involve_code.html diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index e2c70a39..b41f8834 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -1,9 +1,12 @@ -from typing import List, Tuple +from itertools import chain +from typing import Any, List, Tuple from lxml.html import HtmlElement from overrides import override from llm_web_kit.exception.exception import HtmlTableRecognizerExp +from llm_web_kit.extractor.html.recognizer.cccode import CodeRecognizer +from llm_web_kit.extractor.html.recognizer.ccmath import MathRecognizer from llm_web_kit.extractor.html.recognizer.recognizer import ( BaseHTMLElementRecognizer, CCTag) from llm_web_kit.libs.doc_element_type import DocElementType @@ -42,7 +45,7 @@ def recognize(self, def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segment: str) -> dict: if not parsed_content: raise HtmlTableRecognizerExp(f'table parsed_content{parsed_content}为空') - table_type, table_body = self.__get_attribute(parsed_content) + table_type, table_nest_level, table_body = self.__get_attribute(parsed_content) d = { 'type': DocElementType.TABLE, # "bbox": [], @@ -52,6 +55,7 @@ def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segm }, } d['content']['is_complex'] = table_type + d['content']['table_nest_level'] = table_nest_level return d def __is_contain_cc_html(self, cc_html: str) -> bool: @@ -64,6 +68,7 @@ def __is_table_empty(self, table) -> bool: :param table: lxml.html.HtmlElement 对象,表示一个 元素 :return: 如果表格为空,返回 True;否则返回 False """ + def is_element_empty(elem): # 检查元素本身的文本内容 if elem.text and elem.text.strip(): @@ -81,6 +86,7 @@ def is_element_empty(elem): if elem.tail and elem.tail.strip(): return False return True + # 检查所有单元格 for cell in table.xpath('.//td | .//th'): # 检查单元格内容 @@ -101,7 +107,8 @@ def __is_simple_table(self, tree) -> bool: colspan = int(colspan_str) rowspan = int(rowspan_str) except ValueError as e: - raise HtmlTableRecognizerExp(f'table的合并单元格属性值colspan:{colspan_str}或rowspan:{rowspan_str}不是有效的整数') from e + raise HtmlTableRecognizerExp( + f'table的合并单元格属性值colspan:{colspan_str}或rowspan:{rowspan_str}不是有效的整数') from e if (colspan > 1) or (rowspan > 1): return False return True @@ -114,28 +121,28 @@ def __is_table_contain_img(self, tree) -> bool: else: return False - def __is_table_nested(self, tree) -> bool: - """判断table元素是否嵌套.""" - nested_tables = tree.xpath('//table//table') - if len(nested_tables) == 0: - return True - else: - return False + def __is_table_nested(self, tree) -> int: + """获取表格元素的嵌套层级(非表格元素返回0,顶层表格返回1,嵌套表格返回层级数).""" + if tree.tag != 'table': + return 0 # 非表格元素返回0 + # 计算祖先中的 table 数量(不包括自身),再加1表示自身层级 + return len(tree.xpath('ancestor::table')) + 1 - def __extract_tables(self, ele: HtmlElement) -> List[str]: + def __extract_tables(self, ele: HtmlElement) -> list[tuple[str, str]]: """提取html中的table元素.""" - tree = self._build_html_tree(ele) - self.__do_extract_tables(tree) - new_html = self._element_to_html(tree) + self.__do_extract_tables(ele) + new_html = self._element_to_html(ele) lst = self.html_split_by_tags(new_html, CCTag.CC_TABLE) return lst def __get_table_type(self, child: HtmlElement) -> str: """获取table的类型.""" empty_flag = self.__is_table_empty(child) + level = self.__is_table_nested(child) if empty_flag: return 'empty' - flag = self.__is_simple_table(child) and self.__is_table_nested(child) + # 是否跨行跨列 + flag = (self.__is_simple_table(child) and level < 2) if flag: table_type = 'simple' else: @@ -147,36 +154,91 @@ def __extract_table_element(self, ele: HtmlElement) -> str: for item in ele.iterchildren(): return self._element_to_html(item) - def __simplify_td_th_content(self, elem): + def __check_table_include_math_code(self, raw_html: HtmlElement): + """check table中是否包含math.""" + math_html = self._element_to_html(raw_html) + ele_res = list() + math_recognizer = MathRecognizer() + math_res_parts = math_recognizer.recognize(base_url='', main_html_lst=[(math_html, math_html)], + raw_html=math_html) + code_recognizer = CodeRecognizer() + code_res_parts = code_recognizer.recognize(base_url='', main_html_lst=math_res_parts, + raw_html=math_html) + for math_item in code_res_parts: + ele_item = self._build_html_tree(math_item[0]) + ccinline_math_node = ele_item.xpath(f'//{CCTag.CC_MATH_INLINE}') + ccinline_code_node = ele_item.xpath(f'//{CCTag.CC_CODE_INLINE}') + ccinterline_math_node = ele_item.xpath(f'//{CCTag.CC_MATH_INTERLINE}') + ccinterline_code_node = ele_item.xpath(f'//{CCTag.CC_CODE}') + if ccinline_math_node: + formulas = [ + el.text if el.text.strip() else '' + for el in ccinline_math_node + ] + ele_res.extend(formulas) # 添加字符串 + elif ccinterline_math_node: + codes = [ + el.text if el.text.strip() else '' + for el in ccinterline_math_node + ] + ele_res.extend(codes) + elif ccinline_code_node: + inline_codes = [ + el.text if el.text.strip() else '' + for el in ccinline_code_node + ] + ele_res.extend(inline_codes) + elif ccinterline_code_node: + ccinterline_codes = [ + el.text if el.text else '' + for el in ccinterline_code_node + ] + ele_res.extend(ccinterline_codes) + else: + ele_res.extend([ + text.strip() + for text in self._build_html_tree(math_item[1]).itertext() + if text.strip() + ]) + return ele_res + + def __simplify_td_th_content(self, elem: HtmlElement) -> None: """简化
内容,仅保留文本内容.""" - if elem.tag in ['td', 'th'] and len(elem.xpath('.//table')) == 0: - result = '
'.join([text for text in elem.itertext() if text.strip()]) - for child in list(elem): - elem.remove(child) - elem.text = result - elif elem.tag in ['td', 'th'] and len(elem.xpath('.//table')) > 0: - for item in elem.iterchildren(): - self.__simplify_td_th_content(item) + if elem.tag in ['td', 'th']: + # 简化单元格中的元素 + parse_res = list() + math_res = self.__check_table_include_math_code(elem) + parse_res.extend(math_res) + for item in list(elem.iterchildren()): + elem.remove(item) + elem.text = '
'.join(parse_res) + return + for child in elem.iter('td', 'th'): + self.__simplify_td_th_content(child) def __get_table_body(self, table_type, table_root): """获取并处理table body,返回处理后的HTML字符串。""" if table_type == 'empty': return None allowed_attributes = ['colspan', 'rowspan'] - for child in list(table_root.iterchildren()): - if child.tag is not None: - self.__get_table_body(table_type, child) - for ele in table_root.iter('td', 'th'): - self.__simplify_td_th_content(ele) + # 清理除了colspan和rowspan之外的属性 if len(table_root.attrib) > 0: cleaned_attrs = {k: v for k, v in table_root.attrib.items() if k in allowed_attributes} table_root.attrib.clear() table_root.attrib.update(cleaned_attrs) - if table_root.text is not None: - table_root.text = table_root.text.strip() - for elem in table_root.iter(): - if elem.tail is not None: + # text进行strip操作,tail去掉(有较多空换行) + for elem in chain([table_root], table_root.iterdescendants()): + if elem.text: + elem.text = elem.text.strip() + if elem.tail: elem.tail = elem.tail.strip() + + self.__simplify_td_th_content(table_root) + # 迭代 + for child in table_root.iterchildren(): + if child is not None: + self.__get_table_body(table_type, child) + return self._element_to_html(table_root) def __do_extract_tables(self, root: HtmlElement) -> None: @@ -184,23 +246,26 @@ def __do_extract_tables(self, root: HtmlElement) -> None: if root.tag in ['table']: table_raw_html = self._element_to_html(root) table_type = self.__get_table_type(root) + table_nest_level = self.__is_table_nested(root) tail_text = root.tail table_body = self.__get_table_body(table_type, root) cc_element = self._build_cc_element( - CCTag.CC_TABLE, table_body, tail_text, table_type=table_type, html=table_raw_html) + CCTag.CC_TABLE, table_body, tail_text, table_type=table_type, table_nest_level=table_nest_level, + html=table_raw_html) self._replace_element(root, cc_element) return for child in root.iterchildren(): self.__do_extract_tables(child) - def __get_attribute(self, html: str) -> Tuple[int, str]: + def __get_attribute(self, html: str) -> tuple[bool, Any, Any]: """获取element的属性.""" ele = self._build_html_tree(html) if ele is not None and ele.tag == CCTag.CC_TABLE: table_type = ele.attrib.get('table_type') + table_nest_level = ele.attrib.get('table_nest_level') table_flag = self.__get_content_list_table_type(table_type) table_body = ele.text - return table_flag, table_body + return table_flag, table_nest_level, table_body else: raise HtmlTableRecognizerExp(f'{html}中没有cctable标签') diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_include_code_expect.json b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_include_code_expect.json new file mode 100644 index 00000000..15a9cf34 --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_include_code_expect.json @@ -0,0 +1,299 @@ +
1<br>2<br>3<br>4<br>5<br>6<br>7<br>8<br>9<br>10<br>11<br>12<br>13<br>14<br>15<br>16<br>17<br>18<br>19<br>20<br>21<br>22<br>23<br>24<br>25<br>26<br>27<br>28<br>29<br>30<br>31<br>32<br>33<br>34<br>35<br>36<br>37<br>38<br>39<br>40<br>41<br>42<br>43<br>44<br>45<br>46<br>47<br>48<br>49<br>50<br>51<br>52<br>53<br>54<br>55<br>56<br>57<br>58<br>59<br>60<br>61<br>62<br>63<br>64<br>65<br>66<br>67<br>68<br>69<br>70<br>71<br>72<br>73<br>74<br>75<br>76<br>77<br>78<br>79<br>80<br>81<br>82<br>83<br>84<br>85<br>86<br>87<br>88<br>89<br>90<br>91<br>92<br>93<br>94<br>95<br>96<br>97<br>98<br>99<br>100<br>101<br>102<br>103<br>104<br>105<br>106<br>107<br>108<br>109<br>110<br>111<br>112<br>113<br>114<br>115<br>116<br>117<br>118<br>119<br>120<br>121<br>122<br>123<br>124<br>125<br>126<br>127<br>128<br>129<br>130<br>131<br>132<br>133<br>134<br>135<br>136<br>137<br>138<br>139<br>140<br>141<br>142<br>143<br>144<br>145<br>146<br>147<br>148<br>149<br>150<br>151<br>152<br>153<br>154<br>155<br>156<br>157<br>158<br>159<br>160<br>161<br>162<br>163<br>164<br>165<br>166<br>167<br>168<br>169<br>170<br>171<br>172<br>173<br>174<br>175<br>176<br>177<br>178<br>179<br>180<br>181<br>182<br>183<br>184<br>185<br>186<br>187<br>188<br>189<br>190<br>191<br>192<br>193<br>194<br>195<br>196<br>197<br>198<br>199<br>200<br>201<br>202<br>203<br>204<br>205<br>206<br>207<br>208<br>209<br>210<br>211<br>212<br>213<br>214<br>215<br>216<br>217<br>218<br>219<br>220<br>221<br>222<br>223<br>224<br>225<br>226<br>227<br>228<br>229<br>230<br>231<br>232<br>233<br>234<br>235<br>236<br>237<br>238<br>239<br>240<br>241<br>242<br>243<br>244<br>245<br>246<br>247<br>248<br>249<br>250<br>251<br>252<br>253<br>254<br>255<br>256<br>257<br>258<br>259<br>260<br>261<br>262<br>263<br>264<br>265<br>266<br>267<br>268<br>269<br>270<br>271<br>272<br>273<br>274<br>275<br>276<br>277<br>278<br>279<br>280<br>281<br>282<br>283<br>284<br>285<br>286<br>287<br>288<br>289<br>290<br>291<br>292<br>293<br>294<br>295<br>296<br>297<br>298<br>299<%@ page language="java"import="java.util.*"pageEncoding="utf-8"%> +<% +String path = request.getContextPath(); +String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/"; +%> + +<!DOCTYPE HTML PUBLIC"-//W3C//DTD HTML 4.01 Transitional//EN"> +<html> +<head> +<title>My JSP'register.jsp'starting page</title> +</head> + +<body> +<script type="text/javascript"> +function validate(){ +if(registerForm.uname.value==""){ +alert("账号不能为空!"); +return; +} +if(registerForm.upwd.value==""){ +alert("密码不能为空!"); +return; +} +registerForm.submit(); +} +</script> + +<form name="registerForm"action="DoregServlet"method="post"> + +用户名:<input type="text"name="uname"><br> +密 码: <input type="password"name="upwd"> <br> +<input type="submit"value="注册"> +<a href="denglu.jsp">登录</a> +</form> + +</body> +</html> + + + +packagecom.servlet; + +importjava.io.IOException; +importjava.io.PrintWriter; + +importjavax.servlet.ServletException; +importjavax.servlet.http.HttpServlet; +importjavax.servlet.http.HttpServletRequest; +importjavax.servlet.http.HttpServletResponse; + +importcom.dao.UsersDao; + +publicclassservlet3extendsHttpServlet { + +publicservlet3() { +super(); +} + + +publicvoiddestroy() { +super.destroy();// Just puts "destroy" string in log +// Put your code here +} + + +publicvoiddoGet(HttpServletRequest request, HttpServletResponse response) +throwsServletException, IOException { +doPost (request, response); + +} + + +publicvoiddoPost(HttpServletRequest request, HttpServletResponse response) +throwsServletException, IOException { + +String uname = request.getParameter("uname"); +String upwd = request.getParameter("upwd"); +UsersDao usersDao =newUsersDao(); +inti=usersDao.reg(uname, upwd); +if(i>0){ + +response.setHeader("refresh","2;url=login.jsp"); +}else{ + +response.setHeader("refresh","2;url=reg.jsp"); +} +} + +/** +* Initialization of the servlet. <br> +* +* @throws ServletException if an error occurs +*/ +publicvoidinit()throwsServletException { +// Put your code here +} + +} + + + + + +packagecom.sf.servlet; + +importjava.io.IOException; +importjava.io.PrintWriter; + +importjavax.servlet.ServletException; +importjavax.servlet.http.HttpServlet; +importjavax.servlet.http.HttpServletRequest; +importjavax.servlet.http.HttpServletResponse; + +importcom.sf.dao.MsgDao; +importcom.sf.dao.UsersDao; + +publicclassDoregservletextendsHttpServlet { + +/** +* Constructor of the object. +*/ +publicDoregservlet() { +super(); +} + +/** +* Destruction of the servlet. <br> +*/ +publicvoiddestroy() { +super.destroy();// Just puts "destroy" string in log +// Put your code here +} + +publicvoiddoGet(HttpServletRequest request, HttpServletResponse response) +throwsServletException, IOException { + +response.setContentType("text/html"); +PrintWriter out = response.getWriter(); +request.setCharacterEncoding("utf-8"); +String uname = request.getParameter("uname"); +String upwd = request.getParameter("upwd"); + +UsersDao ud =newUsersDao(); +MsgDao md =newMsgDao(); +if(ud.register(uname, upwd) >0) { +request.getSession().setAttribute("uname", uname); +request.getRequestDispatcher("denglu.jsp").forward(request, +response); +}else{ +out.print("注册失败,请重新注册......."); +response.setHeader("refresh","3;url=reg.jsp"); +} +} +publicvoiddoPost(HttpServletRequest request, HttpServletResponse response) +throwsServletException, IOException { + +doGet(request,response); +} + +/** +* Initialization of the servlet. <br> +* +* @throws ServletException if an error occurs +*/ +publicvoidinit()throwsServletException { +// Put your code here +} + +} + + + + + +packagecom.servlet; + +importjava.io.IOException; +importjava.io.PrintWriter; + +importjavax.servlet.ServletException; +importjavax.servlet.http.HttpServlet; +importjavax.servlet.http.HttpServletRequest; +importjavax.servlet.http.HttpServletResponse; + +importcom.dao.MsgDao; + +publicclassservlet5extendsHttpServlet { + +publicservlet5() { +super(); +} + +publicvoiddestroy() { +super.destroy();// Just puts "destroy" string in log +// Put your code here +} + + +publicvoiddoGet(HttpServletRequest request, HttpServletResponse response) +throwsServletException, IOException { + +doPost(request, response); +} + + +publicvoiddoPost(HttpServletRequest request, HttpServletResponse response) +throwsServletException, IOException { + +request.setCharacterEncoding("utf-8"); + +intid=Integer.parseInt(request.getParameter("id")); +MsgDao md=newMsgDao(); +md.delMail(id); +response.getWriter().print("刪除成功....."); +response.setHeader("refresh","2;url=main.jsp"); +response.sendRedirect("main2.jsp"); +} + + +publicvoidinit()throwsServletException { + +} + +} + + + + + + + +packagecom.sf.servlet; + +importjava.io.IOException; +importjava.io.PrintWriter; + +importjavax.servlet.ServletException; +importjavax.servlet.http.HttpServlet; +importjavax.servlet.http.HttpServletRequest; +importjavax.servlet.http.HttpServletResponse; + +importcom.sf.dao.MsgDao; +importcom.sf.entity.Msg; + +publicclassDowriteservletextendsHttpServlet { + +/** +* Constructor of the object. +*/ +publicDowriteservlet() { +super(); +} + +/** +* Destruction of the servlet. <br> +*/ +publicvoiddestroy() { +super.destroy();// Just puts "destroy" string in log +// Put your code here +} + +publicvoiddoGet(HttpServletRequest request, HttpServletResponse response) +throwsServletException, IOException { + +response.setContentType("text/html"); +PrintWriter out = response.getWriter(); +request.setCharacterEncoding("utf-8"); +String uname = (String) request.getSession().getAttribute("uname"); +String sendto = request.getParameter("receiver"); +String title = request.getParameter("title"); +String content = request.getParameter("content"); + +Msg m =newMsg(); +m.setMsgcontent(content); +m.setUsername(uname); +m.setSendto(sendto); +m.setTitle(title); + +MsgDao md =newMsgDao(); +md.addMsg(m); + +out.print("发送成功....."); +response.setHeader("refresh","3;url=main.jsp"); +} + +publicvoiddoPost(HttpServletRequest request, HttpServletResponse response) +throwsServletException, IOException { + +doGet(request,response); } + +/** +* Initialization of the servlet. <br> +* +* @throws ServletException if an error occurs +*/ +publicvoidinit()throwsServletException { +} + +}
\ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_involve_code.html b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_involve_code.html new file mode 100644 index 00000000..d1961838 --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_involve_code.html @@ -0,0 +1,1001 @@ + + + + + + + + + + + + + + 第十三周作业 - 徐涛% - 博客园 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+
+
+ +
+
+

+ + 第十三周作业 + + + + +

+
+
+
+
+ + + + + + + + + + +
+
1
+
2
+
3
+
4
+
5
+
6
+
7
+
8
+
9
+
10
+
11
+
12
+
13
+
14
+
15
+
16
+
17
+
18
+
19
+
20
+
21
+
22
+
23
+
24
+
25
+
26
+
27
+
28
+
29
+
30
+
31
+
32
+
33
+
34
+
35
+
36
+
37
+
38
+
39
+
40
+
41
+
42
+
43
+
44
+
45
+
46
+
47
+
48
+
49
+
50
+
51
+
52
+
53
+
54
+
55
+
56
+
57
+
58
+
59
+
60
+
61
+
62
+
63
+
64
+
65
+
66
+
67
+
68
+
69
+
70
+
71
+
72
+
73
+
74
+
75
+
76
+
77
+
78
+
79
+
80
+
81
+
82
+
83
+
84
+
85
+
86
+
87
+
88
+
89
+
90
+
91
+
92
+
93
+
94
+
95
+
96
+
97
+
98
+
99
+
100
+
101
+
102
+
103
+
104
+
105
+
106
+
107
+
108
+
109
+
110
+
111
+
112
+
113
+
114
+
115
+
116
+
117
+
118
+
119
+
120
+
121
+
122
+
123
+
124
+
125
+
126
+
127
+
128
+
129
+
130
+
131
+
132
+
133
+
134
+
135
+
136
+
137
+
138
+
139
+
140
+
141
+
142
+
143
+
144
+
145
+
146
+
147
+
148
+
149
+
150
+
151
+
152
+
153
+
154
+
155
+
156
+
157
+
158
+
159
+
160
+
161
+
162
+
163
+
164
+
165
+
166
+
167
+
168
+
169
+
170
+
171
+
172
+
173
+
174
+
175
+
176
+
177
+
178
+
179
+
180
+
181
+
182
+
183
+
184
+
185
+
186
+
187
+
188
+
189
+
190
+
191
+
192
+
193
+
194
+
195
+
196
+
197
+
198
+
199
+
200
+
201
+
202
+
203
+
204
+
205
+
206
+
207
+
208
+
209
+
210
+
211
+
212
+
213
+
214
+
215
+
216
+
217
+
218
+
219
+
220
+
221
+
222
+
223
+
224
+
225
+
226
+
227
+
228
+
229
+
230
+
231
+
232
+
233
+
234
+
235
+
236
+
237
+
238
+
239
+
240
+
241
+
242
+
243
+
244
+
245
+
246
+
247
+
248
+
249
+
250
+
251
+
252
+
253
+
254
+
255
+
256
+
257
+
258
+
259
+
260
+
261
+
262
+
263
+
264
+
265
+
266
+
267
+
268
+
269
+
270
+
271
+
272
+
273
+
274
+
275
+
276
+
277
+
278
+
279
+
280
+
281
+
282
+
283
+
284
+
285
+
286
+
287
+
288
+
289
+
290
+
291
+
292
+
293
+
294
+
295
+
296
+
297
+
298
+
299
+ +
+
+
<%@ page language="java" import="java.util.*" pageEncoding="utf-8"%>
+
<%
+
String path = request.getContextPath();
+
String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
+
%>
+
 
+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+
<html>
+
  <head>
+
    <title>My JSP 'register.jsp' starting page</title>   
+
  </head>
+
  
+
  <body>
+
  <script type="text/javascript">
+
        function validate(){
+
            if(registerForm.uname.value==""){
+
                alert("账号不能为空!");
+
                return;
+
            }
+
            if(registerForm.upwd.value==""){
+
                alert("密码不能为空!");
+
                return;
+
            }
+
            registerForm.submit();
+
        }
+
    </script>
+
 
+
    <form  name="registerForm" action="DoregServlet" method="post">
+
 
+
        用户名:<input type="text" name="uname"><br>
+
        密   码: <input type="password" name="upwd"> <br>
+
        <input type="submit" value="注册" >
+
        <a href="denglu.jsp">登录</a>
+
    </form>
+
     
+
  </body>
+
</html>
+
 
+
  
+
 
+
package com.servlet;
+
 
+
import java.io.IOException;
+
import java.io.PrintWriter;
+
 
+
import javax.servlet.ServletException;
+
import javax.servlet.http.HttpServlet;
+
import javax.servlet.http.HttpServletRequest;
+
import javax.servlet.http.HttpServletResponse;
+
 
+
import com.dao.UsersDao;
+
 
+
public class servlet3 extends HttpServlet {
+
 
+
    public servlet3() {
+
        super();
+
    }
+
 
+
     
+
    public void destroy() {
+
        super.destroy(); // Just puts "destroy" string in log
+
        // Put your code here
+
    }
+
 
+
 
+
    public void doGet(HttpServletRequest request, HttpServletResponse response)
+
            throws ServletException, IOException {
+
        doPost (request, response);
+
         
+
    }
+
 
+
     
+
    public void doPost(HttpServletRequest request, HttpServletResponse response)
+
            throws ServletException, IOException {
+
 
+
        String uname = request.getParameter("uname");
+
        String upwd = request.getParameter("upwd");
+
        UsersDao usersDao = new UsersDao();
+
        int i=usersDao.reg(uname, upwd);
+
        if(i>0){
+
         
+
            response.setHeader("refresh""2;url=login.jsp");
+
        }else{
+
     
+
            response.setHeader("refresh""2;url=reg.jsp");
+
        }
+
    }
+
 
+
    /**
+
     * Initialization of the servlet. <br>
+
     *
+
     * @throws ServletException if an error occurs
+
     */
+
    public void init() throws ServletException {
+
        // Put your code here
+
    }
+
 
+
}
+
 
+
  
+
 
+
  
+
 
+
package com.sf.servlet;
+
 
+
import java.io.IOException;
+
import java.io.PrintWriter;
+
 
+
import javax.servlet.ServletException;
+
import javax.servlet.http.HttpServlet;
+
import javax.servlet.http.HttpServletRequest;
+
import javax.servlet.http.HttpServletResponse;
+
 
+
import com.sf.dao.MsgDao;
+
import com.sf.dao.UsersDao;
+
 
+
public class Doregservlet extends HttpServlet {
+
 
+
    /**
+
     * Constructor of the object.
+
     */
+
    public Doregservlet() {
+
        super();
+
    }
+
 
+
    /**
+
     * Destruction of the servlet. <br>
+
     */
+
    public void destroy() {
+
        super.destroy(); // Just puts "destroy" string in log
+
        // Put your code here
+
    }
+
 
+
    public void doGet(HttpServletRequest request, HttpServletResponse response)
+
            throws ServletException, IOException {
+
 
+
        response.setContentType("text/html");
+
        PrintWriter out = response.getWriter();
+
        request.setCharacterEncoding("utf-8");
+
        String uname = request.getParameter("uname");
+
        String upwd = request.getParameter("upwd");
+
 
+
        UsersDao ud = new UsersDao();
+
        MsgDao md = new MsgDao();
+
        if (ud.register(uname, upwd) > 0) {
+
            request.getSession().setAttribute("uname", uname);
+
            request.getRequestDispatcher("denglu.jsp").forward(request,
+
                    response);
+
        else {
+
            out.print("注册失败,请重新注册.......");
+
            response.setHeader("refresh""3;url=reg.jsp");
+
        }
+
    }
+
    public void doPost(HttpServletRequest request, HttpServletResponse response)
+
            throws ServletException, IOException {
+
 
+
        doGet(request,response);
+
    }
+
 
+
    /**
+
     * Initialization of the servlet. <br>
+
     *
+
     * @throws ServletException if an error occurs
+
     */
+
    public void init() throws ServletException {
+
        // Put your code here
+
    }
+
 
+
}
+
 
+
  
+
 
+
  
+
 
+
package com.servlet;
+
 
+
import java.io.IOException;
+
import java.io.PrintWriter;
+
 
+
import javax.servlet.ServletException;
+
import javax.servlet.http.HttpServlet;
+
import javax.servlet.http.HttpServletRequest;
+
import javax.servlet.http.HttpServletResponse;
+
 
+
import com.dao.MsgDao;
+
 
+
public class servlet5 extends HttpServlet {
+
 
+
    public servlet5() {
+
        super();
+
    }
+
 
+
    public void destroy() {
+
        super.destroy(); // Just puts "destroy" string in log
+
        // Put your code here
+
    }
+
 
+
     
+
    public void doGet(HttpServletRequest request, HttpServletResponse response)
+
            throws ServletException, IOException {
+
 
+
        doPost(request,  response);
+
    }
+
 
+
     
+
    public void doPost(HttpServletRequest request, HttpServletResponse response)
+
            throws ServletException, IOException {
+
 
+
        request.setCharacterEncoding("utf-8");
+
          
+
        int id=Integer.parseInt(request.getParameter("id"));
+
        MsgDao md=new MsgDao();
+
        md.delMail(id);   
+
        response.getWriter().print("刪除成功.....");
+
        response.setHeader("refresh""2;url=main.jsp");
+
        response.sendRedirect("main2.jsp");
+
    }
+
 
+
     
+
    public void init() throws ServletException {
+
     
+
    }
+
 
+
}
+
 
+
  
+
 
+
  
+
 
+
  
+
 
+
package com.sf.servlet;
+
 
+
import java.io.IOException;
+
import java.io.PrintWriter;
+
 
+
import javax.servlet.ServletException;
+
import javax.servlet.http.HttpServlet;
+
import javax.servlet.http.HttpServletRequest;
+
import javax.servlet.http.HttpServletResponse;
+
 
+
import com.sf.dao.MsgDao;
+
import com.sf.entity.Msg;
+
 
+
public class Dowriteservlet extends HttpServlet {
+
 
+
    /**
+
     * Constructor of the object.
+
     */
+
    public Dowriteservlet() {
+
        super();
+
    }
+
 
+
    /**
+
     * Destruction of the servlet. <br>
+
     */
+
    public void destroy() {
+
        super.destroy(); // Just puts "destroy" string in log
+
        // Put your code here
+
    }
+
 
+
    public void doGet(HttpServletRequest request, HttpServletResponse response)
+
            throws ServletException, IOException {
+
 
+
        response.setContentType("text/html");
+
        PrintWriter out = response.getWriter();
+
        request.setCharacterEncoding("utf-8");
+
        String uname = (String) request.getSession().getAttribute("uname");
+
        String sendto = request.getParameter("receiver");
+
        String title = request.getParameter("title");
+
        String content = request.getParameter("content");
+
 
+
        Msg m = new Msg();
+
        m.setMsgcontent(content);
+
        m.setUsername(uname);
+
        m.setSendto(sendto);
+
        m.setTitle(title);
+
 
+
        MsgDao md = new MsgDao();
+
        md.addMsg(m);
+
 
+
        out.print("发送成功.....");
+
        response.setHeader("refresh""3;url=main.jsp");
+
    }
+
 
+
    public void doPost(HttpServletRequest request, HttpServletResponse response)
+
            throws ServletException, IOException {
+
 
+
        doGet(request,response);     }
+
 
+
    /**
+
     * Initialization of the servlet. <br>
+
     *
+
     * @throws ServletException if an error occurs
+
     */
+
    public void init() throws ServletException {
+
    }
+
 
+
}
+ +
+ +
+
+
+ +
+
posted @ +2022-05-29 20:20  +徐涛%  +阅读(70)  +评论(0)  +编辑  +收藏  +举报 +
+
+ + +
+
+ + +
+
+ +
+ +
+
+
+
+
+ + + + +
+
+
+
+ +
+ +
+
+ +
+
+
+ +
+ + + + + + + + + + + \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json index f1c6da6a..b0baf47d 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json @@ -3,6 +3,7 @@ "raw_content": "<table><caption>ফেব্রুয়ারি ২০২৪</caption><thead><tr><th>সোম</th><th>মঙ্গল</th><th>বুধ</th><th>বৃহ</th><th>শুক্র</th><th>শনি</th><th>রবি</th></tr></thead><tfoot><tr><td colspan=\\\"3\\\">« জানুয়ারি</td><td></td><td colspan=\\\"3\\\"></td></tr></tfoot><tbody><tr><td colspan=\\\"3\\\"></td><td>১</td><td>২</td><td>৩</td><td>৪</td></tr><tr><td>৫</td><td>৬</td><td>৭</td><td>৮</td><td>৯</td><td>১০</td><td>১১</td></tr><tr><td>১২</td><td>১৩</td><td>১৪</td><td>১৫</td><td>১৬</td><td>১৭</td><td>১৮</td></tr><tr><td>১৯</td><td>২০</td><td>২১</td><td>২২</td><td>২৩</td><td>২৪</td><td>২৫</td></tr><tr><td>২৬</td><td>২৭</td><td>২৮</td><td>২৯</td><td colspan=\\\"3\\\"></td></tr></tbody></table>", "content": { "html": "
ফেব্রুয়ারি ২০২৪
সোমমঙ্গলবুধবৃহশুক্রশনিরবি
« জানুয়ারি
১০১১
১২১৩১৪১৫১৬১৭১৮
১৯২০২১২২২৩২৪২৫
২৬২৭২৮২৯
", - "is_complex": true + "is_complex": true, + "table_nest_level": null } } diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index 19e1b106..08f3492c 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -19,7 +19,8 @@ 'assets/recognizer/table_simple_cc.html', 'assets/recognizer/table_include_rowspan_colspan.html', 'assets/recognizer/table_involve_equation.html', - 'assets/recognizer/table_include_after_code.html' + 'assets/recognizer/table_include_after_code.html', + 'assets/recognizer/table_involve_code.html' ), 'expected': [ @@ -86,7 +87,7 @@ def test_cc_simple_table(self): parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) assert len(parts) == 3 content = html_to_element(parts[1][0]).text_content() - assert content == r'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
Рейтинг:Рейтинг<br>5.00<br>из 5 на основе опроса<br>3<br>пользователей
Тип товара:Препараты для омоложения
Форма:Крем
Объем:50 мл
Рецепт:Отпускается без рецепта
Способ хранения:Хранить при температуре 4-20°
Примечание:Беречь от детей
Оплата:Наличными/банковской картой
Доступность в Северске:В наличии
Доставка:2-7 Дней
Цена:84<br>₽
\n' + assert content == r'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
Рейтинг:<br>\n<br>\n<br>\n<br>\nРейтинг<br>5.00<br>из 5 на основе опроса<br>3<br>пользователей<br>\n<br>\n<br>\n<br>\n
Тип товара:<br>\n<br>\n<br>\n<br>\nПрепараты для омоложения<br>\n<br>\n<br>\n<br>\n
Форма:<br>\n<br>\n<br>\n<br>\nКрем<br>\n<br>\n<br>\n<br>\n
Объем:<br>\n<br>\n<br>\n<br>\n50 мл<br>\n<br>\n<br>\n<br>\n
Рецепт:<br>\n<br>\n<br>\n<br>\nОтпускается без рецепта<br>\n<br>\n<br>\n<br>\n
Способ хранения:<br>\n<br>\n<br>\n<br>\nХранить при температуре 4-20°<br>\n<br>\n<br>\n<br>\n
Примечание:<br>\n<br>\n<br>\n<br>\nБеречь от детей<br>\n<br>\n<br>\n<br>\n
Оплата:<br>\n<br>\n<br>\n<br>\nНаличными/банковской картой<br>\n<br>\n<br>\n<br>\n
Доступность в Северске:<br>\n<br>\n<br>\n<br>\nВ наличии<br>\n<br>\n<br>\n<br>\n
Доставка:<br>\n<br>\n<br>\n<br>\n2-7 Дней<br>\n<br>\n<br>\n<br>\n
Цена:<br>\n<br>\n<br>\n<br>\n84<br>₽<br>\n<br>\n<br>\n<br>\n
\n' def test_cc_complex_table(self): """cc跨行跨列的表格.""" @@ -111,11 +112,11 @@ def test_simple_complex_table(self): simple_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}')[0] simple_table_type = simple_table_tag.attrib assert simple_table_type['table_type'] == 'simple' - assert simple_table_type == {'table_type': 'simple', 'html': '\n \n \n \n \n \n \n \n \n
12
34
\n\n'} + assert simple_table_type == {'table_type': 'simple', 'table_nest_level': '1', 'html': '\n \n \n \n \n \n \n \n \n
12
34
\n\n'} complex_table_tag = html_to_element(parts[2][0]).xpath(f'.//{CCTag.CC_TABLE}')[0] complex_table_type = complex_table_tag.attrib assert complex_table_type['table_type'] == 'complex' - assert complex_table_type == {'table_type': 'complex', 'html': '\n \n \n \n \n \n \n \n \n \n \n \n \n \n
123
4
567
\n '} + assert complex_table_type == {'table_type': 'complex', 'table_nest_level': '1', 'html': '\n \n \n \n \n \n \n \n \n \n \n \n \n \n
123
4
567
\n '} def test_table_to_content_list_node_simple(self): """测试table的 to content list node方法.""" @@ -151,7 +152,8 @@ def test_table_involve_equation(self): base_url = 'https://en.m.wikipedia.org/wiki/Variance' raw_html = raw_html_path.read_text(encoding='utf-8') parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) - assert parts is not None + complex_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}') + assert complex_table_tag[0].text == r'
Name of the probability distributionProbability distribution functionMeanVariance
Binomial distribution{\displaystyle \Pr \,(X=k)={\binom {n}{k}}p^{k}(1-p)^{n-k}}{\displaystyle np}{\displaystyle np(1-p)}
Geometric distribution{\displaystyle \Pr \,(X=k)=(1-p)^{k-1}p}{\displaystyle {\frac {1}{p}}}{\displaystyle {\frac {(1-p)}{p^{2}}}}
Normal distribution{\displaystyle f\left(x\mid \mu ,\sigma ^{2}\right)={\frac {1}{\sqrt {2\pi \sigma ^{2}}}}e^{-{\frac {(x-\mu )^{2}}{2\sigma ^{2}}}}}{\displaystyle \mu }{\displaystyle \sigma ^{2}}
Uniform distribution (continuous){\displaystyle f(x\mid a,b)={\begin{cases}{\frac {1}{b-a}}&{\text{for }}a\leq x\leq b,\\[3pt]0&{\text{for }}x<a{\text{ or }}x>b\end{cases}}}{\displaystyle {\frac {a+b}{2}}}{\displaystyle {\frac {(b-a)^{2}}{12}}}
Exponential distribution{\displaystyle f(x\mid \lambda )=\lambda e^{-\lambda x}}{\displaystyle {\frac {1}{\lambda }}}{\displaystyle {\frac {1}{\lambda ^{2}}}}
Poisson distribution{\displaystyle f(k\mid \lambda )={\frac {e^{-\lambda }\lambda ^{k}}{k!}}}{\displaystyle \lambda }{\displaystyle \lambda }
' def test_table_involve_after_code(self): """test table involve code, code被提取出去了,过滤掉空的和坏的table.""" @@ -161,3 +163,14 @@ def test_table_involve_after_code(self): raw_html = raw_html_path.read_text(encoding='utf-8') parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) assert html_to_element(parts[0][0]).xpath(f'.//{CCTag.CC_TABLE}')[0].text is None + + def test_table_involve_code(self): + """table involve code.""" + for test_case in TEST_CASES: + raw_html_path = base_dir.joinpath(test_case['input'][11]) + base_url = 'https://en.m.wikipedia.org/wiki/Variance' + raw_html = raw_html_path.read_text(encoding='utf-8') + parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) + complex_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}') + content = open('assets/recognizer/table_include_code_expect.json', 'r', encoding='utf-8').read() + assert complex_table_tag[0].text == content From e7c379248180ca57384269ed030a32eef7ddd6b6 Mon Sep 17 00:00:00 2001 From: quyuan Date: Tue, 25 Feb 2025 13:09:03 +0800 Subject: [PATCH 02/46] update extract table --- llm_web_kit/extractor/html/recognizer/table.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index b41f8834..3598aaba 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -128,10 +128,11 @@ def __is_table_nested(self, tree) -> int: # 计算祖先中的 table 数量(不包括自身),再加1表示自身层级 return len(tree.xpath('ancestor::table')) + 1 - def __extract_tables(self, ele: HtmlElement) -> list[tuple[str, str]]: + def __extract_tables(self, ele: str) -> list[tuple[str, str]]: """提取html中的table元素.""" - self.__do_extract_tables(ele) - new_html = self._element_to_html(ele) + tree = self._build_html_tree(ele) + self.__do_extract_tables(tree) + new_html = self._element_to_html(tree) lst = self.html_split_by_tags(new_html, CCTag.CC_TABLE) return lst From f0347ff6421dc53cf8906c6598b1d6f4b49e8308 Mon Sep 17 00:00:00 2001 From: quyuan Date: Tue, 25 Feb 2025 13:24:15 +0800 Subject: [PATCH 03/46] remove table tail --- tests/llm_web_kit/extractor/html/recognizer/test_table.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index 08f3492c..48c17998 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -87,8 +87,7 @@ def test_cc_simple_table(self): parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) assert len(parts) == 3 content = html_to_element(parts[1][0]).text_content() - assert content == r'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
Рейтинг:<br>\n<br>\n<br>\n<br>\nРейтинг<br>5.00<br>из 5 на основе опроса<br>3<br>пользователей<br>\n<br>\n<br>\n<br>\n
Тип товара:<br>\n<br>\n<br>\n<br>\nПрепараты для омоложения<br>\n<br>\n<br>\n<br>\n
Форма:<br>\n<br>\n<br>\n<br>\nКрем<br>\n<br>\n<br>\n<br>\n
Объем:<br>\n<br>\n<br>\n<br>\n50 мл<br>\n<br>\n<br>\n<br>\n
Рецепт:<br>\n<br>\n<br>\n<br>\nОтпускается без рецепта<br>\n<br>\n<br>\n<br>\n
Способ хранения:<br>\n<br>\n<br>\n<br>\nХранить при температуре 4-20°<br>\n<br>\n<br>\n<br>\n
Примечание:<br>\n<br>\n<br>\n<br>\nБеречь от детей<br>\n<br>\n<br>\n<br>\n
Оплата:<br>\n<br>\n<br>\n<br>\nНаличными/банковской картой<br>\n<br>\n<br>\n<br>\n
Доступность в Северске:<br>\n<br>\n<br>\n<br>\nВ наличии<br>\n<br>\n<br>\n<br>\n
Доставка:<br>\n<br>\n<br>\n<br>\n2-7 Дней<br>\n<br>\n<br>\n<br>\n
Цена:<br>\n<br>\n<br>\n<br>\n84<br>₽<br>\n<br>\n<br>\n<br>\n
\n' - + assert content == r"\n\n\n\n\n\n\n\n\n\n\n\n\n
Рейтинг:Рейтинг<br>5.00<br>3
Тип товара:Препараты для омоложения
Форма:Крем
Объем:50 мл
Рецепт:Отпускается без рецепта
Способ хранения:Хранить при температуре 4-20°
Примечание:Беречь от детей
Оплата:Наличными/банковской картой
Доступность в Северске:В наличии
Доставка:2-7 Дней
Цена:84<br>₽
" def test_cc_complex_table(self): """cc跨行跨列的表格.""" for test_case in TEST_CASES: From 5e176944beb0aa2b34b49a3a274380856c831bdd Mon Sep 17 00:00:00 2001 From: quyuan Date: Tue, 25 Feb 2025 13:34:44 +0800 Subject: [PATCH 04/46] normalize line endings --- llm_web_kit/extractor/html/recognizer/table.py | 2 +- tests/llm_web_kit/extractor/html/recognizer/test_table.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index 3598aaba..9d5dbb37 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -232,7 +232,7 @@ def __get_table_body(self, table_type, table_root): if elem.text: elem.text = elem.text.strip() if elem.tail: - elem.tail = elem.tail.strip() + elem.tail = None self.__simplify_td_th_content(table_root) # 迭代 diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index 48c17998..9f26c523 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -87,7 +87,8 @@ def test_cc_simple_table(self): parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) assert len(parts) == 3 content = html_to_element(parts[1][0]).text_content() - assert content == r"\n\n\n\n\n\n\n\n\n\n\n\n\n
Рейтинг:Рейтинг<br>5.00<br>3
Тип товара:Препараты для омоложения
Форма:Крем
Объем:50 мл
Рецепт:Отпускается без рецепта
Способ хранения:Хранить при температуре 4-20°
Примечание:Беречь от детей
Оплата:Наличными/банковской картой
Доступность в Северске:В наличии
Доставка:2-7 Дней
Цена:84<br>₽
" + assert content == r'\n\n\n\n\n\n\n\n\n\n\n\n\n
Рейтинг:Рейтинг<br>5.00<br>3
Тип товара:Препараты для омоложения
Форма:Крем
Объем:50 мл
Рецепт:Отпускается без рецепта
Способ хранения:Хранить при температуре 4-20°
Примечание:Беречь от детей
Оплата:Наличными/банковской картой
Доступность в Северске:В наличии
Доставка:2-7 Дней
Цена:84<br>₽
' + def test_cc_complex_table(self): """cc跨行跨列的表格.""" for test_case in TEST_CASES: From c15dea1fcdda4d59bfcb5b3a8b49a37c62cc7989 Mon Sep 17 00:00:00 2001 From: quyuan Date: Tue, 25 Feb 2025 14:02:39 +0800 Subject: [PATCH 05/46] update test case --- tests/llm_web_kit/extractor/html/recognizer/test_table.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index 9f26c523..b8b67029 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -26,7 +26,8 @@ 'expected': [ ('assets/recognizer/table_to_content_list_simple_res.json'), ('assets/recognizer/table_to_content_list_complex_res.json'), - ('assets/recognizer/table_include_image_expcet.json') + ('assets/recognizer/table_include_image_expcet.json'), + ('assets/recognizer/table_include_code_expect.json') ], } ] @@ -172,5 +173,6 @@ def test_table_involve_code(self): raw_html = raw_html_path.read_text(encoding='utf-8') parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) complex_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}') - content = open('assets/recognizer/table_include_code_expect.json', 'r', encoding='utf-8').read() + expect_path = base_dir.joinpath(test_case['expected'][3]) + content = open(expect_path, 'r', encoding='utf-8').read() assert complex_table_tag[0].text == content From d34a8a7416f3b238b49ea81ff5eff6ee37a396b7 Mon Sep 17 00:00:00 2001 From: quyuan Date: Tue, 25 Feb 2025 14:45:30 +0800 Subject: [PATCH 06/46] update format --- llm_web_kit/extractor/html/extractor.py | 2 +- llm_web_kit/extractor/html/recognizer/table.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/llm_web_kit/extractor/html/extractor.py b/llm_web_kit/extractor/html/extractor.py index 1d3facb3..bc3fe05b 100644 --- a/llm_web_kit/extractor/html/extractor.py +++ b/llm_web_kit/extractor/html/extractor.py @@ -93,7 +93,7 @@ def _do_extract(self, data_json: DataJson) -> DataJson: main_html, method = self._extract_main_html(raw_html, base_url, page_layout_type) parsed_html = [(main_html,raw_html)] - for extract_func in [self._extract_code, self._extract_table, self._extract_math, self._extract_list, + for extract_func in [self._extract_table, self._extract_code, self._extract_math, self._extract_list, self._extract_image, self._extract_title, self._extract_paragraph]: parsed_html = extract_func(base_url, parsed_html, raw_html) diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index 9d5dbb37..64528ea2 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -128,7 +128,7 @@ def __is_table_nested(self, tree) -> int: # 计算祖先中的 table 数量(不包括自身),再加1表示自身层级 return len(tree.xpath('ancestor::table')) + 1 - def __extract_tables(self, ele: str) -> list[tuple[str, str]]: + def __extract_tables(self, ele: str) -> list[Tuple[str, str]]: """提取html中的table元素.""" tree = self._build_html_tree(ele) self.__do_extract_tables(tree) @@ -233,7 +233,6 @@ def __get_table_body(self, table_type, table_root): elem.text = elem.text.strip() if elem.tail: elem.tail = None - self.__simplify_td_th_content(table_root) # 迭代 for child in table_root.iterchildren(): @@ -258,7 +257,7 @@ def __do_extract_tables(self, root: HtmlElement) -> None: for child in root.iterchildren(): self.__do_extract_tables(child) - def __get_attribute(self, html: str) -> tuple[bool, Any, Any]: + def __get_attribute(self, html: str) -> Tuple[bool, Any, Any]: """获取element的属性.""" ele = self._build_html_tree(html) if ele is not None and ele.tag == CCTag.CC_TABLE: From 87a24954be0bfe65f42dfd6d6df559661a02c928 Mon Sep 17 00:00:00 2001 From: quyuan Date: Tue, 25 Feb 2025 14:48:59 +0800 Subject: [PATCH 07/46] update format --- llm_web_kit/extractor/html/recognizer/table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index 64528ea2..232573ea 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -128,7 +128,7 @@ def __is_table_nested(self, tree) -> int: # 计算祖先中的 table 数量(不包括自身),再加1表示自身层级 return len(tree.xpath('ancestor::table')) + 1 - def __extract_tables(self, ele: str) -> list[Tuple[str, str]]: + def __extract_tables(self, ele: str) -> List[Tuple[str, str]]: """提取html中的table元素.""" tree = self._build_html_tree(ele) self.__do_extract_tables(tree) From 98610905a5a36e778fd85631e0fa8ffb8f9d68e9 Mon Sep 17 00:00:00 2001 From: quyuan Date: Tue, 25 Feb 2025 14:59:22 +0800 Subject: [PATCH 08/46] update format --- llm_web_kit/extractor/html/extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_web_kit/extractor/html/extractor.py b/llm_web_kit/extractor/html/extractor.py index bc3fe05b..1d3facb3 100644 --- a/llm_web_kit/extractor/html/extractor.py +++ b/llm_web_kit/extractor/html/extractor.py @@ -93,7 +93,7 @@ def _do_extract(self, data_json: DataJson) -> DataJson: main_html, method = self._extract_main_html(raw_html, base_url, page_layout_type) parsed_html = [(main_html,raw_html)] - for extract_func in [self._extract_table, self._extract_code, self._extract_math, self._extract_list, + for extract_func in [self._extract_code, self._extract_table, self._extract_math, self._extract_list, self._extract_image, self._extract_title, self._extract_paragraph]: parsed_html = extract_func(base_url, parsed_html, raw_html) From a77735f93a337c181e01cc3af3c03b2f691058b8 Mon Sep 17 00:00:00 2001 From: quyuan Date: Tue, 25 Feb 2025 18:01:25 +0800 Subject: [PATCH 09/46] change parse order --- llm_web_kit/extractor/html/extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_web_kit/extractor/html/extractor.py b/llm_web_kit/extractor/html/extractor.py index 1d3facb3..bc3fe05b 100644 --- a/llm_web_kit/extractor/html/extractor.py +++ b/llm_web_kit/extractor/html/extractor.py @@ -93,7 +93,7 @@ def _do_extract(self, data_json: DataJson) -> DataJson: main_html, method = self._extract_main_html(raw_html, base_url, page_layout_type) parsed_html = [(main_html,raw_html)] - for extract_func in [self._extract_code, self._extract_table, self._extract_math, self._extract_list, + for extract_func in [self._extract_table, self._extract_code, self._extract_math, self._extract_list, self._extract_image, self._extract_title, self._extract_paragraph]: parsed_html = extract_func(base_url, parsed_html, raw_html) From 419b2c1024efc37ce15d864bc0d66615bcff6f53 Mon Sep 17 00:00:00 2001 From: quyuan Date: Tue, 25 Feb 2025 21:32:30 +0800 Subject: [PATCH 10/46] add list nest level --- llm_web_kit/extractor/html/recognizer/list.py | 21 ++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/list.py b/llm_web_kit/extractor/html/recognizer/list.py index 1dbea3fc..315b8ac8 100644 --- a/llm_web_kit/extractor/html/recognizer/list.py +++ b/llm_web_kit/extractor/html/recognizer/list.py @@ -1,5 +1,5 @@ import json -from typing import List, Tuple +from typing import List, Tuple, Any from lxml.etree import _Element as HtmlElement from overrides import override @@ -88,16 +88,16 @@ def __do_extract_list(self, root:HtmlElement) -> None: list_tag_names = ['ul', 'ol', 'dl', 'menu', 'dir'] if root.tag in list_tag_names: - is_ordered, content_list, raw_html, tail_text = self.__extract_list_element(root) + list_nest_level, is_ordered, content_list, raw_html, tail_text = self.__extract_list_element(root) text = json.dumps(content_list, ensure_ascii=False, indent=4) - cc_element = self._build_cc_element(CCTag.CC_LIST, text, tail_text, ordered=is_ordered, html=raw_html) + cc_element = self._build_cc_element(CCTag.CC_LIST, text, tail_text, ordered=is_ordered, list_nest_level=list_nest_level, html=raw_html) self._replace_element(root, cc_element) # cc_element 替换掉原来的列表元素 return for child in root.iterchildren(): self.__do_extract_list(child) - def __extract_list_element(self, ele: HtmlElement) -> Tuple[bool, list, str, str]: + def __extract_list_element(self, ele: HtmlElement) -> tuple[int, bool, list[list[list]], str, Any]: """ 提取列表元素: 假如有如下列表: @@ -135,6 +135,7 @@ def __extract_list_element(self, ele: HtmlElement) -> Tuple[bool, list, str, str (bool, str, str): 第一个元素是是否有序; 第二个元素是个python list,内部是文本和行内公式,具体格式参考list的content_list定义。第三个元素是列表原始的html内容 """ is_ordered = ele.tag in ['ol', 'dl'] + list_nest_level = self.__get_list_type(ele) tail_text = ele.tail content_list = [] raw_html = self._element_to_html(ele) @@ -144,7 +145,17 @@ def __extract_list_element(self, ele: HtmlElement) -> Tuple[bool, list, str, str text_paragraph = self.__extract_list_item_text(item) content_list.append(text_paragraph) - return is_ordered, content_list, raw_html, tail_text + return list_nest_level, is_ordered, content_list, raw_html, tail_text + + def __get_list_type(self, list_ele:HtmlElement) -> int: + """ + 获取list嵌套的类型 + """ + if list_ele.tag not in ['ul', 'ol', 'dl', 'menu', 'dir']: + return 0 + ancestor_count = list_ele.xpath('count(ancestor::ul | ancestor::ol)') + # 层级 = 祖先列表数量 + 自身(1层) + return int(ancestor_count) + 1 def __extract_list_item_text(self, root:HtmlElement) -> list[list]: """提取列表项的文本. From c40b1ead2135c737a3f3c8943b7a54eea7f09595 Mon Sep 17 00:00:00 2001 From: quyuan Date: Tue, 25 Feb 2025 21:43:47 +0800 Subject: [PATCH 11/46] fix pylint --- llm_web_kit/extractor/html/recognizer/list.py | 6 ++---- .../assets/recognizer/table_include_code_expect.json | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/list.py b/llm_web_kit/extractor/html/recognizer/list.py index 315b8ac8..d564d41e 100644 --- a/llm_web_kit/extractor/html/recognizer/list.py +++ b/llm_web_kit/extractor/html/recognizer/list.py @@ -1,5 +1,5 @@ import json -from typing import List, Tuple, Any +from typing import Any, List, Tuple from lxml.etree import _Element as HtmlElement from overrides import override @@ -148,9 +148,7 @@ def __extract_list_element(self, ele: HtmlElement) -> tuple[int, bool, list[list return list_nest_level, is_ordered, content_list, raw_html, tail_text def __get_list_type(self, list_ele:HtmlElement) -> int: - """ - 获取list嵌套的类型 - """ + """获取list嵌套的类型.""" if list_ele.tag not in ['ul', 'ol', 'dl', 'menu', 'dir']: return 0 ancestor_count = list_ele.xpath('count(ancestor::ul | ancestor::ol)') diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_include_code_expect.json b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_include_code_expect.json index 15a9cf34..4f6fc9ed 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_include_code_expect.json +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_include_code_expect.json @@ -296,4 +296,4 @@ doGet(request,response); } publicvoidinit()throwsServletException { } -}
\ No newline at end of file +} From 6c7ca2dddf0c29f772bc1e0dcd0df99ec0b9d545 Mon Sep 17 00:00:00 2001 From: quyuan Date: Thu, 27 Feb 2025 16:38:35 +0800 Subject: [PATCH 12/46] update table nest spec.md --- .../output_format/content_list_spec.md | 25 +++++++++++-------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/docs/specification/output_format/content_list_spec.md b/docs/specification/output_format/content_list_spec.md index 5c27c663..8bbecc32 100644 --- a/docs/specification/output_format/content_list_spec.md +++ b/docs/specification/output_format/content_list_spec.md @@ -83,7 +83,8 @@ "html": "
12
", "title": "example table", "note": "数据来源于...", - "is_complex": false // 是否是复杂表格(跨行、跨列的, 默认为false + "is_complex": false // 是否是复杂表格(跨行、跨列的/嵌套表格, 默认为false, + "table_nest_level": 1 //table的嵌套层级 } }, { @@ -285,20 +286,22 @@ "html": "
12
", "title": "example table", "note": "数据来源于...", - "is_complex": false // 是否是复杂表格(跨行、跨列的, 默认为false + "is_complex": false // 是否是复杂表格(跨行、跨列的, 默认为false, + "table_nest_level": 1 //表格嵌套层级 } } ``` -| 字段 | 类型 | 描述 | 是否必须 | -| ------------------ | ------- | ---------------------------------------- | -------- | -| type | string | 值固定为table | 是 | -| bbox | array | \[x1, y1, x2, y2\] | 可选 | -| raw_content | string | 原始文本内容 | 可选 | -| content.html | string | 表格的html内容 | 是 | -| content.title | string | 表格的title属性 | 可选 | -| content.note | string | 表格的note属性 | 可选 | -| content.is_complex | boolean | 是否是复杂表格(跨行、跨列的, 默认为false | 可选 | +| 字段 | 类型 | 描述 | 是否必须 | +| ------------------------ | ------- | ------------------------------------------------- | -------- | +| type | string | 值固定为table | 是 | +| bbox | array | \[x1, y1, x2, y2\] | 可选 | +| raw_content | string | 原始文本内容 | 可选 | +| content.html | string | 表格的html内容 | 是 | +| content.title | string | 表格的title属性 | 可选 | +| content.note | string | 表格的note属性 | 可选 | +| content.is_complex | boolean | 是否是复杂表格(跨行、跨列的/嵌套表格, 默认为false | 可选 | +| content.table_nest_level | int | table嵌套层级(单个table为1,两层为2,以此类推) | 可选 | ### 列表段 From 9e1545293c7d4d6ec10362312a2adb9db6700f6b Mon Sep 17 00:00:00 2001 From: quyuan Date: Mon, 3 Mar 2025 12:36:38 +0800 Subject: [PATCH 13/46] update parse order --- llm_web_kit/extractor/html/extractor.py | 2 +- .../table_involve_complex_code.html | 237 ++++++++++++++++++ .../extractor/html/recognizer/test_code.py | 2 +- .../extractor/html/recognizer/test_table.py | 17 +- 4 files changed, 254 insertions(+), 4 deletions(-) create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_involve_complex_code.html diff --git a/llm_web_kit/extractor/html/extractor.py b/llm_web_kit/extractor/html/extractor.py index bc3fe05b..1d3facb3 100644 --- a/llm_web_kit/extractor/html/extractor.py +++ b/llm_web_kit/extractor/html/extractor.py @@ -93,7 +93,7 @@ def _do_extract(self, data_json: DataJson) -> DataJson: main_html, method = self._extract_main_html(raw_html, base_url, page_layout_type) parsed_html = [(main_html,raw_html)] - for extract_func in [self._extract_table, self._extract_code, self._extract_math, self._extract_list, + for extract_func in [self._extract_code, self._extract_table, self._extract_math, self._extract_list, self._extract_image, self._extract_title, self._extract_paragraph]: parsed_html = extract_func(base_url, parsed_html, raw_html) diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_involve_complex_code.html b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_involve_complex_code.html new file mode 100644 index 00000000..b929d7e0 --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_involve_complex_code.html @@ -0,0 +1,237 @@ + + + + ClientNetworkWrapper.java (Example JavaDoc) + + + + + + + + + + + + + + +
+

+ + + + + + + + + + + + + + + + + +
FileDocCategorySizeDatePackage
ClientNetworkWrapper.javaAPI DocExample2389Thu Nov 08 00:23:44 GMT 2001com.ora.rmibook.chapter3
+

ClientNetworkWrapper

+ public class ClientNetworkWrapper extends NetworkBaseClass implements + PrinterConstants + + + + +
+ + + + +
+
+
+

+ + + + + + + + + + + + + +
Fields Summary
private String +
_serverMachine
+
+
private int +
_serverPort
+
+
+ + + + + + + + +
Constructors Summary
public ClientNetworkWrapper()
+

+ + + + +
+
+
+

+        this (DEFAULT_SERVER_NAME, DEFAULT_SERVER_PORT);
+    
Test Test Test
ABC
+DEF
TEST TEST TEST
+
+
public ClientNetworkWrapper(String + serverMachine, int serverPort) +
+

+ + + + +
+
+
+

+        _serverMachine = serverMachine;
+        _serverPort = serverPort;
+    
+
+
+ + + + + + + + + + + + + + + + +
Methods Summary
private voidreadStatusFromSocket(java.net.Socket + connection) +
+

+ + + + +
+
+
+

+        InputStream inputStream = connection.getInputStream();
+        DataInputStream dataInputStream = new DataInputStream(inputStream);
+        BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
+        boolean response = dataInputStream.readBoolean();
+
+        if (response) {
+            return;
+        }
+        PrinterException error = new PrinterException(inputStream);
+
+        throw error;
+    
+
+
public voidsendDocumentToPrinter(java.io.InputStream actualDocument) +
+

+ + + + +
+
+
+

+        sendDocumentToPrinter(actualDocument, DEFAULT_DOCUMENT_TYPE,
+            DEFAULT_PRINT_TWO_SIDED, DEFAULT_PRINT_QUALITY);
+    
+
+
public voidsendDocumentToPrinter(java.io.InputStream actualDocument, int documentType, boolean printTwoSided, + int printQuality) +
+

+ + + + +
+
+
+

+        DocumentDescription documentToSend;
+
+        try {
+            documentToSend = new DocumentDescription(actualDocument, documentType, printTwoSided, printQuality);
+        } catch (IOException e) {
+            throw new ConnectionException();
+        }
+        sendDocumentToPrinter(documentToSend);
+    
+
+
public voidsendDocumentToPrinter(DocumentDescription documentDescription) +
+

+ + + + +
+
+
+

+        Socket connection = null;
+
+        try {
+            connection = new Socket(_serverMachine, _serverPort);
+            documentDescription.writeToStream(connection.getOutputStream());
+            readStatusFromSocket(connection);
+        } catch (IOException e) {
+            e.printStackTrace();
+            throw new ConnectionException();
+        }
+        closeSocket(connection);
+    
+
+
+

+ + + + \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_code.py b/tests/llm_web_kit/extractor/html/recognizer/test_code.py index 40f758c1..143591b1 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_code.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_code.py @@ -268,7 +268,7 @@ def test_code_rec(self): raw_html_path = base_dir.joinpath(test_case['input'][0]) base_url = test_case['input'][1] print(base_url) - raw_html = raw_html_path.read_text() + raw_html = raw_html_path.read_text(encoding="utf-8") parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) parts = [ part[0] diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index b8b67029..e92e7297 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -20,7 +20,8 @@ 'assets/recognizer/table_include_rowspan_colspan.html', 'assets/recognizer/table_involve_equation.html', 'assets/recognizer/table_include_after_code.html', - 'assets/recognizer/table_involve_code.html' + 'assets/recognizer/table_involve_code.html', + 'assets/recognizer/table_involve_complex_code.html' ), 'expected': [ @@ -175,4 +176,16 @@ def test_table_involve_code(self): complex_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}') expect_path = base_dir.joinpath(test_case['expected'][3]) content = open(expect_path, 'r', encoding='utf-8').read() - assert complex_table_tag[0].text == content + assert complex_table_tag[0].text == content.strip("\n") + + def test_table_involve_complex_code(self): + """table involve complex code""" + for test_case in TEST_CASES: + raw_html_path = base_dir.joinpath(test_case['input'][12]) + base_url = 'https://en.m.wikipedia.org/wiki/Variance' + raw_html = raw_html_path.read_text(encoding='utf-8') + parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) + complex_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}') + expect_path = base_dir.joinpath(test_case['expected'][3]) + content = open(expect_path, 'r', encoding='utf-8').read() + assert complex_table_tag[0].text == content.strip("\n") From 4a61728b43ce10d3230f8e54900ed39931f865e4 Mon Sep 17 00:00:00 2001 From: quyuan Date: Mon, 3 Mar 2025 12:49:57 +0800 Subject: [PATCH 14/46] update parse order --- tests/llm_web_kit/extractor/html/recognizer/test_code.py | 2 +- tests/llm_web_kit/extractor/html/recognizer/test_table.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_code.py b/tests/llm_web_kit/extractor/html/recognizer/test_code.py index 143591b1..5b55ed42 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_code.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_code.py @@ -268,7 +268,7 @@ def test_code_rec(self): raw_html_path = base_dir.joinpath(test_case['input'][0]) base_url = test_case['input'][1] print(base_url) - raw_html = raw_html_path.read_text(encoding="utf-8") + raw_html = raw_html_path.read_text(encoding='utf-8') parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) parts = [ part[0] diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index e92e7297..87ccbce8 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -176,10 +176,10 @@ def test_table_involve_code(self): complex_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}') expect_path = base_dir.joinpath(test_case['expected'][3]) content = open(expect_path, 'r', encoding='utf-8').read() - assert complex_table_tag[0].text == content.strip("\n") + assert complex_table_tag[0].text == content.strip('\n') def test_table_involve_complex_code(self): - """table involve complex code""" + """table involve complex code.""" for test_case in TEST_CASES: raw_html_path = base_dir.joinpath(test_case['input'][12]) base_url = 'https://en.m.wikipedia.org/wiki/Variance' @@ -188,4 +188,4 @@ def test_table_involve_complex_code(self): complex_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}') expect_path = base_dir.joinpath(test_case['expected'][3]) content = open(expect_path, 'r', encoding='utf-8').read() - assert complex_table_tag[0].text == content.strip("\n") + assert complex_table_tag[0].text == content.strip('\n') From 1b0e1e92993c9ff587db304e84f9a23cc3756acb Mon Sep 17 00:00:00 2001 From: quyuan Date: Mon, 3 Mar 2025 13:37:23 +0800 Subject: [PATCH 15/46] update parse order --- tests/llm_web_kit/extractor/html/recognizer/test_code.py | 2 +- tests/llm_web_kit/extractor/html/recognizer/test_table.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_code.py b/tests/llm_web_kit/extractor/html/recognizer/test_code.py index 5b55ed42..40f758c1 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_code.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_code.py @@ -268,7 +268,7 @@ def test_code_rec(self): raw_html_path = base_dir.joinpath(test_case['input'][0]) base_url = test_case['input'][1] print(base_url) - raw_html = raw_html_path.read_text(encoding='utf-8') + raw_html = raw_html_path.read_text() parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) parts = [ part[0] diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index 87ccbce8..e569d340 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -178,6 +178,7 @@ def test_table_involve_code(self): content = open(expect_path, 'r', encoding='utf-8').read() assert complex_table_tag[0].text == content.strip('\n') + @unittest.skip(reason='在code模块解决了这个问题') def test_table_involve_complex_code(self): """table involve complex code.""" for test_case in TEST_CASES: From 78ca0283c79ca84f05b5bdb1f1d87b0a4eb5ddfa Mon Sep 17 00:00:00 2001 From: quyuan Date: Mon, 3 Mar 2025 14:34:59 +0800 Subject: [PATCH 16/46] =?UTF-8?q?update=20list=E6=A0=87=E5=87=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../output_format/content_list_spec.md | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/docs/specification/output_format/content_list_spec.md b/docs/specification/output_format/content_list_spec.md index 8bbecc32..f04e2c85 100644 --- a/docs/specification/output_format/content_list_spec.md +++ b/docs/specification/output_format/content_list_spec.md @@ -108,7 +108,8 @@ ] ] ], - "ordered": true + "ordered": true, + "list_nest_level": 1 //list的嵌套层级 } } ], @@ -327,18 +328,20 @@ ] ] ], - "ordered": true + "ordered": true, + "list_nest_level": 1 //list嵌套层级 } } ``` -| 字段 | 类型 | 描述 | 是否必须 | -| --------------- | ------- | --------------------------------------------------- | -------- | -| type | string | 值固定为list | 是 | -| bbox | array | \[x1, y1, x2, y2\] | 可选 | -| raw_content | string | 原始文本内容 | 可选 | -| content.items | array | 列表项,每个元素是N个段落,段落里的元素是文本或公式 | 是 | -| content.ordered | boolean | 是否是有序列表 | 可选 | +| 字段 | 类型 | 描述 | 是否必须 | +| ----------------------- | ------- | --------------------------------------------------- | -------- | +| type | string | 值固定为list | 是 | +| bbox | array | \[x1, y1, x2, y2\] | 可选 | +| raw_content | string | 原始文本内容 | 可选 | +| content.items | array | 列表项,每个元素是N个段落,段落里的元素是文本或公式 | 是 | +| content.ordered | boolean | 是否是有序列表 | 可选 | +| content.list_nest_level | int | list的嵌套层级(单层list list_nest_level为1) | 可选 | items字段说明 From efcd7a21d99878c4f99d2bac3acfdeabb9d3d7f8 Mon Sep 17 00:00:00 2001 From: quyuan Date: Mon, 3 Mar 2025 15:55:38 +0800 Subject: [PATCH 17/46] add table involve inline code --- .../html/table_involve_inline_code.html | 26 +++++++++++++++++++ .../good_data/html_data_input.jsonl | 1 + .../extractor/test_extractor_chain.py | 15 +++++++++++ 3 files changed, 42 insertions(+) create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_involve_inline_code.html diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_involve_inline_code.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_involve_inline_code.html new file mode 100644 index 00000000..0f927ee3 --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_involve_inline_code.html @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + + + + + + + +
FunctionDescriptionExample
print()Prints a message to the console.print("Hello, World!")
len()Returns the length of an object.len([1, 2, 3])
range()Generates a sequence of numbers.range(1, 10)
diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl index 1efe87b6..5f08bdbf 100644 --- a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl @@ -9,3 +9,4 @@ {"track_id": "rfc-doc", "dataset_name": "test_pipeline_suit", "url": "https://www.test.com","data_source_category": "HTML", "path":"doc.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} {"track_id": "legato_doc", "dataset_name": "test_pipeline_suit", "url": "https://www.test.com","data_source_category": "HTML", "path":"legato_docs.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} {"track_id": "oracle_doc", "dataset_name": "test_pipeline_suit", "url": "https://docs.oracle.com/en-us/iaas/tools/java/3.57.1/com/oracle/bmc/integration/model/CustomEndpointDetails.html","data_source_category": "HTML", "path":"oracle_doc.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} +{"track_id": "table_involve_inline_code", "dataset_name": "test_table_involve_inline_code", "url": "https://docs.oracle.com/en-us/iaas/tools/java/3.57.1/com/oracle/bmc/integration/model/CustomEndpointDetails.html","data_source_category": "HTML", "path":"table_involve_inline_code.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index 4972673b..a6671f4f 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -344,3 +344,18 @@ def test_oracle_doc_comment(self): result = chain.extract(input_data) main_html = result.get_content_list().to_main_html() assert 'public int hashCode()' in main_html + + def test_table_involve_inline_code(self): + """ + table里面包含行内code + Returns: + + """ + chain = ExtractSimpleFactory.create(self.config) + self.assertIsNotNone(chain) + test_data = self.data_json[11] + # Create DataJson from test data + input_data = DataJson(test_data) + result = chain.extract(input_data) + content_list = result.get_content_list()._get_data() + print(content_list) From 0776f6efa0a118bad026147d923052cb662ae3a0 Mon Sep 17 00:00:00 2001 From: quyuan Date: Mon, 3 Mar 2025 16:58:41 +0800 Subject: [PATCH 18/46] add test case --- tests/llm_web_kit/extractor/test_extractor_chain.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index a6671f4f..ef596204 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -357,5 +357,5 @@ def test_table_involve_inline_code(self): # Create DataJson from test data input_data = DataJson(test_data) result = chain.extract(input_data) - content_list = result.get_content_list()._get_data() - print(content_list) + content_list = result.get_content_list()._get_data()[0][0]['content']['html'] + assert content_list == """
FunctionDescriptionExample
print()Prints a message to the console.print("Hello, World!")
len()Returns the length of an object.len([1, 2, 3])
range()Generates a sequence of numbers.range(1, 10)
""" From 3fda2a69efa527e95275d39dde6f3e26df4045fc Mon Sep 17 00:00:00 2001 From: quyuan Date: Mon, 3 Mar 2025 17:07:10 +0800 Subject: [PATCH 19/46] fix test case --- tests/llm_web_kit/extractor/test_extractor_chain.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index ef596204..884c9860 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -59,7 +59,7 @@ def setUp(self): for line in f: self.data_json.append(json.loads(line.strip())) - assert len(self.data_json) == 11 + assert len(self.data_json) == 12 # Config for HTML extraction self.config = { From e0196bfbb27f473cdd0c45ad2b48342d20e8754b Mon Sep 17 00:00:00 2001 From: dt-yy Date: Wed, 5 Mar 2025 16:28:59 +0800 Subject: [PATCH 20/46] add table tail --- .../extractor/html/recognizer/table.py | 35 +- .../good_data/html/table_tail_text.html | 367 ++++++++++++++++++ .../good_data/html_data_input.jsonl | 3 +- .../extractor/test_extractor_chain.py | 13 +- 4 files changed, 405 insertions(+), 13 deletions(-) create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_tail_text.html diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index fa24dd6d..6908398e 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -196,11 +196,22 @@ def __check_table_include_math_code(self, raw_html: HtmlElement): ] ele_res.extend(ccinterline_codes) else: - ele_res.extend([ - text.strip() - for text in self._build_html_tree(math_item[1]).itertext() - if text.strip() - ]) + tree = self._build_html_tree(math_item[1]) + texts = [] + for element in tree.iter(): + if element.text and element.text.strip(): + text = element.text.strip() + # 如果有tail,直接拼接到text后面 + if element.tail and element.tail.strip(): + text += element.tail.strip() + texts.append(text) + elif element.tail and element.tail.strip(): + # 如果只有tail且前面有内容,则拼接到最后一个text + if texts: + texts[-1] += element.tail.strip() + else: + texts.append(element.tail.strip()) + ele_res.extend(texts) return ele_res def __simplify_td_th_content(self, elem: HtmlElement) -> None: @@ -212,7 +223,8 @@ def __simplify_td_th_content(self, elem: HtmlElement) -> None: parse_res.extend(math_res) for item in list(elem.iterchildren()): elem.remove(item) - elem.text = '
'.join(parse_res) + if parse_res: + elem.text = '
'.join(parse_res) return for child in elem.iter('td', 'th'): self.__simplify_td_th_content(child) @@ -227,18 +239,19 @@ def __get_table_body(self, table_type, table_root): cleaned_attrs = {k: v for k, v in table_root.attrib.items() if k in allowed_attributes} table_root.attrib.clear() table_root.attrib.update(cleaned_attrs) - # text进行strip操作,tail去掉(有较多空换行) + # text进行strip操作,tail保留(部分内容留在tail中) for elem in chain([table_root], table_root.iterdescendants()): - if elem.text: + if elem.text is not None: elem.text = elem.text.strip() - if elem.tail: - elem.tail = None + if elem.tail is not None: + elem.tail = elem.tail.strip() + if not elem.tail: + elem.tail = None self.__simplify_td_th_content(table_root) # 迭代 for child in table_root.iterchildren(): if child is not None: self.__get_table_body(table_type, child) - return self._element_to_html(table_root) def __do_extract_tables(self, root: HtmlElement) -> None: diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_tail_text.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_tail_text.html new file mode 100644 index 00000000..4044b9a3 --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_tail_text.html @@ -0,0 +1,367 @@ + + + + + + + + + 🇷🇺 | Show hub - Big-Empty DC++ Dchublist NMDC and ADCs хабов Huburi Хаблист + + + + + + + + + + + + + + + + + + + + + + + + +
+ +

Big-Empty

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Client + https://dchublists.com/clients/FlylinkDC_x64.exe +
StatusOnline | ID: 975
URL + https://dchublists.com/hub-975 +
Address +NMDC | dchub://big-empty.ru +
ASN + Style-Com LLC +
Failover + Not available +
NameBig-Empty
Topic + Not available +
Description + Хаб сети Arbital +
Category + Not available +
Software + PtokaX 0.5.3.0 +
Owner + Self +
Location + RU Russian Federation +
Users + 25 | 55 +
Clones0
Share + 4.39 TB | 90.60 TB +
User limit10000
Share limit0 B
Slot limit0
Hub limit0
Reliability99.04%
Checked + 2024-12-09 03:06:01 | 2021-05-07 +
Votes + +0 | -0 | 0 +
Website + Not available +
Email + Not available +
+
+

Online users

+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NickShare
Darv1n1.55 TB
PtokaX0 B
1975628.43 GB
AndyDesktop0 B
Crtyujgfdscvgjh35.54 GB
DaymarixZZZ37.57 GB
Evgeniy_D76.15 GB
Julia0 B
Kuzma0 B
Larsenv0 B
MAXMED8888888864.10 GB
Qwerty_ytr_R724237.12 GB
SERG_B149.65 GB
Sculli156.92 GB
Shareaza404613.03 GB
Soliton14.68 GB
Sweaborg794.15 GB
Viktor138283179.23 GB
[fly]Fire_dU3JR10.72 GB
[fly]Monkey_QGrFy124.72 GB
[fly]Moon_x7m61.13 GB
kotbaun0 B
marcs3.62 GB
minili59.30 GB
y2b4k698df328djei3261.82 GB
+
+
+ +

Comments

+ There are no comments for this hub, you can write one here. +
+
+ + + + +\n\t\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\t\n\t\n\n\n\n\n\n\n\n
\n\t\t\t\t\t\n\t
\n\t\t\t\t\t\t\t\t
\n\t\t\t\n\t\t\t
\n\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\n\n
Выберите свой город:
\n
\n\n
\n
Выберите из списка:
\n
    \n
  • Абакан
  • \n
  • Ачинск
  • \n
  • Альметьевск
  • \n
  • Ангарск
  • \n
  • Архангельск
  • \n
  • Армавир
  • \n
  • Артём
  • \n
  • Арзамас
  • \n
  • Астрахань
  • \n
  • Балаково
  • \n
  • Балашиха
  • \n
  • Барнаул
  • \n
  • Батайск
  • \n
  • Белгород
  • \n
  • Бердск
  • \n
  • Березники
  • \n
  • Бийск
  • \n
  • Благовещенск
  • \n
  • Братск
  • \n
  • Брянск
  • \n
  • Чебоксары
  • \n
  • Челябинск
  • \n
  • Череповец
  • \n
  • Черкесск
  • \n
  • Чита
  • \n
  • Дербент
  • \n
  • Димитровград
  • \n
  • Долгопрудный
  • \n
  • Домодедово
  • \n
  • Дзержинск
  • \n
  • Екатеринбург
  • \n
  • Елец
  • \n
  • Электросталь
  • \n
  • Элиста
  • \n
  • Энгельс
  • \n
  • Ессентуки
  • \n
  • Евпатория
  • \n
  • Грозный
  • \n
  • Хабаровск
  • \n
  • Хасавюрт
  • \n
  • Химки
  • \n
  • Иркутск
  • \n
  • Иваново
  • \n
  • Ижевск
  • \n
  • Йошкар-Ола
  • \n
  • Калининград
  • \n
  • Калуга
  • \n
  • Каменск-Уральский
  • \n
  • Камышин
  • \n
  • Каспийск
  • \n
  • Казань
  • \n
  • Кемерово
  • \n
  • Керчь
  • \n
  • Киров
  • \n
  • Кисловодск
  • \n
  • Коломна
  • \n
  • Комсомольск-на-Амуре
  • \n
  • Копейск
  • \n
  • Королёв
  • \n
  • Кострома
  • \n
  • Ковров
  • \n
  • Краснодар
  • \n
  • Красногорск
  • \n
  • Красноярск
  • \n
  • Курган
  • \n
  • Курск
  • \n
  • Кызыл
  • \n
  • Липецк
  • \n
  • Люберцы
  • \n
  • Магнитогорск
  • \n
  • Махачкала
  • \n
  • Майкоп
  • \n
  • Миасс
  • \n
  • Мурманск
  • \n
  • Муром
  • \n
  • Мытищи
  • \n
  • Набережные Челны
  • \n
  • Находка
  • \n
  • Нальчик
  • \n
  • Назрань
  • \n
  • Нефтекамск
  • \n
  • Нефтеюганск
  • \n
  • Невинномысск
  • \n
  • Нижнекамск
  • \n
  • Нижневартовск
  • \n
  • Нижний Новгород
  • \n
  • Нижний Тагил
  • \n
  • Ногинск
  • \n
  • Норильск
  • \n
  • Новочебоксарск
  • \n
  • Новочеркасск
  • \n
  • Новокуйбышевск
  • \n
  • Новокузнецк
  • \n
  • Новомосковск
  • \n
  • Новороссийск
  • \n
  • Новошахтинск
  • \n
  • Новосибирск
  • \n
  • Новый Уренгой
  • \n
  • Ноябрьск
  • \n
  • Обнинск
  • \n
  • Одинцово
  • \n
  • Октябрьский
  • \n
  • Омск
  • \n
  • Орехово-Зуево
  • \n
  • Оренбург
  • \n
  • Орск
  • \n
  • Орёл
  • \n
  • Пенза
  • \n
  • Пермь
  • \n
  • Первоуральск
  • \n
  • Петропавловск-Камчатский
  • \n
  • Петрозаводск
  • \n
  • Подольск
  • \n
  • Прокопьевск
  • \n
  • Псков
  • \n
  • Пушкино
  • \n
  • Пятигорск
  • \n
  • Раменское
  • \n
  • Реутов
  • \n
  • Ростов-на-Дону
  • \n
  • Рубцовск
  • \n
  • Рязань
  • \n
  • Рыбинск
  • \n
  • Салават
  • \n
  • Самара
  • \n
  • Санкт-Петербург
  • \n
  • Саранск
  • \n
  • Саратов
  • \n
  • Сергиев Посад
  • \n
  • Серпухов
  • \n
  • Севастополь
  • \n
  • Северодвинск
  • \n
  • Северск
  • \n
  • Шахты
  • \n
  • Щёлково
  • \n
  • Симферополь
  • \n
  • Смоленск
  • \n
  • Сочи
  • \n
  • Старый Оскол
  • \n
  • Ставрополь
  • \n
  • Стерлитамак
  • \n
  • Сургут
  • \n
  • Сыктывкар
  • \n
  • Сызрань
  • \n
  • Таганрог
  • \n
  • Тамбов
  • \n
  • Тольятти
  • \n
  • Томск
  • \n
  • Тула
  • \n
  • Тверь
  • \n
  • Тюмень
  • \n
  • Уфа
  • \n
  • Улан-Удэ
  • \n
  • Ульяновск
  • \n
  • Уссурийск
  • \n
  • Великий Новгород
  • \n
  • Владикавказ
  • \n
  • Владимир
  • \n
  • Владивосток
  • \n
  • Волгодонск
  • \n
  • Волгоград
  • \n
  • Вологда
  • \n
  • Волжский
  • \n
  • Воронеж
  • \n
  • Якутск
  • \n
  • Ярославль
  • \n
  • Южно-Сахалинск
  • \n
  • Жуковский
  • \n
  • Златоуст
  • \n
\n
\n
\n\n\n\n
\n\n\n\n
\n
Не нашли свой город?
\n \n
\n\n
\n\t\n\n\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\tHot line8 800 752 18 22\n\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t
\n\t\t\t
\n\t\t
\n\t\t\t\t\t
\n\t\t
\n\t\t\t
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t \n\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t\t\t
\n\t\t
\n\t\t\t\t
\n\t\t\t
\n\t\t\t\t\n\t\t\t
\n\t\t\t
\n\t\t\t\t
\n\t\t\t\t\t\n\t\t\t\t
\n\t\t\t
\t\t\n\t\t\t\n\t\t\t\t\t\t
\n\t\t\t\t
\n\t\t\t
\n\t\t\t\t\t\t\n\t\t\t\t\t\t
\n\t\t\t\t\n\t\t\t\t2\n\t\t\t
\n\t\t\t\t\t
\n\t\t\t
\n\n\t\t\t\n\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t \n\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t2\n\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t
\n\t\t\t
\n\t\t\t
\n\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t\t\n\t\t\t\t\t
\n\t\t\t\t
\n\t\t\t
\n\t\t\t\n\t\t
\n\t
\n\t\t
\n\t\n
\n
\n\n\t\t\n\t
\n\t
\n\t\t
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t
\n\t
\t\n\t\t
\n\t\t\t
\n\t\t\t\t
\n\t\n
\t\t\t
\n\t\t\t
\n\t\t\t\t
\n\t\t\t\t\t

Miaflow в Северске

\n
\n\t
\n\t\t
Рейтинг 5.00 из 5 на основе опроса 3 пользователей
\t\t\t\t\t\t\t\t(3 отзыва клиентов)\n\t\t\t\t\t\t
\n\t
  В наличии
\n
\n\t\t\t
\n\t
84 
\n\t\t
\n\t\t\t\t\t\t\t\t\n\t\t\t
\n
\n
\n\t

Miaflow — это инновационный крем для омоложения лица, разработанный с использованием передовых технологий. Его уникальная формула, насыщенная ценными компонентами природы, обеспечивает интенсивный уход за кожей, возвращая ей молодость и сияние.

\n
\n\t\t\t\t
Заказать
\n\n
\n\n\t\n\t\n\tКатегория: Препараты для омоложения\n\t\n\t\n
\n
\n
\n
\n

* Не является лекарственным средством

\n
\n
\n
\n \"Оплата\"\n
\n

Оплата:

\n

при получении, наличными или банковской картой

\n
\n
\n
\n \"Доставка\"\n
\n

Доставка в Северске:

\n

1 - 7 дней, почтой или транспортными компаниями

\n
\n
\n
\n
Поделиться: 
\t\t\t\t
\n\t\t\t
\n\t\t
\n\t
\n\t\n
\n\t\t
\n\t\t
\n\t\t\t\n\t\t\t\n\t\t
\n\t\t
\n\t\t
Заказать\n\t\t\tMiaflow\t\t
\n\t\t\t
\n\t\t\t\t\t\t\t\t
\n\t\t\t\t\t

Преимущества

\n
    \n
  • Уменьшение морщин и линий
  • \n
  • Повышение упругости и эластичности кожи
  • \n
  • Омолаживающий эффект с первого применения
  • \n
  • Защита от вредного воздействия окружающей среды
  • \n
  • Глубокое увлажнение и питание
  • \n
\n

Принцип действия Miaflow

\n

Miaflow активирует естественные процессы обновления кожи, восстанавливая ее структуру и придавая заметный лифтинг-эффект. Это достигается благодаря уникальной комбинации активных ингредиентов.

\n

Состав Miaflow:

\n
    \n
  1. Концентрат пантов алтайского марала: Стимулирует обновление клеток, укрепляет структуру кожи.
  2. \n
  3. Концентрат трепанга: Обеспечивает увлажнение и смягчение, борется с признаками усталости.
  4. \n
  5. Каменное масло: Питает и улучшает тонус кожи.
  6. \n
  7. Живица кедровая и лиственничная: Прекрасные антисептики, поддерживают чистоту пор, способствуют заживлению.
  8. \n
  9. Эфирные масла кедра, тыквы, конопли, пихты, облепихи, чайного дерева, гвоздики: Обеспечивают ароматерапевтический эффект и усиливают регенерацию кожи.
  10. \n
\n

Клинические исследования

\n

Проведенные исследования показали, что более 90% участников заметили улучшение состояния кожи после использования Miaflow. Восстановление упругости, сокращение морщин, и природное сияние — вот результаты, подтвержденные клинически.

\n

Показания к применению

\n
    \n
  • Сухая и увядающая кожа
  • \n
  • Первые признаки старения
  • \n
  • Потеря упругости и эластичности
  • \n
\n

Способ применения Miaflow

\n

Наносите крем на чистую кожу лица и шеи массажными движениями до полного впитывания. Используйте утром и вечером для достижения максимального эффекта.

\n

Противопоказания Miaflow

\n

Не рекомендуется использовать при индивидуальной непереносимости к компонентам. Перед применением рекомендуется провести тест на небольшом участке кожи. В случае раздражения прекратите использование.

\n\t\t\t\t\t\t\t\t

Где купить Miaflow?

\n

Miaflow не продается в обычных аптеках в Северске и других регионах России. Однако, вы можете купить его у нас на сайте по выгодной цене 84  с удобной доставкой. Успешно достигните своих целей с данным средством!

\n\t\t\t\t\n\t\t\t\t
\n\t\t\t\t\t\t\t\t
\n\t\t\t\t\t \n
\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
Рейтинг:
Рейтинг 5.00 из 5 на основе опроса 3 пользователей
Тип товара: Препараты для омоложения
Форма:Крем
Объем:50 мл
Рецепт:Отпускается без рецепта
Способ хранения:Хранить при температуре 4-20°
Примечание:Беречь от детей
Оплата:Наличными/банковской картой
Доступность в Северске:В наличии
Доставка:2-7 Дней
Цена:84 
\n
\n\t\t\t\t\t\t\t\\n\t\t\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t
\n\t\t

3 отзывов о Miaflow

\n\t\t\t\t\t
    \n\t\t\t\t
  1. \n\t
    \n\t\t
    \n\n\t\t\t\n\t\t\t
    \n\n\t\t\t\t
    Оценка 5 из 5
    \n\t

    \n\t\tЕлена Евстегнеева \n\t\t\t\t \n\t

    \n\n\t\n\t\t\t
    \n\t\t
    \n\t\t

    Моя кожа претерпела настоящую революцию с Miaflow! Уже через неделю заметила, как морщины стали менее заметными, а цвет лица стал более ровным. Крем приятно наносится, быстро впитывается, и самое главное — результат на лице!

    \n
    \t
    \n
  2. \n
  3. \n\t
    \n\t\t
    \n\n\t\t\t\n\t\t\t
    \n\n\t\t\t\t
    Оценка 5 из 5
    \n\t

    \n\t\tЕрмаков Иван \n\t\t\t\t \n\t

    \n\n\t\n\t\t\t
    \n\t\t
    \n\t\t

    Совершенно случайно попробовал, и теперь я не могу себе представить свой уход без него. Кожа стала более упругой, а яркие следы усталости просто исчезли. Отличный продукт, с которым я чувствую себя настоящим джентльменом!

    \n
    \t
    \n
  4. \n
  5. \n\t
    \n\t\t
    \n\n\t\t\t\n\t\t\t
    \n\n\t\t\t\t
    Оценка 5 из 5
    \n\t

    \n\t\tЦветкова Ксения \n\t\t\t\t \n\t

    \n\n\t\n\t\t\t
    \n\t\t
    \n\t\t

    Мне было сложно найти подходящий уход для кожи после 50, но этот крем превзошел все мои ожидания! Мои друзья даже спрашивают, что я делаю, чтобы выглядеть так молодо. Этот крем — настоящее волшебство для кожи, и я рекомендую его каждой женщине!

    \n
    \t
    \n
  6. \n\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t

Средний рейтинг

\n\t\t\t\t\t\t

5.00

\n\t\t\t\t\t\t
Оценка 5.00 из 5
\t\t\t\t\t\t
\n\t\t\t\t\t\t\t3 Отзыв\t\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
5
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
100%
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
4
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
0%
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
3
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
0%
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
2
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
0%
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
1
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
0%
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\tНапишите отзыв

Ваш адрес email не будет опубликован. Обязательные поля помечены *

\n

\n

\n\n

\t
\n\t\t\t\t
\n\t\t\t
\n\t\t
\n\t\t
\n
\t\t\t\t\t\t\t\t\n\t\t\t\t
\n\t\t\t\t\t\t\t\t\n\t\t\t\t
\n\t\t\t\t\t
\"Miaflow\"
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\tMiaflow\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t84 ₽\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t
\n
\n
\n
\n\n\n\n\n\n
\n


\n

\n

Нажимая на кнопку, вы соглашаетесь с политикой конфиденциальности.

\n


\n

\n
\n\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t
\n\t\t\t\t
\n\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\tMiaflow\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t84 ₽\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
\"Miaflow\"
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t
\n
\n
\n
\n\n\n\n\n\n
\n


\n

\n

Нажимая на кнопку, вы соглашаетесь с политикой конфиденциальности.

\n


\n

\n
\t\t\t\t\t\t\n\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t
\n\n\t\t
\n\n\t
\n\n
\n\n
\n\n
\n\n\n\n\t
\n\n\t\t\t\t\t

Сопутствующие товары

\n\t\t\t\t\n\t\t
    \n\t\t\t\n\t\t\t\t\t
  • \n\t\t
    \n\t
    \n\t\t\t
    \n\t\t\t\t\t\t\t\t\n\t\t\t\t\t
    -25%
    \t\t\t
    \n\"Venzen\"\t\t
    \n\t\t\t
    Quick View \t\t
    \n\t\t\t
    \n\t
    \n\t\t
    \n\t\t\t
    \n\t\t\t\t\t\t\n\t\t\t\n\t\t\t\n
    Оценка 5.00 из 5
    \n\n\t1,990  1,490 \n\n
    Заказать\n
    \n\n\n
    \n\t\t\t
    \n
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t\tVenzen\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t\t1490 ₽\n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \"Venzen\"\n\t\t\t\t\t\t\t\t\tsrc=\"https://1bad.ru/wp-content/uploads/2022/09/venzen.jpg
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n
    \n
    \n
    \n\n\n\n\n\n
    \n


    \n

    \n

    Нажимая на кнопку, вы соглашаетесь с политикой конфиденциальности.

    \n


    \n

    \n
    \n\t\t\t\t\t
    \n\t\t\t\t
    \n\t\t\t
    \n\n
    \n\t\t
    \n\t
    \n
    \n
  • \n\t\t\t\n\t\t\t\t\t
  • \n\t\t
    \n\t
    \n\t\t\t
    \n\t\t\t\t\t\t\t\t\n\t\t\t
    \n\"Night\t\t
    \n\t\t\t
    Quick View \t\t
    \n\t\t\t
    \n\t
    \n\t\t
    \n\t\t\t
    \n\t\t\t\t\t\t\n\t\t\t\n\t\t\t\n
    Оценка 5.00 из 5
    \n\n\t149 \n\n
    Заказать\n
    \n\n\n
    \n\t\t\t
    \n
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t\tNight Miracle\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t\t149.00 ₽\n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \"Night
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n
    \n
    \n
    \n\n\n\n\n\n
    \n


    \n

    \n

    Нажимая на кнопку, вы соглашаетесь с политикой конфиденциальности.

    \n


    \n

    \n
    \n\t\t\t\t\t
    \n\t\t\t\t
    \n\t\t\t
    \n\n
    \n\t\t
    \n\t
    \n
    \n
  • \n\t\t\t\n\t\t\t\t\t
  • \n\t\t
    \n\t
    \n\t\t\t
    \n\t\t\t\t\t\t\t\t\n\t\t\t
    \n\"Молодильный\t\t
    \n\t\t\t
    Quick View \t\t
    \n\t\t\t
    \n\t
    \n\t\t
    \n\t\t\t
    \n\t\t\t\t\t\t\n\t\t\t\n\t\t\t\n
    Оценка 5.00 из 5
    \n\n\t149 \n\n
    Заказать\n
    \n\n\n
    \n\t\t\t
    \n
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t\tМолодильный спас\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t\t149.00 ₽\n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \"Молодильный
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n
    \n
    \n
    \n\n\n\n\n\n
    \n


    \n

    \n

    Нажимая на кнопку, вы соглашаетесь с политикой конфиденциальности.

    \n


    \n

    \n
    \n\t\t\t\t\t
    \n\t\t\t\t
    \n\t\t\t
    \n\n
    \n\t\t
    \n\t
    \n
    \n
  • \n\t\t\t\n\t\t\t\t\t
  • \n\t\t
    \n\t
    \n\t\t\t
    \n\t\t\t\t\t\t\t\t\n\t\t\t
    \n\"Zenza\t\t
    \n\t\t\t
    Quick View \t\t
    \n\t\t\t
    \n\t
    \n\t\t
    \n\t\t\t
    \n\t\t\t\t\t\t\n\t\t\t\n\t\t\t\n
    Оценка 5.00 из 5
    \n\n\t147 \n\n
    Заказать\n
    \n\n\n
    \n\t\t\t
    \n
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t\tZenza Cream\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t\t147.00 ₽\n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \"Zenza
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n
    \n
    \n
    \n\n\n\n\n\n
    \n


    \n

    \n

    Нажимая на кнопку, вы соглашаетесь с политикой конфиденциальности.

    \n


    \n

    \n
    \n\t\t\t\t\t
    \n\t\t\t\t
    \n\t\t\t
    \n\n
    \n\t\t
    \n\t
    \n
    \n
  • \n\t\t\t\n\t\t
\n\t
\n\t\t\n
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t
\t\t\t\t\t
\n\t
\n\n
\t\n\t\t
\n\t\t\t
\n\t\t\t\tЧто Вы ищете?\n\t\t\t\t
Закрыть
\n\t\t\t
\n\t\t\t\t
\n\t\t\t\n\t\t
\n\t\t\t\n\t\t\t\n\t\t\t
\n\t\t\t\t
    \n\t\t\t\t
\n\t\t\t
\n\t\t
\n\t\t\n\t
\n\t\t\n\t\t
\t\n\t
\n\n\n
\n\n\t
\n\t\t
\n\n\t\t\t
\n\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t\t

Вся информация на сайте - справочная. Перед применением лекарственных препаратов проконсультируйтесь с врачом. Дистанционная продажа БАД и лекарственных средств не осуществляется.

\n\n\t\t\t\t\t\t
\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t
\n\n\t\t\t\t\t\t

© 2023 1bad.ru 18+. Все права защищены.

\n\t\t\t\t\t
\n\t\t\t\t\t\n\t\t\t\t\t
\n\t\t\t\t\t\t

Адрес: г. Северск, ул. Курчатова, 11a

\n\t\t\t\t\t\t

Телефон: 8 800 752 18 22

\n\t\t\t\t\t\t

Почта: seversk@1bad.ru

\n\t\t\t\t\t
\n\t\t\t\t
\n\t\t\t
\n\t\t
\n\t
\n\n
\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\t\t\n\t\t\n
\r\n
\r\n
\r\n
\r\n
\r\n
\r\n \r\n
\r\n
\r\n
\r\n
\r\n
\r\n
\r\n\t\t\t\t\t\t
\r\n
\r\n
\r\n
\r\n
\r\n
    \r\n
  • \r\n
  • \r\n
\r\n\t\t\t\t\t\t\t\t\t\tSelect the fields to be shown. Others will be hidden. Drag and drop to rearrange the order.
    \r\n\t\t\t\t\t\t\t\t\t\t\t
  • Image
  • SKU
  • Rating
  • Price
  • Stock
  • Availability
  • Add to cart
  • Description
  • Content
  • Weight
  • Dimensions
  • Additional information
  • Attributes
  • Custom attributes
  • Custom fields
\r\n
\r\n
\r\n
\r\n
\r\n\t\t\t\t\t
\r\n
\r\n
\r\n
\r\n
\r\n \r\n
\r\n
\r\n
\r\n
\r\n\t\t\t\t\t
\r\n
\r\n
\r\n
\r\n\t\t\t\t\t\t\t\t\t \r\n\t\t\t\t\t\t\t\t\t
\r\n
\r\n
\r\n
\r\n\t\t\t\t\t\t\t\t
\r\n\t\t\t\t\t\t\t\t\t\tClick outside to hide the compare bar
\r\n\t\t\t\t\t\t\t\t \r\n\t\t\t\t\t\t\t\t \r\n\t\t\t\t\t\t\t\t
\r\n
\r\n
\r\n
\r\n
\r\n
\r\n\t\t\t\t\t\t\t\t\tCompare
\r\n
\r\n
\r\n
\r\n Compare\r\n \r\n × \r\n
\r\n
\r\n
\r\n Let's Compare!\r\n Continue shopping\r\n
\r\n
\r\n
\r\n
\r\n\t\t\t\t\t
\n\n\n\n\n\t\n\t\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n" \ No newline at end of file +\n\n\n\n\n\t\n\t\n\t\n\t\n\n\t\n\tMiaflow крем для лица: купить в Северске, цены в интернет-аптеке - 1bad.ru\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\t\n\t\n\n\n\n\n\n\n\n
\n\t\t\t\t\t\n\t
\n\t\t\t\t\t\t\t\t
\n\t\t\t\n\t\t\t
\n\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\n\n
Выберите свой город:
\n
\n\n
\n
Выберите из списка:
\n
    \n
  • Абакан
  • \n
  • Ачинск
  • \n
  • Альметьевск
  • \n
  • Ангарск
  • \n
  • Архангельск
  • \n
  • Армавир
  • \n
  • Артём
  • \n
  • Арзамас
  • \n
  • Астрахань
  • \n
  • Балаково
  • \n
  • Балашиха
  • \n
  • Барнаул
  • \n
  • Батайск
  • \n
  • Белгород
  • \n
  • Бердск
  • \n
  • Березники
  • \n
  • Бийск
  • \n
  • Благовещенск
  • \n
  • Братск
  • \n
  • Брянск
  • \n
  • Чебоксары
  • \n
  • Челябинск
  • \n
  • Череповец
  • \n
  • Черкесск
  • \n
  • Чита
  • \n
  • Дербент
  • \n
  • Димитровград
  • \n
  • Долгопрудный
  • \n
  • Домодедово
  • \n
  • Дзержинск
  • \n
  • Екатеринбург
  • \n
  • Елец
  • \n
  • Электросталь
  • \n
  • Элиста
  • \n
  • Энгельс
  • \n
  • Ессентуки
  • \n
  • Евпатория
  • \n
  • Грозный
  • \n
  • Хабаровск
  • \n
  • Хасавюрт
  • \n
  • Химки
  • \n
  • Иркутск
  • \n
  • Иваново
  • \n
  • Ижевск
  • \n
  • Йошкар-Ола
  • \n
  • Калининград
  • \n
  • Калуга
  • \n
  • Каменск-Уральский
  • \n
  • Камышин
  • \n
  • Каспийск
  • \n
  • Казань
  • \n
  • Кемерово
  • \n
  • Керчь
  • \n
  • Киров
  • \n
  • Кисловодск
  • \n
  • Коломна
  • \n
  • Комсомольск-на-Амуре
  • \n
  • Копейск
  • \n
  • Королёв
  • \n
  • Кострома
  • \n
  • Ковров
  • \n
  • Краснодар
  • \n
  • Красногорск
  • \n
  • Красноярск
  • \n
  • Курган
  • \n
  • Курск
  • \n
  • Кызыл
  • \n
  • Липецк
  • \n
  • Люберцы
  • \n
  • Магнитогорск
  • \n
  • Махачкала
  • \n
  • Майкоп
  • \n
  • Миасс
  • \n
  • Мурманск
  • \n
  • Муром
  • \n
  • Мытищи
  • \n
  • Набережные Челны
  • \n
  • Находка
  • \n
  • Нальчик
  • \n
  • Назрань
  • \n
  • Нефтекамск
  • \n
  • Нефтеюганск
  • \n
  • Невинномысск
  • \n
  • Нижнекамск
  • \n
  • Нижневартовск
  • \n
  • Нижний Новгород
  • \n
  • Нижний Тагил
  • \n
  • Ногинск
  • \n
  • Норильск
  • \n
  • Новочебоксарск
  • \n
  • Новочеркасск
  • \n
  • Новокуйбышевск
  • \n
  • Новокузнецк
  • \n
  • Новомосковск
  • \n
  • Новороссийск
  • \n
  • Новошахтинск
  • \n
  • Новосибирск
  • \n
  • Новый Уренгой
  • \n
  • Ноябрьск
  • \n
  • Обнинск
  • \n
  • Одинцово
  • \n
  • Октябрьский
  • \n
  • Омск
  • \n
  • Орехово-Зуево
  • \n
  • Оренбург
  • \n
  • Орск
  • \n
  • Орёл
  • \n
  • Пенза
  • \n
  • Пермь
  • \n
  • Первоуральск
  • \n
  • Петропавловск-Камчатский
  • \n
  • Петрозаводск
  • \n
  • Подольск
  • \n
  • Прокопьевск
  • \n
  • Псков
  • \n
  • Пушкино
  • \n
  • Пятигорск
  • \n
  • Раменское
  • \n
  • Реутов
  • \n
  • Ростов-на-Дону
  • \n
  • Рубцовск
  • \n
  • Рязань
  • \n
  • Рыбинск
  • \n
  • Салават
  • \n
  • Самара
  • \n
  • Санкт-Петербург
  • \n
  • Саранск
  • \n
  • Саратов
  • \n
  • Сергиев Посад
  • \n
  • Серпухов
  • \n
  • Севастополь
  • \n
  • Северодвинск
  • \n
  • Северск
  • \n
  • Шахты
  • \n
  • Щёлково
  • \n
  • Симферополь
  • \n
  • Смоленск
  • \n
  • Сочи
  • \n
  • Старый Оскол
  • \n
  • Ставрополь
  • \n
  • Стерлитамак
  • \n
  • Сургут
  • \n
  • Сыктывкар
  • \n
  • Сызрань
  • \n
  • Таганрог
  • \n
  • Тамбов
  • \n
  • Тольятти
  • \n
  • Томск
  • \n
  • Тула
  • \n
  • Тверь
  • \n
  • Тюмень
  • \n
  • Уфа
  • \n
  • Улан-Удэ
  • \n
  • Ульяновск
  • \n
  • Уссурийск
  • \n
  • Великий Новгород
  • \n
  • Владикавказ
  • \n
  • Владимир
  • \n
  • Владивосток
  • \n
  • Волгодонск
  • \n
  • Волгоград
  • \n
  • Вологда
  • \n
  • Волжский
  • \n
  • Воронеж
  • \n
  • Якутск
  • \n
  • Ярославль
  • \n
  • Южно-Сахалинск
  • \n
  • Жуковский
  • \n
  • Златоуст
  • \n
\n
\n
\n\n\n\n
\n\n\n\n
\n
Не нашли свой город?
\n \n
\n\n
\n\t\n\n\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\tHot line8 800 752 18 22\n\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t
\n\t\t\t
\n\t\t
\n\t\t\t\t\t
\n\t\t
\n\t\t\t
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t \n\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t\t\t
\n\t\t
\n\t\t\t\t
\n\t\t\t
\n\t\t\t\t\n\t\t\t
\n\t\t\t
\n\t\t\t\t
\n\t\t\t\t\t\n\t\t\t\t
\n\t\t\t
\t\t\n\t\t\t\n\t\t\t\t\t\t
\n\t\t\t\t
\n\t\t\t
\n\t\t\t\t\t\t\n\t\t\t\t\t\t
\n\t\t\t\t\n\t\t\t\t2\n\t\t\t
\n\t\t\t\t\t
\n\t\t\t
\n\n\t\t\t\n\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t \n\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t2\n\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t
\n\t\t\t
\n\t\t\t
\n\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t\t\n\t\t\t\t\t
\n\t\t\t\t
\n\t\t\t
\n\t\t\t\n\t\t
\n\t
\n\t\t
\n\t\n
\n
\n\n\t\t\n\t
\n\t
\n\t\t
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t
\n\t
\t\n\t\t
\n\t\t\t
\n\t\t\t\t
\n\t\n
\t\t\t
\n\t\t\t
\n\t\t\t\t
\n\t\t\t\t\t

Miaflow в Северске

\n
\n\t
\n\t\t
Рейтинг 5.00 из 5 на основе опроса 3 пользователей
\t\t\t\t\t\t\t\t(3 отзыва клиентов)\n\t\t\t\t\t\t
\n\t
  В наличии
\n
\n\t\t\t
\n\t
84 
\n\t\t
\n\t\t\t\t\t\t\t\t\n\t\t\t
\n
\n
\n\t

Miaflow — это инновационный крем для омоложения лица, разработанный с использованием передовых технологий. Его уникальная формула, насыщенная ценными компонентами природы, обеспечивает интенсивный уход за кожей, возвращая ей молодость и сияние.

\n
\n\t\t\t\t
Заказать
\n\n
\n\n\t\n\t\n\tКатегория: Препараты для омоложения\n\t\n\t\n
\n
\n
\n
\n

* Не является лекарственным средством

\n
\n
\n
\n \"Оплата\"\n
\n

Оплата:

\n

при получении, наличными или банковской картой

\n
\n
\n
\n \"Доставка\"\n
\n

Доставка в Северске:

\n

1 - 7 дней, почтой или транспортными компаниями

\n
\n
\n
\n
Поделиться: 
\t\t\t\t
\n\t\t\t
\n\t\t
\n\t
\n\t\n
\n\t\t
\n\t\t
\n\t\t\t\n\t\t\t\n\t\t
\n\t\t
\n\t\t
Заказать\n\t\t\tMiaflow\t\t
\n\t\t\t
\n\t\t\t\t\t\t\t\t
\n\t\t\t\t\t

Преимущества

\n
    \n
  • Уменьшение морщин и линий
  • \n
  • Повышение упругости и эластичности кожи
  • \n
  • Омолаживающий эффект с первого применения
  • \n
  • Защита от вредного воздействия окружающей среды
  • \n
  • Глубокое увлажнение и питание
  • \n
\n

Принцип действия Miaflow

\n

Miaflow активирует естественные процессы обновления кожи, восстанавливая ее структуру и придавая заметный лифтинг-эффект. Это достигается благодаря уникальной комбинации активных ингредиентов.

\n

Состав Miaflow:

\n
    \n
  1. Концентрат пантов алтайского марала: Стимулирует обновление клеток, укрепляет структуру кожи.
  2. \n
  3. Концентрат трепанга: Обеспечивает увлажнение и смягчение, борется с признаками усталости.
  4. \n
  5. Каменное масло: Питает и улучшает тонус кожи.
  6. \n
  7. Живица кедровая и лиственничная: Прекрасные антисептики, поддерживают чистоту пор, способствуют заживлению.
  8. \n
  9. Эфирные масла кедра, тыквы, конопли, пихты, облепихи, чайного дерева, гвоздики: Обеспечивают ароматерапевтический эффект и усиливают регенерацию кожи.
  10. \n
\n

Клинические исследования

\n

Проведенные исследования показали, что более 90% участников заметили улучшение состояния кожи после использования Miaflow. Восстановление упругости, сокращение морщин, и природное сияние — вот результаты, подтвержденные клинически.

\n

Показания к применению

\n
    \n
  • Сухая и увядающая кожа
  • \n
  • Первые признаки старения
  • \n
  • Потеря упругости и эластичности
  • \n
\n

Способ применения Miaflow

\n

Наносите крем на чистую кожу лица и шеи массажными движениями до полного впитывания. Используйте утром и вечером для достижения максимального эффекта.

\n

Противопоказания Miaflow

\n

Не рекомендуется использовать при индивидуальной непереносимости к компонентам. Перед применением рекомендуется провести тест на небольшом участке кожи. В случае раздражения прекратите использование.

\n\t\t\t\t\t\t\t\t

Где купить Miaflow?

\n

Miaflow не продается в обычных аптеках в Северске и других регионах России. Однако, вы можете купить его у нас на сайте по выгодной цене 84  с удобной доставкой. Успешно достигните своих целей с данным средством!

\n\t\t\t\t\n\t\t\t\t
\n\t\t\t\t\t\t\t\t
\n\t\t\t\t\t \n
\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
Рейтинг:
Рейтинг 5.00 из 5 на основе опроса 3 пользователей
Тип товара: Препараты для омоложения
Форма:Крем
Объем:50 мл
Рецепт:Отпускается без рецепта
Способ хранения:Хранить при температуре 4-20°
Примечание:Беречь от детей
Оплата:Наличными/банковской картой
Доступность в Северске:В наличии
Доставка:2-7 Дней
Цена:84 
\n
\n\t\t\t\t\t\t\t\\n\t\t\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t
\n\t\t

3 отзывов о Miaflow

\n\t\t\t\t\t
    \n\t\t\t\t
  1. \n\t
    \n\t\t
    \n\n\t\t\t\n\t\t\t
    \n\n\t\t\t\t
    Оценка 5 из 5
    \n\t

    \n\t\tЕлена Евстегнеева \n\t\t\t\t \n\t

    \n\n\t\n\t\t\t
    \n\t\t
    \n\t\t

    Моя кожа претерпела настоящую революцию с Miaflow! Уже через неделю заметила, как морщины стали менее заметными, а цвет лица стал более ровным. Крем приятно наносится, быстро впитывается, и самое главное — результат на лице!

    \n
    \t
    \n
  2. \n
  3. \n\t
    \n\t\t
    \n\n\t\t\t\n\t\t\t
    \n\n\t\t\t\t
    Оценка 5 из 5
    \n\t

    \n\t\tЕрмаков Иван \n\t\t\t\t \n\t

    \n\n\t\n\t\t\t
    \n\t\t
    \n\t\t

    Совершенно случайно попробовал, и теперь я не могу себе представить свой уход без него. Кожа стала более упругой, а яркие следы усталости просто исчезли. Отличный продукт, с которым я чувствую себя настоящим джентльменом!

    \n
    \t
    \n
  4. \n
  5. \n\t
    \n\t\t
    \n\n\t\t\t\n\t\t\t
    \n\n\t\t\t\t
    Оценка 5 из 5
    \n\t

    \n\t\tЦветкова Ксения \n\t\t\t\t \n\t

    \n\n\t\n\t\t\t
    \n\t\t
    \n\t\t

    Мне было сложно найти подходящий уход для кожи после 50, но этот крем превзошел все мои ожидания! Мои друзья даже спрашивают, что я делаю, чтобы выглядеть так молодо. Этот крем — настоящее волшебство для кожи, и я рекомендую его каждой женщине!

    \n
    \t
    \n
  6. \n\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t

Средний рейтинг

\n\t\t\t\t\t\t

5.00

\n\t\t\t\t\t\t
Оценка 5.00 из 5
\t\t\t\t\t\t
\n\t\t\t\t\t\t\t3 Отзыв\t\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
5
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
100%
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
4
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
0%
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
3
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
0%
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
2
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
0%
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
1
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
0%
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\tНапишите отзыв

Ваш адрес email не будет опубликован. Обязательные поля помечены *

\n

\n

\n\n

\t
\n\t\t\t\t
\n\t\t\t
\n\t\t
\n\t\t
\n
\t\t\t\t\t\t\t\t\n\t\t\t\t
\n\t\t\t\t\t\t\t\t\n\t\t\t\t
\n\t\t\t\t\t
\"Miaflow\"
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\tMiaflow\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t84 ₽\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t
\n
\n
\n
\n\n\n\n\n\n
\n


\n

\n

Нажимая на кнопку, вы соглашаетесь с политикой конфиденциальности.

\n


\n

\n
\n\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t
\n\t\t\t\t
\n\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\tMiaflow\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t84 ₽\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
\"Miaflow\"
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t
\n
\n
\n
\n\n\n\n\n\n
\n


\n

\n

Нажимая на кнопку, вы соглашаетесь с политикой конфиденциальности.

\n


\n

\n
\t\t\t\t\t\t\n\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t
\n\n\t\t
\n\n\t
\n\n
\n\n
\n\n
\n\n\n\n\t
\n\n\t\t\t\t\t

Сопутствующие товары

\n\t\t\t\t\n\t\t
    \n\t\t\t\n\t\t\t\t\t
  • \n\t\t
    \n\t
    \n\t\t\t
    \n\t\t\t\t\t\t\t\t\n\t\t\t\t\t
    -25%
    \t\t\t
    \n\"Venzen\"\t\t
    \n\t\t\t
    Quick View \t\t
    \n\t\t\t
    \n\t
    \n\t\t
    \n\t\t\t
    \n\t\t\t\t\t\t\n\t\t\t\n\t\t\t\n
    Оценка 5.00 из 5
    \n\n\t1,990  1,490 \n\n
    Заказать\n
    \n\n\n
    \n\t\t\t
    \n
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t\tVenzen\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t\t1490 ₽\n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \"Venzen\"\n\t\t\t\t\t\t\t\t\tsrc=\"https://1bad.ru/wp-content/uploads/2022/09/venzen.jpg
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n
    \n
    \n
    \n\n\n\n\n\n
    \n


    \n

    \n

    Нажимая на кнопку, вы соглашаетесь с политикой конфиденциальности.

    \n


    \n

    \n
    \n\t\t\t\t\t
    \n\t\t\t\t
    \n\t\t\t
    \n\n
    \n\t\t
    \n\t
    \n
    \n
  • \n\t\t\t\n\t\t\t\t\t
  • \n\t\t
    \n\t
    \n\t\t\t
    \n\t\t\t\t\t\t\t\t\n\t\t\t
    \n\"Night\t\t
    \n\t\t\t
    Quick View \t\t
    \n\t\t\t
    \n\t
    \n\t\t
    \n\t\t\t
    \n\t\t\t\t\t\t\n\t\t\t\n\t\t\t\n
    Оценка 5.00 из 5
    \n\n\t149 \n\n
    Заказать\n
    \n\n\n
    \n\t\t\t
    \n
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t\tNight Miracle\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t\t149.00 ₽\n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \"Night
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n
    \n
    \n
    \n\n\n\n\n\n
    \n


    \n

    \n

    Нажимая на кнопку, вы соглашаетесь с политикой конфиденциальности.

    \n


    \n

    \n
    \n\t\t\t\t\t
    \n\t\t\t\t
    \n\t\t\t
    \n\n
    \n\t\t
    \n\t
    \n
    \n
  • \n\t\t\t\n\t\t\t\t\t
  • \n\t\t
    \n\t
    \n\t\t\t
    \n\t\t\t\t\t\t\t\t\n\t\t\t
    \n\"Молодильный\t\t
    \n\t\t\t
    Quick View \t\t
    \n\t\t\t
    \n\t
    \n\t\t
    \n\t\t\t
    \n\t\t\t\t\t\t\n\t\t\t\n\t\t\t\n
    Оценка 5.00 из 5
    \n\n\t149 \n\n
    Заказать\n
    \n\n\n
    \n\t\t\t
    \n
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t\tМолодильный спас\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t\t149.00 ₽\n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \"Молодильный
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n
    \n
    \n
    \n\n\n\n\n\n
    \n


    \n

    \n

    Нажимая на кнопку, вы соглашаетесь с политикой конфиденциальности.

    \n


    \n

    \n
    \n\t\t\t\t\t
    \n\t\t\t\t
    \n\t\t\t
    \n\n
    \n\t\t
    \n\t
    \n
    \n
  • \n\t\t\t\n\t\t\t\t\t
  • \n\t\t
    \n\t
    \n\t\t\t
    \n\t\t\t\t\t\t\t\t\n\t\t\t
    \n\"Zenza\t\t
    \n\t\t\t
    Quick View \t\t
    \n\t\t\t
    \n\t
    \n\t\t
    \n\t\t\t
    \n\t\t\t\t\t\t\n\t\t\t\n\t\t\t\n
    Оценка 5.00 из 5
    \n\n\t147 \n\n
    Заказать\n
    \n\n\n
    \n\t\t\t
    \n
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t\tZenza Cream\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t\t147.00 ₽\n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \"Zenza
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n
    \n
    \n
    \n\n\n\n\n\n
    \n


    \n

    \n

    Нажимая на кнопку, вы соглашаетесь с политикой конфиденциальности.

    \n


    \n

    \n
    \n\t\t\t\t\t
    \n\t\t\t\t
    \n\t\t\t
    \n\n
    \n\t\t
    \n\t
    \n
    \n
  • \n\t\t\t\n\t\t
\n\t
\n\t\t\n
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t
\t\t\t\t\t
\n\t
\n\n
\t\n\t\t
\n\t\t\t
\n\t\t\t\tЧто Вы ищете?\n\t\t\t\t
Закрыть
\n\t\t\t
\n\t\t\t\t
\n\t\t\t\n\t\t
\n\t\t\t\n\t\t\t\n\t\t\t
\n\t\t\t\t
    \n\t\t\t\t
\n\t\t\t
\n\t\t
\n\t\t\n\t
\n\t\t\n\t\t
\t\n\t
\n\n\n
\n\n\t
\n\t\t
\n\n\t\t\t
\n\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t\t

Вся информация на сайте - справочная. Перед применением лекарственных препаратов проконсультируйтесь с врачом. Дистанционная продажа БАД и лекарственных средств не осуществляется.

\n\n\t\t\t\t\t\t
\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t
\n\n\t\t\t\t\t\t

© 2023 1bad.ru 18+. Все права защищены.

\n\t\t\t\t\t
\n\t\t\t\t\t\n\t\t\t\t\t
\n\t\t\t\t\t\t

Адрес: г. Северск, ул. Курчатова, 11a

\n\t\t\t\t\t\t

Телефон: 8 800 752 18 22

\n\t\t\t\t\t\t

Почта: seversk@1bad.ru

\n\t\t\t\t\t
\n\t\t\t\t
\n\t\t\t
\n\t\t
\n\t
\n\n
\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\t\t\n\t\t\n
\r\n
\r\n
\r\n
\r\n
\r\n
\r\n \r\n
\r\n
\r\n
\r\n
\r\n
\r\n
\r\n\t\t\t\t\t\t
\r\n
\r\n
\r\n
\r\n
\r\n
    \r\n
  • \r\n
  • \r\n
\r\n\t\t\t\t\t\t\t\t\t\tSelect the fields to be shown. Others will be hidden. Drag and drop to rearrange the order.
    \r\n\t\t\t\t\t\t\t\t\t\t\t
  • Image
  • SKU
  • Rating
  • Price
  • Stock
  • Availability
  • Add to cart
  • Description
  • Content
  • Weight
  • Dimensions
  • Additional information
  • Attributes
  • Custom attributes
  • Custom fields
\r\n
\r\n
\r\n
\r\n
\r\n\t\t\t\t\t
\r\n
\r\n
\r\n
\r\n
\r\n \r\n
\r\n
\r\n
\r\n
\r\n\t\t\t\t\t
\r\n
\r\n
\r\n
\r\n\t\t\t\t\t\t\t\t\t \r\n\t\t\t\t\t\t\t\t\t
\r\n
\r\n
\r\n
\r\n\t\t\t\t\t\t\t\t
\r\n\t\t\t\t\t\t\t\t\t\tClick outside to hide the compare bar
\r\n\t\t\t\t\t\t\t\t \r\n\t\t\t\t\t\t\t\t \r\n\t\t\t\t\t\t\t\t
\r\n
\r\n
\r\n
\r\n
\r\n
\r\n\t\t\t\t\t\t\t\t\tCompare
\r\n
\r\n
\r\n
\r\n Compare\r\n \r\n × \r\n
\r\n
\r\n
\r\n Let's Compare!\r\n Continue shopping\r\n
\r\n
\r\n
\r\n
\r\n\t\t\t\t\t
\n\n\n\n\n\t\n\t\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \ No newline at end of file From 94f627f4e8709980ae91a01a1919dab25fc266b6 Mon Sep 17 00:00:00 2001 From: dt-yy Date: Thu, 6 Mar 2025 12:59:54 +0800 Subject: [PATCH 24/46] remove enter in table --- llm_web_kit/extractor/html/extractor.py | 2 +- llm_web_kit/libs/html_utils.py | 17 +- .../html/test_table_elem_include_enter.html | 3136 +++++++++++++++++ .../good_data/html_data_input.jsonl | 3 +- .../extractor/html/recognizer/test_table.py | 3 + .../extractor/test_extractor_chain.py | 19 +- 6 files changed, 3175 insertions(+), 5 deletions(-) create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/test_table_elem_include_enter.html diff --git a/llm_web_kit/extractor/html/extractor.py b/llm_web_kit/extractor/html/extractor.py index 1d3facb3..f68950d3 100644 --- a/llm_web_kit/extractor/html/extractor.py +++ b/llm_web_kit/extractor/html/extractor.py @@ -290,7 +290,7 @@ def __get_cc_node(self, html:str) -> (str, str): nodes = el.xpath(xpath_expr) if len(nodes) == 0: raise ValueError(f'html文本中没有cc标签: {html}') # TODO 异常处理 - if len(nodes) > 1: + if len(nodes) > 2: raise ValueError(f'html文本中包含多个cc标签: {html}') # TODO 异常处理 return element_to_html(nodes[0]), nodes[0].tag diff --git a/llm_web_kit/libs/html_utils.py b/llm_web_kit/libs/html_utils.py index c4628f9d..faf257a8 100644 --- a/llm_web_kit/libs/html_utils.py +++ b/llm_web_kit/libs/html_utils.py @@ -1,4 +1,5 @@ import html +import re from copy import deepcopy from lxml.html import HtmlElement, HTMLParser, fromstring, tostring @@ -114,6 +115,18 @@ def iter_node(element: HtmlElement): yield from iter_node(sub_element) +def _escape_table_cell(text: str) -> str: + """转义表格单元格中的特殊字符. + + 比如 |、内容中的\n等 + """ + # 首先处理换行符,将其替换为空格 + text = re.sub(r'[\r\n]+', ' ', text) + # 转义竖线和点号,避免与markdown表格语法冲突 + escaped = text.replace('|', '\\|') + return escaped + + def html_to_markdown_table(table_html_source: str) -> str: """把html代码片段转换成markdown表格. @@ -140,7 +153,7 @@ def html_to_markdown_table(table_html_source: str) -> str: # 检查第一行是否是表头并获取表头内容 first_row_tags = rows[0].xpath('.//th | .//td') - headers = [tag.text_content().strip() for tag in first_row_tags] + headers = [_escape_table_cell(tag.text_content().strip()) for tag in first_row_tags] # 如果表头存在,添加表头和分隔符,并保证表头与最大列数对齐 if headers: while len(headers) < max_cols: @@ -155,7 +168,7 @@ def html_to_markdown_table(table_html_source: str) -> str: # 添加表格内容,跳过已被用作表头的第一行(如果有的话) for row in rows[1:]: - columns = [td.text_content().strip() for td in row.xpath('.//td | .//th')] + columns = [_escape_table_cell(td.text_content().strip()) for td in row.xpath('.//td | .//th')] # 如果这一行的列数少于最大列数,则补充空白单元格 while len(columns) < max_cols: columns.append('') diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/test_table_elem_include_enter.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/test_table_elem_include_enter.html new file mode 100644 index 00000000..176f4fab --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/test_table_elem_include_enter.html @@ -0,0 +1,3136 @@ + + + + + + + + + + + + + + + + + + + + + دانلود ترجمه مقاله توسعه مالی و هزینه سرمایه حقوق سهامداران + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + +
+ +
+ + + + + + + + + +
+ +
+
+
+
+
+
+ + +
+
+
+
+
+
+
+
+

+دانلود ترجمه مقاله توسعه مالی و هزینه سرمایه حقوق سهامداران

+
+
+
+
+
+
+
+
+
+ + + + + + + + + + + + + + + + + + + +
+ + عنوان فارسی + + +

+توسعه مالی و هزینه سرمایه حقوق سهامداران: شواهدی از چین

+
+ + عنوان انگلیسی + + +

+ Financial development and the cost of equity capital: Evidence from China

+
+ + کلمات کلیدی : + + +

+   + توسعه مالی؛ هزینه سرمایه حقوق سهامداران؛ قانون و امور مالی؛ چین

+
+ + درسهای مرتبط + + + + حسابداری +
+ + + + + + + + + + + + + + + + + + + +
+ تعداد صفحات مقاله انگلیسی : + 35 + نشریه : +ELSEVIER
+ سال انتشار : + 2015 + تعداد رفرنس مقاله : + 112
+ فرمت مقاله انگلیسی : + PDF + + نوع مقاله : + ISI +
+ پاورپوینت : + ندارد + وضعیت ترجمه مقاله : + انجام نشده است.
+
+
+
+
+ +
+
+
+
+
+ فهرست مطالب +
+
+

+1. مقدمه +2. پیشینه نهادی +3. چارچوب نظری +4. طرح تحقیق +5. نتایج تجربی +6. آنالیز بیشتر: تاثیرات فاکتورهای نهادی +7. بررسی دقت +8. نتیجه گیری +

+
+
+سفارش ترجمه +
+
+ ترجمه نمونه متن انگلیسی +
+
+

+ این مطالعه، رابطه بین توسعه مالی سطح استان و هزینه دارایی ویژه در چین را بررسی می کند. یافته های اصلی ما از این قرارند که (1) توسعه بازار سهام، بطور کل هزینه دارایی ویژه را کاهش می دهد، اما این اثر در شرکت های دولتی (SOE) و شرکت های دارای پتانسیل رشد یا شدت نوآوری زیاد، به میزان قابل توجهی کمرنگ می شود و (2) توسعه بانکداری تنها به صورت جزئی هزینه دارایی ویژه را کاهش می دهد، اما این اثر در شرکت های غیر SOE، قویتر است. تحلیل های بیشتر جایگزین های توسعه بازار سهام برای چنین عوامل نهادی مانند کیفیت حسابداری، اجرای قانون، تلفیق بازار سهام و اصلاح ساختار تقسیم سهام در کاهش هزینه دارایی ویژه را آشکار می کنند. همچنین در می یابیم که عدم وجود رقابت در بانکداری و بازاری کردن بانکداری و توسعه ضعیف اقتصاد غیردولتی تاحدی مسئول اثر ضعیف توسعه بانکداری بر هزینه دارایی ویژه می باشد. + +مقدمه: +این مطالعه، تاثر توسعه مالی منطقه ای بر هزینه دارایی ویژه در چین را با استفاده از یک نمونه بزرگ از شرکت های چینی پذیرفته شده در بورس اوراق بهادار شانگهای (SHSE) و بورس اوراق بهادار شنزن (SZSE) در دوره 1998 تا 2008، را بررسی می کند. مخصوصاً اینکه، طبق رویکرد جایاراتنه و استراهان (1996) و گویسو و همکاران (2004 الف، 2004 ب)، بررسی می کنیم که آیا توسعه مالی منطقه ای سطح استانی در یک کشور با هزینه دارایی ویژه ارتباط دارد یا خیر و چه ارتباطی و همچنین اینکه این رابطه چگونه براساس زیرساخت های نهادی مانند اجرای قانونی، کیفیت حسابداری و مقررات دیگر، شرطی می شوند.

+
+
+
+
+ نمونه متن انگلیسی مقاله +
+
+

+ This study examines the relation between province-level financial development and the cost of equity in China. Our main findings are that (1) stock market development reduces the cost of equity in general, but the effect diminishes significantly in state-owned enterprises (SOEs) and firms with high growth potential or innovation intensity and (2) banking development only marginally lowers the cost of equity, but the effect is stronger in non-SOEs. Further analysis reveals that stock market development substitutes for such institutional factors as accounting quality, law enforcement, stock market integration and the split-share structure reform in lowering the cost of equity. We also find that lack of banking competition and banking marketization and under-development of the non-state economy partially account for the weak effect of banking development on the cost of equity. + +Introduction: +This study examines the impact of regional financial development on the cost of equity capital in China, using a large sample of Chinese firms listed on the Shanghai Stock Exchange (SHSE) and Shenzhen Stock Exchange (SZSE) over the period from 1998 to 2008. Specifically, following the approach of Jayaratne and Strahan (1996) and Guiso et al. (2004a, 2004b), we investigate whether and how regional province-level financial development within the same country is associated with the cost of equity, and how the relation is conditioned upon institutional infrastructures such as legal enforcement, accounting quality and other regulations.

+
+
+
+
+ توضیحات و مشاهده مقاله انگلیسی +
+
+

+
+
+
+
+ + +
+
+
سفارش ترجمه تخصصی این مقاله
+
+
+ + + + +
+
+
+
+
+

+ دیدگاهها

+ +

هیچ دیدگاهی برای این محصول نوشته نشده است.

+
+ +
+
+
+ اولین نفری باشید که دیدگاهی را ارسال می کنید برای “دانلود ترجمه مقاله توسعه مالی و هزینه سرمایه حقوق سهامداران”

نشانی ایمیل شما منتشر نخواهد شد. بخش‌های موردنیاز علامت‌گذاری شده‌اند *

+ +

4 × دو =

+ +

+
+
+ +
+
+
+
+
+
+
+
+

پروپوزال آماده

+
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + +
+

مقالات ترجمه شده

+
+ +
+
+ +
+
+ +
+ + +
+ +
+ + +
+ +
+ + +
+ +
+ + +
+ +
+ + +
+ +
+ + +
+ +
+ + +
+ +
+ + +
+ +
+ + +
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + +
+

پایان نامه آماده

+
+ + + + +
+

مطالب علمی

+
+ + + + +
+ +
+
+
+
+
+
+

نماد اعتماد الکترونیکی

+
+
+
+
+
+
+

پشتیبانی

+
+
logo-samandehi
+
+
+ +
+
+
+
+ + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl index 12bfb843..0c68d085 100644 --- a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl @@ -10,4 +10,5 @@ {"track_id": "legato_doc", "dataset_name": "test_pipeline_suit", "url": "https://www.test.com","data_source_category": "HTML", "path":"legato_docs.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} {"track_id": "oracle_doc", "dataset_name": "test_pipeline_suit", "url": "https://docs.oracle.com/en-us/iaas/tools/java/3.57.1/com/oracle/bmc/integration/model/CustomEndpointDetails.html","data_source_category": "HTML", "path":"oracle_doc.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} {"track_id": "table_involve_inline_code", "dataset_name": "test_table_involve_inline_code", "url": "https://docs.oracle.com/en-us/iaas/tools/java/3.57.1/com/oracle/bmc/integration/model/CustomEndpointDetails.html","data_source_category": "HTML", "path":"table_involve_inline_code.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} -{"track_id": "table_tail_text", "dataset_name": "test_table_tail_text", "url": "https://dchublists.com/?do=hublist&id=hub-975&language=en","data_source_category": "HTML", "path":"table_tail_text.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} \ No newline at end of file +{"track_id": "table_tail_text", "dataset_name": "test_table_tail_text", "url": "https://dchublists.com/?do=hublist&id=hub-975&language=en","data_source_category": "HTML", "path":"table_tail_text.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} +{"track_id": "table_elem_include_enter", "dataset_name": "table_elem_include_enter", "url": "https://fardapaper.ir/financial-development-equity-capital","data_source_category": "HTML", "path":"test_table_elem_include_enter.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index 18a40327..fe81f8e3 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -89,6 +89,7 @@ def test_cc_simple_table(self): parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) assert len(parts) == 3 content = html_to_element(parts[1][0]).text_content() + print(content) assert content == r'
Рейтинг:Рейтинг<br>5.00<br>из 5 на основе опроса<br>3<br>пользователей
Тип товара:Препараты для омоложения
Форма:Крем
Объем:50 мл
Рецепт:Отпускается без рецепта
Способ хранения:Хранить при температуре 4-20°
Примечание:Беречь от детей
Оплата:Наличными/банковской картой
Доступность в Северске:В наличии
Доставка:2-7 Дней
Цена:84<br>₽
' def test_cc_complex_table(self): @@ -166,6 +167,7 @@ def test_table_involve_after_code(self): parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) assert html_to_element(parts[0][0]).xpath(f'.//{CCTag.CC_TABLE}')[0].text is None + @unittest.skip(reason='在code模块解决了table嵌套多行代码问题') def test_table_involve_code(self): """table involve code.""" for test_case in TEST_CASES: @@ -176,6 +178,7 @@ def test_table_involve_code(self): complex_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}') expect_path = base_dir.joinpath(test_case['expected'][3]) content = open(expect_path, 'r', encoding='utf-8').read() + print(content) assert complex_table_tag[0].text == content.strip('\n') @unittest.skip(reason='在code模块解决了这个问题') diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index f55f2232..14ea7127 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -59,7 +59,7 @@ def setUp(self): for line in f: self.data_json.append(json.loads(line.strip())) - assert len(self.data_json) == 13 + assert len(self.data_json) == 14 # Config for HTML extraction self.config = { @@ -369,4 +369,21 @@ def test_table_tail_text(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_md = result.get_content_list().to_mm_md() + print(content_md) assert '| ID: 975' in content_md + + def test_table_element_include_enter(self): + """table的元素中间有换行.""" + chain = ExtractSimpleFactory.create(self.config) + self.assertIsNotNone(chain) + test_data = self.data_json[13] + # Create DataJson from test data + input_data = DataJson(test_data) + result = chain.extract(input_data) + content_md = result.get_content_list().to_mm_md() + print(content_md) + assert """| عنوان فارسی | توسعه مالی و هزینه سرمایه حقوق سهامداران: شواهدی از چین | +|---|---| +| عنوان انگلیسی | Financial development and the cost of equity capital: Evidence from China | +| کلمات کلیدی : |   توسعه مالی؛ هزینه سرمایه حقوق سهامداران؛ قانون و امور مالی؛ چین | +| درسهای مرتبط | حسابداری |""" in content_md From 95120d06a2ddc3e398ea208902bb680d5e6b4c79 Mon Sep 17 00:00:00 2001 From: dt-yy Date: Thu, 6 Mar 2025 13:29:21 +0800 Subject: [PATCH 25/46] remove print --- tests/llm_web_kit/extractor/html/recognizer/test_table.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index fe81f8e3..0608e825 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -89,7 +89,6 @@ def test_cc_simple_table(self): parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) assert len(parts) == 3 content = html_to_element(parts[1][0]).text_content() - print(content) assert content == r'
Рейтинг:Рейтинг<br>5.00<br>из 5 на основе опроса<br>3<br>пользователей
Тип товара:Препараты для омоложения
Форма:Крем
Объем:50 мл
Рецепт:Отпускается без рецепта
Способ хранения:Хранить при температуре 4-20°
Примечание:Беречь от детей
Оплата:Наличными/банковской картой
Доступность в Северске:В наличии
Доставка:2-7 Дней
Цена:84<br>₽
' def test_cc_complex_table(self): @@ -178,7 +177,6 @@ def test_table_involve_code(self): complex_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}') expect_path = base_dir.joinpath(test_case['expected'][3]) content = open(expect_path, 'r', encoding='utf-8').read() - print(content) assert complex_table_tag[0].text == content.strip('\n') @unittest.skip(reason='在code模块解决了这个问题') From 1c5ff71860bbdb747a054e194ca741cd944f0374 Mon Sep 17 00:00:00 2001 From: yyy <102640628+dt-yy@users.noreply.github.com> Date: Thu, 6 Mar 2025 14:19:48 +0800 Subject: [PATCH 26/46] remove print --- tests/llm_web_kit/extractor/test_extractor_chain.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index 14ea7127..c2e5ee2b 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -369,7 +369,6 @@ def test_table_tail_text(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_md = result.get_content_list().to_mm_md() - print(content_md) assert '| ID: 975' in content_md def test_table_element_include_enter(self): @@ -381,7 +380,6 @@ def test_table_element_include_enter(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_md = result.get_content_list().to_mm_md() - print(content_md) assert """| عنوان فارسی | توسعه مالی و هزینه سرمایه حقوق سهامداران: شواهدی از چین | |---|---| | عنوان انگلیسی | Financial development and the cost of equity capital: Evidence from China | From 371800ed25adf6dcfd6e469656c8063cf7fa09f2 Mon Sep 17 00:00:00 2001 From: dt-yy Date: Thu, 6 Mar 2025 19:38:22 +0800 Subject: [PATCH 27/46] add exception --- llm_web_kit/extractor/html/extractor.py | 70 +- .../extractor/html/recognizer/table.py | 107 +- .../good_data/html/table_include_math_p.html | 2326 +++++++++++++++++ .../good_data/html/test_list_empty.html | 2000 ++++++++++++++ .../good_data/html_data_input.jsonl | 4 +- .../extractor/test_extractor_chain.py | 25 +- 6 files changed, 4468 insertions(+), 64 deletions(-) create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_include_math_p.html create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/test_list_empty.html diff --git a/llm_web_kit/extractor/html/extractor.py b/llm_web_kit/extractor/html/extractor.py index f68950d3..53565f15 100644 --- a/llm_web_kit/extractor/html/extractor.py +++ b/llm_web_kit/extractor/html/extractor.py @@ -22,6 +22,7 @@ from llm_web_kit.libs.html_utils import element_to_html, html_to_element from llm_web_kit.libs.logger import mylogger from llm_web_kit.libs.path_lib import get_py_pkg_root_dir +from llm_web_kit.exception.exception import HtmlFileExtractorException class HTMLPageLayoutType: @@ -245,6 +246,63 @@ def _extract_paragraph(self, base_url:str, html_lst:List[Tuple[str,str]], raw_ht lst = self.__paragraph_recognizer.recognize(base_url, html_lst, raw_html) return lst + def __is_valid_node(self, node: dict) -> bool: + """检查节点是否有效(不为空). + + Args: + node (dict): 内容节点 + + Returns: + bool: 如果节点有效返回True,否则返回False + """ + if not node: + raise HtmlFileExtractorException("node is empty") + node_type = node.get('type') + valid_types = {'list', 'code', 'equation-interline', 'image', 'table', 'title', 'paragraph'} + if node_type not in valid_types: + raise HtmlFileExtractorException(f"Invalid node type: {node_type}") + # 检查列表类型的节点 + if node.get('type') == 'list': + items = node.get('content', {}).get('items', []) + # 过滤掉None、空列表,以及只包含None或空值的列表 + return bool(items) and any( + isinstance(item, (dict, list)) and bool(item) + for item in items) + #检测code类型的节点 + if node.get('type') == 'code': + code_content = node.get('content', {}).get('code_content') + # 如果代码内容为None或空字符串,则视为无效节点 + return bool(code_content and code_content.strip()) + #检测行间公式类型的节点 + if node.get('type') == 'equation-interline': + math_content = node.get('content', {}).get('math_content') + # 如果公式内容为None或空字符串,则视为无效节点 + return bool(math_content and math_content.strip()) + #检测image类型的节点 + if node.get('type') == 'image': + content = node.get('content', {}) + # 检查url、path或data字段是否至少有一个不为空 + return bool(content.get('url') or content.get('path') or content.get('data')) + #检测table类型的节点 + if node.get('type') == 'table': + html = node.get('content', {}).get('html') + # 如果表格的html内容为None或空字符串,则视为无效节点 + return bool(html and html.strip()) + #检测title类型的节点 + if node.get('type') == 'title': + title_content = node.get('content', {}).get('title_content') + # 如果标题内容为None或空字符串,则视为无效节点 + return bool(title_content and title_content.strip()) + #检测段落类型的节点 + if node.get('type') == 'paragraph': + content = node.get('content', []) + # 检查content列表是否存在且不为空,并且至少有一个非空的内容项 + return bool(content) and any( + item.get('c') and item.get('c').strip() + for item in content + ) + return True + def _export_to_content_list(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str) -> ContentList: """将解析结果存入content_list格式中. @@ -263,12 +321,10 @@ def _export_to_content_list(self, base_url:str, html_lst:List[Tuple[str,str]], r parser:BaseHTMLElementRecognizer = self.__to_content_list_mapper.get(cc_tag) if parser: node = parser.to_content_list_node(base_url, ccnode_html, raw_html) - if node: + if node and self.__is_valid_node(node): one_page.append(node) else: - mylogger.warning(f'无法识别的html标签:{cc_tag}, {parsed_html}') - # TODO 开发成熟的时候,在这里抛出异常,让调用者记录下来,以便后续分析改进 - + raise HtmlFileExtractorException(f'无法识别的html标签:{cc_tag}, {parsed_html}') content_list = ContentList([one_page]) # 对于网页来说仅有一页,如果多页,则剩下的每个都是一个论坛的回复 return content_list @@ -289,9 +345,9 @@ def __get_cc_node(self, html:str) -> (str, str): xpath_expr = ' | '.join(f'self::{tag} | .//{tag}' for tag in self.__to_content_list_mapper.keys()) nodes = el.xpath(xpath_expr) if len(nodes) == 0: - raise ValueError(f'html文本中没有cc标签: {html}') # TODO 异常处理 - if len(nodes) > 2: - raise ValueError(f'html文本中包含多个cc标签: {html}') # TODO 异常处理 + raise HtmlFileExtractorException(f'html文本中没有cc标签: {html}') + if len(nodes) > 3: + raise HtmlFileExtractorException(f'html文本中包含多个cc标签: {html}') return element_to_html(nodes[0]), nodes[0].tag def __build_extractor(self): diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index cd7cd387..9586713f 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -3,7 +3,7 @@ from lxml.html import HtmlElement from overrides import override - +import json from llm_web_kit.exception.exception import HtmlTableRecognizerException from llm_web_kit.extractor.html.recognizer.cccode import CodeRecognizer from llm_web_kit.extractor.html.recognizer.ccmath import MathRecognizer @@ -68,7 +68,6 @@ def __is_table_empty(self, table) -> bool: :param table: lxml.html.HtmlElement 对象,表示一个 元素 :return: 如果表格为空,返回 True;否则返回 False """ - def is_element_empty(elem): # 检查元素本身的文本内容 if elem.text and elem.text.strip(): @@ -113,20 +112,19 @@ def __is_simple_table(self, tree) -> bool: return False return True - def __is_table_contain_img(self, tree) -> bool: - """判断table元素是否包含图片.""" - imgs = tree.xpath('//table//img') - if len(imgs) == 0: - return True - else: - return False - - def __is_table_nested(self, tree) -> int: - """获取表格元素的嵌套层级(非表格元素返回0,顶层表格返回1,嵌套表格返回层级数).""" - if tree.tag != 'table': - return 0 # 非表格元素返回0 - # 计算祖先中的 table 数量(不包括自身),再加1表示自身层级 - return len(tree.xpath('ancestor::table')) + 1 + def __is_table_nested(self, element) -> int: + """计算表格的嵌套层级(非表格返回0)""" + if element.tag != "table": + return 0 + # 获取当前表格下所有的表格(包括自身) + all_tables = [element] + element.xpath('.//table') + max_level = 1 # 初始层级为1(当前表格) + # 计算每个表格的层级,取最大值 + for table in all_tables: + ancestor_count = len(table.xpath('ancestor::table')) + level = ancestor_count + 1 + max_level = max(max_level, level) + return max_level def __extract_tables(self, ele: str) -> List[Tuple[str, str]]: """提取html中的table元素.""" @@ -150,61 +148,60 @@ def __get_table_type(self, child: HtmlElement) -> str: table_type = 'complex' return table_type - def __extract_table_element(self, ele: HtmlElement) -> str: - """提取表格的元素.""" - for item in ele.iterchildren(): - return self._element_to_html(item) def __check_table_include_math_code(self, raw_html: HtmlElement): - """check table中是否包含math.""" + """检查table中的内容,包括普通文本、数学公式和代码.""" math_html = self._element_to_html(raw_html) - ele_res = list() + # 处理数学公式和代码 math_recognizer = MathRecognizer() math_res_parts = math_recognizer.recognize(base_url='', main_html_lst=[(math_html, math_html)], - raw_html=math_html) + raw_html=math_html) code_recognizer = CodeRecognizer() code_res_parts = code_recognizer.recognize(base_url='', main_html_lst=math_res_parts, - raw_html=math_html) + raw_html=math_html) + + result = [] for math_item in code_res_parts: ele_item = self._build_html_tree(math_item[0]) + # 处理所有文本内容 + for text_segment in ele_item.itertext(): + cleaned_text = text_segment.strip().replace('\\n', '') + if cleaned_text: # 过滤空字符串 + #print("cleaned_text", cleaned_text) + result.append(cleaned_text) + # 处理行内公式 ccinline_math_node = ele_item.xpath(f'//{CCTag.CC_MATH_INLINE}') - ccinline_code_node = ele_item.xpath(f'//{CCTag.CC_CODE_INLINE}') - ccinterline_math_node = ele_item.xpath(f'//{CCTag.CC_MATH_INTERLINE}') - ccinterline_code_node = ele_item.xpath(f'//{CCTag.CC_CODE}') if ccinline_math_node: formulas = [ - el.text if el.text.strip() else '' - for el in ccinline_math_node + el.text.strip() for el in ccinline_math_node if el.text and el.text.strip() ] - ele_res.extend(formulas) # 添加字符串 - elif ccinterline_math_node: - codes = [ - el.text if el.text.strip() else '' - for el in ccinterline_math_node + result.extend(formulas) + + # 处理行间公式 + ccinterline_math_node = ele_item.xpath(f'//{CCTag.CC_MATH_INTERLINE}') + if ccinterline_math_node: + formulas = [ + el.text.strip() for el in ccinterline_math_node if el.text and el.text.strip() ] - ele_res.extend(codes) - elif ccinline_code_node: - inline_codes = [ - el.text if el.text.strip() else '' - for el in ccinline_code_node + result.extend(formulas) + + # 处理行内代码 + ccinline_code_node = ele_item.xpath(f'//{CCTag.CC_CODE_INLINE}') + if ccinline_code_node: + codes = [ + el.text.strip() for el in ccinline_code_node if el.text and el.text.strip() ] - ele_res.extend(inline_codes) - elif ccinterline_code_node: - ccinterline_codes = [ - el.text if el.text else '' - for el in ccinterline_code_node + result.extend(codes) + + # 处理行间代码 + ccinterline_code_node = ele_item.xpath(f'//{CCTag.CC_CODE}') + if ccinterline_code_node: + codes = [ + el.text.strip() for el in ccinterline_code_node if el.text and el.text.strip() ] - ele_res.extend(ccinterline_codes) - else: - texts = [] - # 使用 itertext() 遍历所有文本片段 - for text_segment in ele_item.itertext(): - # 统一处理文本:去空白 + 替换字面 \n - cleaned_text = text_segment.strip().replace('\\n', '') - if cleaned_text: # 过滤空字符串 - texts.append(cleaned_text) - ele_res.extend(texts) - return ele_res + result.extend(codes) + + return result def __simplify_td_th_content(self, elem: HtmlElement) -> None: """简化
内容,仅保留文本内容.""" diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_include_math_p.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_include_math_p.html new file mode 100644 index 00000000..257b0bac --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_include_math_p.html @@ -0,0 +1,2326 @@ + + + + +factoring - Is $83^{27} +1 $ a prime number? - Mathematics Stack Exchange + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+ + + + + + + + +
+ + + + + +
+ + +
+ + +
+
+ +
+ Mathematics Stack Exchange is a question and answer site for people studying math at any level and professionals in related fields. It's 100% free, no registration required. +
+
+ Sign up +
+
+ Here's how it works: +
    +
  1. Anybody can ask a question +
  2. +
  3. Anybody can answer +
  4. +
  5. The best answers are voted up and rise to the top +
  6. +
+
+
+
+ +
+ +
+ + + +
+ + + + + + + + + + + +
+ + +
+ + up vote + 17 + down vote + + favorite +
5
+ + +
+ +
+
+
+ +

I'm having problems with exercises on proving whether or not a given number is prime. Is $83^{27} + 1$ prime?

+
+ + + + + + + +
+
share|cite|improve this question
+
+ + +
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+ 4 + +   +
+
+
+ Someone else points out that it can't be prime because it's even. That's probably the quickest way. May answer shows how to factor it (so it can't be prime) by a method that would work just as well if it had been $84^{27}+1$. + – Michael Hardy + Aug 2 '13 at 22:11 +
+
+ + + + + + + +
+ 5 + +   +
+
+
+ Wolfram Alpha says that $83^{27}+1= 2^2×3^4×7×109×757×2269×9613×49339×2208799×14685985270709080390792801$. Perhaps it's fun to try to prove that 3 and 7 are factors. + – lhf + Aug 2 '13 at 22:13 + +
+
+ + + + + + + +
+ 51 + +   +
+
+
+ The number is EVEN! + – Ali + Aug 3 '13 at 5:37 +
+
+ + + + + + + +
+ 3 + +   +
+
+
+ @Joseph It is "well-known" and not too hard to prove that if $b^n+1$ is prime for some integer $b>1$ then $n$ has to be zero or a power of two. And 27 is neither zero nor a power of two. Search for Generalized Fermat Prime to find a proof, or do the proof yourself. + – Jeppe Stig Nielsen + Aug 3 '13 at 6:48 + +
+
+ + + + + + + +
+ 5 + +   +
+
+
+ If it is for a test, the simple answer is that you don't have the tools to prove it prime, so it must be composite. + – Ross Millikan + Aug 3 '13 at 13:04 +
+
+
+ + +
+
+ +
+ + +
+
+

+ 9 Answers + 9 +

+ +
+
+ + + + + + +
+ + + + + + + + + + + + +
+ + +
+ + up vote + 68 + down vote + + + + accepted + +
+ +
+
+

$83$ is odd, so is any power of $83$. Hence $83^{27}+1$ is even, but the only even prime number is $2$ and this number is not $2$.

+ +

More generally, if $a,k\in\mathbb N$ and $k$ is odd, then +$$a^k+1\equiv (-1)^k+1\equiv 0\pmod{a+1}$$ +So $a+1\mid a^k+1$. In this case this yields $84=2^2\cdot 3\cdot 7$ as divisor.

+
+ + + + + + + + + +
+
share|cite|improve this answer
+ + + + +
+
+
+ + + + + + + + + + +
+ + + + + + + +
+ 3 + +   +
+
+
+ Your last statement can also be seen by geometric series: $(1+x+\cdots+x^{k-1})(x-1)=x^k-1$. For $k$ odd, substitute $x=-a$ and cancel out the minuses signs on each side to get the factor $(a+1)$ on the left with $a^k+1$ on the right. + – nayrb + Aug 2 '13 at 21:52 + +
+
+
+ + +
+
+ + + +
+ + + + + + + + + + + + +
+ + +
+ + up vote + 2 + down vote + + + + +
+ +
+
+

Let's ask WolframAlpha!

+ +
+

PrimeQ[83^27 + 1]

+
+ + + +
+

is $6\,532\,937\,361\,590\,551\,025\,727\,805\,459\,013\,652\,074\,798\,022\,177\,030\,828$ a prime number?

+ +

$83^{27} + 1$ is not a prime number

+ +

$2^2 \times 3^4 \times 7 \times 109 \times 757 \times 2269 \times 9613 \times 49339 \times 2208799 \times 14685985270709080390792801 \space\text{(14 prime factors, 10 distinct)}$

+
+ +
+ +

However, using basic knowledge that an odd times an odd is always an odd ($3 \times 3 = 9$), we see that $83$ (an odd number) raised to any power is an odd number. Then we add one to it and get an even number.

+ +

Being even (and obviously not equal to $2$), the definition of a prime tells us that the number is not prime because it is divisible by $2$ (my words):

+ +
+

prime (noun):

+ +
    +
  1. Any natural number, greater than $1$, that, when divided by any natural number, greater than $1$, other than itself or $1$ does not result in a natural number.
  2. +
  3. Any "natural number greater than $1$ that has no positive divisors other than $1$ and itself." (Wikipedia article "prime number")
  4. +
+
+
+ + + + + + + + + +
+
share|cite|improve this answer
+ + + + +
+
+
+ + + + + +
+
+ + +
+
+ + + +
+ + + + + + + + + + + + +
+ + +
+ + up vote + 2 + down vote + + + + +
+ +
+
+

well it is divisible by $84$ and in general $\forall a,m\in\mathbb {N}$ we have +$(a+1)\mid (a^{2m+1}+1)$ So....

+
+ + + + + + + + +
+
share|cite|improve this answer
+ + + +
+
+
+ + + + + +
+
+ + +
+
+ + + +
+ + + + + + + + + + + + +
+ + +
+ + up vote + 4 + down vote + + + + +
+ +
+
+

The only prime numbers of the form $a^x+b^x$, occur when $x$ is a power of two. This does not guarantee a prime, but if $x$ is not a power of $2$, then the number has algebraic factors.

+ +

In practice, there is an algebraic divisor of $a^n-b^n$, for each $m$ that divides $n$. For the equation $a^n+b^n$, one would look for divisors of $2n$ that don't divide $n$. Inthe question we have $n=27$, so the divisors of 54 that don't divide 27. That is, 2, 6, 18 and 54. For powers of 2, there is only one number that divides $2n$ but not $n$.

+
+ + + + + + + + + +
+
share|cite|improve this answer
+ + + + +
+
+
+ + + + + + + + + + +
+ + + + + + + +
+    + +   +
+
+
+ Extebded answer to include this. + – wendy.krieger + Aug 3 '13 at 23:44 +
+
+
+ + +
+
+ + + +
+ + + + + + + + + + + + +
+ + +
+ + up vote + 13 + down vote + + + + +
+ +
+
+

We have a chain of divisibilities, based on the fact that $(a-b)\mid(a^n-b^n)$, +$$ +83^1-(-1)^1\mid83^3-(-1)^3\mid83^9-(-1)^9\mid83^{27}-(-1)^{27}=83^{27}+1 +$$ +Using this chain, we get, using $a^3-b^3=(a-b)(a^2+ab+b^2)$, +$$ +\begin{align} +83^{27}+1 +&=\frac{83^{27}+1}{83^9+1}\times\frac{83^9+1}{83^3+1}\times\frac{83^3+1}{83^1+1}\times\left(83^1+1\right)\\ +&=\left(83^{18}-83^9+1\right)\times\left(83^6-83^3+1\right)\times\left(83^2-83^1+1\right)\times\left(83^1+1\right)\\[9pt] +&=34946659039493167203883141969862007\times326939801583\times6807\times84 +\end{align} +$$ +Thus, $83^{27}+1$ is not prime.

+ +

Note: none of these factors are guaranteed to be prime, just factors.

+
+ + + + + + + + + +
+
share|cite|improve this answer
+ + + + +
+
+
+ + + + + + + + + + + + + + +
+ + + + + + + +
+ 3 + +   +
+
+
+ Would the downvoter care to comment? + – robjohn + Aug 3 '13 at 19:37 +
+
+ + + + + + + +
+    + +   +
+
+
+ I like it. Some might say overly rigorous for a simple problem, but helps demonstrate some deeper thinking than just noticing that it would be even. +1 + – Asimov + Sep 28 '14 at 18:32 +
+
+
+ + +
+
+ + + +
+ + + + + + + + + + + + +
+ + +
+ + up vote + 40 + down vote + + + + +
+ +
+
+

$$ +83^{27} + 1 = \Big(83^9\Big)^3 + 1 = a^3+b^3 = (a+b)(a^2-ab+b^2) = \Big(83^9+1\Big)\Big((83^9)^2-83^9+1\Big). +$$

+ +

So, no, it's not prime.

+ +

PS (added later): Some point out that it's obviously an even number, so it's not prime. But what I do above would work just as well if it were $84$ rather than $83$.

+
+ + + + + + + + + +
+
share|cite|improve this answer
+ + + + +
+
+
+ + + + + +
+
+ + +
+
+ + + +
+ + + + + + + + + + + + +
+ + +
+ + up vote + 23 + down vote + + + + +
+ +
+
+

Note that $83\equiv -1\pmod{84}$. Thus $83^{27}+1\equiv 0\pmod{84}$.

+ +

It follows that our number is divisible by all the divisors of $84$.

+ +

It is also non-prime in other ways. For let $x=83^3$. Then our number is $x^9+1$, so is divisible by $x+1$. Similarly, we could let $y=83^9$, and conclude that our number is divisible by $y+1$.

+ +

Seriously non-prime!

+
+ + + + + + + + +
+
share|cite|improve this answer
+ + + +
+
+
+ + + + + +
+
+ + +
+
+ + + +
+ + + + + + + + + + + + +
+ + +
+ + up vote + 14 + down vote + + + + +
+ +
+
+

It is obviously not prime. $83$ is odd, therefore $83^{27}$ is odd, hence $83^{27}+1$ is even and not prime.

+
+ + + + + + + + +
+
share|cite|improve this answer
+ + + +
+
+
+ + + + + +
+
+ + +
+
+ + + +
+ + + + + + + + + + + + +
+ + +
+ + up vote + 46 + down vote + + + + +
+ +
+
+

Well, it is an even number, so...

+
+ + + + + + + + +
+
share|cite|improve this answer
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+ 1 + +   +
+
+
+ I downvoted your answer on the basis that it doesn't provide the reason behind why it is even. For example, it doesn't say that since $83$ is odd, so the powers of it must also be odd and thus, odd + 1 must be even. + – Jeel Shah + Nov 5 '13 at 3:37 +
+
+ + + + + + + +
+ 6 + +   +
+
+
+ @gekkostate: (1) If you think all the answers must provide all the reasons behind them then you're going to downvote a lot around here, as many participants, probably most of the serious ones, don't think like you do. (2) The question is at a level that requires as trivial to know that powers of odd numbers are odd, and sum of odd numbers is even, so to add that to the answer seems trivial after it's been remarked that the number is odd (and thus the OP begins to think "why?" and he completes the answer by himself). Think of this, perhaps you'll realize you rush too much to do downvote... + – DonAntonio + Nov 5 '13 at 4:51 +
+
+ + + + + + + +
+ 2 + +   +
+
+
+ I clearly failed to see the intent behind your answer but I still feel that it lacks any reasoning whatsoever. Your answer is equivalent to the highest upvoted comment so maybe, that should have been enough? Also, I hardly ever downvote questions/answers (ratio of up to down is 77/5) so I didn't really rush into this (clearly, I use downvotes sparingly). I don't want to make this into something that it is not. Let's leave this at the fact that we have a difference of opinions on answers. + – Jeel Shah + Nov 5 '13 at 14:03 +
+
+ + + + + + + +
+    + +   +
+
+
+ I just love the precise nature of this answer. +1 + – Asimov + Sep 28 '14 at 18:31 +
+
+
+ + +
+
+
+

protected by Community Jun 21 '14 at 19:06 +

+

+Thank you for your interest in this question. +Because it has attracted low-quality or spam answers that had to be removed, posting an answer now requires 10 reputation on this site. +

+Would you like to answer one of these unanswered questions instead? +

+
+ + + + + +

+Not the answer you're looking for? Browse other questions tagged or ask your own question.

+
+
+ + + +
+ + +
+
+ + + + + + + + \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/test_list_empty.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/test_list_empty.html new file mode 100644 index 00000000..96aa3568 --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/test_list_empty.html @@ -0,0 +1,2000 @@ + + + +Натуральное мыло ручной работы — продажа оптом от производителя, каталог 2024 из 39 разновидностей, цены + + + + + + + + + + + + + + + + +
+
+
+
+ +
+
+20 943 +Российских производителей
+
+82 716 +Товаров российского производства
+
+
+ +
+
+ + + + + +
+ +
+
+ + +
+
+ +
+
+
+
+
+
+
+
    +
  • 134970 картинка каталога «Производство России». Продукция Натуральное мыло ручной работы, г.Симферополь 2015
  • +
  • Фото 2 Натуральное мыло ручной работы, г.Симферополь 2015
  • +
  • Фото 3 Натуральное мыло ручной работы, г.Симферополь 2015
  • +
  • Фото 4 Натуральное мыло ручной работы, г.Симферополь 2015
  • +
+
+Источник фото: knk-kosmetika.ru © +
+
+

Натуральное мыло ручной работы

+ оптом от производителя, г.Симферополь +
Продажа оптом мыла ручной работы от производителя натуральной косметики «Крымская Натуральная Коллекция», г. Симферополь
+
+
+
+
+ + +
Цена от 54 
+мин. партия: 180 шт.
+Купить оптом в 1 клик
+ +
+
+
+
+
+
    +
  • + +Описание
  • +
  • + +Вопросы
  • +
  • + +Отзывы
  • +
  • + +Контакты
+
+
+

Натуральное мыло ручной работы изготавливает и реализует по оптовой цене российский производитель и поставщик косметики под брендом «Крымская Натуральная Коллекция».

+ +

В каталоге представлено 39 разновидностей мыла.

+ +

Выпускаем:

+ +
    +
  • мыло с омолаживающим эффектом;
  • +
  • антицеллюлитное мыло;
  • +
  • мыло-скраб;
  • +
  • лечебное;
  • +
  • мыло-духи (ароматизированное).
  • +
+ +

Список ассортимента, каталог и прайс-листы отправляем по запросу на электронную почту заказчиков.

+ +

Фасовка: бруски по 43 и 75 грамм. Также предлагаем поставки мыла брусками по 850 гр.

+ +

Преимущества мыла от «Крымская Натуральная Коллекция»:

+ +
    +
  • производство «холодным» способом по уникальным рецептурам с сохранением полезных веществ и микроэлементов;
  • +
  • натуральный состав без химических добавок, консервантов, синтетических красителей и отдушек;
  • +
  • не вызывает аллергии;
  • +
  • насыщение кожи полезными веществами, минералами и витаминами;
  • +
  • ароматерапевтическое действие;
  • +
  • в составе растительные экстракты, масла и травы;
  • +
  • омолаживающий и питательный эффект для кожи;
  • +
  • придание коже легкого уникального аромата;
  • +
  • тщательный контроль качества продкции;
  • +
  • не сушит кожу и не стягивает кожу, потому что имеет максимальный PH (не более 8,5).
  • +
+ +

Срок годности: 12 мес. (в упаковке).

+ +

Также мыло ручной работы от бренда «Крымская Натуральная Коллекция» подходит:

+ +
    +
  • для мыльного массажа (глубоко очищает кожу, удаляет ороговевшие слои, способствует уменьшению объемов тела и профилактике целлюлита);
  • +
  • в качестве средства для бритья;
  • +
  • для ежедневного очищения кожи (умывание и душ);
  • +
  • в качестве мыльной маски для очищения кожи;
  • +
  • для интимной гигиены (бережно очищает, уменьшает количество воспалений, не раздражает слизистые покровы).
  • +
+ +

Также выпускаем натуральные дезодоранты, соль для ванны, натуральные кремы для лица и маски-скрабы. Смотрите список продукции в каталоге компании на выставке и на официальном сайте фабрики.

Приглашаем к сотрудничеству косметически салоны и CGF-центры, косметологии, салоны красоты, магазины, дилеров и оптовых заказчиков, корпоративных клиентов.

+ +

Продажа оптом от 180 шт. (сумма полного оптового заказ от 20000 руб.).

+ +

Оплату принимаем на расчетный счет фабрики и отгружаем заказы при 100% предоплате. Доставка по России транспортными компаниями.

+ +

Прайс-лист закажите у менеджера бренда на выставке через кнопку «Заказать прайс-лист» или по телефону.

+
+
+
+
+
+
Аватар пользователя
+
+Hani +29.05.2023 12:49 +
+
+

كيف يمكن ان اشتري من الشركة بالجملة وكيف يمكن ان اتواصل مع مدير المبيعات

+Армения, г.Ереван +
+
+
+
Аватар пользователя
+
+Жанна А. +19.03.2022 08:57 +
+
+

Здравствуйте. Пишу с коммерческим предложением. Разрешаете ли продажу на маркетплейсе? Вышлите прайс,пожалуйста.

+Россия, г.Санкт-Петербург +
+
+
+
Аватар пользователя
+
+Михрим Баратова +6.02.2022 06:22 +
+
+

Здравствуйте, можно опт цену на бруски по 850гр. В Алматы отправите? Минимальный заказ?

+Казахстан, г.Алматы +
+
+
+
Аватар пользователя
+
+Татьяна Сергеевна Дернова +7.01.2022 03:00 +
+
+

Могу ли продавать вашу продукцию на маркетплейсах? Условия?

+Россия, г.Петропавловск-Камчатский +
+
+
+
Аватар пользователя
+
+Татьяна +5.10.2021 09:14 +
+
+

Здравствуйте! Скажите, пожалуйста, сколько стоит мыло в брусках?

+Россия, г.Москва +
+ + +
+Задать вопрос +
+ + +
+ +
+
+ + + + +Я соглашаюсь с политикой конфиденциальности
+ + +
+ + + +
+ + +
+
+ + + +
+ + + + + +
+ Написать отзыв +
+ + + + Ваша оценка +
+
+ + + +
+
Преимущества
+ +
Недостатки
+ +
Комментарий
+ + +
+ + + + + Я соглашаюсь с политикой конфиденциальности + +
+ + + + +
+ + + + + + +
+ + + + +
+
+
+
+Фабрика «Крымская Натуральная Коллекция»
+
+Фабрика «Крымская Натуральная Коллекция» +
+ + +3 отзыва
+
Фабрика «Крымская Натуральная Коллекция» — российский производитель и...
+
+Контактная информация + + + + + + + + + + + + + + + + +
АдресКрым, Симферополь, ул. Бородина 10
Телефон+7 (978) 875-4152
WhatsApp+7 9782866450
Электронная почтаzakaz.knk@mail.ru
Официальный сайтknk-kosmetika.ru
+Реквизиты компании + + + + + + + + + + + + + + + + + + + +
НаименованиеИП Долгая Ирина Анатольевна
ОГРН314910226700661
ИНН910200114800
Юридический адрес295000, Респ Крым, г Симферополь
Дата регистрации24.09.2014
Виды деятельности +
+Основной ОКВЭД +46.45 Торговля оптовая парфюмерными и косметическими товарами +Дополнительные ОКВЭД +46.31.2 Торговля оптовая консервированными овощами, фруктами и орехами + + + + + + + + + + + + + + + + + + +Показать весь список... +
+
+Компания на карте +
+
+
Продукция компании22 Смотреть всё +
+ + + +
+
+
+
+
+ Фото 1 ​Ароматические освежители воздуха натуральные, г.Симферополь 2015 +
+ +
+
Цена от 178,20 
+ ​Ароматические освежители воздуха натуральные +
​Ароматические освежители воздуха натуральные изготавливает и предлагает оптовым заказчикам купить по выгодной цене российский...
+ +
+
+ +
+ + +
+ + +
+
+ + + + 0 отзывов +
+
+ + +
+
+ + + + +
+
+
+
+
+ Фото 1 Натуральная хозяйственная паста с горчицей, г.Симферополь 2015 +
+ +
+
Цена от 180 
+ Натуральная хозяйственная паста с горчицей +
Российский производитель и поставщик натуральной косметической и бытовой продукциии «Крымская Натуральная...
+ +
+
+ +
+ + +
+ + +
+
+ + + + 0 отзывов +
+
+ + +
+
+ + + + +
+
+
+
+
+ Фото 1 Морская соль для ванн, г.Симферополь 2015 +
+ +
+
Цена от 90 
+ Морская соль для ванн +
Российская фабрика-поставщик натуральной косметики из Крыма Фабрика «Крымская Натуральная Коллекция» изготавливает...
+ +
+
+ +
+ + +
+ + +
+
+ + + + 0 отзывов +
+
+ + +
+
+ + + + +
+
+
+
+
+ Фото 1 Натуральный дезодорант-антиперспирант, г.Симферополь 2015 +
+ +
+
Цена от 145,20 
+ Натуральный дезодорант-антиперспирант +
Симферопольский бренд-поставщик экологичной косметики «Крымская Натуральная Коллекция» продает по оптовой цене...
+ +
+
+ +
+ + +
+ + +
+
+ + + + 0 отзывов +
+
+ + +
+
+ + + + +
+
+
+
+
+ Фото 1 Маска-скраб для лица, г.Симферополь 2015 +
+ +
+
Цена от 102 
+ Маска-скраб для лица +
Косметическая фабрика-поставщик «Крымская Натуральная Коллекция» предлагает поставки масок-скрабов для лица в...
+ +
+
+ +
+ + +
+ + +
+
+ + + + 0 отзывов +
+
+ + +
+
+ + + + +
+
+
+
+
+ Фото 1 Натуральные кремы для лица, г.Симферополь 2015 +
+ +
+
Цена от 318 
+ Натуральные кремы для лица +
Российский производитель и поставщик косметической продукции под брендом «Крымская Натуральная Коллекция»...
+ +
+
+ +
+ + +
+ + +
+
+ + + + 0 отзывов +
+
+ + +
+
+ + + + +
+
+
+
+
+ Фото 1 Косметические масляно-солевые скрабы, г.Симферополь 2015 +
+ +
+
Цена от 178,20 
+ Косметические масляно-солевые скрабы +
Фабрика-поставщик натуральной косметики «Крымская Натуральная Коллекция» представляет широкий ассортимент...
+ +
+
+ +
+ + +
+ + +
+
+ + + + 0 отзывов +
+
+ + +
+
+ + + + +
+
+
+
+
+ Фото 1 Натуральный крем для рук «Нежное прикосновение», г.Симферополь 2015 +
+ +
+
Цена от 198 
+ Натуральный крем для рук «Нежное прикосновение» +
Натуральный крем для рук «Нежное прикосновение» изготавливает и реализует по оптовой цене производитель и поставщик...
+ +
+
+ +
+ + +
+ + +
+
+ + + + 0 отзывов +
+
+ + +
+
+ + + + +
+
+
+
+
+ Фото 1 Натуральное мыло ручной работы, г.Симферополь 2015 +
+ +
+
Цена от 54 
+ Натуральное мыло ручной работы +
Натуральное мыло ручной работы изготавливает и реализует по оптовой цене российский производитель и поставщик косметики под...
+ +
+
+ +
+ + +
+ + +
+
+ + + + 0 отзывов +
+
+ + +
+
+ + + + +
+
+
+
+
+ Фото 1 Мягкое травяное мыло «Бельди», г.Симферополь 2015 +
+ +
+
Цена от 142,80 
+ Мягкое травяное мыло «Бельди» +
Мягкое травяное мыло «Бельди» изготавливает и реализует по оптовой цене российский бренд-поставщик косметики...
+ +
+
+ +
+ + +
+ + +
+
+ + + + 0 отзывов +
+
+ + +
+
+ + + + + + + + + +
+
+
+ + +
+
+
+ +Ожидайте, идёт загрузка...
+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl index 0c68d085..f41b78ec 100644 --- a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl @@ -11,4 +11,6 @@ {"track_id": "oracle_doc", "dataset_name": "test_pipeline_suit", "url": "https://docs.oracle.com/en-us/iaas/tools/java/3.57.1/com/oracle/bmc/integration/model/CustomEndpointDetails.html","data_source_category": "HTML", "path":"oracle_doc.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} {"track_id": "table_involve_inline_code", "dataset_name": "test_table_involve_inline_code", "url": "https://docs.oracle.com/en-us/iaas/tools/java/3.57.1/com/oracle/bmc/integration/model/CustomEndpointDetails.html","data_source_category": "HTML", "path":"table_involve_inline_code.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} {"track_id": "table_tail_text", "dataset_name": "test_table_tail_text", "url": "https://dchublists.com/?do=hublist&id=hub-975&language=en","data_source_category": "HTML", "path":"table_tail_text.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} -{"track_id": "table_elem_include_enter", "dataset_name": "table_elem_include_enter", "url": "https://fardapaper.ir/financial-development-equity-capital","data_source_category": "HTML", "path":"test_table_elem_include_enter.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} \ No newline at end of file +{"track_id": "table_elem_include_enter", "dataset_name": "table_elem_include_enter", "url": "https://fardapaper.ir/financial-development-equity-capital","data_source_category": "HTML", "path":"test_table_elem_include_enter.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} +{"track_id": "list_empty", "dataset_name": "test_list_empty", "url": "https://productcenter.ru/products/27276/naturalnoie-krymskoie-mylo-ruchnoi-raboty-39-raznovidnostiei","data_source_category": "HTML", "path":"test_list_empty.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} +{"track_id": "table_include_math_p", "dataset_name": "table_include_math_p", "url": "https://math.stackexchange.com/questions/458323/is-8327-1-a-prime-number?answertab=active","data_source_category": "HTML", "path":"table_include_math_p.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index c2e5ee2b..a62b8e27 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -59,7 +59,7 @@ def setUp(self): for line in f: self.data_json.append(json.loads(line.strip())) - assert len(self.data_json) == 14 + assert len(self.data_json) == 16 # Config for HTML extraction self.config = { @@ -385,3 +385,26 @@ def test_table_element_include_enter(self): | عنوان انگلیسی | Financial development and the cost of equity capital: Evidence from China | | کلمات کلیدی : |   توسعه مالی؛ هزینه سرمایه حقوق سهامداران؛ قانون و امور مالی؛ چین | | درسهای مرتبط | حسابداری |""" in content_md + + def test_list_empty(self): + """list抽取为空,原因是嵌套的img标签没有text""" + chain = ExtractSimpleFactory.create(self.config) + self.assertIsNotNone(chain) + test_data = self.data_json[14] + # Create DataJson from test data + input_data = DataJson(test_data) + result = chain.extract(input_data) + list_type = result.get_content_list()._get_data()[0][0]["type"] + assert list_type != "list" + + def test_table_include_math_p(self): + """table包含math和其他内容""" + chain = ExtractSimpleFactory.create(self.config) + self.assertIsNotNone(chain) + test_data = self.data_json[15] + # Create DataJson from test data + input_data = DataJson(test_data) + result = chain.extract(input_data) + content_list = result.get_content_list()._get_data() + # TODO math模块需要处理下$符号但是非公式 + assert len(content_list[0]) == 17 \ No newline at end of file From 11d0968897bda9c19eeec5633207b4d211cc0b62 Mon Sep 17 00:00:00 2001 From: dt-yy Date: Thu, 6 Mar 2025 19:40:59 +0800 Subject: [PATCH 28/46] fix pylint --- llm_web_kit/extractor/html/extractor.py | 27 +++++++++---------- .../extractor/html/recognizer/table.py | 17 ++++++------ .../extractor/test_extractor_chain.py | 12 ++++----- 3 files changed, 27 insertions(+), 29 deletions(-) diff --git a/llm_web_kit/extractor/html/extractor.py b/llm_web_kit/extractor/html/extractor.py index 53565f15..a3d4a5f6 100644 --- a/llm_web_kit/extractor/html/extractor.py +++ b/llm_web_kit/extractor/html/extractor.py @@ -5,6 +5,7 @@ from overrides import override from llm_web_kit.config.cfg_reader import load_config +from llm_web_kit.exception.exception import HtmlFileExtractorException from llm_web_kit.extractor.extractor import BaseFileFormatExtractor from llm_web_kit.extractor.html.magic_html import GeneralExtractor from llm_web_kit.extractor.html.recognizer.audio import AudioRecognizer @@ -20,9 +21,7 @@ from llm_web_kit.extractor.html.recognizer.video import VideoRecognizer from llm_web_kit.input.datajson import ContentList, DataJson from llm_web_kit.libs.html_utils import element_to_html, html_to_element -from llm_web_kit.libs.logger import mylogger from llm_web_kit.libs.path_lib import get_py_pkg_root_dir -from llm_web_kit.exception.exception import HtmlFileExtractorException class HTMLPageLayoutType: @@ -256,51 +255,51 @@ def __is_valid_node(self, node: dict) -> bool: bool: 如果节点有效返回True,否则返回False """ if not node: - raise HtmlFileExtractorException("node is empty") + raise HtmlFileExtractorException('node is empty') node_type = node.get('type') valid_types = {'list', 'code', 'equation-interline', 'image', 'table', 'title', 'paragraph'} if node_type not in valid_types: - raise HtmlFileExtractorException(f"Invalid node type: {node_type}") + raise HtmlFileExtractorException(f'Invalid node type: {node_type}') # 检查列表类型的节点 if node.get('type') == 'list': items = node.get('content', {}).get('items', []) # 过滤掉None、空列表,以及只包含None或空值的列表 return bool(items) and any( - isinstance(item, (dict, list)) and bool(item) + isinstance(item, (dict, list)) and bool(item) for item in items) - #检测code类型的节点 + # 检测code类型的节点 if node.get('type') == 'code': code_content = node.get('content', {}).get('code_content') # 如果代码内容为None或空字符串,则视为无效节点 return bool(code_content and code_content.strip()) - #检测行间公式类型的节点 + # 检测行间公式类型的节点 if node.get('type') == 'equation-interline': math_content = node.get('content', {}).get('math_content') # 如果公式内容为None或空字符串,则视为无效节点 return bool(math_content and math_content.strip()) - #检测image类型的节点 + # 检测image类型的节点 if node.get('type') == 'image': content = node.get('content', {}) # 检查url、path或data字段是否至少有一个不为空 return bool(content.get('url') or content.get('path') or content.get('data')) - #检测table类型的节点 + # 检测table类型的节点 if node.get('type') == 'table': html = node.get('content', {}).get('html') # 如果表格的html内容为None或空字符串,则视为无效节点 return bool(html and html.strip()) - #检测title类型的节点 + # 检测title类型的节点 if node.get('type') == 'title': title_content = node.get('content', {}).get('title_content') # 如果标题内容为None或空字符串,则视为无效节点 return bool(title_content and title_content.strip()) - #检测段落类型的节点 + # 检测段落类型的节点 if node.get('type') == 'paragraph': content = node.get('content', []) # 检查content列表是否存在且不为空,并且至少有一个非空的内容项 return bool(content) and any( - item.get('c') and item.get('c').strip() + item.get('c') and item.get('c').strip() for item in content - ) + ) return True def _export_to_content_list(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str) -> ContentList: @@ -347,7 +346,7 @@ def __get_cc_node(self, html:str) -> (str, str): if len(nodes) == 0: raise HtmlFileExtractorException(f'html文本中没有cc标签: {html}') if len(nodes) > 3: - raise HtmlFileExtractorException(f'html文本中包含多个cc标签: {html}') + raise HtmlFileExtractorException(f'html文本中包含多个cc标签: {html}') return element_to_html(nodes[0]), nodes[0].tag def __build_extractor(self): diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index 9586713f..6d7d94e8 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -3,7 +3,7 @@ from lxml.html import HtmlElement from overrides import override -import json + from llm_web_kit.exception.exception import HtmlTableRecognizerException from llm_web_kit.extractor.html.recognizer.cccode import CodeRecognizer from llm_web_kit.extractor.html.recognizer.ccmath import MathRecognizer @@ -114,7 +114,7 @@ def __is_simple_table(self, tree) -> bool: def __is_table_nested(self, element) -> int: """计算表格的嵌套层级(非表格返回0)""" - if element.tag != "table": + if element.tag != 'table': return 0 # 获取当前表格下所有的表格(包括自身) all_tables = [element] + element.xpath('.//table') @@ -148,7 +148,6 @@ def __get_table_type(self, child: HtmlElement) -> str: table_type = 'complex' return table_type - def __check_table_include_math_code(self, raw_html: HtmlElement): """检查table中的内容,包括普通文本、数学公式和代码.""" math_html = self._element_to_html(raw_html) @@ -159,7 +158,7 @@ def __check_table_include_math_code(self, raw_html: HtmlElement): code_recognizer = CodeRecognizer() code_res_parts = code_recognizer.recognize(base_url='', main_html_lst=math_res_parts, raw_html=math_html) - + result = [] for math_item in code_res_parts: ele_item = self._build_html_tree(math_item[0]) @@ -167,7 +166,7 @@ def __check_table_include_math_code(self, raw_html: HtmlElement): for text_segment in ele_item.itertext(): cleaned_text = text_segment.strip().replace('\\n', '') if cleaned_text: # 过滤空字符串 - #print("cleaned_text", cleaned_text) + # print("cleaned_text", cleaned_text) result.append(cleaned_text) # 处理行内公式 ccinline_math_node = ele_item.xpath(f'//{CCTag.CC_MATH_INLINE}') @@ -176,7 +175,7 @@ def __check_table_include_math_code(self, raw_html: HtmlElement): el.text.strip() for el in ccinline_math_node if el.text and el.text.strip() ] result.extend(formulas) - + # 处理行间公式 ccinterline_math_node = ele_item.xpath(f'//{CCTag.CC_MATH_INTERLINE}') if ccinterline_math_node: @@ -184,7 +183,7 @@ def __check_table_include_math_code(self, raw_html: HtmlElement): el.text.strip() for el in ccinterline_math_node if el.text and el.text.strip() ] result.extend(formulas) - + # 处理行内代码 ccinline_code_node = ele_item.xpath(f'//{CCTag.CC_CODE_INLINE}') if ccinline_code_node: @@ -192,7 +191,7 @@ def __check_table_include_math_code(self, raw_html: HtmlElement): el.text.strip() for el in ccinline_code_node if el.text and el.text.strip() ] result.extend(codes) - + # 处理行间代码 ccinterline_code_node = ele_item.xpath(f'//{CCTag.CC_CODE}') if ccinterline_code_node: @@ -200,7 +199,7 @@ def __check_table_include_math_code(self, raw_html: HtmlElement): el.text.strip() for el in ccinterline_code_node if el.text and el.text.strip() ] result.extend(codes) - + return result def __simplify_td_th_content(self, elem: HtmlElement) -> None: diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index a62b8e27..d322c6b1 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -387,18 +387,18 @@ def test_table_element_include_enter(self): | درسهای مرتبط | حسابداری |""" in content_md def test_list_empty(self): - """list抽取为空,原因是嵌套的img标签没有text""" + """list抽取为空,原因是嵌套的img标签没有text.""" chain = ExtractSimpleFactory.create(self.config) self.assertIsNotNone(chain) test_data = self.data_json[14] # Create DataJson from test data input_data = DataJson(test_data) result = chain.extract(input_data) - list_type = result.get_content_list()._get_data()[0][0]["type"] - assert list_type != "list" - + list_type = result.get_content_list()._get_data()[0][0]['type'] + assert list_type != 'list' + def test_table_include_math_p(self): - """table包含math和其他内容""" + """table包含math和其他内容.""" chain = ExtractSimpleFactory.create(self.config) self.assertIsNotNone(chain) test_data = self.data_json[15] @@ -407,4 +407,4 @@ def test_table_include_math_p(self): result = chain.extract(input_data) content_list = result.get_content_list()._get_data() # TODO math模块需要处理下$符号但是非公式 - assert len(content_list[0]) == 17 \ No newline at end of file + assert len(content_list[0]) == 17 From d3f995fb436fb27ec84788728a4397b55b1e4091 Mon Sep 17 00:00:00 2001 From: dt-yy Date: Fri, 7 Mar 2025 15:57:59 +0800 Subject: [PATCH 29/46] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dtable&list=E9=97=AE?= =?UTF-8?q?=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- llm_web_kit/extractor/extractor_chain.py | 1 - .../extractor/html/recognizer/cccode.py | 1 - .../extractor/html/recognizer/table.py | 143 ++++++++++-------- .../html/table_include_table_math.html | 90 +++++++++++ .../good_data/html_data_input.jsonl | 3 +- .../extractor/html/recognizer/test_table.py | 4 +- .../extractor/test_extractor_chain.py | 18 ++- 7 files changed, 189 insertions(+), 71 deletions(-) create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_include_table_math.html diff --git a/llm_web_kit/extractor/extractor_chain.py b/llm_web_kit/extractor/extractor_chain.py index 5063a2fa..d02d17e0 100644 --- a/llm_web_kit/extractor/extractor_chain.py +++ b/llm_web_kit/extractor/extractor_chain.py @@ -46,7 +46,6 @@ def extract(self, data: DataJson) -> DataJson: # Pre extractors for pre_ext in self.__pre_extractors: data = pre_ext.pre_extract(data) - # Main extractors for ext in self.__extractors: data = ext.extract(data) diff --git a/llm_web_kit/extractor/html/recognizer/cccode.py b/llm_web_kit/extractor/html/recognizer/cccode.py index d98d5a75..4a638fee 100644 --- a/llm_web_kit/extractor/html/recognizer/cccode.py +++ b/llm_web_kit/extractor/html/recognizer/cccode.py @@ -38,7 +38,6 @@ def recognize( if self.is_cc_html(html): rtn.append((html, raw_html)) continue - root: HtmlElement = html_to_element(html) while True: # 最常见: diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index 6d7d94e8..28694c7f 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -5,7 +5,6 @@ from overrides import override from llm_web_kit.exception.exception import HtmlTableRecognizerException -from llm_web_kit.extractor.html.recognizer.cccode import CodeRecognizer from llm_web_kit.extractor.html.recognizer.ccmath import MathRecognizer from llm_web_kit.extractor.html.recognizer.recognizer import ( BaseHTMLElementRecognizer, CCTag) @@ -113,7 +112,7 @@ def __is_simple_table(self, tree) -> bool: return True def __is_table_nested(self, element) -> int: - """计算表格的嵌套层级(非表格返回0)""" + """计算表格的嵌套层级(非表格返回0,根据原始table判断的.""" if element.tag != 'table': return 0 # 获取当前表格下所有的表格(包括自身) @@ -151,73 +150,90 @@ def __get_table_type(self, child: HtmlElement) -> str: def __check_table_include_math_code(self, raw_html: HtmlElement): """检查table中的内容,包括普通文本、数学公式和代码.""" math_html = self._element_to_html(raw_html) - # 处理数学公式和代码 math_recognizer = MathRecognizer() - math_res_parts = math_recognizer.recognize(base_url='', main_html_lst=[(math_html, math_html)], - raw_html=math_html) - code_recognizer = CodeRecognizer() - code_res_parts = code_recognizer.recognize(base_url='', main_html_lst=math_res_parts, - raw_html=math_html) - + math_res_parts = math_recognizer.recognize( + base_url='', + main_html_lst=[(math_html, math_html)], + raw_html=math_html + ) result = [] - for math_item in code_res_parts: + for math_item in math_res_parts: ele_item = self._build_html_tree(math_item[0]) - # 处理所有文本内容 - for text_segment in ele_item.itertext(): - cleaned_text = text_segment.strip().replace('\\n', '') - if cleaned_text: # 过滤空字符串 - # print("cleaned_text", cleaned_text) - result.append(cleaned_text) - # 处理行内公式 - ccinline_math_node = ele_item.xpath(f'//{CCTag.CC_MATH_INLINE}') - if ccinline_math_node: - formulas = [ - el.text.strip() for el in ccinline_math_node if el.text and el.text.strip() - ] - result.extend(formulas) - - # 处理行间公式 - ccinterline_math_node = ele_item.xpath(f'//{CCTag.CC_MATH_INTERLINE}') - if ccinterline_math_node: - formulas = [ - el.text.strip() for el in ccinterline_math_node if el.text and el.text.strip() - ] - result.extend(formulas) - - # 处理行内代码 - ccinline_code_node = ele_item.xpath(f'//{CCTag.CC_CODE_INLINE}') - if ccinline_code_node: - codes = [ - el.text.strip() for el in ccinline_code_node if el.text and el.text.strip() - ] - result.extend(codes) - - # 处理行间代码 - ccinterline_code_node = ele_item.xpath(f'//{CCTag.CC_CODE}') - if ccinterline_code_node: - codes = [ - el.text.strip() for el in ccinterline_code_node if el.text and el.text.strip() - ] - result.extend(codes) + def process_node(node): + """处理行内公式、行间公式、行间代码、行内代码.""" + if node.tag == CCTag.CC_MATH_INLINE: + if node.text and node.text.strip(): + result.append(f'${node.text.strip()}$') + if node.tail and node.tail.strip(): + result.append(node.tail.strip()) + # 处理行间公式 + elif node.tag == CCTag.CC_MATH_INTERLINE: + if node.text and node.text.strip(): + result.append(f'$${node.text.strip()}$$') + if node.tail and node.tail.strip(): + result.append(node.tail.strip()) + # 处理行间代码 + elif node.tag == CCTag.CC_CODE: + if node.text and node.text.strip(): + result.append(f'```{node.text.strip()}```') + if node.tail and node.tail.strip(): + result.append(node.tail.strip()) + # 处理行内代码 + elif node.tag == CCTag.CC_CODE_INLINE: + if node.text and node.text.strip(): + result.append(f'`{node.text.strip()}`') + if node.tail and node.tail.strip(): + result.append(node.tail.strip()) + else: + # 提取当前节点的文本 + if node.text and node.text.strip(): + cleaned_text = node.text.strip().replace('\\n', '') + result.append(cleaned_text) + # 处理节点的tail(元素闭合后的文本) + if node.tail and node.tail.strip(): + cleaned_tail = node.tail.strip().replace('\\n', '') + result.append(cleaned_tail) + # 递归处理子节点 + for child in node: + process_node(child) + # 从根节点开始处理 + process_node(ele_item) return result - def __simplify_td_th_content(self, elem: HtmlElement) -> None: - """简化
内容,仅保留文本内容.""" + def __simplify_td_th_content(self, table_nest_level, elem: HtmlElement) -> None: + """简化 内容,保留嵌套表格结构.""" if elem.tag in ['td', 'th']: - # 简化单元格中的元素 - parse_res = list() - math_res = self.__check_table_include_math_code(elem) - parse_res.extend(math_res) - for item in list(elem.iterchildren()): - elem.remove(item) - if parse_res: - elem.text = '
'.join(parse_res) + parse_res = [] + # 检查是否存在嵌套的表格 + if table_nest_level > 1: + # 存在嵌套表格,递归处理子节点 + for child in elem.iterchildren(): + if child.tag == 'table': + # 对嵌套表格递归调用简化处理 + self.__simplify_td_th_content(table_nest_level, child) + else: + # 处理非表格元素 + math_res = self.__check_table_include_math_code(child) + parse_res.extend(math_res) + elem.remove(child) + # 将非表格内容拼接后放在表格前面 + if parse_res: + elem.text = ' '.join(parse_res) + (elem.text or '') + else: + # 没有嵌套表格,直接简化 + math_res = self.__check_table_include_math_code(elem) + parse_res.extend(math_res) + for item in list(elem.iterchildren()): + elem.remove(item) + if parse_res: + elem.text = ' '.join(parse_res) return - for child in elem.iter('td', 'th'): - self.__simplify_td_th_content(child) + # 非 td/th 元素继续递归处理 + for child in elem.iterchildren(): + self.__simplify_td_th_content(table_nest_level, child) - def __get_table_body(self, table_type, table_root): + def __get_table_body(self, table_type, table_nest_level, table_root): """获取并处理table body,返回处理后的HTML字符串。""" if table_type == 'empty': return None @@ -233,11 +249,12 @@ def __get_table_body(self, table_type, table_root): elem.text = elem.text.strip().replace('\\n', '') if elem.tail is not None: elem.tail = elem.tail.strip().replace('\\n', '') - self.__simplify_td_th_content(table_root) + # 单元格内的多标签内容进行简化,空格拼接,公式、代码识别 + self.__simplify_td_th_content(table_nest_level, table_root) # 迭代 for child in table_root.iterchildren(): if child is not None: - self.__get_table_body(table_type, child) + self.__get_table_body(table_type, table_nest_level, child) return self._element_to_html(table_root) def __do_extract_tables(self, root: HtmlElement) -> None: @@ -247,7 +264,7 @@ def __do_extract_tables(self, root: HtmlElement) -> None: table_type = self.__get_table_type(root) table_nest_level = self.__is_table_nested(root) tail_text = root.tail - table_body = self.__get_table_body(table_type, root) + table_body = self.__get_table_body(table_type, table_nest_level, root) cc_element = self._build_cc_element( CCTag.CC_TABLE, table_body, tail_text, table_type=table_type, table_nest_level=table_nest_level, html=table_raw_html) diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_include_table_math.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_include_table_math.html new file mode 100644 index 00000000..16d7b72e --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_include_table_math.html @@ -0,0 +1,90 @@ + + + + + + + + +
+ + + + + + + + + + + + + +
+

STEM 综合展示表

+
+

基础公式:

+ E = mc^2 + + + + + + + + + + + +
单位换算: + 1 \text{km} = 10^3 \text{m} + + + + + + + + + + + + +
长度质量时间
1m=10^2cm1kg=10^3g1h=3600s
+
运动学: + v = \frac{dx}{dt} + a = \frac{dv}{dt} +
+
+

编程示例:

+
console.log("Hello World")
+ + + + + + + +
+

Python:

+
print(sum(range(1,n+1)))
+
+

对应公式:

+ \sum_{i=1}^{n} i = \frac{n(n+1)}{2} + + + + + + + + + + + +
等差数列等比数列
S_n = \frac{n(a_1+a_n)}{2}S_n = a_1\frac{1-r^n}{1-r}
+
+
+
+ + \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl index f41b78ec..76b39eb6 100644 --- a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl @@ -13,4 +13,5 @@ {"track_id": "table_tail_text", "dataset_name": "test_table_tail_text", "url": "https://dchublists.com/?do=hublist&id=hub-975&language=en","data_source_category": "HTML", "path":"table_tail_text.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} {"track_id": "table_elem_include_enter", "dataset_name": "table_elem_include_enter", "url": "https://fardapaper.ir/financial-development-equity-capital","data_source_category": "HTML", "path":"test_table_elem_include_enter.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} {"track_id": "list_empty", "dataset_name": "test_list_empty", "url": "https://productcenter.ru/products/27276/naturalnoie-krymskoie-mylo-ruchnoi-raboty-39-raznovidnostiei","data_source_category": "HTML", "path":"test_list_empty.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} -{"track_id": "table_include_math_p", "dataset_name": "table_include_math_p", "url": "https://math.stackexchange.com/questions/458323/is-8327-1-a-prime-number?answertab=active","data_source_category": "HTML", "path":"table_include_math_p.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} \ No newline at end of file +{"track_id": "table_include_math_p", "dataset_name": "table_include_math_p", "url": "https://math.stackexchange.com/questions/458323/is-8327-1-a-prime-number?answertab=active","data_source_category": "HTML", "path":"table_include_math_p.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} +{"track_id": "table_include_table_math", "dataset_name": "table_include_table_math", "url": "https://test","data_source_category": "HTML", "path":"table_include_table_math.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index 0608e825..afb9418f 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -89,7 +89,7 @@ def test_cc_simple_table(self): parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) assert len(parts) == 3 content = html_to_element(parts[1][0]).text_content() - assert content == r'
Рейтинг:Рейтинг<br>5.00<br>из 5 на основе опроса<br>3<br>пользователей
Тип товара:Препараты для омоложения
Форма:Крем
Объем:50 мл
Рецепт:Отпускается без рецепта
Способ хранения:Хранить при температуре 4-20°
Примечание:Беречь от детей
Оплата:Наличными/банковской картой
Доступность в Северске:В наличии
Доставка:2-7 Дней
Цена:84<br>₽
' + assert content == r'
Рейтинг:Рейтинг 5.00 из 5 на основе опроса 3 пользователей
Тип товара:Препараты для омоложения
Форма:Крем
Объем:50 мл
Рецепт:Отпускается без рецепта
Способ хранения:Хранить при температуре 4-20°
Примечание:Беречь от детей
Оплата:Наличными/банковской картой
Доступность в Северске:В наличии
Доставка:2-7 Дней
Цена:84 ₽
' def test_cc_complex_table(self): """cc跨行跨列的表格.""" @@ -155,7 +155,7 @@ def test_table_involve_equation(self): raw_html = raw_html_path.read_text(encoding='utf-8') parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) complex_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}') - assert complex_table_tag[0].text == r'
Name of the probability distributionProbability distribution functionMeanVariance
Binomial distribution{\displaystyle \Pr \,(X=k)={\binom {n}{k}}p^{k}(1-p)^{n-k}}{\displaystyle np}{\displaystyle np(1-p)}
Geometric distribution{\displaystyle \Pr \,(X=k)=(1-p)^{k-1}p}{\displaystyle {\frac {1}{p}}}{\displaystyle {\frac {(1-p)}{p^{2}}}}
Normal distribution{\displaystyle f\left(x\mid \mu ,\sigma ^{2}\right)={\frac {1}{\sqrt {2\pi \sigma ^{2}}}}e^{-{\frac {(x-\mu )^{2}}{2\sigma ^{2}}}}}{\displaystyle \mu }{\displaystyle \sigma ^{2}}
Uniform distribution (continuous){\displaystyle f(x\mid a,b)={\begin{cases}{\frac {1}{b-a}}&{\text{for }}a\leq x\leq b,\\[3pt]0&{\text{for }}x<a{\text{ or }}x>b\end{cases}}}{\displaystyle {\frac {a+b}{2}}}{\displaystyle {\frac {(b-a)^{2}}{12}}}
Exponential distribution{\displaystyle f(x\mid \lambda )=\lambda e^{-\lambda x}}{\displaystyle {\frac {1}{\lambda }}}{\displaystyle {\frac {1}{\lambda ^{2}}}}
Poisson distribution{\displaystyle f(k\mid \lambda )={\frac {e^{-\lambda }\lambda ^{k}}{k!}}}{\displaystyle \lambda }{\displaystyle \lambda }
' + assert complex_table_tag[0].text == r'
Name of the probability distributionProbability distribution functionMeanVariance
Binomial distribution${\displaystyle \Pr \,(X=k)={\binom {n}{k}}p^{k}(1-p)^{n-k}}$${\displaystyle np}$${\displaystyle np(1-p)}$
Geometric distribution${\displaystyle \Pr \,(X=k)=(1-p)^{k-1}p}$${\displaystyle {\frac {1}{p}}}$${\displaystyle {\frac {(1-p)}{p^{2}}}}$
Normal distribution${\displaystyle f\left(x\mid \mu ,\sigma ^{2}\right)={\frac {1}{\sqrt {2\pi \sigma ^{2}}}}e^{-{\frac {(x-\mu )^{2}}{2\sigma ^{2}}}}}$${\displaystyle \mu }$${\displaystyle \sigma ^{2}}$
Uniform distribution (continuous)${\displaystyle f(x\mid a,b)={\begin{cases}{\frac {1}{b-a}}&{\text{for }}a\leq x\leq b,\\[3pt]0&{\text{for }}x<a{\text{ or }}x>b\end{cases}}}$${\displaystyle {\frac {a+b}{2}}}$${\displaystyle {\frac {(b-a)^{2}}{12}}}$
Exponential distribution${\displaystyle f(x\mid \lambda )=\lambda e^{-\lambda x}}$${\displaystyle {\frac {1}{\lambda }}}$${\displaystyle {\frac {1}{\lambda ^{2}}}}$
Poisson distribution${\displaystyle f(k\mid \lambda )={\frac {e^{-\lambda }\lambda ^{k}}{k!}}}$${\displaystyle \lambda }$${\displaystyle \lambda }$
' def test_table_involve_after_code(self): """test table involve code, code被提取出去了,过滤掉空的和坏的table.""" diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index d322c6b1..7f3886eb 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -59,7 +59,7 @@ def setUp(self): for line in f: self.data_json.append(json.loads(line.strip())) - assert len(self.data_json) == 16 + assert len(self.data_json) == 17 # Config for HTML extraction self.config = { @@ -358,7 +358,7 @@ def test_table_involve_inline_code(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_list = result.get_content_list()._get_data()[0][0]['content']['html'] - assert content_list == """
FunctionDescriptionExample
print()Prints a message to the console.print("Hello, World!")
len()Returns the length of an object.len([1, 2, 3])
range()Generates a sequence of numbers.range(1, 10)
""" + assert content_list == r"""
FunctionDescriptionExample
`print()`Prints a message to the console.`print("Hello, World!")`
`len()`Returns the length of an object.`len([1, 2, 3])`
`range()`Generates a sequence of numbers.`range(1, 10)`
""" def test_table_tail_text(self): """table的tail文本保留.""" @@ -406,5 +406,17 @@ def test_table_include_math_p(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_list = result.get_content_list()._get_data() - # TODO math模块需要处理下$符号但是非公式 assert len(content_list[0]) == 17 + assert content_list[0][3]['content']['html'] == r"
up vote 17 down vote favorite 5I'm having problems with exercises on proving whether or not a given number is prime. Is $83^{27} + 1$ prime? prime-numbers factoring
" + + def test_table_include_math_p_2(self): + """table包含math和其他内容.""" + chain = ExtractSimpleFactory.create(self.config) + self.assertIsNotNone(chain) + test_data = self.data_json[16] + # Create DataJson from test data + input_data = DataJson(test_data) + result = chain.extract(input_data) + content_list = result.get_content_list()._get_data() + assert content_list[0][2]['content']['html'] == "
单位换算:$1 \\text{km} = 10^3 \\text{m}$
长度质量时间
$1m=10^2cm$$1kg=10^3g$$1h=3600s$
运动学:$v = \\frac{dx}{dt}$ $a = \\frac{dv}{dt}$
" + From 07f1de4689f7edf2323689fd864e77e80a3dc2ad Mon Sep 17 00:00:00 2001 From: dt-yy Date: Fri, 7 Mar 2025 16:06:47 +0800 Subject: [PATCH 30/46] fix pylint --- .../extractor/test_extractor_chain.py | 20 +------------------ 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index 35c5cf3a..3cd7feb1 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -59,11 +59,7 @@ def setUp(self): for line in f: self.data_json.append(json.loads(line.strip())) -<<<<<<< HEAD assert len(self.data_json) == 17 -======= - assert len(self.data_json) == 14 ->>>>>>> 620f5e2739380e5091b8d096ecd6242e219e10ae # Config for HTML extraction self.config = { @@ -380,7 +376,6 @@ def test_table_tail_text(self): content_md = result.get_content_list().to_mm_md() assert '| ID: 975' in content_md -<<<<<<< HEAD def test_table_element_include_enter(self): """table的元素中间有换行.""" chain = ExtractSimpleFactory.create(self.config) @@ -428,17 +423,4 @@ def test_table_include_math_p_2(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_list = result.get_content_list()._get_data() - assert content_list[0][2]['content']['html'] == "
单位换算:$1 \\text{km} = 10^3 \\text{m}$
长度质量时间
$1m=10^2cm$$1kg=10^3g$$1h=3600s$
运动学:$v = \\frac{dx}{dt}$ $a = \\frac{dv}{dt}$
" - -======= - def test_clean_tags(self): - """测试clean_tag的preExtractor是否生效.""" - chain = ExtractSimpleFactory.create(self.config) - self.assertIsNotNone(chain) - test_data = self.data_json[13] - input_data = DataJson(test_data) - result = chain.extract(input_data) - content_md = result.get_content_list().to_mm_md() - print(content_md) - self.assertNotIn('begingroup', content_md) ->>>>>>> 620f5e2739380e5091b8d096ecd6242e219e10ae + assert content_list[0][2]['content']['html'] == '
单位换算:$1 \\text{km} = 10^3 \\text{m}$
长度质量时间
$1m=10^2cm$$1kg=10^3g$$1h=3600s$
运动学:$v = \\frac{dx}{dt}$ $a = \\frac{dv}{dt}$
' From dbe26d620db9a724cb844229c3b19a3e6f2da606 Mon Sep 17 00:00:00 2001 From: dt-yy Date: Fri, 7 Mar 2025 16:19:55 +0800 Subject: [PATCH 31/46] =?UTF-8?q?=E8=A7=A3=E5=86=B3list=E5=92=8Ctable?= =?UTF-8?q?=E7=AD=89=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../good_data/html_data_input.jsonl | 5 +---- tests/llm_web_kit/extractor/test_extractor_chain.py | 12 +++++++++++- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl index 1dd414bd..6c191184 100644 --- a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl @@ -11,11 +11,8 @@ {"track_id": "oracle_doc", "dataset_name": "test_pipeline_suit", "url": "https://docs.oracle.com/en-us/iaas/tools/java/3.57.1/com/oracle/bmc/integration/model/CustomEndpointDetails.html","data_source_category": "HTML", "path":"oracle_doc.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} {"track_id": "table_involve_inline_code", "dataset_name": "test_table_involve_inline_code", "url": "https://docs.oracle.com/en-us/iaas/tools/java/3.57.1/com/oracle/bmc/integration/model/CustomEndpointDetails.html","data_source_category": "HTML", "path":"table_involve_inline_code.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} {"track_id": "table_tail_text", "dataset_name": "test_table_tail_text", "url": "https://dchublists.com/?do=hublist&id=hub-975&language=en","data_source_category": "HTML", "path":"table_tail_text.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} -<<<<<<< HEAD {"track_id": "table_elem_include_enter", "dataset_name": "table_elem_include_enter", "url": "https://fardapaper.ir/financial-development-equity-capital","data_source_category": "HTML", "path":"test_table_elem_include_enter.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} {"track_id": "list_empty", "dataset_name": "test_list_empty", "url": "https://productcenter.ru/products/27276/naturalnoie-krymskoie-mylo-ruchnoi-raboty-39-raznovidnostiei","data_source_category": "HTML", "path":"test_list_empty.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} {"track_id": "table_include_math_p", "dataset_name": "table_include_math_p", "url": "https://math.stackexchange.com/questions/458323/is-8327-1-a-prime-number?answertab=active","data_source_category": "HTML", "path":"table_include_math_p.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} {"track_id": "table_include_table_math", "dataset_name": "table_include_table_math", "url": "https://test","data_source_category": "HTML", "path":"table_include_table_math.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} -======= -{"track_id": "test_clean_tags", "dataset_name": "test_pipeline_suit", "url": "https://math.stackexchange.com/questions/4082284/solving-for-vector-contained-in-a-diagonal-matrix","data_source_category": "HTML", "path":"test_clean_tags.html", "file_bytes": 1000, "page_layout_type":"forum", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} ->>>>>>> 620f5e2739380e5091b8d096ecd6242e219e10ae +{"track_id": "test_clean_tags", "dataset_name": "test_pipeline_suit", "url": "https://math.stackexchange.com/questions/4082284/solving-for-vector-contained-in-a-diagonal-matrix","data_source_category": "HTML", "path":"test_clean_tags.html", "file_bytes": 1000, "page_layout_type":"forum", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index 3cd7feb1..40f9c9a5 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -59,7 +59,7 @@ def setUp(self): for line in f: self.data_json.append(json.loads(line.strip())) - assert len(self.data_json) == 17 + assert len(self.data_json) == 18 # Config for HTML extraction self.config = { @@ -424,3 +424,13 @@ def test_table_include_math_p_2(self): result = chain.extract(input_data) content_list = result.get_content_list()._get_data() assert content_list[0][2]['content']['html'] == '
单位换算:$1 \\text{km} = 10^3 \\text{m}$
长度质量时间
$1m=10^2cm$$1kg=10^3g$$1h=3600s$
运动学:$v = \\frac{dx}{dt}$ $a = \\frac{dv}{dt}$
' + + def test_clean_tags(self): + """测试clean_tag的preExtractor是否生效.""" + chain = ExtractSimpleFactory.create(self.config) + self.assertIsNotNone(chain) + test_data = self.data_json[17] + input_data = DataJson(test_data) + result = chain.extract(input_data) + content_md = result.get_content_list().to_mm_md() + self.assertNotIn('begingroup', content_md) From 45071b35441a657ea989a68c0ca51e47b338dc5b Mon Sep 17 00:00:00 2001 From: dt-yy Date: Mon, 10 Mar 2025 11:47:19 +0800 Subject: [PATCH 32/46] add list test --- llm_web_kit/extractor/html/recognizer/list.py | 49 ++++++++++++++----- .../good_data/html/list_nest_three.html | 30 ++++++++++++ .../good_data/html_data_input.jsonl | 3 +- .../extractor/test_extractor_chain.py | 15 +++++- 4 files changed, 83 insertions(+), 14 deletions(-) create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/list_nest_three.html diff --git a/llm_web_kit/extractor/html/recognizer/list.py b/llm_web_kit/extractor/html/recognizer/list.py index d564d41e..537571a0 100644 --- a/llm_web_kit/extractor/html/recognizer/list.py +++ b/llm_web_kit/extractor/html/recognizer/list.py @@ -3,7 +3,7 @@ from lxml.etree import _Element as HtmlElement from overrides import override - +from llm_web_kit.exception.exception import HtmlListRecognizerException from llm_web_kit.extractor.html.recognizer.recognizer import ( BaseHTMLElementRecognizer, CCTag) from llm_web_kit.libs.doc_element_type import DocElementType, ParagraphTextType @@ -22,13 +22,14 @@ def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segm Returns: """ - ordered, content_list, _ = self.__get_attribute(parsed_content) + ordered, content_list, _, list_nest_level= self.__get_attribute(parsed_content) ele_node = { 'type': DocElementType.LIST, 'raw_content': raw_html_segment, 'content': { 'items': content_list, - 'ordered': ordered + 'ordered': ordered, + 'list_nest_level': list_nest_level } } @@ -51,6 +52,7 @@ def recognize(self, base_url: str, main_html_lst: List[Tuple[str, str]], raw_htm if self.is_cc_html(html): new_html_lst.append((html, raw_html)) else: + print(f"0000000html: {html}") lst = self._extract_list(html) new_html_lst.extend(lst) return new_html_lst @@ -148,13 +150,36 @@ def __extract_list_element(self, ele: HtmlElement) -> tuple[int, bool, list[list return list_nest_level, is_ordered, content_list, raw_html, tail_text def __get_list_type(self, list_ele:HtmlElement) -> int: - """获取list嵌套的类型.""" - if list_ele.tag not in ['ul', 'ol', 'dl', 'menu', 'dir']: - return 0 - ancestor_count = list_ele.xpath('count(ancestor::ul | ancestor::ol)') - # 层级 = 祖先列表数量 + 自身(1层) - return int(ancestor_count) + 1 + """获取list嵌套的层级。 + + 计算一个列表元素的最大嵌套深度,通过递归遍历所有子元素。 + 例如: + - 没有嵌套的列表返回1 + - 有一层嵌套的列表返回2 + - 有两层嵌套的列表返回3 + + Args: + list_ele: 列表HTML元素 + Returns: + int: 列表的最大嵌套深度 + """ + list_type = ['ul', 'ol', 'dl', 'menu', 'dir'] + + def get_max_depth(element): + max_child_depth = 0 + for child in element.iterchildren(): + if child.tag in list_type: + # 找到嵌套列表,其深度至少为1 + child_depth = 1 + get_max_depth(child) + max_child_depth = max(max_child_depth, child_depth) + else: + # 对非列表元素递归检查其子元素 + child_depth = get_max_depth(child) + max_child_depth = max(max_child_depth, child_depth) + return max_child_depth + return get_max_depth(list_ele) + 1 + def __extract_list_item_text(self, root:HtmlElement) -> list[list]: """提取列表项的文本. 列表项里的文本的分段策略采用最简单的方式: @@ -208,7 +233,7 @@ def __get_attribute(self, html:str) -> Tuple[bool, dict, str]: ordered = ele.attrib.get('ordered', 'False') in ['True', 'true'] content_list = json.loads(ele.text) raw_html = ele.attrib.get('html') - return ordered, content_list, raw_html + list_nest_level = ele.attrib.get('list_nest_level', 0) + return ordered, content_list, raw_html, list_nest_level else: - # TODO 抛出异常, 需要自定义 - raise ValueError(f'{html}中没有cctitle标签') + raise HtmlListRecognizerException(f'{html}中没有cctitle标签') diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/list_nest_three.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/list_nest_three.html new file mode 100644 index 00000000..018a85ab --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/list_nest_three.html @@ -0,0 +1,30 @@ + +
+
外层列表项
+
+ +
    +
  1. 第二层列表项 + +
      +
    • 第三层列表项 1
    • +
    • 第三层列表项 2
    • +
    +
  2. +
  3. 第二层其他项
  4. +
+
+ +
外层另一个列表项
+
+ + +
  • 第二层菜单项 + + +
  • 第三层目录项
  • +
    +
  • +
    +
    +
    \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl index 6c191184..857adf62 100644 --- a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl @@ -15,4 +15,5 @@ {"track_id": "list_empty", "dataset_name": "test_list_empty", "url": "https://productcenter.ru/products/27276/naturalnoie-krymskoie-mylo-ruchnoi-raboty-39-raznovidnostiei","data_source_category": "HTML", "path":"test_list_empty.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} {"track_id": "table_include_math_p", "dataset_name": "table_include_math_p", "url": "https://math.stackexchange.com/questions/458323/is-8327-1-a-prime-number?answertab=active","data_source_category": "HTML", "path":"table_include_math_p.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} {"track_id": "table_include_table_math", "dataset_name": "table_include_table_math", "url": "https://test","data_source_category": "HTML", "path":"table_include_table_math.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} -{"track_id": "test_clean_tags", "dataset_name": "test_pipeline_suit", "url": "https://math.stackexchange.com/questions/4082284/solving-for-vector-contained-in-a-diagonal-matrix","data_source_category": "HTML", "path":"test_clean_tags.html", "file_bytes": 1000, "page_layout_type":"forum", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} \ No newline at end of file +{"track_id": "test_clean_tags", "dataset_name": "test_pipeline_suit", "url": "https://math.stackexchange.com/questions/4082284/solving-for-vector-contained-in-a-diagonal-matrix","data_source_category": "HTML", "path":"test_clean_tags.html", "file_bytes": 1000, "page_layout_type":"forum", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} +{"track_id": "list_nest_three", "dataset_name": "list_nest_three", "url": "http://test.com","data_source_category": "HTML", "path":"list_nest_three.html", "file_bytes": 1000, "page_layout_type":"forum", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index 40f9c9a5..a6a7dcd7 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -59,7 +59,7 @@ def setUp(self): for line in f: self.data_json.append(json.loads(line.strip())) - assert len(self.data_json) == 18 + assert len(self.data_json) == 19 # Config for HTML extraction self.config = { @@ -434,3 +434,16 @@ def test_clean_tags(self): result = chain.extract(input_data) content_md = result.get_content_list().to_mm_md() self.assertNotIn('begingroup', content_md) + + def test_list_nest_three(self): + """测试列表嵌套三层.""" + chain = ExtractSimpleFactory.create(self.config) + self.assertIsNotNone(chain) + test_data = self.data_json[18] + input_data = DataJson(test_data) + result = chain.extract(input_data) + result_content_list = result.get_content_list()._get_data() + assert result_content_list[0][0]['content']['nested_level'] == 3 + +if __name__ == "__main__": + unittest.main() \ No newline at end of file From 50468bc2e7867c3e5fbdc30c2858d0d28fcd7f23 Mon Sep 17 00:00:00 2001 From: dt-yy Date: Mon, 10 Mar 2025 11:49:00 +0800 Subject: [PATCH 33/46] add list test --- llm_web_kit/extractor/html/recognizer/list.py | 15 ++++++++------- .../llm_web_kit/extractor/test_extractor_chain.py | 3 --- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/list.py b/llm_web_kit/extractor/html/recognizer/list.py index 537571a0..c314dad8 100644 --- a/llm_web_kit/extractor/html/recognizer/list.py +++ b/llm_web_kit/extractor/html/recognizer/list.py @@ -3,6 +3,7 @@ from lxml.etree import _Element as HtmlElement from overrides import override + from llm_web_kit.exception.exception import HtmlListRecognizerException from llm_web_kit.extractor.html.recognizer.recognizer import ( BaseHTMLElementRecognizer, CCTag) @@ -22,7 +23,7 @@ def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segm Returns: """ - ordered, content_list, _, list_nest_level= self.__get_attribute(parsed_content) + ordered, content_list, _, list_nest_level = self.__get_attribute(parsed_content) ele_node = { 'type': DocElementType.LIST, 'raw_content': raw_html_segment, @@ -52,7 +53,7 @@ def recognize(self, base_url: str, main_html_lst: List[Tuple[str, str]], raw_htm if self.is_cc_html(html): new_html_lst.append((html, raw_html)) else: - print(f"0000000html: {html}") + print(f'0000000html: {html}') lst = self._extract_list(html) new_html_lst.extend(lst) return new_html_lst @@ -151,13 +152,13 @@ def __extract_list_element(self, ele: HtmlElement) -> tuple[int, bool, list[list def __get_list_type(self, list_ele:HtmlElement) -> int: """获取list嵌套的层级。 - + 计算一个列表元素的最大嵌套深度,通过递归遍历所有子元素。 例如: - 没有嵌套的列表返回1 - 有一层嵌套的列表返回2 - 有两层嵌套的列表返回3 - + Args: list_ele: 列表HTML元素 @@ -165,7 +166,7 @@ def __get_list_type(self, list_ele:HtmlElement) -> int: int: 列表的最大嵌套深度 """ list_type = ['ul', 'ol', 'dl', 'menu', 'dir'] - + def get_max_depth(element): max_child_depth = 0 for child in element.iterchildren(): @@ -179,7 +180,7 @@ def get_max_depth(element): max_child_depth = max(max_child_depth, child_depth) return max_child_depth return get_max_depth(list_ele) + 1 - + def __extract_list_item_text(self, root:HtmlElement) -> list[list]: """提取列表项的文本. 列表项里的文本的分段策略采用最简单的方式: @@ -233,7 +234,7 @@ def __get_attribute(self, html:str) -> Tuple[bool, dict, str]: ordered = ele.attrib.get('ordered', 'False') in ['True', 'true'] content_list = json.loads(ele.text) raw_html = ele.attrib.get('html') - list_nest_level = ele.attrib.get('list_nest_level', 0) + list_nest_level = ele.attrib.get('list_nest_level', 0) return ordered, content_list, raw_html, list_nest_level else: raise HtmlListRecognizerException(f'{html}中没有cctitle标签') diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index a6a7dcd7..a475d2c0 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -444,6 +444,3 @@ def test_list_nest_three(self): result = chain.extract(input_data) result_content_list = result.get_content_list()._get_data() assert result_content_list[0][0]['content']['nested_level'] == 3 - -if __name__ == "__main__": - unittest.main() \ No newline at end of file From 77532d44b375cb559f90709aa43ecf1d4b1507b3 Mon Sep 17 00:00:00 2001 From: dt-yy Date: Mon, 10 Mar 2025 12:55:46 +0800 Subject: [PATCH 34/46] add list test --- llm_web_kit/extractor/html/recognizer/list.py | 1 - tests/llm_web_kit/extractor/test_extractor_chain.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/list.py b/llm_web_kit/extractor/html/recognizer/list.py index c314dad8..7694ba1a 100644 --- a/llm_web_kit/extractor/html/recognizer/list.py +++ b/llm_web_kit/extractor/html/recognizer/list.py @@ -53,7 +53,6 @@ def recognize(self, base_url: str, main_html_lst: List[Tuple[str, str]], raw_htm if self.is_cc_html(html): new_html_lst.append((html, raw_html)) else: - print(f'0000000html: {html}') lst = self._extract_list(html) new_html_lst.extend(lst) return new_html_lst diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index a475d2c0..b58b6964 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -443,4 +443,4 @@ def test_list_nest_three(self): input_data = DataJson(test_data) result = chain.extract(input_data) result_content_list = result.get_content_list()._get_data() - assert result_content_list[0][0]['content']['nested_level'] == 3 + assert int(result_content_list[0][0]['content']['list_nest_level']) == 3 From b173d3c5bacb958c8acf245d2bdc60d923d43297 Mon Sep 17 00:00:00 2001 From: dt-yy Date: Thu, 13 Mar 2025 14:14:03 +0800 Subject: [PATCH 35/46] improve performence --- llm_web_kit/extractor/html/extractor.py | 46 +++++++++++------ .../extractor/html/recognizer/audio.py | 27 ++++++++-- .../html/recognizer/cc_math/common.py | 2 +- .../extractor/html/recognizer/cccode.py | 37 ++++++++------ .../extractor/html/recognizer/ccmath.py | 15 +++--- .../extractor/html/recognizer/image.py | 15 +++--- llm_web_kit/extractor/html/recognizer/list.py | 22 ++++---- .../extractor/html/recognizer/recognizer.py | 29 +++++++---- .../extractor/html/recognizer/table.py | 51 +++++++++++-------- llm_web_kit/extractor/html/recognizer/text.py | 50 +++++++++--------- .../extractor/html/recognizer/title.py | 34 +++++++------ .../extractor/html/recognizer/video.py | 5 +- llm_web_kit/input/datajson.py | 11 ++-- llm_web_kit/tools/cli.py | 1 - tests/llm_web_kit/cli_sdk/test_cli_sdk.py | 2 +- .../assets/recognizer/raw_html_attr.html | 2 +- .../extractor/html/recognizer/test_code.py | 16 +++--- .../extractor/html/recognizer/test_image.py | 5 +- .../extractor/html/recognizer/test_list.py | 5 +- .../extractor/html/recognizer/test_math.py | 25 ++++----- .../extractor/html/recognizer/test_para.py | 6 +-- .../html/recognizer/test_recognizer.py | 32 ++++++------ .../extractor/html/recognizer/test_table.py | 48 ++++++++--------- .../extractor/html/recognizer/test_title.py | 7 +-- .../extractor/test_extractor_chain.py | 1 + 25 files changed, 287 insertions(+), 207 deletions(-) diff --git a/llm_web_kit/extractor/html/extractor.py b/llm_web_kit/extractor/html/extractor.py index a3d4a5f6..e62a82c9 100644 --- a/llm_web_kit/extractor/html/extractor.py +++ b/llm_web_kit/extractor/html/extractor.py @@ -2,6 +2,7 @@ from typing import List, Tuple import commentjson as json +from lxml.html import HtmlElement from overrides import override from llm_web_kit.config.cfg_reader import load_config @@ -20,6 +21,7 @@ from llm_web_kit.extractor.html.recognizer.title import TitleRecognizer from llm_web_kit.extractor.html.recognizer.video import VideoRecognizer from llm_web_kit.input.datajson import ContentList, DataJson +from llm_web_kit.libs.doc_element_type import DocElementType from llm_web_kit.libs.html_utils import element_to_html, html_to_element from llm_web_kit.libs.path_lib import get_py_pkg_root_dir @@ -92,12 +94,19 @@ def _do_extract(self, data_json: DataJson) -> DataJson: page_layout_type:str = data_json.get('page_layout_type', HTMLPageLayoutType.LAYOUT_ARTICLE) # 默认是文章类型 main_html, method = self._extract_main_html(raw_html, base_url, page_layout_type) - parsed_html = [(main_html,raw_html)] + main_html_element = html_to_element(main_html) + # parsed_html = [(main_html,raw_html)] + parsed_html = [(main_html_element, raw_html)] + """ + for extract_func in [self._extract_code, self._extract_table, self._extract_math, self._extract_list, + self._extract_image, + self._extract_title, self._extract_paragraph]: + parsed_html = extract_func(base_url, parsed_html, raw_html) + """ for extract_func in [self._extract_code, self._extract_table, self._extract_math, self._extract_list, self._extract_image, self._extract_title, self._extract_paragraph]: parsed_html = extract_func(base_url, parsed_html, raw_html) - content_list:ContentList = self._export_to_content_list(base_url, parsed_html, raw_html) data_json['content_list'] = content_list @@ -119,7 +128,8 @@ def _extract_main_html(self, raw_html:str, base_url:str, page_layout_type:str) - dict_result = self.__magic_html_extractor.extract(raw_html, base_url=base_url, precision=False, html_type=page_layout_type) return dict_result['html'], dict_result['xp_num'] - def _extract_code(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]: + # def _extract_code(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]: + def _extract_code(self, base_url:str, html_lst:List[Tuple[HtmlElement, HtmlElement]], raw_html:str) -> List[Tuple[HtmlElement,HtmlElement]]: """从html文本中提取代码. Args: @@ -257,43 +267,43 @@ def __is_valid_node(self, node: dict) -> bool: if not node: raise HtmlFileExtractorException('node is empty') node_type = node.get('type') - valid_types = {'list', 'code', 'equation-interline', 'image', 'table', 'title', 'paragraph'} + valid_types = {DocElementType.LIST, DocElementType.CODE, DocElementType.EQUATION_INTERLINE, DocElementType.IMAGE, DocElementType.TABLE, DocElementType.IMAGE, DocElementType.PARAGRAPH} if node_type not in valid_types: raise HtmlFileExtractorException(f'Invalid node type: {node_type}') # 检查列表类型的节点 - if node.get('type') == 'list': + if node.get('type') == DocElementType.LIST: items = node.get('content', {}).get('items', []) # 过滤掉None、空列表,以及只包含None或空值的列表 return bool(items) and any( isinstance(item, (dict, list)) and bool(item) for item in items) # 检测code类型的节点 - if node.get('type') == 'code': + if node.get('type') == DocElementType.CODE: code_content = node.get('content', {}).get('code_content') # 如果代码内容为None或空字符串,则视为无效节点 return bool(code_content and code_content.strip()) # 检测行间公式类型的节点 - if node.get('type') == 'equation-interline': + if node.get('type') == DocElementType.EQUATION_INTERLINE: math_content = node.get('content', {}).get('math_content') # 如果公式内容为None或空字符串,则视为无效节点 return bool(math_content and math_content.strip()) # 检测image类型的节点 - if node.get('type') == 'image': + if node.get('type') == DocElementType.IMAGE: content = node.get('content', {}) # 检查url、path或data字段是否至少有一个不为空 return bool(content.get('url') or content.get('path') or content.get('data')) # 检测table类型的节点 - if node.get('type') == 'table': + if node.get('type') == DocElementType.TABLE: html = node.get('content', {}).get('html') # 如果表格的html内容为None或空字符串,则视为无效节点 return bool(html and html.strip()) # 检测title类型的节点 - if node.get('type') == 'title': + if node.get('type') == DocElementType.TITLE: title_content = node.get('content', {}).get('title_content') # 如果标题内容为None或空字符串,则视为无效节点 return bool(title_content and title_content.strip()) # 检测段落类型的节点 - if node.get('type') == 'paragraph': + if node.get('type') == DocElementType.PARAGRAPH: content = node.get('content', []) # 检查content列表是否存在且不为空,并且至少有一个非空的内容项 return bool(content) and any( @@ -302,7 +312,7 @@ def __is_valid_node(self, node: dict) -> bool: ) return True - def _export_to_content_list(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str) -> ContentList: + def _export_to_content_list(self, base_url:str, html_lst:List[Tuple[HtmlElement,HtmlElement]], raw_html:str) -> ContentList: """将解析结果存入content_list格式中. Args: @@ -319,7 +329,9 @@ def _export_to_content_list(self, base_url:str, html_lst:List[Tuple[str,str]], r ccnode_html, cc_tag = self.__get_cc_node(parsed_html) parser:BaseHTMLElementRecognizer = self.__to_content_list_mapper.get(cc_tag) if parser: - node = parser.to_content_list_node(base_url, ccnode_html, raw_html) + raw_html_str = element_to_html(raw_html) + # raw_html_str = raw_html + node = parser.to_content_list_node(base_url, ccnode_html, raw_html_str) if node and self.__is_valid_node(node): one_page.append(node) else: @@ -327,7 +339,7 @@ def _export_to_content_list(self, base_url:str, html_lst:List[Tuple[str,str]], r content_list = ContentList([one_page]) # 对于网页来说仅有一页,如果多页,则剩下的每个都是一个论坛的回复 return content_list - def __get_cc_node(self, html:str) -> (str, str): + def __get_cc_node(self, html:HtmlElement) -> (HtmlElement, str): """获取html文本的根标签名。只获取一个,如果html文本中包含多个cc标签,则抛异常。 Args: @@ -336,7 +348,8 @@ def __get_cc_node(self, html:str) -> (str, str): Returns: str: 根标签名 """ - el = html_to_element(html) + # el = html_to_element(html) + el = html if el.tag in self.__to_content_list_mapper.keys(): return html, el.tag else: @@ -347,7 +360,8 @@ def __get_cc_node(self, html:str) -> (str, str): raise HtmlFileExtractorException(f'html文本中没有cc标签: {html}') if len(nodes) > 3: raise HtmlFileExtractorException(f'html文本中包含多个cc标签: {html}') - return element_to_html(nodes[0]), nodes[0].tag + # return element_to_html(nodes[0]), nodes[0].tag + return nodes[0], nodes[0].tag def __build_extractor(self): """ diff --git a/llm_web_kit/extractor/html/recognizer/audio.py b/llm_web_kit/extractor/html/recognizer/audio.py index 24acc343..4f3d18ae 100644 --- a/llm_web_kit/extractor/html/recognizer/audio.py +++ b/llm_web_kit/extractor/html/recognizer/audio.py @@ -1,15 +1,17 @@ from typing import List, Tuple +from lxml.html import HtmlElement from overrides import override from llm_web_kit.extractor.html.recognizer.recognizer import \ BaseHTMLElementRecognizer +from llm_web_kit.libs.doc_element_type import DocElementType class AudioRecognizer(BaseHTMLElementRecognizer): """解析音频元素.""" @override - def recognize(self, base_url:str, main_html_lst: List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]: + def recognize(self, base_url:str, main_html_lst: List[Tuple[HtmlElement,HtmlElement]], raw_html:str) -> List[Tuple[HtmlElement,HtmlElement]]: """父类,解析音频元素. Args: @@ -22,5 +24,24 @@ def recognize(self, base_url:str, main_html_lst: List[Tuple[str,str]], raw_html: raise NotImplementedError @override - def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segment: str) -> dict: - raise NotImplementedError + def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_html_segment: str) -> dict: + """ + 把音频元素转换为content list node. + Args: + base_url: + parsed_content: + raw_html_segment: + + Returns: + + """ + node = { + 'type': DocElementType.AUDIO, + 'raw_content': parsed_content.attrib.get('html', ''), + 'content': { + 'url': parsed_content.attrib.get('url', ''), + 'path': parsed_content.attrib.get('path', ''), + 'data': parsed_content.attrib.get('data', '') + } + } + return node diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/common.py b/llm_web_kit/extractor/html/recognizer/cc_math/common.py index f98e4cd8..fe102f13 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/common.py +++ b/llm_web_kit/extractor/html/recognizer/cc_math/common.py @@ -169,7 +169,7 @@ def extract_asciimath(self, s: str) -> str: parsed = asciimath2tex.translate(s) return parsed - def get_math_render(self, html: str) -> str: + def get_math_render(self, html: HtmlElement) -> str: """获取数学公式渲染器. 示例: MathJax: diff --git a/llm_web_kit/extractor/html/recognizer/cccode.py b/llm_web_kit/extractor/html/recognizer/cccode.py index 4a638fee..33374410 100644 --- a/llm_web_kit/extractor/html/recognizer/cccode.py +++ b/llm_web_kit/extractor/html/recognizer/cccode.py @@ -7,7 +7,6 @@ tag_pre_code) from llm_web_kit.extractor.html.recognizer.recognizer import ( BaseHTMLElementRecognizer, CCTag) -from llm_web_kit.libs.html_utils import element_to_html, html_to_element class CodeRecognizer(BaseHTMLElementRecognizer): @@ -17,9 +16,9 @@ class CodeRecognizer(BaseHTMLElementRecognizer): def recognize( self, base_url: str, - main_html_lst: List[Tuple[str, str]], - raw_html: str, - ) -> List[Tuple[str, str]]: + main_html_lst: List[Tuple[HtmlElement, HtmlElement]], + raw_html: str + ) -> List[Tuple[HtmlElement, HtmlElement]]: """父类,解析代码元素. Args: @@ -38,7 +37,8 @@ def recognize( if self.is_cc_html(html): rtn.append((html, raw_html)) continue - root: HtmlElement = html_to_element(html) + # root: HtmlElement = html_to_element(html) + root = html while True: # 最常见: #
    @@ -77,31 +77,36 @@ def remove_empty_code(r: HtmlElement): remove_empty_code(x) remove_empty_code(root) - - html_str: str = element_to_html(root) - - rtn.extend(BaseHTMLElementRecognizer.html_split_by_tags(html_str, CCTag.CC_CODE)) - + # html_str: str = element_to_html(root) + rtn.extend(BaseHTMLElementRecognizer.html_split_by_tags(root, CCTag.CC_CODE)) return rtn @override - def to_content_list_node(self, base_url:str, parsed_content: str, raw_html_segment:str) -> dict: - code_node: HtmlElement = html_to_element(parsed_content) + def to_content_list_node(self, base_url:str, parsed_content: HtmlElement, raw_html_segment:str) -> dict: + """ + 把代码元素转换为content list node. + Args: + base_url: + parsed_content: HtmlElement对象 + raw_html_segment: + + Returns: + """ d = { 'type': 'code', # "bbox": [], 'raw_content': raw_html_segment, - 'inline': code_node.get('inline', 'false') == 'true', + 'inline': parsed_content.get('inline', 'false') == 'true', 'content': { - 'code_content': code_node.text, + 'code_content': parsed_content.text, }, } - if lang := code_node.get('language', None): + if lang := parsed_content.get('language', None): d['content']['language'] = lang - if by := code_node.get('by', None): + if by := parsed_content.get('by', None): d['content']['by'] = by return d diff --git a/llm_web_kit/extractor/html/recognizer/ccmath.py b/llm_web_kit/extractor/html/recognizer/ccmath.py index 41a53688..6831a6ef 100644 --- a/llm_web_kit/extractor/html/recognizer/ccmath.py +++ b/llm_web_kit/extractor/html/recognizer/ccmath.py @@ -24,7 +24,7 @@ def __init__(self): super().__init__() @override - def recognize(self, base_url: str, main_html_lst: List[Tuple[str, str]], raw_html: str) -> List[Tuple[str, str]]: + def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html: str) -> List[Tuple[HtmlElement, HtmlElement]]: """父类,解析数学公式元素. Args: @@ -46,7 +46,7 @@ def recognize(self, base_url: str, main_html_lst: List[Tuple[str, str]], raw_htm return result @override - def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segment: str) -> dict: + def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_html_segment: str) -> dict: """将content转换成content_list_node. 每种类型的html元素都有自己的content-list格式:参考 docs/specification/output_format/content_list_spec.md 例如代码的返回格式: @@ -68,7 +68,8 @@ def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segm Returns: dict: content_list_node """ - tree = self._build_html_tree(parsed_content) + # tree = self._build_html_tree(parsed_content) + tree = parsed_content if tree is None: raise HtmlMathRecognizerException(f'Failed to load html: {parsed_content}') @@ -103,7 +104,7 @@ def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segm else: raise HtmlMathRecognizerException(f'No ccmath element found in content: {parsed_content}') - def process_ccmath_html(self, cc_html: str, o_html: str, math_render: str) -> List[Tuple[str, str]]: + def process_ccmath_html(self, cc_html: HtmlElement, o_html: HtmlElement, math_render: str) -> List[Tuple[HtmlElement, HtmlElement]]: """处理数学公式,将外层标签修改为 ccmath. Args: @@ -114,7 +115,8 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: str) -> Li List[Tuple[str, str]]: 处理后的HTML对 """ # node是从cc_html中解析出来的lxml节点 - tree = self._build_html_tree(cc_html) + # tree = self._build_html_tree(cc_html) + tree = cc_html if tree is None: raise HtmlMathRecognizerException(f'Failed to load html: {cc_html}') @@ -159,7 +161,8 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: str) -> Li tag_common_modify.modify_tree(cm, math_render, original_html, node, parent) # 打印处理后的html # print(self._element_to_html(tree)) - return self.html_split_by_tags(self._element_to_html(tree), [CCTag.CC_MATH_INTERLINE]) + # return self.html_split_by_tags(self._element_to_html(tree), [CCTag.CC_MATH_INTERLINE]) + return self.html_split_by_tags(tree, [CCTag.CC_MATH_INTERLINE]) if __name__ == '__main__': diff --git a/llm_web_kit/extractor/html/recognizer/image.py b/llm_web_kit/extractor/html/recognizer/image.py index 7be5b862..af362110 100644 --- a/llm_web_kit/extractor/html/recognizer/image.py +++ b/llm_web_kit/extractor/html/recognizer/image.py @@ -18,7 +18,7 @@ class ImageRecognizer(BaseHTMLElementRecognizer): IMG_LABEL = ['.jpg', '.jpeg', '.png', '.gft', '.webp', '.bmp', '.svg', 'data:image', '.gif'] # '.pdf' @override - def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segment: str) -> dict: + def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_html_segment: str) -> dict: """将content转换成content_list_node. 每种类型的html元素都有自己的content-list格式:参考 docs/specification/output_format/content_list_spec.md 例如代码的返回格式: @@ -43,7 +43,8 @@ def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segm Returns: dict: content_list_node """ - html_obj = self._build_html_tree(parsed_content) + # html_obj = self._build_html_tree(parsed_content) + html_obj = parsed_content if html_obj.tag == CCTag.CC_IMAGE: return self.__ccimg_to_content_list(raw_html_segment, html_obj) @@ -66,7 +67,7 @@ def __ccimg_to_content_list(self, raw_html_segment: str, html_obj: HtmlElement) return result @override - def recognize(self, base_url: str, main_html_lst: List[Tuple[str, str]], raw_html: str) -> List[Tuple[str, str]]: + def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html: str) -> List[Tuple[HtmlElement, HtmlElement]]: """父类,解析图片元素. Args: @@ -88,9 +89,10 @@ def recognize(self, base_url: str, main_html_lst: List[Tuple[str, str]], raw_htm ccimg_html.append(html_li) return ccimg_html - def __parse_html_img(self, base_url: str, html_str: Tuple[str, str]) -> List[Tuple[str, str]]: + def __parse_html_img(self, base_url: str, html_str: Tuple[HtmlElement, HtmlElement]) -> List[Tuple[HtmlElement, HtmlElement]]: """解析html,获取img标签.""" - html_obj = self._build_html_tree(html_str[0]) + # html_obj = self._build_html_tree(html_str[0]) + html_obj = html_str[0] image_related_selectors = [ '//*[contains(@class, "image-embed") or contains(@id, "image-embed")]', # 可能包含嵌入图片的自定义标签 '//*[starts-with(@src, "data:image/") and not(self::img)]', @@ -168,7 +170,8 @@ def __parse_img_elements(self, base_url: str, img_elements: HtmlElement, html_ob self._replace_element(elem, new_ccimage) if is_valid_img: - updated_html = self._element_to_html(html_obj) + # updated_html = self._element_to_html(html_obj) + updated_html = html_obj return (updated_html, img_tag) else: return (None, None) diff --git a/llm_web_kit/extractor/html/recognizer/list.py b/llm_web_kit/extractor/html/recognizer/list.py index 7694ba1a..2615f60e 100644 --- a/llm_web_kit/extractor/html/recognizer/list.py +++ b/llm_web_kit/extractor/html/recognizer/list.py @@ -1,7 +1,7 @@ import json from typing import Any, List, Tuple -from lxml.etree import _Element as HtmlElement +from lxml.html import HtmlElement from overrides import override from llm_web_kit.exception.exception import HtmlListRecognizerException @@ -13,7 +13,7 @@ class ListRecognizer(BaseHTMLElementRecognizer): """解析列表元素.""" - def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segment: str) -> dict: + def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_html_segment: str) -> dict: """专化为列表元素的解析. Args: @@ -23,6 +23,8 @@ def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segm Returns: """ + if not isinstance(parsed_content, HtmlElement): + raise HtmlListRecognizerException(f'parsed_content 必须是 HtmlElement 类型,而不是 {type(parsed_content)}') ordered, content_list, _, list_nest_level = self.__get_attribute(parsed_content) ele_node = { 'type': DocElementType.LIST, @@ -37,7 +39,7 @@ def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segm return ele_node @override - def recognize(self, base_url: str, main_html_lst: List[Tuple[str, str]], raw_html: str) -> List[Tuple[str, str]]: + def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html: str) -> List[Tuple[HtmlElement, HtmlElement]]: """父类,解析列表元素. Args: @@ -57,7 +59,7 @@ def recognize(self, base_url: str, main_html_lst: List[Tuple[str, str]], raw_htm new_html_lst.extend(lst) return new_html_lst - def _extract_list(self, raw_html: str) -> List[Tuple[str, str]]: + def _extract_list(self, raw_html: HtmlElement) -> List[Tuple[HtmlElement, HtmlElement]]: """提取列表元素. 不支持嵌套列表,如果有嵌套的情况,则内部列表将作为一个单独的段落,内部列表的每个列表项作为一个单独的句子,使用句号结尾。 列表在html中有以下几个标签: @@ -70,12 +72,13 @@ def _extract_list(self, raw_html: str) -> List[Tuple[str, str]]: Returns: List[Tuple[str, str]]: 列表元素, 第一个str是xxx, 第二个str是原始的html内容 """ - tree = self._build_html_tree(raw_html) + # tree = self._build_html_tree(raw_html) + tree = raw_html self.__do_extract_list(tree) # 最后切割html - new_html = self._element_to_html(tree) + # new_html = self._element_to_html(tree) + new_html = tree lst = self.html_split_by_tags(new_html, CCTag.CC_LIST) - return lst def __do_extract_list(self, root:HtmlElement) -> None: @@ -219,7 +222,7 @@ def __extract_list_item_text_recusive(el: HtmlElement) -> list[list]: return text_paragraph - def __get_attribute(self, html:str) -> Tuple[bool, dict, str]: + def __get_attribute(self, html:HtmlElement) -> Tuple[bool, dict, str]: """获取element的属性. Args: @@ -228,7 +231,8 @@ def __get_attribute(self, html:str) -> Tuple[bool, dict, str]: Returns: Tuple[str]: 第一个元素是是否有序; 第二个元素是个python list,内部是文本和行内公式,具体格式参考list的content_list定义。第三个元素是列表原始的html内容 """ - ele = self._build_html_tree(html) + # ele = self._build_html_tree(html) + ele = html if ele is not None and ele.tag == CCTag.CC_LIST: ordered = ele.attrib.get('ordered', 'False') in ['True', 'true'] content_list = json.loads(ele.text) diff --git a/llm_web_kit/extractor/html/recognizer/recognizer.py b/llm_web_kit/extractor/html/recognizer/recognizer.py index d8a4b7c8..a7dd3e13 100644 --- a/llm_web_kit/extractor/html/recognizer/recognizer.py +++ b/llm_web_kit/extractor/html/recognizer/recognizer.py @@ -28,7 +28,7 @@ class BaseHTMLElementRecognizer(ABC): """基本的元素解析类.""" @abstractmethod - def recognize(self, base_url:str, main_html_lst: List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]: + def recognize(self, base_url:str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html:str) -> List[Tuple[HtmlElement, HtmlElement]]: """父类,解析html中的元素. Args: @@ -37,11 +37,12 @@ def recognize(self, base_url:str, main_html_lst: List[Tuple[str,str]], raw_html: raw_html: 原始完整的html Returns: + List[Tuple[HtmlElement, HtmlElement]]: 处理后的HTML元素列表 """ raise NotImplementedError @abstractmethod - def to_content_list_node(self, base_url:str, parsed_content: str, raw_html_segment:str) -> dict: + def to_content_list_node(self, base_url:str, parsed_content: HtmlElement, raw_html_segment:str) -> dict: """将content转换成content_list_node. 每种类型的html元素都有自己的content-list格式:参考 docs/specification/output_format/content_list_spec.md 例如代码的返回格式: @@ -114,7 +115,7 @@ def _replace_element(self, element:HtmlElement, cc_element:HtmlElement) -> None: replace_element(element, cc_element) @staticmethod - def html_split_by_tags(html_segment: str, split_tag_names:str | list) -> List[Tuple[str,str]]: + def html_split_by_tags(root: HtmlElement, split_tag_names:str | list) -> List[Tuple[HtmlElement,HtmlElement]]: """根据split_tag_name将html分割成不同的部分. Args: @@ -122,7 +123,7 @@ def html_split_by_tags(html_segment: str, split_tag_names:str | list) -> List[Tu split_tag_names: str|list: 分割标签名, 例如 'p' 或者 'div' 或者 ['p', 'div'] """ copy_attri = True # 是否copy 父节点的属性 - root = html_to_element(html_segment) + # root = html_to_element(html_segment) if isinstance(split_tag_names, str): # 如果参数是str,转换成list split_tag_names = [split_tag_names] @@ -179,7 +180,8 @@ def __split_node(elem: HtmlElement): for sub_elem in elem: if sub_elem.tag in split_tag_names: # previous elements - nodes = raw_nodes = element_to_html(path[0]) + # nodes = raw_nodes = element_to_html(path[0]) + nodes = raw_nodes = path[0] if not __is_element_text_empty(path[0]): yield nodes, raw_nodes @@ -191,7 +193,11 @@ def __split_node(elem: HtmlElement): if not html_source_segment: mylogger.error(f'{sub_elem.tag} has no html attribute') # TODO raise exception - nodes, raw_nodes = element_to_html(path[0]), html_source_segment + # nodes, raw_nodes = element_to_html(path[0]), html_source_segment + if html_source_segment: + nodes, raw_nodes = path[0], html_to_element(html_source_segment) + else: + nodes, raw_nodes = path[0], None # if not __is_element_text_empty(path[0]): yield nodes, raw_nodes # 这个地方无需检查是否为空,因为这个是分割元素,必须返还 @@ -208,7 +214,8 @@ def __split_node(elem: HtmlElement): copied.tail = elem.tail if not path: - nodes = raw_nodes = element_to_html(copied) + nodes = raw_nodes = copied + # raw_nodes = element_to_html(copied) if not __is_element_text_empty(copied): yield nodes, raw_nodes @@ -216,18 +223,18 @@ def __split_node(elem: HtmlElement): return rtn @staticmethod - def is_cc_html(html: str, tag_name: str | list = None) -> bool: + def is_cc_html(el: HtmlElement, tag_name: str | list = None) -> bool: """判断html片段是否是cc标签. 判断的时候由于自定义ccmath等标签可能会含有父标签,因此要逐层判断tagname. 含有父html 完整路径的如:...,这种情况也会被识别为cc标签. - TODO 保证进来的cc标签没有父标签,只有一个根标签。 + Args: - html: str: html片段 + el: str|HtmlElement: html片段或HtmlElement对象 tag_name: str|list: cc标签,如ccmath, cccode, 如果指定了那么就只检查这几个标签是否在html里,否则检查所有cc标签 """ # cc标签是指自定义标签,例如等,输入html片段,判断是否是cc标签 - el = html_to_element(html) + # el = html_to_element(html) if el is None: return False diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index 28694c7f..8fc8e9bd 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -9,6 +9,7 @@ from llm_web_kit.extractor.html.recognizer.recognizer import ( BaseHTMLElementRecognizer, CCTag) from llm_web_kit.libs.doc_element_type import DocElementType +from llm_web_kit.libs.html_utils import remove_element class TableRecognizer(BaseHTMLElementRecognizer): @@ -20,8 +21,8 @@ def __init__(self): @override def recognize(self, base_url: str, - main_html_lst: List[Tuple[str, str]], - raw_html: str) -> List[Tuple[str, str]]: + main_html_lst: List[Tuple[HtmlElement, HtmlElement]], + raw_html: str) -> List[Tuple[HtmlElement, HtmlElement]]: """父类,解析表格元素. Args: @@ -30,6 +31,7 @@ def recognize(self, raw_html: 原始完整的html Returns: + List[Tuple[HtmlElement, HtmlElement]]: 处理后的HTML元素列表 """ final_result = list() for cc_html, o_html in main_html_lst: @@ -41,23 +43,27 @@ def recognize(self, return final_result @override - def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segment: str) -> dict: - if not parsed_content: - raise HtmlTableRecognizerException(f'table parsed_content{parsed_content}为空') + def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_html_segment: str) -> dict: + if not isinstance(parsed_content, HtmlElement): + raise HtmlTableRecognizerException(f'parsed_content 必须是 HtmlElement 类型,而不是 {type(parsed_content)}') + table_type, table_nest_level, table_body = self.__get_attribute(parsed_content) + + # 确保 table_body 不为 None 且是字符串类型 + html_content = table_body if table_body is not None else '' + # 使用传入的 raw_html_segment 或将 parsed_content 转换为字符串 d = { 'type': DocElementType.TABLE, - # "bbox": [], 'raw_content': raw_html_segment, 'content': { - 'html': table_body, - }, + 'html': html_content, + 'is_complex': table_type, + 'table_nest_level': table_nest_level + } } - d['content']['is_complex'] = table_type - d['content']['table_nest_level'] = table_nest_level return d - def __is_contain_cc_html(self, cc_html: str) -> bool: + def __is_contain_cc_html(self, cc_html: HtmlElement) -> bool: """判断html片段是否是cc标签.""" return BaseHTMLElementRecognizer.is_cc_html(cc_html) @@ -125,11 +131,11 @@ def __is_table_nested(self, element) -> int: max_level = max(max_level, level) return max_level - def __extract_tables(self, ele: str) -> List[Tuple[str, str]]: + def __extract_tables(self, ele: HtmlElement) -> List[Tuple[HtmlElement, HtmlElement]]: """提取html中的table元素.""" - tree = self._build_html_tree(ele) + tree = ele self.__do_extract_tables(tree) - new_html = self._element_to_html(tree) + new_html = tree lst = self.html_split_by_tags(new_html, CCTag.CC_TABLE) return lst @@ -149,16 +155,18 @@ def __get_table_type(self, child: HtmlElement) -> str: def __check_table_include_math_code(self, raw_html: HtmlElement): """检查table中的内容,包括普通文本、数学公式和代码.""" - math_html = self._element_to_html(raw_html) + math_raw_html = self._element_to_html(raw_html) + math_html = raw_html math_recognizer = MathRecognizer() math_res_parts = math_recognizer.recognize( base_url='', main_html_lst=[(math_html, math_html)], - raw_html=math_html + raw_html=math_raw_html ) result = [] for math_item in math_res_parts: - ele_item = self._build_html_tree(math_item[0]) + # ele_item = self._build_html_tree(math_item[0]) + ele_item = math_item[0] def process_node(node): """处理行内公式、行间公式、行间代码、行内代码.""" @@ -216,16 +224,16 @@ def __simplify_td_th_content(self, table_nest_level, elem: HtmlElement) -> None: # 处理非表格元素 math_res = self.__check_table_include_math_code(child) parse_res.extend(math_res) - elem.remove(child) + remove_element(child) # 将非表格内容拼接后放在表格前面 if parse_res: - elem.text = ' '.join(parse_res) + (elem.text or '') + elem.text = ' '.join(parse_res) else: # 没有嵌套表格,直接简化 math_res = self.__check_table_include_math_code(elem) parse_res.extend(math_res) for item in list(elem.iterchildren()): - elem.remove(item) + remove_element(item) if parse_res: elem.text = ' '.join(parse_res) return @@ -275,7 +283,8 @@ def __do_extract_tables(self, root: HtmlElement) -> None: def __get_attribute(self, html: str) -> Tuple[bool, Any, Any]: """获取element的属性.""" - ele = self._build_html_tree(html) + # ele = self._build_html_tree(html) + ele = html if ele is not None and ele.tag == CCTag.CC_TABLE: table_type = ele.attrib.get('table_type') table_nest_level = ele.attrib.get('table_nest_level') diff --git a/llm_web_kit/extractor/html/recognizer/text.py b/llm_web_kit/extractor/html/recognizer/text.py index d4e9e7c7..c106e4da 100644 --- a/llm_web_kit/extractor/html/recognizer/text.py +++ b/llm_web_kit/extractor/html/recognizer/text.py @@ -2,14 +2,14 @@ import string from typing import List, Tuple -from lxml import etree +from lxml import html from lxml.html import HtmlElement from overrides import override from llm_web_kit.extractor.html.recognizer.recognizer import ( BaseHTMLElementRecognizer, CCTag) from llm_web_kit.libs.doc_element_type import DocElementType, ParagraphTextType -from llm_web_kit.libs.html_utils import element_to_html +from llm_web_kit.libs.html_utils import element_to_html, html_to_element special_symbols = [ # TODO 从文件读取 '®', # 注册商标符号 @@ -42,27 +42,28 @@ class TextParagraphRecognizer(BaseHTMLElementRecognizer): """解析文本段落元素.""" @override - def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segment: str) -> dict: + def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_html_segment: str) -> dict: """ 把文本段落元素转换为content list node. Args: base_url: - parsed_content: + parsed_content: 可能是字符串或HtmlElement对象 raw_html_segment: Returns: """ - el = self._build_html_tree(parsed_content) + # 如果是字符串则转换为HtmlElement,否则直接使用 + el = parsed_content node = { 'type': DocElementType.PARAGRAPH, - 'raw_content': el.attrib.get('html', ''), + 'raw_content': raw_html_segment, 'content': json.loads(el.text), } return node @override - def recognize(self, base_url:str, main_html_lst: List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]: + def recognize(self, base_url:str, main_html_lst: List[Tuple[HtmlElement | str, HtmlElement | str]], raw_html:str) -> List[Tuple[HtmlElement, HtmlElement]]: """父类,解析文本段落元素. Args: @@ -73,31 +74,34 @@ def recognize(self, base_url:str, main_html_lst: List[Tuple[str,str]], raw_html: Returns: """ new_html_lst = [] - for html, raw_html in main_html_lst: - if self.is_cc_html(html): - new_html_lst.append((html, raw_html)) + for html_element, raw_html_element in main_html_lst: + # 如果是字符串则转换为 HtmlElement + # html_element = html_to_element(html) if isinstance(html, str) else html + # raw_html_element = html_to_element(raw_html) if isinstance(raw_html, str) else raw_html + if self.is_cc_html(html_element): + new_html_lst.append((html_element, raw_html_element)) else: - root_el = self._build_html_tree(html) - lst = list(self.__extract_paragraphs(root_el)) - # 然后对lst[Element, raw_html] 进行处理. 提出Element里的文字,做成<>标签 + lst = list(self.__extract_paragraphs(html_element)) new_lst = self.__to_cctext_lst(lst) new_html_lst.extend(new_lst) return new_html_lst - def __to_cctext_lst(self, lst: List[Tuple[HtmlElement, str]]) -> List[Tuple[str, str]]: + def __to_cctext_lst(self, lst: List[Tuple[HtmlElement | str, HtmlElement | str]]) -> List[Tuple[HtmlElement, HtmlElement]]: """将lst[Element, raw_html] 进行处理. 提出Element里的文字,做成<>标签. Args: - lst: List[Tuple[HtmlElement, str]]: Element和raw_html组成的列表 + lst: List[Tuple[HtmlElement | str, HtmlElement | str]]: Element和raw_html组成的列表 """ new_lst = [] for el, raw_html in lst: - para_text = self.__get_paragraph_text(el) - if para_text: - cctext_el = self._build_cc_element(CCTag.CC_TEXT, json.dumps(para_text, ensure_ascii=False, indent=4), '', html=raw_html) - cc_node_html = self._element_to_html(cctext_el) - new_lst.append((cc_node_html, raw_html)) + # 如果是字符串则转换为 HtmlElement + el_element = html_to_element(el) if isinstance(el, str) else el + raw_html_element = html_to_element(raw_html) if isinstance(raw_html, str) else raw_html + para_text = self.__get_paragraph_text(el_element) + if para_text: + cctext_el = self._build_cc_element(CCTag.CC_TEXT, json.dumps(para_text, ensure_ascii=False, indent=4), '', html=element_to_html(raw_html_element)) + new_lst.append((cctext_el, raw_html_element)) return new_lst def __combine_text(self, text1:str, text2:str, lang='en') -> str: @@ -169,7 +173,7 @@ def __extract_paragraphs(self, root: HtmlElement): 解析后的文本段落元素 """ path: List[HtmlElement] = [] - parser = etree.HTMLParser(collect_ids=False, encoding='utf-8', remove_comments=True, remove_pis=True) + parser = html.HTMLParser(collect_ids=False, encoding='utf-8', remove_comments=True, remove_pis=True) def is_contain_readable_text(text): return text.strip() if text else text @@ -224,8 +228,8 @@ def helper(elem: HtmlElement): if has_direct_text(sub_elem) or (sub_elem.tag == 'p' and has_text(sub_elem)): rebuild_path() path[-1].append(copy_helper(sub_elem)) - yield path[0], element_to_html(path[0]) - + # yield path[0], element_to_html(path[0]) + yield path[0], path[0] # detach the yielded tree rebuild_path() continue diff --git a/llm_web_kit/extractor/html/recognizer/title.py b/llm_web_kit/extractor/html/recognizer/title.py index 8f2043db..9b18bdb4 100644 --- a/llm_web_kit/extractor/html/recognizer/title.py +++ b/llm_web_kit/extractor/html/recognizer/title.py @@ -1,6 +1,7 @@ from typing import List, Tuple -from lxml.etree import _Element as HtmlElement +# from lxml.etree import _Element as HtmlElement +from lxml.html import HtmlElement from overrides import override from llm_web_kit.extractor.html.recognizer.recognizer import ( @@ -12,7 +13,7 @@ class TitleRecognizer(BaseHTMLElementRecognizer): """解析多级标题元素.""" @override - def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segment: str) -> dict: + def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_html_segment: str) -> dict: """将html转换成content_list_node. Args: @@ -37,8 +38,8 @@ def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segm return cctitle_content_node @override - def recognize(self, base_url:str, main_html_lst: List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]: - """父类,解析多级标题元素. + def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html: str) -> List[Tuple[HtmlElement, HtmlElement]]: + """父类,解析标题元素. Args: base_url: str: 基础url @@ -46,9 +47,12 @@ def recognize(self, base_url:str, main_html_lst: List[Tuple[str,str]], raw_html: raw_html: 原始完整的html Returns: + List[Tuple[HtmlElement, HtmlElement]]: 处理后的HTML元素列表 """ new_html_lst = [] for html, raw_html in main_html_lst: + if isinstance(html, str): + html = self._build_html_tree(html) if self.is_cc_html(html): new_html_lst.append((html, raw_html)) else: @@ -56,22 +60,19 @@ def recognize(self, base_url:str, main_html_lst: List[Tuple[str,str]], raw_html: new_html_lst.extend(lst) return new_html_lst - def _extract_title(self, raw_html:str) -> List[Tuple[str,str]]: - """ - 提取多级标题元素 + def _extract_title(self, raw_html: HtmlElement) -> List[Tuple[HtmlElement, HtmlElement]]: + """提取多级标题元素 Args: - raw_html: + raw_html: HtmlElement对象 Returns: - List[Tuple[str,str]]: 多级标题元素, 第一个str是xxx, 第二个str是原始的html内容 - + List[Tuple[HtmlElement, HtmlElement]]: 多级标题元素列表 """ - tree = self._build_html_tree(raw_html) - self.__do_extract_title(tree) # 遍历这个tree, 找到所有h1, h2, h3, h4, h5, h6标签, 并得到其对应的原始的html片段 + tree = raw_html + self.__do_extract_title(tree) # 遍历这个tree, 找到所有h1, h2, h3, h4, h5, h6标签 # 最后切割html - new_html = self._element_to_html(tree) + new_html = tree lst = self.html_split_by_tags(new_html, CCTag.CC_TITLE) - return lst def __do_extract_title(self, root:HtmlElement) -> None: @@ -137,9 +138,10 @@ def __extract_title_text_recusive(el: HtmlElement, with_tail: bool = True) -> li return ' '.join(blk for blk in blks if blk) - def __get_attribute(self, html:str) -> Tuple[int, str]: + def __get_attribute(self, html:HtmlElement) -> Tuple[int, str]: """获取element的属性.""" - ele = self._build_html_tree(html) + # ele = self._build_html_tree(html) + ele = html # 找到cctitle标签 if ele is not None: level = ele.attrib.get('level') diff --git a/llm_web_kit/extractor/html/recognizer/video.py b/llm_web_kit/extractor/html/recognizer/video.py index 227736a1..bed7df5a 100644 --- a/llm_web_kit/extractor/html/recognizer/video.py +++ b/llm_web_kit/extractor/html/recognizer/video.py @@ -1,5 +1,6 @@ from typing import List, Tuple +from lxml.html import HtmlElement from overrides import override from llm_web_kit.extractor.html.recognizer.recognizer import \ @@ -9,7 +10,7 @@ class VideoRecognizer(BaseHTMLElementRecognizer): """解析视元素.""" @override - def recognize(self, base_url:str, main_html_lst: List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]: + def recognize(self, base_url:str, main_html_lst: List[Tuple[HtmlElement,HtmlElement]], raw_html:str) -> List[Tuple[HtmlElement,HtmlElement]]: """父类,解析视频元素. Args: @@ -22,5 +23,5 @@ def recognize(self, base_url:str, main_html_lst: List[Tuple[str,str]], raw_html: raise NotImplementedError @override - def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segment: str) -> dict: + def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_html_segment: str) -> dict: raise NotImplementedError diff --git a/llm_web_kit/input/datajson.py b/llm_web_kit/input/datajson.py index 20c3842f..90517a23 100644 --- a/llm_web_kit/input/datajson.py +++ b/llm_web_kit/input/datajson.py @@ -6,7 +6,8 @@ from overrides import override from llm_web_kit.libs.doc_element_type import DocElementType, ParagraphTextType -from llm_web_kit.libs.html_utils import (get_element_text, html_to_element, +from llm_web_kit.libs.html_utils import (element_to_html, get_element_text, + html_to_element, html_to_markdown_table, table_cells_count) @@ -118,9 +119,11 @@ def to_main_html(self) -> str: for page in content_lst: for content_lst_node in page: raw_html = content_lst_node['raw_content'] - if raw_html: - html += raw_html - + if isinstance(raw_html, str): + html_segment = raw_html # 直接使用字符串 + else: + html_segment = element_to_html(raw_html) # 转换HtmlElement为字符串 + html += html_segment return html def to_json(self, pretty=False) -> str: diff --git a/llm_web_kit/tools/cli.py b/llm_web_kit/tools/cli.py index c2260e1f..ca8a41b4 100644 --- a/llm_web_kit/tools/cli.py +++ b/llm_web_kit/tools/cli.py @@ -58,7 +58,6 @@ def cli(input_path, output_path, debug_mode): extractor = HTMLFileFormatExtractor({}) data_e = extractor.extract(DataJson(input_data)) output_json = data_e.to_json() - if output_path: output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) diff --git a/tests/llm_web_kit/cli_sdk/test_cli_sdk.py b/tests/llm_web_kit/cli_sdk/test_cli_sdk.py index 6aad22ba..a556e0ba 100644 --- a/tests/llm_web_kit/cli_sdk/test_cli_sdk.py +++ b/tests/llm_web_kit/cli_sdk/test_cli_sdk.py @@ -73,11 +73,11 @@ def test_process_html_file_path(self, runner, json_with_file_path, tmp_path): def test_stdout_output(self, runner, json_with_html_path): """测试输出到标准输出.""" + print('json_with_html_path', json_with_html_path) result = runner.invoke(cli, ['-i', str(json_with_html_path)]) assert result.exit_code == 0 assert result.output - output_data = json.loads(result.output) assert 'content_list' in output_data assert isinstance(output_data['content_list'], list) diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/raw_html_attr.html b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/raw_html_attr.html index 5b33ffa1..e90fe9f1 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/raw_html_attr.html +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/raw_html_attr.html @@ -1,5 +1,5 @@ -E=MC^2 +E=MC^2 这是python
       
    diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_code.py b/tests/llm_web_kit/extractor/html/recognizer/test_code.py
    index 40f758c1..e5edec1c 100644
    --- a/tests/llm_web_kit/extractor/html/recognizer/test_code.py
    +++ b/tests/llm_web_kit/extractor/html/recognizer/test_code.py
    @@ -269,12 +269,12 @@ def test_code_rec(self):
                 base_url = test_case['input'][1]
                 print(base_url)
                 raw_html = raw_html_path.read_text()
    -            parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html)
    -            parts = [
    -                part[0]
    -                for part in parts
    -                if CCTag.CC_CODE in part[0] or CCTag.CC_CODE_INLINE in part[0]
    -            ]
    +            parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html)
    +            # parts = [
    +            #    part[0]
    +            #    for part in parts
    +            #    if CCTag.CC_CODE in part[0] or CCTag.CC_CODE_INLINE in part[0]
    +            # ]
                 # for part in parts:
                 #     part_el = html_to_element(part)
                 #     answer = get_element_text(part_el).strip()
    @@ -283,7 +283,7 @@ def test_code_rec(self):
                 #     print("--------------------------------------------------")
                 answers = []
                 for part in parts:
    -                part_el = html_to_element(part)
    +                part_el = part[0]
                     cccodes = part_el.xpath(f'.//{CCTag.CC_CODE}') + part_el.xpath(
                         f'.//{CCTag.CC_CODE_INLINE}'
                     )
    @@ -556,4 +556,4 @@ def test_lineno_4(self):
     
     """
             # 无须检查内容,只要不爆错就可以了
    -        _ = self.rec.recognize('', [(html, html)], html)
    +        _ = self.rec.recognize('', [(html_to_element(html), html_to_element(html))], html)
    diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_image.py b/tests/llm_web_kit/extractor/html/recognizer/test_image.py
    index ab3cd733..9f374848 100644
    --- a/tests/llm_web_kit/extractor/html/recognizer/test_image.py
    +++ b/tests/llm_web_kit/extractor/html/recognizer/test_image.py
    @@ -3,6 +3,7 @@
     
     from llm_web_kit.extractor.html.recognizer.image import ImageRecognizer
     from llm_web_kit.extractor.html.recognizer.recognizer import CCTag
    +from llm_web_kit.libs.html_utils import html_to_element
     
     TEST_CASES_HTML = [
         {
    @@ -98,7 +99,7 @@ def test_recognize(self):
                 raw_html_path = base_dir.joinpath(test_case['input'])
                 base_url = test_case['base_url']
                 raw_html = raw_html_path.read_text(encoding='utf-8')
    -            parts = self.img_recognizer.recognize(base_url, [(raw_html, raw_html)], raw_html)
    +            parts = self.img_recognizer.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html)
                 self.assertEqual(len(parts), test_case['expected'])
                 ccimg_datas = [ccimg[0] for ccimg in parts if CCTag.CC_IMAGE in ccimg[0] and 'by="svg"' not in ccimg[0]]
                 if ccimg_datas:
    @@ -109,7 +110,7 @@ def test_recognize(self):
         def test_to_content_list_node(self):
             for test_case in TEST_CC_CASE:
                 try:
    -                res = self.img_recognizer.to_content_list_node(test_case['url'], test_case['parsed_content'],
    +                res = self.img_recognizer.to_content_list_node(test_case['url'], html_to_element(test_case['parsed_content']),
                                                                    test_case['html'])
                     self.assertEqual(res, test_case['expected'])
                     self.assertEqual(res['content']['alt'], test_case['alt'])
    diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_list.py b/tests/llm_web_kit/extractor/html/recognizer/test_list.py
    index 2cc10aac..0696618f 100644
    --- a/tests/llm_web_kit/extractor/html/recognizer/test_list.py
    +++ b/tests/llm_web_kit/extractor/html/recognizer/test_list.py
    @@ -2,6 +2,7 @@
     import unittest
     
     from llm_web_kit.extractor.html.recognizer.list import ListRecognizer
    +from llm_web_kit.libs.html_utils import html_to_element
     
     
     class TestSimpleListRecognize(unittest.TestCase):
    @@ -17,10 +18,10 @@ def setUp(self):
                 self.__complex_list_content = file.read()
     
         def test_simple_list(self):
    -        html_part = self.__list_recognize.recognize('http://url.com', [(self.__simple_list_content, self.__complex_list_content)], self.__simple_list_content)
    +        html_part = self.__list_recognize.recognize('http://url.com', [(html_to_element(self.__simple_list_content), html_to_element(self.__complex_list_content))], self.__simple_list_content)
             assert len(html_part) == 6
     
         def test_complex_list(self):
             # TODO: Fix this test
    -        html_part = self.__list_recognize.recognize('http://url.com', [(self.__simple_list_content, self.__complex_list_content)], self.__complex_list_content)
    +        html_part = self.__list_recognize.recognize('http://url.com', [(html_to_element(self.__simple_list_content), html_to_element(self.__complex_list_content))], self.__complex_list_content)
             assert len(html_part) == 6
    diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_math.py b/tests/llm_web_kit/extractor/html/recognizer/test_math.py
    index 985306ee..ba569eb4 100644
    --- a/tests/llm_web_kit/extractor/html/recognizer/test_math.py
    +++ b/tests/llm_web_kit/extractor/html/recognizer/test_math.py
    @@ -4,7 +4,7 @@
     from llm_web_kit.exception.exception import HtmlMathRecognizerException
     from llm_web_kit.extractor.html.recognizer.ccmath import CCMATH, MathRecognizer
     from llm_web_kit.extractor.html.recognizer.recognizer import CCTag
    -from llm_web_kit.libs.html_utils import html_to_element
    +from llm_web_kit.libs.html_utils import element_to_html, html_to_element
     
     TEST_CASES = [
         # 基本公式测试用例
    @@ -344,7 +344,7 @@ def test_math_recognizer(self):
                 with self.subTest(input=test_case['input'], raw_html=test_case['raw_html']):
                     output_html = self.math_recognizer.recognize(
                         'https://www.baidu.com',
    -                    test_case['input'],
    +                    [(html_to_element(test_case['input'][0][0]), html_to_element(test_case['input'][0][1]))],
                         test_case['raw_html']
                     )
                     print(output_html)
    @@ -352,9 +352,7 @@ def test_math_recognizer(self):
                     self.assertEqual(len(output_html), len(test_case['expected']), msg=f'result is: {len(output_html)}, expected is: {expect_len}')
                     for i in range(len(output_html)):
                         expect = test_case['expected'][i][0]
    -                    print(output_html[i][0])
    -                    print(expect)
    -                    self.assertEqual(output_html[i][0], expect, msg=f'result is: {output_html[i][0]}, expected is: {expect}')
    +                    self.assertEqual(element_to_html(output_html[i][0]), expect, msg=f'result is: {output_html[i][0]}, expected is: {expect}')
     
         def test_math_recognizer_html(self):
             for test_case in TEST_CASES_HTML:
    @@ -362,16 +360,19 @@ def test_math_recognizer_html(self):
                 # print('raw_html_path::::::::', raw_html_path)
                 base_url = test_case['base_url']
                 raw_html = raw_html_path.read_text()
    -            parts = self.math_recognizer.recognize(base_url, [(raw_html, raw_html)], raw_html)
    +            parts = self.math_recognizer.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html)
                 # print(parts)
                 # 将parts列表中第一个元素拼接保存到文件,带随机数
                 # import random
                 # with open('parts'+str(random.randint(1, 100))+".html", 'w') as f:
                 #     for part in parts:
                 #         f.write(str(part[0]))
    +            print(parts)
                 # 检查行间公式抽取正确性
    -            parts = [part[0] for part in parts if CCTag.CC_MATH_INTERLINE in part[0]]
    -            print(len(parts))
    +            new_parts = []
    +            for part in parts:
    +                new_parts.append((element_to_html(part[0]), element_to_html(part[1])))
    +            parts = [part[0] for part in new_parts if CCTag.CC_MATH_INTERLINE in part[0]]
                 expect_text = base_dir.joinpath(test_case['expected']).read_text().strip()
                 expect_formulas = [formula for formula in expect_text.split('\n') if formula]
                 self.assertEqual(len(parts), len(expect_formulas))
    @@ -390,8 +391,6 @@ def test_math_recognizer_html(self):
                 if test_case.get('expected_inline', None):
                     print('expected_inline::::::::', test_case['expected_inline'])
                     parts = [part[0] for part in parts if CCTag.CC_MATH_INLINE in part[0]]
    -                print(len(parts))
    -                print(parts)
     
         def write_to_html(self, answers, file_name):
             file_name = file_name.split('.')[0]
    @@ -405,9 +404,11 @@ def test_to_content_list_node(self):
                 with self.subTest(input=test_case['input']):
                     output_node = self.math_recognizer.to_content_list_node(
                         test_case['input'][0],
    -                    test_case['input'][1],
    +                    html_to_element(test_case['input'][1]),
                         test_case['input'][2]
                     )
    +                print('output_node::::::::', output_node)
    +                print(test_case['expected'])
                     self.assertEqual(output_node, test_case['expected'])
     
             # 测试没有ccmath标签的情况
    @@ -419,7 +420,7 @@ def test_to_content_list_node(self):
             with self.assertRaises(HtmlMathRecognizerException) as exc_info:
                 self.math_recognizer.to_content_list_node(
                     invalid_content[0],
    -                invalid_content[1],
    +                html_to_element(invalid_content[1]),
                     invalid_content[2]
                 )
             self.assertIn('No ccmath element found in content', str(exc_info.exception))
    diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_para.py b/tests/llm_web_kit/extractor/html/recognizer/test_para.py
    index 42e988bd..adb38c59 100644
    --- a/tests/llm_web_kit/extractor/html/recognizer/test_para.py
    +++ b/tests/llm_web_kit/extractor/html/recognizer/test_para.py
    @@ -20,21 +20,21 @@ def test_recognize_simple_para(self):
                 html = f.read()
     
             # 执行识别
    -        result = self.recognizer.recognize('', [(html, html)], html)
    +        result = self.recognizer.recognize('', [(html_to_element(html), html_to_element(html))], html)
     
             # 验证结果
             self.assertEqual(len(result), 2)  # 应该识别出2个段落
     
             # 验证第一个段落
             first_para = result[0][0]
    -        ccel = html_to_element(first_para)
    +        ccel = first_para
             jso = json.loads(ccel.text)
             self.assertEqual(jso[0]['c'], '质量方程')
             self.assertEqual(jso[0]['t'], 'text')
     
             # 验证第二个段落
             second_para = result[1][0]
    -        text = html_to_element(second_para).text
    +        text = second_para.text
             jso = json.loads(text)
             self.assertEqual(jso[0]['c'], '爱因斯坦的方程')
             self.assertEqual(jso[0]['t'], 'text')
    diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_recognizer.py b/tests/llm_web_kit/extractor/html/recognizer/test_recognizer.py
    index 7bedf512..86b303e6 100644
    --- a/tests/llm_web_kit/extractor/html/recognizer/test_recognizer.py
    +++ b/tests/llm_web_kit/extractor/html/recognizer/test_recognizer.py
    @@ -3,6 +3,7 @@
     
     from llm_web_kit.extractor.html.recognizer.recognizer import \
         BaseHTMLElementRecognizer
    +from llm_web_kit.libs.html_utils import element_to_html, html_to_element
     
     
     class TestBaseHTMLElementRecognizer(unittest.TestCase):
    @@ -10,51 +11,50 @@ def test_html_split_by_tags_1(self):
             with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/image.html', 'r') as file:
                 html_content = file.read()
     
    -        result = BaseHTMLElementRecognizer.html_split_by_tags(html_content, ['img'])
    +        result = BaseHTMLElementRecognizer.html_split_by_tags(html_to_element(html_content), ['img'])
             assert len(result) == 7
     
         def test_html_split_by_tags_2(self):
             with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/cccode.html', 'r') as file:
                 html_content = file.read()
     
    -        result = BaseHTMLElementRecognizer.html_split_by_tags(html_content, ['cccode'])
    +        result = BaseHTMLElementRecognizer.html_split_by_tags(html_to_element(html_content), ['cccode'])
             assert len(result) == 3
     
         def test_html_split_by_tags_3(self):
             with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/raw_html_attr.html', 'r') as file:
                 html_content = file.read()
    -
    -        result = BaseHTMLElementRecognizer.html_split_by_tags(html_content, ['ccmath'])
    +        result = BaseHTMLElementRecognizer.html_split_by_tags(html_to_element(html_content), ['ccmath'])
             assert len(result) == 2
    -        assert result[0][1] == '$E=MC^2$'
    +        assert element_to_html(result[0][1]) == '$E=MC^2$'
     
         def test_html_split_by_tags_with_parent_nodes(self):
             """测试是否能够正确带上父节点."""
             with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/with_parent_nodes.html', 'r') as file:
                 html_content = file.read()
     
    -        result_with_parent = BaseHTMLElementRecognizer.html_split_by_tags(html_content, 'cccode')
    +        result_with_parent = BaseHTMLElementRecognizer.html_split_by_tags(html_to_element(html_content), 'cccode')
             assert len(result_with_parent) == 7
    -        assert result_with_parent[0][0] == """
    + assert element_to_html(result_with_parent[0][0]) == """
    这里是text 这里是span
    """ - assert result_with_parent[2][0] == '
    print("BBBBBB")
    ' - assert result_with_parent[3][0] == """
    + assert element_to_html(result_with_parent[2][0]) == '
    print("BBBBBB")
    ' + assert element_to_html(result_with_parent[3][0]) == """
    这里是tail

    这里是div text 这里是span2

    """ - result = BaseHTMLElementRecognizer.html_split_by_tags(html_content, 'cccode') + result = BaseHTMLElementRecognizer.html_split_by_tags(html_to_element(html_content), 'cccode') assert len(result) == 7 def test_is_cctag(self): with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/iscctag.html', 'r') as file: html_content = file.read() - assert BaseHTMLElementRecognizer.is_cc_html(html_content, 'cccode') - assert BaseHTMLElementRecognizer.is_cc_html(html_content, 'ccmath') - assert BaseHTMLElementRecognizer.is_cc_html(html_content, 'ccimage') - assert not BaseHTMLElementRecognizer.is_cc_html(html_content, 'ccvideo') - assert not BaseHTMLElementRecognizer.is_cc_html(html_content, 'cctitle') - assert BaseHTMLElementRecognizer.is_cc_html(html_content, ['cccode', 'ccxxx']) + assert BaseHTMLElementRecognizer.is_cc_html(html_to_element(html_content), 'cccode') + assert BaseHTMLElementRecognizer.is_cc_html(html_to_element(html_content), 'ccmath') + assert BaseHTMLElementRecognizer.is_cc_html(html_to_element(html_content), 'ccimage') + assert not BaseHTMLElementRecognizer.is_cc_html(html_to_element(html_content), 'ccvideo') + assert not BaseHTMLElementRecognizer.is_cc_html(html_to_element(html_content), 'cctitle') + assert BaseHTMLElementRecognizer.is_cc_html(html_to_element(html_content), ['cccode', 'ccxxx']) diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index afb9418f..33a3fa5f 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -45,7 +45,7 @@ def test_involve_cctale(self): raw_html_path = base_dir.joinpath(test_case['input'][0]) base_url = test_case['input'][1] raw_html = raw_html_path.read_text() - parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) + parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) self.assertEqual(len(parts), 4) def test_not_involve_table(self): @@ -54,7 +54,7 @@ def test_not_involve_table(self): raw_html_path = base_dir.joinpath(test_case['input'][1]) base_url = test_case['input'][1] raw_html = raw_html_path.read_text(encoding='utf-8') - parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) + parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) self.assertEqual(len(parts), 1) def test_only_involve_table(self): @@ -63,9 +63,9 @@ def test_only_involve_table(self): raw_html_path = base_dir.joinpath(test_case['input'][2]) base_url = test_case['input'][1] raw_html = raw_html_path.read_text() - parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) + parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) self.assertEqual(len(parts), 2) - table_body = html_to_element(parts[1][0]).text_content() + table_body = parts[1][0].text_content() assert table_body == r'
    Mrs S Hindle
    ShowCCRCC
    Driffield 5th October 2006CH. Ricksbury Royal HeroCH. Keyingham Branwell
    Manchester 16th January 2008CH. Lochbuie GeordieMerryoth Maeve
    Darlington 20th September 2009CH. Maibee Make BelieveCH. Loranka Just Like Heaven JW
    Blackpool 22nd June 2012CH. Loranka Sherrie BabyDear Magic Touch De La Fi Au Songeur
    Welsh Kennel Club 2014Brymarden Carolina SunriseCh. Wandris Evan Elp Us
    Welsh Kennel Club 2014Ch. Charnell Clematis of SalegreenCH. Byermoor Queens Maid
    ' def test_table_include_img_label(self): @@ -74,9 +74,9 @@ def test_table_include_img_label(self): raw_html_path = base_dir.joinpath(test_case['input'][6]) base_url = test_case['input'][1] raw_html = raw_html_path.read_text() - parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) + parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) assert len(parts) == 3 - simple_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}')[0] + simple_table_tag = parts[1][0].xpath(f'.//{CCTag.CC_TABLE}')[0] simple_table_type = simple_table_tag.attrib assert simple_table_type['table_type'] == 'simple' @@ -86,9 +86,9 @@ def test_cc_simple_table(self): raw_html_path = base_dir.joinpath(test_case['input'][7]) base_url = test_case['input'][8] raw_html = raw_html_path.read_text(encoding='utf-8') - parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) + parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) assert len(parts) == 3 - content = html_to_element(parts[1][0]).text_content() + content = parts[1][0].text_content() assert content == r'
    Рейтинг:Рейтинг 5.00 из 5 на основе опроса 3 пользователей
    Тип товара:Препараты для омоложения
    Форма:Крем
    Объем:50 мл
    Рецепт:Отпускается без рецепта
    Способ хранения:Хранить при температуре 4-20°
    Примечание:Беречь от детей
    Оплата:Наличными/банковской картой
    Доступность в Северске:В наличии
    Доставка:2-7 Дней
    Цена:84 ₽
    ' def test_cc_complex_table(self): @@ -97,11 +97,11 @@ def test_cc_complex_table(self): raw_html_path = base_dir.joinpath(test_case['input'][8]) base_url = test_case['input'][8] raw_html = raw_html_path.read_text(encoding='utf-8') - parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) + parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) assert len(parts) == 3 - content = html_to_element(parts[1][0]).text_content() + content = parts[1][0].text_content() assert content == r'
    ফেব্রুয়ারি ২০২৪
    সোমমঙ্গলবুধবৃহশুক্রশনিরবি
    « জানুয়ারি
    ১০১১
    ১২১৩১৪১৫১৬১৭১৮
    ১৯২০২১২২২৩২৪২৫
    ২৬২৭২৮২৯
    ' - table_type = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}')[0] + table_type = parts[1][0].xpath(f'.//{CCTag.CC_TABLE}')[0] assert table_type.attrib['table_type'] == 'complex' def test_simple_complex_table(self): @@ -110,12 +110,12 @@ def test_simple_complex_table(self): raw_html_path = base_dir.joinpath(test_case['input'][3]) base_url = test_case['input'][1] raw_html = raw_html_path.read_text(encoding='utf-8') - parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) - simple_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}')[0] + parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) + simple_table_tag = parts[1][0].xpath(f'.//{CCTag.CC_TABLE}')[0] simple_table_type = simple_table_tag.attrib assert simple_table_type['table_type'] == 'simple' assert simple_table_type == {'table_type': 'simple', 'table_nest_level': '1', 'html': '\n \n \n \n \n \n \n \n \n
    12
    34
    \n\n'} - complex_table_tag = html_to_element(parts[2][0]).xpath(f'.//{CCTag.CC_TABLE}')[0] + complex_table_tag = parts[2][0].xpath(f'.//{CCTag.CC_TABLE}')[0] complex_table_type = complex_table_tag.attrib assert complex_table_type['table_type'] == 'complex' assert complex_table_type == {'table_type': 'complex', 'table_nest_level': '1', 'html': '\n \n \n \n \n \n \n \n \n \n \n \n \n \n
    123
    4
    567
    \n '} @@ -127,7 +127,7 @@ def test_table_to_content_list_node_simple(self): base_url = test_case['input'][1] raw_html = raw_html_path.read_text(encoding='utf-8') parsed_content = raw_html - result = self.rec.to_content_list_node(base_url, parsed_content, raw_html) + result = self.rec.to_content_list_node(base_url, html_to_element(parsed_content), raw_html) expect = base_dir.joinpath(test_case['expected'][0]) expect_json = expect.read_text(encoding='utf-8') assert result['type'] == json.loads(expect_json)['type'] @@ -142,7 +142,7 @@ def test_table_to_content_list_node_complex(self): raw_html_path = base_dir.joinpath(test_case['input'][5]) expect_path = base_dir.joinpath(test_case['expected'][1]) raw_html = raw_html_path.read_text(encoding='utf-8') - result = self.rec.to_content_list_node(expect_path, raw_html, raw_html) + result = self.rec.to_content_list_node(expect_path, html_to_element(raw_html), raw_html) fr = open(expect_path, 'r', encoding='utf-8') expect_result = json.loads(fr.read()) assert result == expect_result @@ -153,8 +153,8 @@ def test_table_involve_equation(self): raw_html_path = base_dir.joinpath(test_case['input'][9]) base_url = 'https://en.m.wikipedia.org/wiki/Variance' raw_html = raw_html_path.read_text(encoding='utf-8') - parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) - complex_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}') + parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) + complex_table_tag = parts[1][0].xpath(f'.//{CCTag.CC_TABLE}') assert complex_table_tag[0].text == r'
    Name of the probability distributionProbability distribution functionMeanVariance
    Binomial distribution${\displaystyle \Pr \,(X=k)={\binom {n}{k}}p^{k}(1-p)^{n-k}}$${\displaystyle np}$${\displaystyle np(1-p)}$
    Geometric distribution${\displaystyle \Pr \,(X=k)=(1-p)^{k-1}p}$${\displaystyle {\frac {1}{p}}}$${\displaystyle {\frac {(1-p)}{p^{2}}}}$
    Normal distribution${\displaystyle f\left(x\mid \mu ,\sigma ^{2}\right)={\frac {1}{\sqrt {2\pi \sigma ^{2}}}}e^{-{\frac {(x-\mu )^{2}}{2\sigma ^{2}}}}}$${\displaystyle \mu }$${\displaystyle \sigma ^{2}}$
    Uniform distribution (continuous)${\displaystyle f(x\mid a,b)={\begin{cases}{\frac {1}{b-a}}&{\text{for }}a\leq x\leq b,\\[3pt]0&{\text{for }}x<a{\text{ or }}x>b\end{cases}}}$${\displaystyle {\frac {a+b}{2}}}$${\displaystyle {\frac {(b-a)^{2}}{12}}}$
    Exponential distribution${\displaystyle f(x\mid \lambda )=\lambda e^{-\lambda x}}$${\displaystyle {\frac {1}{\lambda }}}$${\displaystyle {\frac {1}{\lambda ^{2}}}}$
    Poisson distribution${\displaystyle f(k\mid \lambda )={\frac {e^{-\lambda }\lambda ^{k}}{k!}}}$${\displaystyle \lambda }$${\displaystyle \lambda }$
    ' def test_table_involve_after_code(self): @@ -163,8 +163,8 @@ def test_table_involve_after_code(self): raw_html_path = base_dir.joinpath(test_case['input'][10]) base_url = 'https://en.m.wikipedia.org/wiki/Variance' raw_html = raw_html_path.read_text(encoding='utf-8') - parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) - assert html_to_element(parts[0][0]).xpath(f'.//{CCTag.CC_TABLE}')[0].text is None + parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) + assert parts[0][0].xpath(f'.//{CCTag.CC_TABLE}')[0].text is None @unittest.skip(reason='在code模块解决了table嵌套多行代码问题') def test_table_involve_code(self): @@ -173,8 +173,8 @@ def test_table_involve_code(self): raw_html_path = base_dir.joinpath(test_case['input'][11]) base_url = 'https://en.m.wikipedia.org/wiki/Variance' raw_html = raw_html_path.read_text(encoding='utf-8') - parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) - complex_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}') + parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) + complex_table_tag = parts[1][0].xpath(f'.//{CCTag.CC_TABLE}') expect_path = base_dir.joinpath(test_case['expected'][3]) content = open(expect_path, 'r', encoding='utf-8').read() assert complex_table_tag[0].text == content.strip('\n') @@ -186,8 +186,8 @@ def test_table_involve_complex_code(self): raw_html_path = base_dir.joinpath(test_case['input'][12]) base_url = 'https://en.m.wikipedia.org/wiki/Variance' raw_html = raw_html_path.read_text(encoding='utf-8') - parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) - complex_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}') + parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) + complex_table_tag = parts[1][0].xpath(f'.//{CCTag.CC_TABLE}') expect_path = base_dir.joinpath(test_case['expected'][3]) content = open(expect_path, 'r', encoding='utf-8').read() assert complex_table_tag[0].text == content.strip('\n') diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_title.py b/tests/llm_web_kit/extractor/html/recognizer/test_title.py index d3eedc2d..8cc8eeeb 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_title.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_title.py @@ -4,6 +4,7 @@ import pytest from llm_web_kit.extractor.html.recognizer.title import TitleRecognizer +from llm_web_kit.libs.html_utils import element_to_html @pytest.fixture @@ -17,9 +18,9 @@ def test_title_recognizer(title_recognizer): result = title_recognizer.recognize('http://www.baidu.com', [(html_content, html_content)], html_content) assert len(result) == 10 - assert result[0][0] == """大模型好,大模型棒1""" - assert result[6][0] == """大模型好,大模型棒5 大模型很棒""" @@ -27,5 +28,5 @@ def test_title_tails_and_levels(title_recognizer): html_content = """

    TEST:import *TEST

    Tail

    aaa

    """ result = title_recognizer.recognize('http://www.baidu.com', [(html_content, html_content)], html_content) assert len(result) == 2 - assert result[0][0] == '
    TEST: `import *` TEST
    ' + assert element_to_html(result[0][0]) == '
    TEST: `import *` TEST
    ' pass diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index b58b6964..33454e1b 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -279,6 +279,7 @@ def test_code_pre_mixed(self): # Create DataJson from test data input_data = DataJson(test_data) result = chain.extract(input_data) + # print("code_pre_mixed", result.get_content_list().to_mm_md()) self.assertIn("""``` this (DEFAULT_SERVER_NAME, DEFAULT_SERVER_PORT); ``` From e07a1192557392d35e9690e49cbff44d74391d60 Mon Sep 17 00:00:00 2001 From: dt-yy Date: Mon, 17 Mar 2025 14:24:52 +0800 Subject: [PATCH 36/46] improve performence --- llm_web_kit/extractor/html/extractor.py | 1 - .../extractor/html/recognizer/cccode.py | 1 - .../html/recognizer/code/tag_code.py | 11 ++- .../extractor/html/recognizer/recognizer.py | 26 +++-- .../extractor/html/recognizer/table.py | 92 ++++++++---------- pytest.lprof | Bin 0 -> 3600 bytes 6 files changed, 69 insertions(+), 62 deletions(-) create mode 100644 pytest.lprof diff --git a/llm_web_kit/extractor/html/extractor.py b/llm_web_kit/extractor/html/extractor.py index 62433f3d..7daf2c5b 100644 --- a/llm_web_kit/extractor/html/extractor.py +++ b/llm_web_kit/extractor/html/extractor.py @@ -128,7 +128,6 @@ def _extract_main_html(self, raw_html:str, base_url:str, page_layout_type:str) - dict_result = self.__magic_html_extractor.extract(raw_html, base_url=base_url, precision=False, html_type=page_layout_type) return dict_result['html'], dict_result['xp_num'] - # def _extract_code(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]: def _extract_code(self, base_url:str, html_lst:List[Tuple[HtmlElement, HtmlElement]], raw_html:str) -> List[Tuple[HtmlElement,HtmlElement]]: """从html文本中提取代码. diff --git a/llm_web_kit/extractor/html/recognizer/cccode.py b/llm_web_kit/extractor/html/recognizer/cccode.py index 33374410..fe5744a7 100644 --- a/llm_web_kit/extractor/html/recognizer/cccode.py +++ b/llm_web_kit/extractor/html/recognizer/cccode.py @@ -11,7 +11,6 @@ class CodeRecognizer(BaseHTMLElementRecognizer): """解析代码元素.""" - @override def recognize( self, diff --git a/llm_web_kit/extractor/html/recognizer/code/tag_code.py b/llm_web_kit/extractor/html/recognizer/code/tag_code.py index 98d9aa3f..5554114e 100644 --- a/llm_web_kit/extractor/html/recognizer/code/tag_code.py +++ b/llm_web_kit/extractor/html/recognizer/code/tag_code.py @@ -10,8 +10,15 @@ def __get_html_element(root: HtmlElement, node_path: list[str]) -> HtmlElement: - path = '/'.join(node_path) - path = '/'.join(path.removeprefix('/').split('/')[1:]) + path_parts = [] + for element in node_path: + path_parts.extend(element.split('/')) + start_idx = 0 + if path_parts: + start_idx = 1 + while start_idx <= len(path_parts) and path_parts[start_idx - 1] in ('', '/'): + start_idx += 1 + path = '/'.join(path_parts[start_idx:]) if not path: return root node = root.find(path, {'og': 'http://ogp.me/ns'}) diff --git a/llm_web_kit/extractor/html/recognizer/recognizer.py b/llm_web_kit/extractor/html/recognizer/recognizer.py index ff25a7e9..6ab1a5ef 100644 --- a/llm_web_kit/extractor/html/recognizer/recognizer.py +++ b/llm_web_kit/extractor/html/recognizer/recognizer.py @@ -238,20 +238,28 @@ def is_cc_html(el: HtmlElement, tag_name: str | list = None) -> bool: el: str|HtmlElement: html片段或HtmlElement对象 tag_name: str|list: cc标签,如ccmath, cccode, 如果指定了那么就只检查这几个标签是否在html里,否则检查所有cc标签 """ - # cc标签是指自定义标签,例如等,输入html片段,判断是否是cc标签 - # el = html_to_element(html) if el is None: return False + # 默认cc标签列表 + default_tag_names = [ + CCTag.CC_CODE, CCTag.CC_MATH_INTERLINE, CCTag.CC_IMAGE, CCTag.CC_VIDEO, + CCTag.CC_AUDIO, CCTag.CC_TABLE, CCTag.CC_LIST, CCTag.CC_TEXT, CCTag.CC_TITLE + ] + + # 确定需要检查的标签集合 if tag_name: if isinstance(tag_name, str): - tag_to_check = [tag_name] + tags = {tag_name} else: - tag_to_check = tag_name + tags = set(tag_name) else: - tag_to_check = [CCTag.CC_CODE, CCTag.CC_MATH_INTERLINE, CCTag.CC_IMAGE, CCTag.CC_VIDEO, CCTag.CC_AUDIO, CCTag.CC_TABLE, CCTag.CC_LIST, CCTag.CC_TEXT, CCTag.CC_TITLE] + tags = set(default_tag_names) + + # 如果当前元素的标签匹配,直接返回True + if el.tag in tags: + return True - for tag in tag_to_check: - if el.tag == tag or el.xpath(f'.//{tag}') : - return True - return False + # 构建XPath表达式,检查子元素是否包含目标标签 + xpath_expr = ' or '.join([f'self::{tag}' for tag in tags]) + return bool(el.xpath(f'.//*[{xpath_expr}]')) diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index 62ce0193..92fbaa95 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -17,6 +17,7 @@ class TableRecognizer(BaseHTMLElementRecognizer): def __init__(self): super().__init__() + self.math_recognizer = MathRecognizer() @override def recognize(self, @@ -67,43 +68,27 @@ def __is_contain_cc_html(self, cc_html: HtmlElement) -> bool: """判断html片段是否是cc标签.""" return BaseHTMLElementRecognizer.is_cc_html(cc_html) - def __is_table_empty(self, table) -> bool: - """检查表格是否为空(递归检查嵌套元素) - - :param table: lxml.html.HtmlElement 对象,表示一个 元素 - :return: 如果表格为空,返回 True;否则返回 False - """ - def is_element_empty(elem): - # 检查元素本身的文本内容 - if elem.text and elem.text.strip(): - return False - # 检查所有子元素 - for child in elem.iterchildren(): - # 如果是嵌套表格,递归检查表格是否为空 - if child.tag == 'table': - if not self.__is_table_empty(child): - return False - # 其他元素需要递归检查 - elif not is_element_empty(child): - return False - # 检查尾部文本(如 后的文本) - if elem.tail and elem.tail.strip(): - return False - return True - - # 检查所有单元格 - for cell in table.xpath('.//td | .//th'): - # 检查单元格内容 + def __is_table_empty(self, table: HtmlElement) -> bool: + """table是否为空.""" + # 合并单元格查询 + cells = table.xpath('.//td | .//th') + for cell in cells: if cell.text and cell.text.strip(): return False - # 递归检查子元素 - if not is_element_empty(cell): - return False + stack = [cell] + while stack: + elem = stack.pop() + if elem.text and elem.text.strip(): + return False + if elem.tail and elem.tail.strip(): + return False + # 添加子元素到栈中(倒序保证处理顺序) + stack.extend(reversed(elem.getchildren())) return True - def __is_simple_table(self, tree) -> bool: + def __is_simple_table(self, tree: HtmlElement) -> bool: """处理table元素,判断是是否复杂:是否包含合并单元格.""" - cells = tree.xpath('.//td') + tree.xpath('.//th') + cells = tree.xpath('.//td | .//th') for cell in cells: colspan_str = cell.get('colspan', '1') rowspan_str = cell.get('rowspan', '1') @@ -117,18 +102,28 @@ def __is_simple_table(self, tree) -> bool: return False return True - def __is_table_nested(self, element) -> int: - """计算表格的嵌套层级(非表格返回0,根据原始table判断的.""" + def __is_table_nested(self, element: HtmlElement) -> int: + """计算表格的嵌套层级.""" if element.tag != 'table': return 0 - # 获取当前表格下所有的表格(包括自身) - all_tables = [element] + element.xpath('.//table') - max_level = 1 # 初始层级为1(当前表格) - # 计算每个表格的层级,取最大值 - for table in all_tables: - ancestor_count = len(table.xpath('ancestor::table')) - level = ancestor_count + 1 - max_level = max(max_level, level) + + # 初始化栈结构:存储(当前元素, 当前层级) + stack = [(element, 1)] + max_level = 1 + + # 深度优先遍历 + while stack: + current, current_level = stack.pop() + # 更新最大层级 + max_level = max(max_level, current_level) + # 遍历子元素(倒序保证处理顺序) + for child in reversed(current.getchildren()): + if child.tag == 'table': + # 遇到子表格时层级+1 + stack.append((child, current_level + 1)) + else: + # 非表格元素保持当前层级 + stack.append((child, current_level)) return max_level def __extract_tables(self, ele: HtmlElement) -> List[Tuple[HtmlElement, HtmlElement]]: @@ -141,10 +136,11 @@ def __extract_tables(self, ele: HtmlElement) -> List[Tuple[HtmlElement, HtmlElem def __get_table_type(self, child: HtmlElement) -> str: """获取table的类型.""" + assert isinstance(child, HtmlElement) empty_flag = self.__is_table_empty(child) - level = self.__is_table_nested(child) if empty_flag: return 'empty' + level = self.__is_table_nested(child) # 是否跨行跨列 flag = (self.__is_simple_table(child) and level < 2) if flag: @@ -157,8 +153,7 @@ def __check_table_include_math_code(self, raw_html: HtmlElement): """检查table中的内容,包括普通文本、数学公式和代码.""" math_raw_html = self._element_to_html(raw_html) math_html = raw_html - math_recognizer = MathRecognizer() - math_res_parts = math_recognizer.recognize( + math_res_parts = self.math_recognizer.recognize( base_url='', main_html_lst=[(math_html, math_html)], raw_html=math_raw_html @@ -252,7 +247,7 @@ def __get_table_body(self, table_type, table_nest_level, table_root): table_root.attrib.clear() table_root.attrib.update(cleaned_attrs) # text进行strip操作,tail保留(部分内容留在tail中) - for elem in chain([table_root], table_root.iterdescendants()): + for elem in chain([table_root], table_root.iterchildren()): if elem.text is not None: elem.text = elem.text.strip().replace('\\n', '') if elem.tail is not None: @@ -281,10 +276,9 @@ def __do_extract_tables(self, root: HtmlElement) -> None: for child in root.iterchildren(): self.__do_extract_tables(child) - def __get_attribute(self, html: str) -> Tuple[bool, Any, Any]: + def __get_attribute(self, ele: HtmlElement) -> Tuple[bool, Any, Any]: """获取element的属性.""" # ele = self._build_html_tree(html) - ele = html if ele is not None and ele.tag == CCTag.CC_TABLE: table_type = ele.attrib.get('table_type') table_nest_level = ele.attrib.get('table_nest_level') @@ -292,7 +286,7 @@ def __get_attribute(self, html: str) -> Tuple[bool, Any, Any]: table_body = ele.text return table_flag, table_nest_level, table_body else: - raise HtmlTableRecognizerException(f'{html}中没有cctable标签') + raise HtmlTableRecognizerException(f'{ele}中没有cctable标签') def __get_content_list_table_type(self, table_type): """complex|simple 转为True|False.""" diff --git a/pytest.lprof b/pytest.lprof new file mode 100644 index 0000000000000000000000000000000000000000..8c2a78687c45eef36fd5b2102592da3aff9277b4 GIT binary patch literal 3600 zcmcJSYgm-m5y#nOm*pY|auLBGDBjWnTE$>Z@PeqE<1`nQL{Z)>%PVZy3oeU_HpVn2 zMBSJYSDVDBPt?TNJguqLN@`=HxfDgbYrLV2S~VJNM8#Nd{m)rQ(tb|(!1FtE=FB^1 zX3m_mbInW4Q9a3eBnFy{%)6lb=bnHjN-&AyRwdj`5y4`8bObBxVfmKVm3%KX8^p9c zGt2i==QuIh9?E=xe6iw#dZmAm{tz$9*1czd{1BxX#haPRfg{XA3YE@L8A7@d3dbqg zC?4ytj5lK^W=jIwI|&@0YhnDSTuf1W0AI>b`XCl?TE0!O4S+K;ftF+g{)LSf-vpe) zy5?3DtMiZz9}3w;$dVT+7cN1zIR~<@0V@UqF4HIj{dEoKirhj?xqyEI?mrCph7K6W zbP;rwRvUyhO4(?UsdPLX!fW(`L9EJF!t3%|D$yXkA^XuEqX4ad@kxN2vVm^T2HeIs zw+;f_0Sqbt+{KrsNWeWPKNDZ;ryv8ItuUFeaWa5u&@NCgOP_8CW~qs#A7c@j4G3aV zZ%_!&AM$b#$-%J*JesO!PWf~W-D(?tiCrvBU zy&9hbO?T){-Q{g7?MSLI3$0FNKZ^d8BXak}!ztO3at4Uza8=*Yth3RkbYhr~ZC2cg zx#L8|vD6YSX3UG?wQ(#T+lhlE_q1nodL^!(I86gRbH#h*4`pAq>)DOLC6A(GXt^#>3qNj0nYPLe2D6dwBRx*hh8;`IqOv6 z4i+Jq!9k*=T&@lqNA@h;nig_Mgi6n*#Wf15qqZ zbc4-nkc6cGtY}n6c&Y@gVs|3ICX;9jgASdiu$SwD`J7PWCD4LE5(k&2aP$`jBD`$YAa0hvz-Z0J!yNBd z0;82x**tnx8YTJ*J5%S251djyw}E@8(&{a*YI9^6y=)N;XH+!aqAM|C-_maE!*ZG$ zE=F(SZX;LFIio0uQPEq;JnS_UvQ^Bs!Aib{`HQ@C)_;e2zy_{zxt8WyMDA-Hncw4Y zEmBQ^J^Q_{^A9{Laky<2Rm#D3-!@5`Sl+B$tCu#qI0804loq5u@V8EwV-3ELQr)ZD+S}Jo zG>@;>La9+k(RQwiU4T9rfF|_j9USA`fWcf0doY#C_r_k7#~g-zGJ@7)ig$Uy7pG|f z<7hJKf%1bVz#%LtV#|UsUpI*>v=yc#ADNrdOviaU z-^yT`&iT3}4V2Aro14%32lzbQk;Vn)v$b=Vvgvy;E#6g);Xdw(u}B}^nikP>=X%WR zzR5q$eOBoSTxVhBO@U`wgzQC2x5A?iGL8DOpOJt^__Rg=Ebjnf0CS=NvFQH#wEgU0 zd9zF?-w?2oiUJC|i1{>>!UV2~?tsmRvg`q9><36k0oge{Jpr>gSiLZn$9=3f!WYZX z2en0<8=v(j+x_=9x}`k?VKSPE_&F6WBWWwwe}s&sk(})gGDK|nB#AQ?g|@qm-%}lN zDEEf|qT%jm{1S>m-hS7_-9RtDDr4Kj@_@`03ENeDrpOGMkOtLmfYrYNq@opm!xhq< zT8+ZnqDs04m$AXFmZUS61m37UQBFl1yxwqiWGF|d@W+jida0|2PLwa~yH|c&+w6gG z98JgP(@`u>@$yP@JuJYSrH1tZ}Rd4A>foD$?vfkrB zPu&^DC!qNjicf#WVh9S7<9%^nH?r;W;-QA$N>yIfZriqGl>=!RceoP3%~U`sAnpNx z4tQS&Oa(M{1(X49jRRBwlEwlYDB=RH0T(34^8s$aUG5aqSc10P=Ef*rj7N96r|X*T ztn}hH(f7ZI#?Ul_Sem7_ULfnK(`qaySuZ{+<1p#fK2x>J?)8t(jmeC-(ChG$>XOX= E0!ynirvLx| literal 0 HcmV?d00001 From 92ac356c0c828ddd2500c6faaef2fcf3edd64799 Mon Sep 17 00:00:00 2001 From: dt-yy Date: Mon, 17 Mar 2025 14:30:46 +0800 Subject: [PATCH 37/46] remove more file --- pytest.lprof | Bin 3600 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 pytest.lprof diff --git a/pytest.lprof b/pytest.lprof deleted file mode 100644 index 8c2a78687c45eef36fd5b2102592da3aff9277b4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3600 zcmcJSYgm-m5y#nOm*pY|auLBGDBjWnTE$>Z@PeqE<1`nQL{Z)>%PVZy3oeU_HpVn2 zMBSJYSDVDBPt?TNJguqLN@`=HxfDgbYrLV2S~VJNM8#Nd{m)rQ(tb|(!1FtE=FB^1 zX3m_mbInW4Q9a3eBnFy{%)6lb=bnHjN-&AyRwdj`5y4`8bObBxVfmKVm3%KX8^p9c zGt2i==QuIh9?E=xe6iw#dZmAm{tz$9*1czd{1BxX#haPRfg{XA3YE@L8A7@d3dbqg zC?4ytj5lK^W=jIwI|&@0YhnDSTuf1W0AI>b`XCl?TE0!O4S+K;ftF+g{)LSf-vpe) zy5?3DtMiZz9}3w;$dVT+7cN1zIR~<@0V@UqF4HIj{dEoKirhj?xqyEI?mrCph7K6W zbP;rwRvUyhO4(?UsdPLX!fW(`L9EJF!t3%|D$yXkA^XuEqX4ad@kxN2vVm^T2HeIs zw+;f_0Sqbt+{KrsNWeWPKNDZ;ryv8ItuUFeaWa5u&@NCgOP_8CW~qs#A7c@j4G3aV zZ%_!&AM$b#$-%J*JesO!PWf~W-D(?tiCrvBU zy&9hbO?T){-Q{g7?MSLI3$0FNKZ^d8BXak}!ztO3at4Uza8=*Yth3RkbYhr~ZC2cg zx#L8|vD6YSX3UG?wQ(#T+lhlE_q1nodL^!(I86gRbH#h*4`pAq>)DOLC6A(GXt^#>3qNj0nYPLe2D6dwBRx*hh8;`IqOv6 z4i+Jq!9k*=T&@lqNA@h;nig_Mgi6n*#Wf15qqZ zbc4-nkc6cGtY}n6c&Y@gVs|3ICX;9jgASdiu$SwD`J7PWCD4LE5(k&2aP$`jBD`$YAa0hvz-Z0J!yNBd z0;82x**tnx8YTJ*J5%S251djyw}E@8(&{a*YI9^6y=)N;XH+!aqAM|C-_maE!*ZG$ zE=F(SZX;LFIio0uQPEq;JnS_UvQ^Bs!Aib{`HQ@C)_;e2zy_{zxt8WyMDA-Hncw4Y zEmBQ^J^Q_{^A9{Laky<2Rm#D3-!@5`Sl+B$tCu#qI0804loq5u@V8EwV-3ELQr)ZD+S}Jo zG>@;>La9+k(RQwiU4T9rfF|_j9USA`fWcf0doY#C_r_k7#~g-zGJ@7)ig$Uy7pG|f z<7hJKf%1bVz#%LtV#|UsUpI*>v=yc#ADNrdOviaU z-^yT`&iT3}4V2Aro14%32lzbQk;Vn)v$b=Vvgvy;E#6g);Xdw(u}B}^nikP>=X%WR zzR5q$eOBoSTxVhBO@U`wgzQC2x5A?iGL8DOpOJt^__Rg=Ebjnf0CS=NvFQH#wEgU0 zd9zF?-w?2oiUJC|i1{>>!UV2~?tsmRvg`q9><36k0oge{Jpr>gSiLZn$9=3f!WYZX z2en0<8=v(j+x_=9x}`k?VKSPE_&F6WBWWwwe}s&sk(})gGDK|nB#AQ?g|@qm-%}lN zDEEf|qT%jm{1S>m-hS7_-9RtDDr4Kj@_@`03ENeDrpOGMkOtLmfYrYNq@opm!xhq< zT8+ZnqDs04m$AXFmZUS61m37UQBFl1yxwqiWGF|d@W+jida0|2PLwa~yH|c&+w6gG z98JgP(@`u>@$yP@JuJYSrH1tZ}Rd4A>foD$?vfkrB zPu&^DC!qNjicf#WVh9S7<9%^nH?r;W;-QA$N>yIfZriqGl>=!RceoP3%~U`sAnpNx z4tQS&Oa(M{1(X49jRRBwlEwlYDB=RH0T(34^8s$aUG5aqSc10P=Ef*rj7N96r|X*T ztn}hH(f7ZI#?Ul_Sem7_ULfnK(`qaySuZ{+<1p#fK2x>J?)8t(jmeC-(ChG$>XOX= E0!ynirvLx| From fd4eb0da84b6f72e31d58c24b73ba1e7b0cc98ff Mon Sep 17 00:00:00 2001 From: dt-yy Date: Mon, 17 Mar 2025 14:36:48 +0800 Subject: [PATCH 38/46] update cccode --- .../extractor/html/recognizer/code/tag_code.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/code/tag_code.py b/llm_web_kit/extractor/html/recognizer/code/tag_code.py index 5554114e..98d9aa3f 100644 --- a/llm_web_kit/extractor/html/recognizer/code/tag_code.py +++ b/llm_web_kit/extractor/html/recognizer/code/tag_code.py @@ -10,15 +10,8 @@ def __get_html_element(root: HtmlElement, node_path: list[str]) -> HtmlElement: - path_parts = [] - for element in node_path: - path_parts.extend(element.split('/')) - start_idx = 0 - if path_parts: - start_idx = 1 - while start_idx <= len(path_parts) and path_parts[start_idx - 1] in ('', '/'): - start_idx += 1 - path = '/'.join(path_parts[start_idx:]) + path = '/'.join(node_path) + path = '/'.join(path.removeprefix('/').split('/')[1:]) if not path: return root node = root.find(path, {'og': 'http://ogp.me/ns'}) From b7eaf8369c596b2fde561ebcb554dff652c5ad3e Mon Sep 17 00:00:00 2001 From: dt-yy Date: Mon, 17 Mar 2025 14:44:06 +0800 Subject: [PATCH 39/46] update perf improvement --- llm_web_kit/extractor/html/recognizer/audio.py | 11 +---------- .../extractor/html/recognizer/test_math.py | 3 --- 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/audio.py b/llm_web_kit/extractor/html/recognizer/audio.py index 4f3d18ae..6fc04eb6 100644 --- a/llm_web_kit/extractor/html/recognizer/audio.py +++ b/llm_web_kit/extractor/html/recognizer/audio.py @@ -35,13 +35,4 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h Returns: """ - node = { - 'type': DocElementType.AUDIO, - 'raw_content': parsed_content.attrib.get('html', ''), - 'content': { - 'url': parsed_content.attrib.get('url', ''), - 'path': parsed_content.attrib.get('path', ''), - 'data': parsed_content.attrib.get('data', '') - } - } - return node + raise NotImplementedError diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_math.py b/tests/llm_web_kit/extractor/html/recognizer/test_math.py index ba569eb4..b63956f0 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_math.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_math.py @@ -347,7 +347,6 @@ def test_math_recognizer(self): [(html_to_element(test_case['input'][0][0]), html_to_element(test_case['input'][0][1]))], test_case['raw_html'] ) - print(output_html) expect_len = len(test_case['expected']) self.assertEqual(len(output_html), len(test_case['expected']), msg=f'result is: {len(output_html)}, expected is: {expect_len}') for i in range(len(output_html)): @@ -367,7 +366,6 @@ def test_math_recognizer_html(self): # with open('parts'+str(random.randint(1, 100))+".html", 'w') as f: # for part in parts: # f.write(str(part[0])) - print(parts) # 检查行间公式抽取正确性 new_parts = [] for part in parts: @@ -389,7 +387,6 @@ def test_math_recognizer_html(self): # self.write_to_html(answers, test_case['input'][0]) # 检查行内公式抽取正确性 if test_case.get('expected_inline', None): - print('expected_inline::::::::', test_case['expected_inline']) parts = [part[0] for part in parts if CCTag.CC_MATH_INLINE in part[0]] def write_to_html(self, answers, file_name): From f6f880ead5b2b0cdde363e9cef545bb641b656a2 Mon Sep 17 00:00:00 2001 From: dt-yy Date: Mon, 17 Mar 2025 16:04:31 +0800 Subject: [PATCH 40/46] fix pylint --- llm_web_kit/extractor/html/recognizer/audio.py | 1 - llm_web_kit/extractor/html/recognizer/table.py | 6 ++---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/audio.py b/llm_web_kit/extractor/html/recognizer/audio.py index 6fc04eb6..f9e74a7b 100644 --- a/llm_web_kit/extractor/html/recognizer/audio.py +++ b/llm_web_kit/extractor/html/recognizer/audio.py @@ -5,7 +5,6 @@ from llm_web_kit.extractor.html.recognizer.recognizer import \ BaseHTMLElementRecognizer -from llm_web_kit.libs.doc_element_type import DocElementType class AudioRecognizer(BaseHTMLElementRecognizer): diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index 92fbaa95..093a153c 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -126,9 +126,8 @@ def __is_table_nested(self, element: HtmlElement) -> int: stack.append((child, current_level)) return max_level - def __extract_tables(self, ele: HtmlElement) -> List[Tuple[HtmlElement, HtmlElement]]: + def __extract_tables(self, tree: HtmlElement) -> List[Tuple[HtmlElement, HtmlElement]]: """提取html中的table元素.""" - tree = ele self.__do_extract_tables(tree) new_html = tree lst = self.html_split_by_tags(new_html, CCTag.CC_TABLE) @@ -160,9 +159,8 @@ def __check_table_include_math_code(self, raw_html: HtmlElement): ) result = [] for math_item in math_res_parts: - # ele_item = self._build_html_tree(math_item[0]) ele_item = math_item[0] - + def process_node(node): """处理行内公式、行间公式、行间代码、行内代码.""" if node.tag == CCTag.CC_MATH_INLINE: From 10b7480d7b8236cbb326265ef157462fe21ab4d2 Mon Sep 17 00:00:00 2001 From: dt-yy Date: Mon, 17 Mar 2025 16:19:55 +0800 Subject: [PATCH 41/46] fix pylint --- llm_web_kit/extractor/html/recognizer/table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index 093a153c..f405101e 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -160,7 +160,7 @@ def __check_table_include_math_code(self, raw_html: HtmlElement): result = [] for math_item in math_res_parts: ele_item = math_item[0] - + def process_node(node): """处理行内公式、行间公式、行间代码、行内代码.""" if node.tag == CCTag.CC_MATH_INLINE: From 8d44af63670f7d0af76454a35b08f340869e385a Mon Sep 17 00:00:00 2001 From: dt-yy Date: Mon, 17 Mar 2025 17:29:56 +0800 Subject: [PATCH 42/46] update perf --- llm_web_kit/extractor/html/recognizer/cc_math/common.py | 4 ++++ llm_web_kit/extractor/html/recognizer/text.py | 2 -- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/common.py b/llm_web_kit/extractor/html/recognizer/cc_math/common.py index fe102f13..f1825a7c 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/common.py +++ b/llm_web_kit/extractor/html/recognizer/cc_math/common.py @@ -96,6 +96,10 @@ class MATH_TYPE_PATTERN: ['\\begin{align}', '\\end{align}'], ['\\begin{alignat}', '\\end{alignat}'], ['\\begin{array}', '\\end{array}'], + # ['\\begin{equation}', '\\end{equation}'], + # ['\\begin{align}', '\\end{align}'], + # ['\\begin{alignat}', '\\end{alignat}'], + # ['\\begin{array}', '\\end{array}'], # 添加通用的begin/end匹配 ['\\begin{.*?}', '\\end{.*?}'], ], diff --git a/llm_web_kit/extractor/html/recognizer/text.py b/llm_web_kit/extractor/html/recognizer/text.py index 94abc50c..309a6797 100644 --- a/llm_web_kit/extractor/html/recognizer/text.py +++ b/llm_web_kit/extractor/html/recognizer/text.py @@ -76,8 +76,6 @@ def recognize(self, base_url:str, main_html_lst: List[Tuple[HtmlElement | str, H new_html_lst = [] for html_element, raw_html_element in main_html_lst: # 如果是字符串则转换为 HtmlElement - # html_element = html_to_element(html) if isinstance(html, str) else html - # raw_html_element = html_to_element(raw_html) if isinstance(raw_html, str) else raw_html if self.is_cc_html(html_element): new_html_lst.append((html_element, raw_html_element)) else: From b3723b825ef9781aea92de9f52ed99820d6d7c40 Mon Sep 17 00:00:00 2001 From: dt-yy Date: Mon, 17 Mar 2025 17:33:22 +0800 Subject: [PATCH 43/46] update perf --- llm_web_kit/extractor/html/recognizer/cc_math/common.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/common.py b/llm_web_kit/extractor/html/recognizer/cc_math/common.py index f1825a7c..b544d3f8 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/common.py +++ b/llm_web_kit/extractor/html/recognizer/cc_math/common.py @@ -92,10 +92,6 @@ class MATH_TYPE_PATTERN: ['\\[', '\\]'], ['$$', '$$'], ['[tex]', '[/tex]'], # 这个网站自定义的分割,https://www.physicsforums.com/threads/turning-to-a-single-logarithm-then-simply.269419/ - ['\\begin{equation}', '\\end{equation}'], - ['\\begin{align}', '\\end{align}'], - ['\\begin{alignat}', '\\end{alignat}'], - ['\\begin{array}', '\\end{array}'], # ['\\begin{equation}', '\\end{equation}'], # ['\\begin{align}', '\\end{align}'], # ['\\begin{alignat}', '\\end{alignat}'], From a31953bc4963709b78801d2bacabcea961e43ce8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=90=B4=E6=A2=93=E9=93=AD?= Date: Tue, 18 Mar 2025 15:52:12 +0800 Subject: [PATCH 44/46] feat: fast tag code extract (#1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 代码性能优化 --- .../html/recognizer/code/tag_code.py | 154 ++++++------------ 1 file changed, 47 insertions(+), 107 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/code/tag_code.py b/llm_web_kit/extractor/html/recognizer/code/tag_code.py index 98d9aa3f..016419de 100644 --- a/llm_web_kit/extractor/html/recognizer/code/tag_code.py +++ b/llm_web_kit/extractor/html/recognizer/code/tag_code.py @@ -1,3 +1,5 @@ +from typing import Optional + from lxml.html import HtmlElement from llm_web_kit.extractor.html.recognizer.code.common import ( @@ -29,111 +31,6 @@ def __is_all_chars_in_code_element(node: HtmlElement) -> bool: return full_text == code_text -def __group_code_by_distance( - root: HtmlElement, - node_paths: list[list[str]], - dist: list[list[int]], -) -> list[str]: - father = list(range(len(node_paths))) - - def get_father(x: int) -> int: - if father[x] == x: - return x - father[x] = get_father(father[x]) - return father[x] - - edges: list[tuple[int, int, int]] = [] - root_paths: list[list[str]] = [] - for i in range(len(node_paths)): - root_paths.append(node_paths[i]) - for j in range(i + 1, len(node_paths)): - edges.append((dist[i][j], i, j)) - edges = sorted(edges) - - used_edge = 0 - meet = set() - for edge in edges: - _, i, j = edge - i = get_father(i) - j = get_father(j) - if i != j and (i, j) not in meet: - common_node_idx = min(len(root_paths[i]), len(root_paths[j])) - for idx, (x, y) in enumerate(zip(root_paths[i], root_paths[j])): - if idx == 0: - continue - if x != y: - common_node_idx = idx - break - maybe_tree_root = __get_html_element(root, root_paths[i][:common_node_idx]) - - if len(maybe_tree_root.xpath(f'.//{CCTag.CC_CODE}|.//{CCTag.CC_CODE_INLINE}')) > 0: - meet.add((i, j)) - continue - - if not __is_all_chars_in_code_element(maybe_tree_root): - meet.add((i, j)) - continue - - root_paths[i] = root_paths[i][:common_node_idx] - used_edge += 1 - father[j] = i - - root_paths = [ - root_path for i, root_path in enumerate(root_paths) if i == get_father(i) - ] - - removed = set() - root_paths_joined = sorted( - list(set(['/'.join(root_path) for root_path in root_paths])) - ) - for x in root_paths_joined: - for y in root_paths_joined: - if len(x) < len(y) and y.startswith(x): - removed.add(y) - return [x for x in root_paths_joined if x not in removed] - - -def __compute_distance_matrix(node_paths: list[list[str]]) -> list[list[int]]: - """ - 计算节点路径的距离矩阵,具体步骤: - 1. 创建距离矩阵,计算每两个节点之间的距离 - 2. 距离计算方法:从共同祖先节点到两个节点的路径长度之和 - 例如: - 节点1路径:/html/body/div/code - 节点2路径:/html/body/pre/code - 共同祖先到 body,距离为 2(div->code) + 2(pre->code) = 4 - 节点1和节点2的距离为 4 - - 距离矩阵(对称矩阵): - [0, 1, 2, 3], - [1, 0, 1, 2], - [2, 1, 0, 1], - [3, 2, 1, 0] - - Args: - node_paths: 节点路径 - - Returns: - list[list[int]]: 距离矩阵 - """ - def get_lca_depth(path1: list[str], path2: list[str]) -> int: - for i, (x, y) in enumerate(zip(path1, path2)): - if x != y: - return i - return min(len(path1), len(path2)) - - n = len(node_paths) - dist = [[0] * n for _ in range(n)] - - for i in range(n): - for j in range(i + 1, n): - lca_depth = get_lca_depth(node_paths[i], node_paths[j]) - d = len(node_paths[i]) + len(node_paths[j]) - 2 * lca_depth - dist[i][j] = dist[j][i] = d - - return dist - - def __get_code_node_paths(html_el: HtmlElement) -> list[list[str]]: """获取 html_el 中所有 code 标签的路径 只获取最外层的code标签, 如果code标签内还有code标签,则不获取。 @@ -223,6 +120,49 @@ def __detect_inline_code(root: HtmlElement, node_paths: list[list[str]]) -> tupl return new_node_paths, inline_code +def __group_code(root: HtmlElement, node_paths: list[list[str]]) -> list[str]: + root_paths = [] + + def next_parent(code_node: HtmlElement, code_tags: int) -> tuple[Optional[HtmlElement], int]: + parent: Optional[HtmlElement] = code_node.getparent() + while parent is not None: + new_code_tags = len(parent.xpath('.//code')) + if new_code_tags == code_tags: + parent = parent.getparent() + else: + return parent, new_code_tags + return None, 0 + + while len(node_paths): + code_node = __get_html_element(root, node_paths[0]) + code_tags = 1 + + parent, new_code_tags = next_parent(code_node, code_tags) + while parent is not None: + if not __is_all_chars_in_code_element(parent): + break + + if len(parent.xpath(f'.//{CCTag.CC_CODE}|.//{CCTag.CC_CODE_INLINE}')) > 0: + break + + code_node = parent + code_tags = new_code_tags + + parent, new_code_tags = next_parent(code_node, code_tags) + + root_path = code_node.getroottree().getpath(code_node) + root_paths.append(root_path) + + new_node_path = [] + for node_path in node_paths: + if '/'.join(node_path).startswith(root_path): + continue + new_node_path.append(node_path) + node_paths = new_node_path + + return root_paths + + def modify_tree(root: HtmlElement) -> None: """将 html 树中所有 code 标签转换为代码块. @@ -239,8 +179,8 @@ def modify_tree(root: HtmlElement) -> None: elif len(node_paths) == 1: tree_roots = ['/'.join(node_paths[0])] else: - dist_matrix = __compute_distance_matrix(node_paths) # 计算距离矩阵 - tree_roots = __group_code_by_distance(root, node_paths, dist_matrix) # 根据距离矩阵,对code标签进行分组 + tree_roots = __group_code(root, node_paths) # 根据距离矩阵,对code标签进行分组 + tree_roots = sorted(tree_roots) nodes = __get_code_blocks_nodes(root, tree_roots) # 获取所有需要被转换为代码块的节点,并进行标签替换 for node in nodes: From 4c4587b0727683eb4bcad2a12e58874ca1585f01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=90=B4=E6=A2=93=E9=93=AD?= Date: Tue, 18 Mar 2025 19:22:30 +0800 Subject: [PATCH 45/46] fix: wrong tag nums (#2) * feat: fast tag code * Trigger Build * fix: typo * fix: maynot be one code * fix: type --------- Co-authored-by: wuziming --- llm_web_kit/extractor/html/recognizer/code/tag_code.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_web_kit/extractor/html/recognizer/code/tag_code.py b/llm_web_kit/extractor/html/recognizer/code/tag_code.py index 016419de..760d312b 100644 --- a/llm_web_kit/extractor/html/recognizer/code/tag_code.py +++ b/llm_web_kit/extractor/html/recognizer/code/tag_code.py @@ -135,7 +135,7 @@ def next_parent(code_node: HtmlElement, code_tags: int) -> tuple[Optional[HtmlEl while len(node_paths): code_node = __get_html_element(root, node_paths[0]) - code_tags = 1 + code_tags = len(code_node.xpath('.//code')) parent, new_code_tags = next_parent(code_node, code_tags) while parent is not None: From 48329cdd9a5dcbbbb0384e2f43a4baa2eb91d55a Mon Sep 17 00:00:00 2001 From: dt-yy Date: Thu, 20 Mar 2025 14:41:09 +0800 Subject: [PATCH 46/46] improve perf --- llm_web_kit/extractor/html/extractor.py | 11 +- .../extractor/html/recognizer/table.py | 11 +- llm_web_kit/extractor/html/recognizer/text.py | 6 + llm_web_kit/input/datajson.py | 92 ++- llm_web_kit/libs/doc_element_type.py | 3 +- llm_web_kit/libs/statics.py | 4 +- tests/llm_web_kit/cli_sdk/test_cli_sdk.py | 1 + .../good_data/html/content_list_empty.html | 1 + .../good_data/html/exclude_complex_table.html | 528 ++++++++++++++++++ .../good_data/html/para_is_short.html | 1 + .../good_data/html_data_input.jsonl | 5 +- .../table_to_content_list_complex_res.json | 2 +- .../table_to_content_list_simple_res.json | 2 +- .../recognizer/assets/recognizer/text.html | 4 +- .../extractor/html/recognizer/test_table.py | 1 + .../extractor/html/recognizer/test_text.py | 2 +- .../test_ContentListStaticsPostExtractor.py | 6 +- .../extractor/test_extractor_chain.py | 52 +- tests/llm_web_kit/input/test_datajson.py | 63 ++- 19 files changed, 734 insertions(+), 61 deletions(-) create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/content_list_empty.html create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/exclude_complex_table.html create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/para_is_short.html diff --git a/llm_web_kit/extractor/html/extractor.py b/llm_web_kit/extractor/html/extractor.py index 7daf2c5b..5a91a41f 100644 --- a/llm_web_kit/extractor/html/extractor.py +++ b/llm_web_kit/extractor/html/extractor.py @@ -95,14 +95,7 @@ def _do_extract(self, data_json: DataJson) -> DataJson: main_html, method = self._extract_main_html(raw_html, base_url, page_layout_type) main_html_element = html_to_element(main_html) - # parsed_html = [(main_html,raw_html)] parsed_html = [(main_html_element, raw_html)] - """ - for extract_func in [self._extract_code, self._extract_table, self._extract_math, self._extract_list, - self._extract_image, - self._extract_title, self._extract_paragraph]: - parsed_html = extract_func(base_url, parsed_html, raw_html) - """ for extract_func in [self._extract_code, self._extract_table, self._extract_math, self._extract_list, self._extract_image, self._extract_title, self._extract_paragraph]: @@ -266,7 +259,7 @@ def __is_valid_node(self, node: dict) -> bool: if not node: raise HtmlFileExtractorException('node is empty') node_type = node.get('type') - valid_types = {DocElementType.TITLE, DocElementType.LIST, DocElementType.CODE, DocElementType.EQUATION_INTERLINE, DocElementType.IMAGE, DocElementType.TABLE, DocElementType.IMAGE, DocElementType.PARAGRAPH} + valid_types = {DocElementType.TITLE, DocElementType.LIST, DocElementType.CODE, DocElementType.EQUATION_INTERLINE, DocElementType.IMAGE, DocElementType.SIMPLE_TABLE, DocElementType.COMPLEX_TABLE, DocElementType.IMAGE, DocElementType.PARAGRAPH} if node_type not in valid_types: raise HtmlFileExtractorException(f'Invalid node type: {node_type}') # 检查列表类型的节点 @@ -292,7 +285,7 @@ def __is_valid_node(self, node: dict) -> bool: # 检查url、path或data字段是否至少有一个不为空 return bool(content.get('url') or content.get('path') or content.get('data')) # 检测table类型的节点 - if node.get('type') == DocElementType.TABLE: + if node.get('type') == DocElementType.SIMPLE_TABLE or node.get('type') == DocElementType.COMPLEX_TABLE: html = node.get('content', {}).get('html') # 如果表格的html内容为None或空字符串,则视为无效节点 return bool(html and html.strip()) diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index f405101e..3effe18a 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -53,8 +53,12 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h # 确保 table_body 不为 None 且是字符串类型 html_content = table_body if table_body is not None else '' # 使用传入的 raw_html_segment 或将 parsed_content 转换为字符串 + if table_type: + cc_table_type = DocElementType.COMPLEX_TABLE + else: + cc_table_type = DocElementType.SIMPLE_TABLE d = { - 'type': DocElementType.TABLE, + 'type': cc_table_type, 'raw_content': raw_html_segment, 'content': { 'html': html_content, @@ -88,10 +92,11 @@ def __is_table_empty(self, table: HtmlElement) -> bool: def __is_simple_table(self, tree: HtmlElement) -> bool: """处理table元素,判断是是否复杂:是否包含合并单元格.""" + print('tree', self._element_to_html(tree)) cells = tree.xpath('.//td | .//th') for cell in cells: - colspan_str = cell.get('colspan', '1') - rowspan_str = cell.get('rowspan', '1') + colspan_str = cell.get('colspan', '1').strip('"\'\\') + rowspan_str = cell.get('rowspan', '1').strip('"\'\\') try: colspan = int(colspan_str) rowspan = int(rowspan_str) diff --git a/llm_web_kit/extractor/html/recognizer/text.py b/llm_web_kit/extractor/html/recognizer/text.py index 309a6797..a47a393d 100644 --- a/llm_web_kit/extractor/html/recognizer/text.py +++ b/llm_web_kit/extractor/html/recognizer/text.py @@ -225,6 +225,12 @@ def helper(elem: HtmlElement): path[-1].append(copied) path.append(copied) + # elem直接有text,则直接添加返回 + if has_direct_text(elem): + rebuild_path() + path[-1].append(copy_helper(elem)) + yield path[0], path[0] + rebuild_path() for sub_elem in elem: if has_direct_text(sub_elem) or (sub_elem.tag == 'p' and has_text(sub_elem)): rebuild_path() diff --git a/llm_web_kit/input/datajson.py b/llm_web_kit/input/datajson.py index 28f2a907..26246b58 100644 --- a/llm_web_kit/input/datajson.py +++ b/llm_web_kit/input/datajson.py @@ -5,6 +5,7 @@ from overrides import override +from llm_web_kit.exception.exception import ExtractorChainInputException from llm_web_kit.libs.doc_element_type import DocElementType, ParagraphTextType from llm_web_kit.libs.html_utils import (element_to_html, get_element_text, html_to_element, @@ -52,11 +53,13 @@ def __init__(self): self.__list_item_start = '-' # md里的列表项前缀 self.__list_para_prefix = ' ' # 两个空格,md里的列表项非第一个段落的前缀:如果多个段落的情况,第二个以及之后的段落前缀 self.__md_special_chars = ['#', '`', ] # TODO: 先去掉$,会影响行内公式,后面再处理 + self.__nodes_document_type = [DocElementType.MM_NODE_LIST, DocElementType.PARAGRAPH, DocElementType.LIST, DocElementType.SIMPLE_TABLE, DocElementType.COMPLEX_TABLE, DocElementType.TITLE, DocElementType.IMAGE, DocElementType.AUDIO, DocElementType.VIDEO, DocElementType.CODE, DocElementType.EQUATION_INTERLINE] + self.__inline_types_document_type = [ParagraphTextType.EQUATION_INLINE, ParagraphTextType.CODE_INLINE] def to_html(self): raise NotImplementedError('This method must be implemented by the subclass.') - def to_txt(self, exclude_nodes=DocElementType.MM_NODE_LIST): + def to_txt(self, exclude_nodes=DocElementType.MM_NODE_LIST, exclude_inline_types=[]): """把content_list转化为txt格式. Args: @@ -69,7 +72,7 @@ def to_txt(self, exclude_nodes=DocElementType.MM_NODE_LIST): for page in content_lst: for content_lst_node in page: if content_lst_node['type'] not in exclude_nodes: - txt_content = self.__content_lst_node_2_txt(content_lst_node) + txt_content = self.__content_lst_node_2_txt(content_lst_node, exclude_inline_types) if txt_content and len(txt_content) > 0: text_blocks.append(txt_content) @@ -77,7 +80,7 @@ def to_txt(self, exclude_nodes=DocElementType.MM_NODE_LIST): txt = txt.strip() + self.__text_end # 加上结尾换行符 return txt - def __to_md(self, exclude_nodes=[]): + def __to_md(self, exclude_nodes=[], exclude_inline_types=[]): """把content_list转化为md格式. Args: @@ -90,7 +93,7 @@ def __to_md(self, exclude_nodes=[]): for page in content_lst: for content_lst_node in page: if content_lst_node['type'] not in exclude_nodes: - txt_content = self.__content_lst_node_2_md(content_lst_node) + txt_content = self.__content_lst_node_2_md(content_lst_node, exclude_inline_types) if txt_content and len(txt_content) > 0: md_blocks.append(txt_content) @@ -98,12 +101,31 @@ def __to_md(self, exclude_nodes=[]): md = md.strip() + self.__text_end # 加上结尾换行符 return md - def to_nlp_md(self): - md = self.__to_md(exclude_nodes=DocElementType.MM_NODE_LIST) + def __validate_exclude_nodes(self, exclude_nodes, exclude_inline_types): + if isinstance(exclude_nodes, str): + exclude_nodes = [exclude_nodes] + if isinstance(exclude_inline_types, str): + exclude_inline_types = [exclude_inline_types] + if not isinstance(exclude_nodes, list): + raise ExtractorChainInputException('exclude_nodes must be a list type.') + if not isinstance(exclude_inline_types, list): + raise ExtractorChainInputException('exclude_inline_types must be a list type.') + for node in exclude_nodes: + if node not in self.__nodes_document_type: + raise ExtractorChainInputException(f'exclude_nodes contains invalid element type: {node}') + for inline_type in exclude_inline_types: + if inline_type not in self.__inline_types_document_type: + raise ExtractorChainInputException(f'exclude_inline_types contains invalid inline type: {inline_type}') + return exclude_nodes, exclude_inline_types + + def to_nlp_md(self, exclude_nodes=[], exclude_inline_types=[]): + exclude_nodes, exclude_inline_types = self.__validate_exclude_nodes(exclude_nodes, exclude_inline_types) + md = self.__to_md(exclude_nodes + DocElementType.MM_NODE_LIST, exclude_inline_types) return md - def to_mm_md(self): - md = self.__to_md() + def to_mm_md(self, exclude_nodes=[], exclude_inline_types=[]): + self.__validate_exclude_nodes(exclude_nodes, exclude_inline_types) + md = self.__to_md(exclude_nodes, exclude_inline_types) return md def to_main_html(self) -> str: @@ -140,7 +162,7 @@ def to_dict(self) -> dict: def _get_data(self) -> List[Dict]: raise NotImplementedError('This method must be implemented by the subclass.') - def __content_lst_node_2_md(self, content_lst_node: dict) -> str: + def __content_lst_node_2_md(self, content_lst_node: dict, exclude_inline_types: list = []) -> str: """把content_list里定义的每种元素块转化为markdown格式. Args: @@ -202,7 +224,7 @@ def __content_lst_node_2_md(self, content_lst_node: dict) -> str: return md_title elif node_type == DocElementType.PARAGRAPH: paragraph_el_lst = content_lst_node['content'] - one_para = self.__join_one_para(paragraph_el_lst) + one_para = self.__join_one_para(paragraph_el_lst, exclude_inline_types) return one_para elif node_type == DocElementType.LIST: items_paras = [] @@ -210,7 +232,7 @@ def __content_lst_node_2_md(self, content_lst_node: dict) -> str: for item_idx, item in enumerate(content_lst_node['content']['items']): paras_of_item = [] for para in item: - one_para = self.__join_one_para(para) + one_para = self.__join_one_para(para, exclude_inline_types) paras_of_item.append(one_para) # 由于markdown的列表项里可以有多个段落,这里拼装成md列表段落格式 list_prefix = f'{item_idx + 1}.' if is_ordered else self.__list_item_start # 有序列表和无需列表前缀 @@ -218,7 +240,7 @@ def __content_lst_node_2_md(self, content_lst_node: dict) -> str: items_paras.append(item_paras_md) md_list = '\n'.join(items_paras) return md_list - elif node_type == DocElementType.TABLE: + elif node_type == DocElementType.SIMPLE_TABLE: # 对文本格式来说,普通表格直接转为md表格,复杂表格返还原始html html_table = content_lst_node['content']['html'] if html_table is not None: @@ -227,12 +249,15 @@ def __content_lst_node_2_md(self, content_lst_node: dict) -> str: if cells_count <= 1: # 单个单元格的表格,直接返回文本 text = get_element_text(html_to_element(html_table)).strip() return text - is_complex = content_lst_node['content']['is_complex'] - if is_complex: - return html_table - else: - md_table = html_to_markdown_table(html_table) - return md_table + md_table = html_to_markdown_table(html_table) + return md_table + else: + return '' + elif node_type == DocElementType.COMPLEX_TABLE: + html_table = content_lst_node['content']['html'] + if html_table is not None: + html_table = html_table.strip() + return html_table else: return '' else: @@ -274,7 +299,7 @@ def __para_2_md_list_item(self, paras_of_item: list, list_prefix: str) -> str: return md_list_item - def __content_lst_node_2_txt(self, content_lst_node: dict) -> str: + def __content_lst_node_2_txt(self, content_lst_node: dict, exclude_inline_types=[]) -> str: """把content_list里定义的每种元素块转化为纯文本格式. Args: @@ -330,35 +355,38 @@ def __content_lst_node_2_txt(self, content_lst_node: dict) -> str: return title_content elif node_type == DocElementType.PARAGRAPH: paragraph_el_lst = content_lst_node['content'] - one_para = self.__join_one_para(paragraph_el_lst) + one_para = self.__join_one_para(paragraph_el_lst, exclude_inline_types) return one_para elif node_type == DocElementType.LIST: items_paras = [] for item in content_lst_node['content']['items']: paras_of_item = [] for para in item: - one_para = self.__join_one_para(para) + one_para = self.__join_one_para(para, exclude_inline_types) paras_of_item.append(one_para) items_paras.append(paras_of_item) items_paras = [self.__txt_para_splitter.join(item) for item in items_paras] return self.__txt_para_splitter.join(items_paras) # 对于txt格式来说一个列表项里多个段落没啥问题,但是对于markdown来说,多个段落要合并成1个,否则md格式无法表达。 - elif node_type == DocElementType.TABLE: + elif node_type == DocElementType.SIMPLE_TABLE: # 对文本格式来说,普通表格直接转为md表格,复杂表格返还原始html html_table = content_lst_node['content']['html'] if html_table is not None: html_table = html_table.strip() - is_complex = content_lst_node['content']['is_complex'] - if is_complex: - return html_table - else: - md_table = html_to_markdown_table(html_table) - return md_table + md_table = html_to_markdown_table(html_table) + return md_table + else: + return '' + elif node_type == DocElementType.COMPLEX_TABLE: + html_table = content_lst_node['content']['html'] + if html_table is not None: + html_table = html_table.strip() + return html_table else: return '' else: raise ValueError(f'content_lst_node contains invalid element type: {node_type}') # TODO: 自定义异常 - def __join_one_para(self, para: list) -> str: + def __join_one_para(self, para: list, exclude_inline_types: list = []) -> str: """把一个段落的元素块连接起来. Args: @@ -368,6 +396,8 @@ def __join_one_para(self, para: list) -> str: """ one_para = [] for el in para: + if el['t'] in exclude_inline_types: + continue if el['t'] == ParagraphTextType.TEXT: c = el['c'] if not c or not c.strip(): @@ -393,10 +423,10 @@ def _validate(self, json_obj: dict): json_obj (dict): _description_ """ if not isinstance(json_obj, dict): - raise ValueError('json_obj must be a dict type.') + raise ExtractorChainInputException('json_obj must be a dict type.') if DataJsonKey.CONTENT_LIST in json_obj: if not isinstance(json_obj.get(DataJsonKey.CONTENT_LIST, ''), list): - raise ValueError('content_list must be a list type.') + raise ExtractorChainInputException('content_list must be a list type.') class ContentList(StructureMapper): diff --git a/llm_web_kit/libs/doc_element_type.py b/llm_web_kit/libs/doc_element_type.py index c3c63fdb..dd962ed7 100644 --- a/llm_web_kit/libs/doc_element_type.py +++ b/llm_web_kit/libs/doc_element_type.py @@ -8,7 +8,8 @@ class ParagraphTextType(object): class DocElementType(object): PARAGRAPH = 'paragraph' LIST = 'list' - TABLE = 'table' + SIMPLE_TABLE = 'simple_table' + COMPLEX_TABLE = 'complex_table' EQUATION_INTERLINE = 'equation-interline' CODE = 'code' TITLE = 'title' diff --git a/llm_web_kit/libs/statics.py b/llm_web_kit/libs/statics.py index df640617..006cdcc8 100644 --- a/llm_web_kit/libs/statics.py +++ b/llm_web_kit/libs/statics.py @@ -94,10 +94,10 @@ def process_list_items(items, parent_type): elif element_type == DocElementType.LIST: # 使用递归函数处理列表项 process_list_items(element['content']['items'], DocElementType.LIST) - elif element_type == DocElementType.TABLE: + elif element_type == DocElementType.COMPLEX_TABLE: # 统计复杂表格数量 if element.get('content', {}).get('is_complex', False): - item_type = f'{DocElementType.TABLE}.complex' + item_type = f'{DocElementType.COMPLEX_TABLE}.complex' current_count = self.statics.get(item_type, 0) self.statics[item_type] = current_count + 1 diff --git a/tests/llm_web_kit/cli_sdk/test_cli_sdk.py b/tests/llm_web_kit/cli_sdk/test_cli_sdk.py index a556e0ba..f0085a69 100644 --- a/tests/llm_web_kit/cli_sdk/test_cli_sdk.py +++ b/tests/llm_web_kit/cli_sdk/test_cli_sdk.py @@ -78,6 +78,7 @@ def test_stdout_output(self, runner, json_with_html_path): assert result.exit_code == 0 assert result.output + print('result.output', result.output) output_data = json.loads(result.output) assert 'content_list' in output_data assert isinstance(output_data['content_list'], list) diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/content_list_empty.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/content_list_empty.html new file mode 100644 index 00000000..30fce8a5 --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/content_list_empty.html @@ -0,0 +1 @@ +北京大平层,奶油风浪漫到家!
    \n-
    \n设计案例: 168m轻法式大平层设计
    \n项目地址:北京市大兴区
    \n-
    \n在这个168平方米的轻法式大平层设计中,全屋以浪漫的奶白色为主色调,搭配驼色,营造出空间的呼吸感。客餐厅一体设计,地面铺满柔光砖,裸调的高级质感扑面而来。
    \n
    \n转角沙发与充满设计感的小型休闲椅相搭配,家居格调瞬间提升。威尼斯棕大理石餐桌的加入,为餐厅增添了更多的层次感和温柔。坐在沙发上,可以一览餐厅和厨房的空间,增加了互动性。
    \n
    \n墙面采用暖白色,搭配一些局部的原木色护墙板,让空间的视觉效果更加灵动,不易产生疲劳感。阳光透过窗户洒进室内,整个空间显得格外治愈,喜欢这种明亮纯粹的家。 diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/exclude_complex_table.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/exclude_complex_table.html new file mode 100644 index 00000000..e0b1d2bb --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/exclude_complex_table.html @@ -0,0 +1,528 @@ +\n\n \n \n\n \n\n\n + +\n \n WikiProcessors – smartmontools\n \n + \n + \n + \n + \n + \n + \n + \n + \n + \n + \n + \n + \n \n + \n + \n + \n + \n + \n + \n + + \n + \n + + \n +\n + +\n\t
    \n\t\t
    \n\t\t\t\n\n +
    \n
    \n

    smartmontools +

    \n
    \n
    \n
    \n \n \n \n
    \n \n + \n
    \n \n
    \n \n
    \n

    Context Navigation

    \n \n +
    \n +
    \n
    \n
    \n
    \n + + + \n + + \n
    Version 3 (modified by trac, 5 years ago)\n (diff)
    \n

    \n--\n

    \n\n
    \n
    \n \n
    +

    Wiki Processors

    \n

    \nProcessors are WikiMacros designed to provide alternative markup + formats for the Wiki engine. Processors + can be thought of as macro functions to process user-edited text. \n

    \n

    + \nWiki processors can be used in any Wiki text throughout Trac, such as:\n

    \n +

    Using Processors

    \n

    \nTo use a processor on a block of + text, first delimit the lines using a Wiki code block:\n

    \n +
    {{{\nThe lines\nthat should be processed...\n}}}\n
    +

    \nImmediately after the {{{ or on the line just below, add #! + followed by the processor name:\n

    \n +
    {{{\n#!processorname\nThe lines\nthat should be processed...\n}}}\n
    +

    \nThis is the \"shebang\" notation, familiar to most UNIX users.\n

    \n

    \nBesides + their content, some Wiki processors can also accept parameters, which are then + given as key=value pairs after the processor name and on the same line. If + value has to contain space, as it's often the case for the style parameter, + a quoted string can be used (key=\"value with space\").\n

    \n

    \nAs some + processors are meant to process Wiki markup, it's quite possible to nest + processor blocks.\nYou may want to indent the content of nested blocks for increased + clarity, this extra indentation will be ignored when processing the content.\n

    \n

    Examples

    \n\n + + + + + + + \n + + + \n + + \n + + + \n + + \n + + +
    Wiki Markup Display \n
    +
    \n

    \nExample 1: Inserting raw + HTML\n

    \n
    \n +
    +
    {{{\n#!html\n<h1 style=\"color: grey\">This is raw HTML</h1>\n}}}\n
    +
    +

    This is raw HTML

    \n +
    +
    \n

    \nExample 2: Highlighted + Python code in a <div> block with custom style\n

    \n +
    \n +
    +
    {{{#!div style=\"background: #ffd; border: 3px ridge\"\n\nThis is an example of embedded \"code\" block:\n\n  {{{\n  #!python\n  def hello():\n      return \"world\"\n  }}}\n\n}}}\n
    +
    +
    +

    \nThis is an example of embedded \"code\" block:\n

    \n
    +
    +
    def hello():\n    return \"world\"\n
    +
    +
    +
    +
    +
    \n

    \nExample 3: Searching tickets + from a wiki page, by keywords.\n

    \n
    \n +
    +
    {{{\n#!html\n<form action=\"/query\" method=\"get\"><div>\n<input type=\"text\" name=\"keywords\" value=\"~\" size=\"30\"/>\n<input type=\"submit\" value=\"Search by Keywords\"/>\n<!-- To control what fields show up use hidden fields\n<input type=\"hidden\" name=\"col\" value=\"id\"/>\n<input type=\"hidden\" name=\"col\" value=\"summary\"/>\n<input type=\"hidden\" name=\"col\" value=\"status\"/>\n<input type=\"hidden\" name=\"col\" value=\"milestone\"/>\n<input type=\"hidden\" name=\"col\" value=\"version\"/>\n<input type=\"hidden\" name=\"col\" value=\"owner\"/>\n<input type=\"hidden\" name=\"col\" value=\"priority\"/>\n<input type=\"hidden\" name=\"col\" value=\"component\"/>\n-->\n</div></form>\n}}}\n
    +
    +
    +
    \n\n\n
    +
    \n +
    \n

    Available Processors

    \n

    \nThe following + processors are included in the Trac distribution:\n

    \n\n + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    #!default Present the text verbatim in a preformatted text block. This is the same as + specifying no processor name (and no #!). \n
    #!comment Do not process the text in this section, i.e. contents exist only in the plain + text - not in the rendered page. \n
    #!rtl Introduce a Right-To-Left block with appropriate CSS direction and styling. + (since 0.12.2) \n
    \n
    HTML + related \n
    #!html Insert custom HTML in a wiki page. \n
    #!htmlcomment Insert an HTML comment in a wiki page. (since 0.12) \n
    Note that #!html blocks have to be self-contained, i.e. + you can't start an HTML element in one block and close it later in a second + block. Use the following processors for achieving a similar effect. \n
    #!div Wrap wiki content inside a <div> element. \n
    #!span Wrap wiki content inside a <span> element. \n
    #!td Wrap wiki content inside a <td> element. (since 0.12) \n
    #!th Wrap wiki content inside a <th> element. (since 0.12) \n
    #!tr Can optionally be used for wrapping #!td and #!th + blocks, either for specifying row attributes or better visual grouping. + (since 0.12) \n
    #!table Can optionally be used for wrapping #!tr, #!td and + #!th blocks, for specifying table attributes. One current + limitation however is that tables cannot be nested. (since 0.12) \n +
    See WikiHtml for example usage + and more details about these processors. \n
    \n
    Other Markups \n
    #!rst Trac support for Restructured Text. See WikiRestructuredText. \n
    #!textile Supported if Textile is installed. See a Textile reference. \n
    \n
    Code Highlighting Support + \n
    #!c
    #!cpp + (C++)
    #!python
    + #!perl
    #!ruby +
    #!php
    + #!asp
    #!java +
    #!js (Javascript)
    + #!sql
    #!xml + (XML or HTML)
    #!sh (Bourne/Bash shell) +
    etc.
    Trac includes processors to provide inline syntax highlighting for source code + in various languages.

    Trac relies on Pygments for + syntax coloring.

    See TracSyntaxColoring for information + about which languages are supported and how to enable support for more + languages. \n
    \n
    \n

    \nSince 1.1.2 the default, coding highlighting and MIME-type processors support + the argument lineno for adding line numbering to the code block. When a + value is specified, as in lineno=3, the numbering will start at the + specified value. When used in combination with the lineno argument, the + marks argument is also supported for highlighting lines. A single line + number, set of line numbers and range of line numbers are allowed. For example, + marks=3, marks=3-6, marks=3,5,7 and + marks=3-5,7 are all allowed. The specified values are relative to the + numbered lines, so if lineno=2 is specified to start the line numbering at + 2, marks=2 will result in the first line being highlighted.\n

    \n

    + \nUsing the MIME type as processor, it is possible to syntax-highlight the same + languages that are supported when browsing source code.\n

    \n\n + + + + + + + + + + + +
    MIME Type Processors \n
    +

    \nSome examples:\n

    \n +
    {{{#!text/html\n<h1>text</h1>\n}}}\n
    +
    +

    \nThe result will be syntax highlighted HTML code:\n

    \n
    +
    +
    <h1>text</h1>\n
    +
    +
    +

    \nThe same is valid for all other mime types + supported.\n

    \n +
    +
    {{{#!diff\n--- Version 55\n+++ Version 56\n@@ -115,8 +115,9 @@\n     name='TracHelloWorld', version='1.0',\n     packages=find_packages(exclude=['*.tests*']),\n-    entry_points = \"\"\"\n-        [trac.plugins]\n-        helloworld = myplugs.helloworld\n-    \"\"\",\n+    entry_points = {\n+        'trac.plugins': [\n+            'helloworld = myplugs.helloworld',\n+        ],\n+    },\n )\n}}}\n
    +
    +

    \n#!diff has a particularly nice renderer:\n +

    \n
    +
    \n\n
      \n \n
    • \n

      \n + Version\n \n

      \n \n \n \n \n \n + \n + + + + \n \n \n \n \n \n + \n \n \n \n \n \n + \n\n \n \n \n \n \n \n + + \n \n \n \n + + \n \n \n \n \n + + \n \n \n \n + \n \n\n \n\n \n\n \n\n \n + + \n \n\n \n\n \n\n \n\n \n \n \n \n \n + \n + + \n + + \n + + \n + + \n + + + \n + + \n + + \n + \n \n \n + + \n + + \n + + \n + + \n + + \n + + \n + + \n + + \n + + \n + \n \n \n \n\n \n + + \n \n\n \n \n \n \n \n \n + + \n \n \n \n + \n \n\n \n\n \n\n \n\n \n \n \n \n +
      \n \n \n \n \n \n  
      115115    + name='TracHelloWorld', version='1.0', +
      116116    + packages=find_packages(exclude=['*.tests*']), +
      117     entry_points = + \"\"\"
      118         + [trac.plugins]
      119         + helloworld = myplugs.helloworld
      120     \"\"\", +
       117    entry_points = + {
       118        + 'trac.plugins': [
       119        +     'helloworld = + myplugs.helloworld',
       120        + ],
       121    },
      121 + 122)
      \n
    • \n \n
    \n\n
    +
    +
    \n

    \nLine numbers can be added to code blocks and lines can be highlighted + (since 1.1.2).\n

    \n +
    {{{#!python lineno=3 marks=3,9-10,16\ndef expand_markup(stream, ctxt=None):\n    \"\"\"A Genshi stream filter for expanding `genshi.Markup` events.\n\n    Note: Expansion may not be possible if the fragment is badly\n    formed, or partial.\n    \"\"\"\n    for event in stream:\n        if isinstance(event[1], Markup):\n            try:\n                for subevent in HTML(event[1]):\n                    yield subevent\n            except ParseError:\n                yield event\n        else:\n            yield event\n}}}\n
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Line 
    3def expand_markup(stream, ctxt=None):
    4    \"\"\"A Genshi stream filter for expanding + `genshi.Markup` events.
    5
    6    Note: Expansion may not be possible if the + fragment is badly
    7    formed, or partial.
    8    \"\"\"
    9    for event in stream:
    10        if isinstance(event[1], Markup):
    11            try:
    12                for subevent in HTML(event[1]):
    13                    yield subevent
    14            except ParseError:
    15                yield event
    16        else:
    17            yield event
    +
    +

    \nFor more processor macros developed and/or contributed by users, visit the Trac + Hacks community site.\n

    \n

    \nDeveloping processors is no different from + Wiki macros. In fact, they work the same way, only the usage syntax differs. See WikiMacros#DevelopingCustomMacros + for more information.\n

    \n +
    \n

    \nSee also: WikiMacros, WikiHtml, WikiRestructuredText, TracSyntaxColoring, WikiFormatting, TracGuide\n

    \n +
    \n \n \n \n \n
    \n \n\n \n + + \n + + \n
    \n

    Download in other formats:

    \n \n
    \n + \n
    +
    \n \"Trac\n

    Powered by Trac + 1.2.5
    \n By Edgewall Software. +

    \n

    Validator: Check + XHTML

    \n +
    \n\n\t\t\n \n\n
    \n +\n + + \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/para_is_short.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/para_is_short.html new file mode 100644 index 00000000..a7126065 --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/para_is_short.html @@ -0,0 +1 @@ +\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nA plain blog about politics: Acceptable\n\n\n\n\n\n\n\n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n\n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n
    \n\n
    \n
    \n
    \n
    \n
    \n
    \n

    Monday, December 5, 2011

    \n
    \n
    \n
    \n\n\n\n

    \nAcceptable\n

    \n
    \n
    \n
    \n
    \nSince I've commented quite a bit on polling that as I read it shows Mitt Romney broadly acceptable to most Republican voters, I definitely need to say something about a new poll today that doesn't exactly show that. Gallup got around to actually asking that very question (\"Please tell me if you would find ___ to be an acceptable  nominee for president from the Republican Party, or not\"). The answers mostly showed the weakness of the field, with six of the eight candidates asked about scoring well below 50% acceptable. But the clear most-acceptable candidate is Newt Gingrich, with a 62/34 acceptable/not acceptable ratio, while Romney is only at 54/41.
    \n
    \nThere are a lot of ways to look at this, but overall it's certainly a piece of evidence that the anti-Romney vote is, well, around 40%. Only a piece of evidence, however. It's not clear how hard these kinds of numbers might be, in either direction. On the positive side, it seems unlikely that Newt would remain over 60% once more Republicans know that he's been lobbying for Freddie Mac, and supported the individual mandate on health insurance, and made a climate change ad with Nancy Pelosi, and all the rest of it. On the other hand, it's certainly possible that the \"unacceptable\" answers are awful soft, for Romney and for everyone else.
    \n
    \nIn particular, as Greg pointed out, Romney only does three points better on the \"acceptable\" scale with moderate Republicans than does Newt. This isn't the first indication we've had that Romney isn't doing as well with moderate Republicans as one would think he should be. Whether that means he has some room to grow or that he's just not an appealing politician is, I guess, still entirely up in the air at this point.
    \n
    \nI still overall don't see a low cap on Romney's support, but of course all the evidence counts, and polling in general begins to be a little more important the closer we get to actual voting. I'll be continuing to track anything more we get on this one.\n
    \n
    \n\n
    \n
    \n\n

    14 comments:

    \n
    \n\n \n\n\n\n", + src=\"https://whatsknow.com/wp-content/cache/autoptimize/js/autoptimize_fe6b5f33f1d030f29a946c59f754e0ce.js\"> \n\n\n\n \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index 36a775b5..2470c060 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -130,6 +130,7 @@ def test_table_to_content_list_node_simple(self): result = self.rec.to_content_list_node(base_url, html_to_element(parsed_content), raw_html) expect = base_dir.joinpath(test_case['expected'][0]) expect_json = expect.read_text(encoding='utf-8') + print(result) assert result['type'] == json.loads(expect_json)['type'] assert result['content']['is_complex'] == json.loads(expect_json)['content']['is_complex'] assert result['raw_content'] == json.loads(expect_json)['raw_content'] diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_text.py b/tests/llm_web_kit/extractor/html/recognizer/test_text.py index c497c5cd..9f713c14 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_text.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_text.py @@ -66,7 +66,7 @@ def test_text_1(self): '中共中央政治局召开会议审议《成-2020年10月16日新闻联播', 'zh')[:7] == '知识乱象\n中共' result = self.text_recognize.recognize('http://www.baidu.com', [(html_to_element(html_content), html_to_element(html_content))], html_content) - assert element_to_html(result[909][0])[1413:1422] == '知识乱象\\n 中共' + assert '知识乱象\\n 中共' in element_to_html(result[908][0]) def test_text_2(self): """ diff --git a/tests/llm_web_kit/extractor/html/test_ContentListStaticsPostExtractor.py b/tests/llm_web_kit/extractor/html/test_ContentListStaticsPostExtractor.py index da4a4d7e..cd7196c9 100644 --- a/tests/llm_web_kit/extractor/html/test_ContentListStaticsPostExtractor.py +++ b/tests/llm_web_kit/extractor/html/test_ContentListStaticsPostExtractor.py @@ -54,7 +54,7 @@ def setUp(self): } }, { - 'type': 'table', + 'type': 'complex_table', 'raw_content': '', 'content': { 'html': '
    12
    ', @@ -75,5 +75,5 @@ def test_content_list_statics_post_extractor(self): self.assertEqual(data_json.get(DataJsonKey.METAINFO, {}).get(DataJsonKey.STATICS, {}).get('paragraph.text'), 2) self.assertEqual(data_json.get(DataJsonKey.METAINFO, {}).get(DataJsonKey.STATICS, {}).get('paragraph.equation-inline'), 1) self.assertEqual(data_json.get(DataJsonKey.METAINFO, {}).get(DataJsonKey.STATICS, {}).get('equation-interline'), 1) - self.assertEqual(data_json.get(DataJsonKey.METAINFO, {}).get(DataJsonKey.STATICS, {}).get('table'), 1) - self.assertEqual(data_json.get(DataJsonKey.METAINFO, {}).get(DataJsonKey.STATICS, {}).get('table.complex'), 1) + self.assertEqual(data_json.get(DataJsonKey.METAINFO, {}).get(DataJsonKey.STATICS, {}).get('complex_table'), 1) + self.assertEqual(data_json.get(DataJsonKey.METAINFO, {}).get(DataJsonKey.STATICS, {}).get('complex_table.complex'), 1) diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index a0bb3f09..05d1649b 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -59,7 +59,7 @@ def setUp(self): for line in f: self.data_json.append(json.loads(line.strip())) - assert len(self.data_json) == 20 + assert len(self.data_json) == 23 # Config for HTML extraction self.config = { @@ -141,13 +141,13 @@ def test_html_pipeline(self): # 然后是simple table html_content = html_content_list[4] - self.assertEqual(html_content['type'], DocElementType.TABLE) + self.assertEqual(html_content['type'], DocElementType.SIMPLE_TABLE) self.assertEqual(html_content['content']['is_complex'], False) assert html_content['content']['html'].startswith('' not in content_txt + assert '
    ' not in content_txt + + def test_para_is_short(self): + """测试para识别后内容太短.""" + chain = ExtractSimpleFactory.create(self.config) + self.assertIsNotNone(chain) + test_data = self.data_json[22] + input_data = DataJson(test_data) + result = chain.extract(input_data) + content_txt = result.get_content_list().to_nlp_md() + assert len(content_txt) == 3985 diff --git a/tests/llm_web_kit/input/test_datajson.py b/tests/llm_web_kit/input/test_datajson.py index ac0b9ea1..2ec08637 100644 --- a/tests/llm_web_kit/input/test_datajson.py +++ b/tests/llm_web_kit/input/test_datajson.py @@ -2,7 +2,9 @@ import pytest +from llm_web_kit.exception.exception import ExtractorChainInputException from llm_web_kit.input.datajson import ContentList, DataJson, DataJsonKey +from llm_web_kit.libs.doc_element_type import DocElementType def test_datajson_init(): @@ -98,14 +100,71 @@ def test_datajson_serialization(): def test_datajson_validation(): # Test invalid input type - with pytest.raises(ValueError): + with pytest.raises(ExtractorChainInputException): DataJson([]) # List instead of dict # Test invalid content_list type - with pytest.raises(ValueError): + with pytest.raises(ExtractorChainInputException): DataJson({DataJsonKey.CONTENT_LIST: 'invalid'}) # String instead of list +def test_datajson_exclude_nodes_to_nlp_md(): + data = { + DataJsonKey.DATASET_NAME: 'test_dataset', + DataJsonKey.FILE_FORMAT: 'html', + DataJsonKey.CONTENT_LIST: [[{ + 'type': 'simple_table', + 'raw_content': "
    Title: T.J. Byrne, Slide of floor plan, Poor Law Commission cottage, 1872.
    Authors: T.J., Byrne
    Fewer, Michael
    Keywords: T.J. Byrne
    Cottages
    Poor Law Commission
    Issue Date: 2011
    2011
    Description: T.J. Byrne's slide of a one storey cottage, labelled 'Mr Barney's Plan', recommended by the Poor Law Commission, 1872.
    URI: https://hdl.handle.net/10599/5719
    Appears in Collections:Published Items
    T.J. Byrne Collection
    ", + 'content': { + 'html': "
    Title:T.J. Byrne, Slide of floor plan, Poor Law Commission cottage, 1872.
    Authors:T.J., Byrne Fewer, Michael
    Keywords:T.J. Byrne Cottages Poor Law Commission
    Issue Date:2011 2011
    Description:T.J. Byrne's slide of a one storey cottage, labelled 'Mr Barney's Plan', recommended by the Poor Law Commission, 1872.
    URI:https://hdl.handle.net/10599/5719
    Appears in Collections:Published Items T.J. Byrne Collection
    ", + 'is_complex': False, + 'table_nest_level': '1' + } + }]] + } + datajson = DataJson(data) + md = datajson.get_content_list().to_nlp_md(exclude_nodes=DocElementType.COMPLEX_TABLE) + assert '' not in md + + +def test_datajson_exclude_nodes_to_mmd(): + data = { + DataJsonKey.DATASET_NAME: 'test_dataset', + DataJsonKey.FILE_FORMAT: 'html', + DataJsonKey.CONTENT_LIST: [[{ + 'type': 'simple_table', + 'raw_content': "
    Title: T.J. Byrne, Slide of floor plan, Poor Law Commission cottage, 1872.
    Authors: T.J., Byrne
    Fewer, Michael
    Keywords: T.J. Byrne
    Cottages
    Poor Law Commission
    Issue Date: 2011
    2011
    Description: T.J. Byrne's slide of a one storey cottage, labelled 'Mr Barney's Plan', recommended by the Poor Law Commission, 1872.
    URI: https://hdl.handle.net/10599/5719
    Appears in Collections:Published Items
    T.J. Byrne Collection
    ", + 'content': { + 'html': "
    Title:T.J. Byrne, Slide of floor plan, Poor Law Commission cottage, 1872.
    Authors:T.J., Byrne Fewer, Michael
    Keywords:T.J. Byrne Cottages Poor Law Commission
    Issue Date:2011 2011
    Description:T.J. Byrne's slide of a one storey cottage, labelled 'Mr Barney's Plan', recommended by the Poor Law Commission, 1872.
    URI:https://hdl.handle.net/10599/5719
    Appears in Collections:Published Items T.J. Byrne Collection
    ", + 'is_complex': False, + 'table_nest_level': '1' + } + }, { + 'type': 'complex_table', + 'raw_content': "
    Title: T.J. Byrne, Slide of floor plan, Poor Law Commission cottage, 1872.
    Authors: T.J., Byrne
    Fewer, Michael
    Keywords: T.J. Byrne
    Cottages
    Poor Law Commission
    Issue Date: 2011
    2011
    Description: T.J. Byrne's slide of a one storey cottage, labelled 'Mr Barney's Plan', recommended by the Poor Law Commission, 1872.
    URI: https://hdl.handle.net/10599/5719
    Appears in Collections:Published Items
    T.J. Byrne Collection
    ", + 'content': { + 'html': "
    Title:T.J. Byrne, Slide of floor plan, Poor Law Commission cottage, 1872.
    Authors:T.J., Byrne Fewer, Michael
    Keywords:T.J. Byrne Cottages Poor Law Commission
    Issue Date:2011 2011
    Description:T.J. Byrne's slide of a one storey cottage, labelled 'Mr Barney's Plan', recommended by the Poor Law Commission, 1872.
    URI:https://hdl.handle.net/10599/5719
    Appears in Collections:Published Items T.J. Byrne Collection
    ", + 'is_complex': True, + 'table_nest_level': '1' + } + }, { + 'type': 'image', + 'raw_content': "\"Curtindo", + 'content': { + 'url': 'https://naproadavida.com/wp-content/uploads/2020/11/20201024-Airbnb-SP-Consolacao_getaway_manha_Sony-1.jpg', + 'data': None, + 'alt': 'Curtindo o apartamento com piscina no centro de SP. ', + 'title': 'Curtindo o apartamento com piscina no centro de SP. ', + 'caption': None + } + }]] + } + datajson = DataJson(data) + md = datajson.get_content_list().to_mm_md(exclude_nodes=DocElementType.COMPLEX_TABLE) + assert '' not in md + assert 'Curtindo o apartamento com piscina no centro de SP.' in md + + def test_data_json_deepcopy(): """从一个外部dict构建datajson, 改变datajson,不改变外部dict.""" d = {'track_id': '32266dfa-c335-45c5-896e-56f057889d28',