From 69d420cf2c8b1357bc9b454269da995991e4c669 Mon Sep 17 00:00:00 2001 From: quyuan Date: Tue, 25 Feb 2025 13:01:20 +0800 Subject: [PATCH 01/31] resolve nest table --- .../extractor/html/recognizer/table.py | 137 ++- .../recognizer/table_include_code_expect.json | 299 +++++ .../assets/recognizer/table_involve_code.html | 1001 +++++++++++++++++ .../table_to_content_list_complex_res.json | 3 +- .../extractor/html/recognizer/test_table.py | 23 +- 5 files changed, 1421 insertions(+), 42 deletions(-) create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_include_code_expect.json create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_involve_code.html diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index e2c70a39..b41f8834 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -1,9 +1,12 @@ -from typing import List, Tuple +from itertools import chain +from typing import Any, List, Tuple from lxml.html import HtmlElement from overrides import override from llm_web_kit.exception.exception import HtmlTableRecognizerExp +from llm_web_kit.extractor.html.recognizer.cccode import CodeRecognizer +from llm_web_kit.extractor.html.recognizer.ccmath import MathRecognizer from llm_web_kit.extractor.html.recognizer.recognizer import ( BaseHTMLElementRecognizer, CCTag) from llm_web_kit.libs.doc_element_type import DocElementType @@ -42,7 +45,7 @@ def recognize(self, def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segment: str) -> dict: if not parsed_content: raise HtmlTableRecognizerExp(f'table parsed_content{parsed_content}为空') - table_type, table_body = self.__get_attribute(parsed_content) + table_type, table_nest_level, table_body = self.__get_attribute(parsed_content) d = { 'type': DocElementType.TABLE, # "bbox": [], @@ -52,6 +55,7 @@ def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segm }, } d['content']['is_complex'] = table_type + d['content']['table_nest_level'] = table_nest_level return d def __is_contain_cc_html(self, cc_html: str) -> bool: @@ -64,6 +68,7 @@ def __is_table_empty(self, table) -> bool: :param table: lxml.html.HtmlElement 对象,表示一个 元素 :return: 如果表格为空,返回 True;否则返回 False """ + def is_element_empty(elem): # 检查元素本身的文本内容 if elem.text and elem.text.strip(): @@ -81,6 +86,7 @@ def is_element_empty(elem): if elem.tail and elem.tail.strip(): return False return True + # 检查所有单元格 for cell in table.xpath('.//td | .//th'): # 检查单元格内容 @@ -101,7 +107,8 @@ def __is_simple_table(self, tree) -> bool: colspan = int(colspan_str) rowspan = int(rowspan_str) except ValueError as e: - raise HtmlTableRecognizerExp(f'table的合并单元格属性值colspan:{colspan_str}或rowspan:{rowspan_str}不是有效的整数') from e + raise HtmlTableRecognizerExp( + f'table的合并单元格属性值colspan:{colspan_str}或rowspan:{rowspan_str}不是有效的整数') from e if (colspan > 1) or (rowspan > 1): return False return True @@ -114,28 +121,28 @@ def __is_table_contain_img(self, tree) -> bool: else: return False - def __is_table_nested(self, tree) -> bool: - """判断table元素是否嵌套.""" - nested_tables = tree.xpath('//table//table') - if len(nested_tables) == 0: - return True - else: - return False + def __is_table_nested(self, tree) -> int: + """获取表格元素的嵌套层级(非表格元素返回0,顶层表格返回1,嵌套表格返回层级数).""" + if tree.tag != 'table': + return 0 # 非表格元素返回0 + # 计算祖先中的 table 数量(不包括自身),再加1表示自身层级 + return len(tree.xpath('ancestor::table')) + 1 - def __extract_tables(self, ele: HtmlElement) -> List[str]: + def __extract_tables(self, ele: HtmlElement) -> list[tuple[str, str]]: """提取html中的table元素.""" - tree = self._build_html_tree(ele) - self.__do_extract_tables(tree) - new_html = self._element_to_html(tree) + self.__do_extract_tables(ele) + new_html = self._element_to_html(ele) lst = self.html_split_by_tags(new_html, CCTag.CC_TABLE) return lst def __get_table_type(self, child: HtmlElement) -> str: """获取table的类型.""" empty_flag = self.__is_table_empty(child) + level = self.__is_table_nested(child) if empty_flag: return 'empty' - flag = self.__is_simple_table(child) and self.__is_table_nested(child) + # 是否跨行跨列 + flag = (self.__is_simple_table(child) and level < 2) if flag: table_type = 'simple' else: @@ -147,36 +154,91 @@ def __extract_table_element(self, ele: HtmlElement) -> str: for item in ele.iterchildren(): return self._element_to_html(item) - def __simplify_td_th_content(self, elem): + def __check_table_include_math_code(self, raw_html: HtmlElement): + """check table中是否包含math.""" + math_html = self._element_to_html(raw_html) + ele_res = list() + math_recognizer = MathRecognizer() + math_res_parts = math_recognizer.recognize(base_url='', main_html_lst=[(math_html, math_html)], + raw_html=math_html) + code_recognizer = CodeRecognizer() + code_res_parts = code_recognizer.recognize(base_url='', main_html_lst=math_res_parts, + raw_html=math_html) + for math_item in code_res_parts: + ele_item = self._build_html_tree(math_item[0]) + ccinline_math_node = ele_item.xpath(f'//{CCTag.CC_MATH_INLINE}') + ccinline_code_node = ele_item.xpath(f'//{CCTag.CC_CODE_INLINE}') + ccinterline_math_node = ele_item.xpath(f'//{CCTag.CC_MATH_INTERLINE}') + ccinterline_code_node = ele_item.xpath(f'//{CCTag.CC_CODE}') + if ccinline_math_node: + formulas = [ + el.text if el.text.strip() else '' + for el in ccinline_math_node + ] + ele_res.extend(formulas) # 添加字符串 + elif ccinterline_math_node: + codes = [ + el.text if el.text.strip() else '' + for el in ccinterline_math_node + ] + ele_res.extend(codes) + elif ccinline_code_node: + inline_codes = [ + el.text if el.text.strip() else '' + for el in ccinline_code_node + ] + ele_res.extend(inline_codes) + elif ccinterline_code_node: + ccinterline_codes = [ + el.text if el.text else '' + for el in ccinterline_code_node + ] + ele_res.extend(ccinterline_codes) + else: + ele_res.extend([ + text.strip() + for text in self._build_html_tree(math_item[1]).itertext() + if text.strip() + ]) + return ele_res + + def __simplify_td_th_content(self, elem: HtmlElement) -> None: """简化
内容,仅保留文本内容.""" - if elem.tag in ['td', 'th'] and len(elem.xpath('.//table')) == 0: - result = '
'.join([text for text in elem.itertext() if text.strip()]) - for child in list(elem): - elem.remove(child) - elem.text = result - elif elem.tag in ['td', 'th'] and len(elem.xpath('.//table')) > 0: - for item in elem.iterchildren(): - self.__simplify_td_th_content(item) + if elem.tag in ['td', 'th']: + # 简化单元格中的元素 + parse_res = list() + math_res = self.__check_table_include_math_code(elem) + parse_res.extend(math_res) + for item in list(elem.iterchildren()): + elem.remove(item) + elem.text = '
'.join(parse_res) + return + for child in elem.iter('td', 'th'): + self.__simplify_td_th_content(child) def __get_table_body(self, table_type, table_root): """获取并处理table body,返回处理后的HTML字符串。""" if table_type == 'empty': return None allowed_attributes = ['colspan', 'rowspan'] - for child in list(table_root.iterchildren()): - if child.tag is not None: - self.__get_table_body(table_type, child) - for ele in table_root.iter('td', 'th'): - self.__simplify_td_th_content(ele) + # 清理除了colspan和rowspan之外的属性 if len(table_root.attrib) > 0: cleaned_attrs = {k: v for k, v in table_root.attrib.items() if k in allowed_attributes} table_root.attrib.clear() table_root.attrib.update(cleaned_attrs) - if table_root.text is not None: - table_root.text = table_root.text.strip() - for elem in table_root.iter(): - if elem.tail is not None: + # text进行strip操作,tail去掉(有较多空换行) + for elem in chain([table_root], table_root.iterdescendants()): + if elem.text: + elem.text = elem.text.strip() + if elem.tail: elem.tail = elem.tail.strip() + + self.__simplify_td_th_content(table_root) + # 迭代 + for child in table_root.iterchildren(): + if child is not None: + self.__get_table_body(table_type, child) + return self._element_to_html(table_root) def __do_extract_tables(self, root: HtmlElement) -> None: @@ -184,23 +246,26 @@ def __do_extract_tables(self, root: HtmlElement) -> None: if root.tag in ['table']: table_raw_html = self._element_to_html(root) table_type = self.__get_table_type(root) + table_nest_level = self.__is_table_nested(root) tail_text = root.tail table_body = self.__get_table_body(table_type, root) cc_element = self._build_cc_element( - CCTag.CC_TABLE, table_body, tail_text, table_type=table_type, html=table_raw_html) + CCTag.CC_TABLE, table_body, tail_text, table_type=table_type, table_nest_level=table_nest_level, + html=table_raw_html) self._replace_element(root, cc_element) return for child in root.iterchildren(): self.__do_extract_tables(child) - def __get_attribute(self, html: str) -> Tuple[int, str]: + def __get_attribute(self, html: str) -> tuple[bool, Any, Any]: """获取element的属性.""" ele = self._build_html_tree(html) if ele is not None and ele.tag == CCTag.CC_TABLE: table_type = ele.attrib.get('table_type') + table_nest_level = ele.attrib.get('table_nest_level') table_flag = self.__get_content_list_table_type(table_type) table_body = ele.text - return table_flag, table_body + return table_flag, table_nest_level, table_body else: raise HtmlTableRecognizerExp(f'{html}中没有cctable标签') diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_include_code_expect.json b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_include_code_expect.json new file mode 100644 index 00000000..15a9cf34 --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_include_code_expect.json @@ -0,0 +1,299 @@ +
1<br>2<br>3<br>4<br>5<br>6<br>7<br>8<br>9<br>10<br>11<br>12<br>13<br>14<br>15<br>16<br>17<br>18<br>19<br>20<br>21<br>22<br>23<br>24<br>25<br>26<br>27<br>28<br>29<br>30<br>31<br>32<br>33<br>34<br>35<br>36<br>37<br>38<br>39<br>40<br>41<br>42<br>43<br>44<br>45<br>46<br>47<br>48<br>49<br>50<br>51<br>52<br>53<br>54<br>55<br>56<br>57<br>58<br>59<br>60<br>61<br>62<br>63<br>64<br>65<br>66<br>67<br>68<br>69<br>70<br>71<br>72<br>73<br>74<br>75<br>76<br>77<br>78<br>79<br>80<br>81<br>82<br>83<br>84<br>85<br>86<br>87<br>88<br>89<br>90<br>91<br>92<br>93<br>94<br>95<br>96<br>97<br>98<br>99<br>100<br>101<br>102<br>103<br>104<br>105<br>106<br>107<br>108<br>109<br>110<br>111<br>112<br>113<br>114<br>115<br>116<br>117<br>118<br>119<br>120<br>121<br>122<br>123<br>124<br>125<br>126<br>127<br>128<br>129<br>130<br>131<br>132<br>133<br>134<br>135<br>136<br>137<br>138<br>139<br>140<br>141<br>142<br>143<br>144<br>145<br>146<br>147<br>148<br>149<br>150<br>151<br>152<br>153<br>154<br>155<br>156<br>157<br>158<br>159<br>160<br>161<br>162<br>163<br>164<br>165<br>166<br>167<br>168<br>169<br>170<br>171<br>172<br>173<br>174<br>175<br>176<br>177<br>178<br>179<br>180<br>181<br>182<br>183<br>184<br>185<br>186<br>187<br>188<br>189<br>190<br>191<br>192<br>193<br>194<br>195<br>196<br>197<br>198<br>199<br>200<br>201<br>202<br>203<br>204<br>205<br>206<br>207<br>208<br>209<br>210<br>211<br>212<br>213<br>214<br>215<br>216<br>217<br>218<br>219<br>220<br>221<br>222<br>223<br>224<br>225<br>226<br>227<br>228<br>229<br>230<br>231<br>232<br>233<br>234<br>235<br>236<br>237<br>238<br>239<br>240<br>241<br>242<br>243<br>244<br>245<br>246<br>247<br>248<br>249<br>250<br>251<br>252<br>253<br>254<br>255<br>256<br>257<br>258<br>259<br>260<br>261<br>262<br>263<br>264<br>265<br>266<br>267<br>268<br>269<br>270<br>271<br>272<br>273<br>274<br>275<br>276<br>277<br>278<br>279<br>280<br>281<br>282<br>283<br>284<br>285<br>286<br>287<br>288<br>289<br>290<br>291<br>292<br>293<br>294<br>295<br>296<br>297<br>298<br>299<%@ page language="java"import="java.util.*"pageEncoding="utf-8"%> +<% +String path = request.getContextPath(); +String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/"; +%> + +<!DOCTYPE HTML PUBLIC"-//W3C//DTD HTML 4.01 Transitional//EN"> +<html> +<head> +<title>My JSP'register.jsp'starting page</title> +</head> + +<body> +<script type="text/javascript"> +function validate(){ +if(registerForm.uname.value==""){ +alert("账号不能为空!"); +return; +} +if(registerForm.upwd.value==""){ +alert("密码不能为空!"); +return; +} +registerForm.submit(); +} +</script> + +<form name="registerForm"action="DoregServlet"method="post"> + +用户名:<input type="text"name="uname"><br> +密 码: <input type="password"name="upwd"> <br> +<input type="submit"value="注册"> +<a href="denglu.jsp">登录</a> +</form> + +</body> +</html> + + + +packagecom.servlet; + +importjava.io.IOException; +importjava.io.PrintWriter; + +importjavax.servlet.ServletException; +importjavax.servlet.http.HttpServlet; +importjavax.servlet.http.HttpServletRequest; +importjavax.servlet.http.HttpServletResponse; + +importcom.dao.UsersDao; + +publicclassservlet3extendsHttpServlet { + +publicservlet3() { +super(); +} + + +publicvoiddestroy() { +super.destroy();// Just puts "destroy" string in log +// Put your code here +} + + +publicvoiddoGet(HttpServletRequest request, HttpServletResponse response) +throwsServletException, IOException { +doPost (request, response); + +} + + +publicvoiddoPost(HttpServletRequest request, HttpServletResponse response) +throwsServletException, IOException { + +String uname = request.getParameter("uname"); +String upwd = request.getParameter("upwd"); +UsersDao usersDao =newUsersDao(); +inti=usersDao.reg(uname, upwd); +if(i>0){ + +response.setHeader("refresh","2;url=login.jsp"); +}else{ + +response.setHeader("refresh","2;url=reg.jsp"); +} +} + +/** +* Initialization of the servlet. <br> +* +* @throws ServletException if an error occurs +*/ +publicvoidinit()throwsServletException { +// Put your code here +} + +} + + + + + +packagecom.sf.servlet; + +importjava.io.IOException; +importjava.io.PrintWriter; + +importjavax.servlet.ServletException; +importjavax.servlet.http.HttpServlet; +importjavax.servlet.http.HttpServletRequest; +importjavax.servlet.http.HttpServletResponse; + +importcom.sf.dao.MsgDao; +importcom.sf.dao.UsersDao; + +publicclassDoregservletextendsHttpServlet { + +/** +* Constructor of the object. +*/ +publicDoregservlet() { +super(); +} + +/** +* Destruction of the servlet. <br> +*/ +publicvoiddestroy() { +super.destroy();// Just puts "destroy" string in log +// Put your code here +} + +publicvoiddoGet(HttpServletRequest request, HttpServletResponse response) +throwsServletException, IOException { + +response.setContentType("text/html"); +PrintWriter out = response.getWriter(); +request.setCharacterEncoding("utf-8"); +String uname = request.getParameter("uname"); +String upwd = request.getParameter("upwd"); + +UsersDao ud =newUsersDao(); +MsgDao md =newMsgDao(); +if(ud.register(uname, upwd) >0) { +request.getSession().setAttribute("uname", uname); +request.getRequestDispatcher("denglu.jsp").forward(request, +response); +}else{ +out.print("注册失败,请重新注册......."); +response.setHeader("refresh","3;url=reg.jsp"); +} +} +publicvoiddoPost(HttpServletRequest request, HttpServletResponse response) +throwsServletException, IOException { + +doGet(request,response); +} + +/** +* Initialization of the servlet. <br> +* +* @throws ServletException if an error occurs +*/ +publicvoidinit()throwsServletException { +// Put your code here +} + +} + + + + + +packagecom.servlet; + +importjava.io.IOException; +importjava.io.PrintWriter; + +importjavax.servlet.ServletException; +importjavax.servlet.http.HttpServlet; +importjavax.servlet.http.HttpServletRequest; +importjavax.servlet.http.HttpServletResponse; + +importcom.dao.MsgDao; + +publicclassservlet5extendsHttpServlet { + +publicservlet5() { +super(); +} + +publicvoiddestroy() { +super.destroy();// Just puts "destroy" string in log +// Put your code here +} + + +publicvoiddoGet(HttpServletRequest request, HttpServletResponse response) +throwsServletException, IOException { + +doPost(request, response); +} + + +publicvoiddoPost(HttpServletRequest request, HttpServletResponse response) +throwsServletException, IOException { + +request.setCharacterEncoding("utf-8"); + +intid=Integer.parseInt(request.getParameter("id")); +MsgDao md=newMsgDao(); +md.delMail(id); +response.getWriter().print("刪除成功....."); +response.setHeader("refresh","2;url=main.jsp"); +response.sendRedirect("main2.jsp"); +} + + +publicvoidinit()throwsServletException { + +} + +} + + + + + + + +packagecom.sf.servlet; + +importjava.io.IOException; +importjava.io.PrintWriter; + +importjavax.servlet.ServletException; +importjavax.servlet.http.HttpServlet; +importjavax.servlet.http.HttpServletRequest; +importjavax.servlet.http.HttpServletResponse; + +importcom.sf.dao.MsgDao; +importcom.sf.entity.Msg; + +publicclassDowriteservletextendsHttpServlet { + +/** +* Constructor of the object. +*/ +publicDowriteservlet() { +super(); +} + +/** +* Destruction of the servlet. <br> +*/ +publicvoiddestroy() { +super.destroy();// Just puts "destroy" string in log +// Put your code here +} + +publicvoiddoGet(HttpServletRequest request, HttpServletResponse response) +throwsServletException, IOException { + +response.setContentType("text/html"); +PrintWriter out = response.getWriter(); +request.setCharacterEncoding("utf-8"); +String uname = (String) request.getSession().getAttribute("uname"); +String sendto = request.getParameter("receiver"); +String title = request.getParameter("title"); +String content = request.getParameter("content"); + +Msg m =newMsg(); +m.setMsgcontent(content); +m.setUsername(uname); +m.setSendto(sendto); +m.setTitle(title); + +MsgDao md =newMsgDao(); +md.addMsg(m); + +out.print("发送成功....."); +response.setHeader("refresh","3;url=main.jsp"); +} + +publicvoiddoPost(HttpServletRequest request, HttpServletResponse response) +throwsServletException, IOException { + +doGet(request,response); } + +/** +* Initialization of the servlet. <br> +* +* @throws ServletException if an error occurs +*/ +publicvoidinit()throwsServletException { +} + +}
\ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_involve_code.html b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_involve_code.html new file mode 100644 index 00000000..d1961838 --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_involve_code.html @@ -0,0 +1,1001 @@ + + + + + + + + + + + + + + 第十三周作业 - 徐涛% - 博客园 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+
+
+ +
+
+

+ + 第十三周作业 + + + + +

+
+
+
+
+ + + + + + + + + + +
+
1
+
2
+
3
+
4
+
5
+
6
+
7
+
8
+
9
+
10
+
11
+
12
+
13
+
14
+
15
+
16
+
17
+
18
+
19
+
20
+
21
+
22
+
23
+
24
+
25
+
26
+
27
+
28
+
29
+
30
+
31
+
32
+
33
+
34
+
35
+
36
+
37
+
38
+
39
+
40
+
41
+
42
+
43
+
44
+
45
+
46
+
47
+
48
+
49
+
50
+
51
+
52
+
53
+
54
+
55
+
56
+
57
+
58
+
59
+
60
+
61
+
62
+
63
+
64
+
65
+
66
+
67
+
68
+
69
+
70
+
71
+
72
+
73
+
74
+
75
+
76
+
77
+
78
+
79
+
80
+
81
+
82
+
83
+
84
+
85
+
86
+
87
+
88
+
89
+
90
+
91
+
92
+
93
+
94
+
95
+
96
+
97
+
98
+
99
+
100
+
101
+
102
+
103
+
104
+
105
+
106
+
107
+
108
+
109
+
110
+
111
+
112
+
113
+
114
+
115
+
116
+
117
+
118
+
119
+
120
+
121
+
122
+
123
+
124
+
125
+
126
+
127
+
128
+
129
+
130
+
131
+
132
+
133
+
134
+
135
+
136
+
137
+
138
+
139
+
140
+
141
+
142
+
143
+
144
+
145
+
146
+
147
+
148
+
149
+
150
+
151
+
152
+
153
+
154
+
155
+
156
+
157
+
158
+
159
+
160
+
161
+
162
+
163
+
164
+
165
+
166
+
167
+
168
+
169
+
170
+
171
+
172
+
173
+
174
+
175
+
176
+
177
+
178
+
179
+
180
+
181
+
182
+
183
+
184
+
185
+
186
+
187
+
188
+
189
+
190
+
191
+
192
+
193
+
194
+
195
+
196
+
197
+
198
+
199
+
200
+
201
+
202
+
203
+
204
+
205
+
206
+
207
+
208
+
209
+
210
+
211
+
212
+
213
+
214
+
215
+
216
+
217
+
218
+
219
+
220
+
221
+
222
+
223
+
224
+
225
+
226
+
227
+
228
+
229
+
230
+
231
+
232
+
233
+
234
+
235
+
236
+
237
+
238
+
239
+
240
+
241
+
242
+
243
+
244
+
245
+
246
+
247
+
248
+
249
+
250
+
251
+
252
+
253
+
254
+
255
+
256
+
257
+
258
+
259
+
260
+
261
+
262
+
263
+
264
+
265
+
266
+
267
+
268
+
269
+
270
+
271
+
272
+
273
+
274
+
275
+
276
+
277
+
278
+
279
+
280
+
281
+
282
+
283
+
284
+
285
+
286
+
287
+
288
+
289
+
290
+
291
+
292
+
293
+
294
+
295
+
296
+
297
+
298
+
299
+ +
+
+
<%@ page language="java" import="java.util.*" pageEncoding="utf-8"%>
+
<%
+
String path = request.getContextPath();
+
String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
+
%>
+
 
+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+
<html>
+
  <head>
+
    <title>My JSP 'register.jsp' starting page</title>   
+
  </head>
+
  
+
  <body>
+
  <script type="text/javascript">
+
        function validate(){
+
            if(registerForm.uname.value==""){
+
                alert("账号不能为空!");
+
                return;
+
            }
+
            if(registerForm.upwd.value==""){
+
                alert("密码不能为空!");
+
                return;
+
            }
+
            registerForm.submit();
+
        }
+
    </script>
+
 
+
    <form  name="registerForm" action="DoregServlet" method="post">
+
 
+
        用户名:<input type="text" name="uname"><br>
+
        密   码: <input type="password" name="upwd"> <br>
+
        <input type="submit" value="注册" >
+
        <a href="denglu.jsp">登录</a>
+
    </form>
+
     
+
  </body>
+
</html>
+
 
+
  
+
 
+
package com.servlet;
+
 
+
import java.io.IOException;
+
import java.io.PrintWriter;
+
 
+
import javax.servlet.ServletException;
+
import javax.servlet.http.HttpServlet;
+
import javax.servlet.http.HttpServletRequest;
+
import javax.servlet.http.HttpServletResponse;
+
 
+
import com.dao.UsersDao;
+
 
+
public class servlet3 extends HttpServlet {
+
 
+
    public servlet3() {
+
        super();
+
    }
+
 
+
     
+
    public void destroy() {
+
        super.destroy(); // Just puts "destroy" string in log
+
        // Put your code here
+
    }
+
 
+
 
+
    public void doGet(HttpServletRequest request, HttpServletResponse response)
+
            throws ServletException, IOException {
+
        doPost (request, response);
+
         
+
    }
+
 
+
     
+
    public void doPost(HttpServletRequest request, HttpServletResponse response)
+
            throws ServletException, IOException {
+
 
+
        String uname = request.getParameter("uname");
+
        String upwd = request.getParameter("upwd");
+
        UsersDao usersDao = new UsersDao();
+
        int i=usersDao.reg(uname, upwd);
+
        if(i>0){
+
         
+
            response.setHeader("refresh""2;url=login.jsp");
+
        }else{
+
     
+
            response.setHeader("refresh""2;url=reg.jsp");
+
        }
+
    }
+
 
+
    /**
+
     * Initialization of the servlet. <br>
+
     *
+
     * @throws ServletException if an error occurs
+
     */
+
    public void init() throws ServletException {
+
        // Put your code here
+
    }
+
 
+
}
+
 
+
  
+
 
+
  
+
 
+
package com.sf.servlet;
+
 
+
import java.io.IOException;
+
import java.io.PrintWriter;
+
 
+
import javax.servlet.ServletException;
+
import javax.servlet.http.HttpServlet;
+
import javax.servlet.http.HttpServletRequest;
+
import javax.servlet.http.HttpServletResponse;
+
 
+
import com.sf.dao.MsgDao;
+
import com.sf.dao.UsersDao;
+
 
+
public class Doregservlet extends HttpServlet {
+
 
+
    /**
+
     * Constructor of the object.
+
     */
+
    public Doregservlet() {
+
        super();
+
    }
+
 
+
    /**
+
     * Destruction of the servlet. <br>
+
     */
+
    public void destroy() {
+
        super.destroy(); // Just puts "destroy" string in log
+
        // Put your code here
+
    }
+
 
+
    public void doGet(HttpServletRequest request, HttpServletResponse response)
+
            throws ServletException, IOException {
+
 
+
        response.setContentType("text/html");
+
        PrintWriter out = response.getWriter();
+
        request.setCharacterEncoding("utf-8");
+
        String uname = request.getParameter("uname");
+
        String upwd = request.getParameter("upwd");
+
 
+
        UsersDao ud = new UsersDao();
+
        MsgDao md = new MsgDao();
+
        if (ud.register(uname, upwd) > 0) {
+
            request.getSession().setAttribute("uname", uname);
+
            request.getRequestDispatcher("denglu.jsp").forward(request,
+
                    response);
+
        else {
+
            out.print("注册失败,请重新注册.......");
+
            response.setHeader("refresh""3;url=reg.jsp");
+
        }
+
    }
+
    public void doPost(HttpServletRequest request, HttpServletResponse response)
+
            throws ServletException, IOException {
+
 
+
        doGet(request,response);
+
    }
+
 
+
    /**
+
     * Initialization of the servlet. <br>
+
     *
+
     * @throws ServletException if an error occurs
+
     */
+
    public void init() throws ServletException {
+
        // Put your code here
+
    }
+
 
+
}
+
 
+
  
+
 
+
  
+
 
+
package com.servlet;
+
 
+
import java.io.IOException;
+
import java.io.PrintWriter;
+
 
+
import javax.servlet.ServletException;
+
import javax.servlet.http.HttpServlet;
+
import javax.servlet.http.HttpServletRequest;
+
import javax.servlet.http.HttpServletResponse;
+
 
+
import com.dao.MsgDao;
+
 
+
public class servlet5 extends HttpServlet {
+
 
+
    public servlet5() {
+
        super();
+
    }
+
 
+
    public void destroy() {
+
        super.destroy(); // Just puts "destroy" string in log
+
        // Put your code here
+
    }
+
 
+
     
+
    public void doGet(HttpServletRequest request, HttpServletResponse response)
+
            throws ServletException, IOException {
+
 
+
        doPost(request,  response);
+
    }
+
 
+
     
+
    public void doPost(HttpServletRequest request, HttpServletResponse response)
+
            throws ServletException, IOException {
+
 
+
        request.setCharacterEncoding("utf-8");
+
          
+
        int id=Integer.parseInt(request.getParameter("id"));
+
        MsgDao md=new MsgDao();
+
        md.delMail(id);   
+
        response.getWriter().print("刪除成功.....");
+
        response.setHeader("refresh""2;url=main.jsp");
+
        response.sendRedirect("main2.jsp");
+
    }
+
 
+
     
+
    public void init() throws ServletException {
+
     
+
    }
+
 
+
}
+
 
+
  
+
 
+
  
+
 
+
  
+
 
+
package com.sf.servlet;
+
 
+
import java.io.IOException;
+
import java.io.PrintWriter;
+
 
+
import javax.servlet.ServletException;
+
import javax.servlet.http.HttpServlet;
+
import javax.servlet.http.HttpServletRequest;
+
import javax.servlet.http.HttpServletResponse;
+
 
+
import com.sf.dao.MsgDao;
+
import com.sf.entity.Msg;
+
 
+
public class Dowriteservlet extends HttpServlet {
+
 
+
    /**
+
     * Constructor of the object.
+
     */
+
    public Dowriteservlet() {
+
        super();
+
    }
+
 
+
    /**
+
     * Destruction of the servlet. <br>
+
     */
+
    public void destroy() {
+
        super.destroy(); // Just puts "destroy" string in log
+
        // Put your code here
+
    }
+
 
+
    public void doGet(HttpServletRequest request, HttpServletResponse response)
+
            throws ServletException, IOException {
+
 
+
        response.setContentType("text/html");
+
        PrintWriter out = response.getWriter();
+
        request.setCharacterEncoding("utf-8");
+
        String uname = (String) request.getSession().getAttribute("uname");
+
        String sendto = request.getParameter("receiver");
+
        String title = request.getParameter("title");
+
        String content = request.getParameter("content");
+
 
+
        Msg m = new Msg();
+
        m.setMsgcontent(content);
+
        m.setUsername(uname);
+
        m.setSendto(sendto);
+
        m.setTitle(title);
+
 
+
        MsgDao md = new MsgDao();
+
        md.addMsg(m);
+
 
+
        out.print("发送成功.....");
+
        response.setHeader("refresh""3;url=main.jsp");
+
    }
+
 
+
    public void doPost(HttpServletRequest request, HttpServletResponse response)
+
            throws ServletException, IOException {
+
 
+
        doGet(request,response);     }
+
 
+
    /**
+
     * Initialization of the servlet. <br>
+
     *
+
     * @throws ServletException if an error occurs
+
     */
+
    public void init() throws ServletException {
+
    }
+
 
+
}
+ +
+ +
+
+
+ +
+
posted @ +2022-05-29 20:20  +徐涛%  +阅读(70)  +评论(0)  +编辑  +收藏  +举报 +
+
+ + +
+
+ + +
+
+ +
+ +
+
+
+
+
+ + + + +
+
+
+
+ +
+ +
+
+ +
+
+
+ +
+ + + + + + + + + + + \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json index f1c6da6a..b0baf47d 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json @@ -3,6 +3,7 @@ "raw_content": "<table><caption>ফেব্রুয়ারি ২০২৪</caption><thead><tr><th>সোম</th><th>মঙ্গল</th><th>বুধ</th><th>বৃহ</th><th>শুক্র</th><th>শনি</th><th>রবি</th></tr></thead><tfoot><tr><td colspan=\\\"3\\\">« জানুয়ারি</td><td></td><td colspan=\\\"3\\\"></td></tr></tfoot><tbody><tr><td colspan=\\\"3\\\"></td><td>১</td><td>২</td><td>৩</td><td>৪</td></tr><tr><td>৫</td><td>৬</td><td>৭</td><td>৮</td><td>৯</td><td>১০</td><td>১১</td></tr><tr><td>১২</td><td>১৩</td><td>১৪</td><td>১৫</td><td>১৬</td><td>১৭</td><td>১৮</td></tr><tr><td>১৯</td><td>২০</td><td>২১</td><td>২২</td><td>২৩</td><td>২৪</td><td>২৫</td></tr><tr><td>২৬</td><td>২৭</td><td>২৮</td><td>২৯</td><td colspan=\\\"3\\\"></td></tr></tbody></table>", "content": { "html": "
ফেব্রুয়ারি ২০২৪
সোমমঙ্গলবুধবৃহশুক্রশনিরবি
« জানুয়ারি
১০১১
১২১৩১৪১৫১৬১৭১৮
১৯২০২১২২২৩২৪২৫
২৬২৭২৮২৯
", - "is_complex": true + "is_complex": true, + "table_nest_level": null } } diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index 19e1b106..08f3492c 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -19,7 +19,8 @@ 'assets/recognizer/table_simple_cc.html', 'assets/recognizer/table_include_rowspan_colspan.html', 'assets/recognizer/table_involve_equation.html', - 'assets/recognizer/table_include_after_code.html' + 'assets/recognizer/table_include_after_code.html', + 'assets/recognizer/table_involve_code.html' ), 'expected': [ @@ -86,7 +87,7 @@ def test_cc_simple_table(self): parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) assert len(parts) == 3 content = html_to_element(parts[1][0]).text_content() - assert content == r'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
Рейтинг:Рейтинг<br>5.00<br>из 5 на основе опроса<br>3<br>пользователей
Тип товара:Препараты для омоложения
Форма:Крем
Объем:50 мл
Рецепт:Отпускается без рецепта
Способ хранения:Хранить при температуре 4-20°
Примечание:Беречь от детей
Оплата:Наличными/банковской картой
Доступность в Северске:В наличии
Доставка:2-7 Дней
Цена:84<br>₽
\n' + assert content == r'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
Рейтинг:<br>\n<br>\n<br>\n<br>\nРейтинг<br>5.00<br>из 5 на основе опроса<br>3<br>пользователей<br>\n<br>\n<br>\n<br>\n
Тип товара:<br>\n<br>\n<br>\n<br>\nПрепараты для омоложения<br>\n<br>\n<br>\n<br>\n
Форма:<br>\n<br>\n<br>\n<br>\nКрем<br>\n<br>\n<br>\n<br>\n
Объем:<br>\n<br>\n<br>\n<br>\n50 мл<br>\n<br>\n<br>\n<br>\n
Рецепт:<br>\n<br>\n<br>\n<br>\nОтпускается без рецепта<br>\n<br>\n<br>\n<br>\n
Способ хранения:<br>\n<br>\n<br>\n<br>\nХранить при температуре 4-20°<br>\n<br>\n<br>\n<br>\n
Примечание:<br>\n<br>\n<br>\n<br>\nБеречь от детей<br>\n<br>\n<br>\n<br>\n
Оплата:<br>\n<br>\n<br>\n<br>\nНаличными/банковской картой<br>\n<br>\n<br>\n<br>\n
Доступность в Северске:<br>\n<br>\n<br>\n<br>\nВ наличии<br>\n<br>\n<br>\n<br>\n
Доставка:<br>\n<br>\n<br>\n<br>\n2-7 Дней<br>\n<br>\n<br>\n<br>\n
Цена:<br>\n<br>\n<br>\n<br>\n84<br>₽<br>\n<br>\n<br>\n<br>\n
\n' def test_cc_complex_table(self): """cc跨行跨列的表格.""" @@ -111,11 +112,11 @@ def test_simple_complex_table(self): simple_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}')[0] simple_table_type = simple_table_tag.attrib assert simple_table_type['table_type'] == 'simple' - assert simple_table_type == {'table_type': 'simple', 'html': '\n \n \n \n \n \n \n \n \n
12
34
\n\n'} + assert simple_table_type == {'table_type': 'simple', 'table_nest_level': '1', 'html': '\n \n \n \n \n \n \n \n \n
12
34
\n\n'} complex_table_tag = html_to_element(parts[2][0]).xpath(f'.//{CCTag.CC_TABLE}')[0] complex_table_type = complex_table_tag.attrib assert complex_table_type['table_type'] == 'complex' - assert complex_table_type == {'table_type': 'complex', 'html': '\n \n \n \n \n \n \n \n \n \n \n \n \n \n
123
4
567
\n '} + assert complex_table_type == {'table_type': 'complex', 'table_nest_level': '1', 'html': '\n \n \n \n \n \n \n \n \n \n \n \n \n \n
123
4
567
\n '} def test_table_to_content_list_node_simple(self): """测试table的 to content list node方法.""" @@ -151,7 +152,8 @@ def test_table_involve_equation(self): base_url = 'https://en.m.wikipedia.org/wiki/Variance' raw_html = raw_html_path.read_text(encoding='utf-8') parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) - assert parts is not None + complex_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}') + assert complex_table_tag[0].text == r'
Name of the probability distributionProbability distribution functionMeanVariance
Binomial distribution{\displaystyle \Pr \,(X=k)={\binom {n}{k}}p^{k}(1-p)^{n-k}}{\displaystyle np}{\displaystyle np(1-p)}
Geometric distribution{\displaystyle \Pr \,(X=k)=(1-p)^{k-1}p}{\displaystyle {\frac {1}{p}}}{\displaystyle {\frac {(1-p)}{p^{2}}}}
Normal distribution{\displaystyle f\left(x\mid \mu ,\sigma ^{2}\right)={\frac {1}{\sqrt {2\pi \sigma ^{2}}}}e^{-{\frac {(x-\mu )^{2}}{2\sigma ^{2}}}}}{\displaystyle \mu }{\displaystyle \sigma ^{2}}
Uniform distribution (continuous){\displaystyle f(x\mid a,b)={\begin{cases}{\frac {1}{b-a}}&{\text{for }}a\leq x\leq b,\\[3pt]0&{\text{for }}x<a{\text{ or }}x>b\end{cases}}}{\displaystyle {\frac {a+b}{2}}}{\displaystyle {\frac {(b-a)^{2}}{12}}}
Exponential distribution{\displaystyle f(x\mid \lambda )=\lambda e^{-\lambda x}}{\displaystyle {\frac {1}{\lambda }}}{\displaystyle {\frac {1}{\lambda ^{2}}}}
Poisson distribution{\displaystyle f(k\mid \lambda )={\frac {e^{-\lambda }\lambda ^{k}}{k!}}}{\displaystyle \lambda }{\displaystyle \lambda }
' def test_table_involve_after_code(self): """test table involve code, code被提取出去了,过滤掉空的和坏的table.""" @@ -161,3 +163,14 @@ def test_table_involve_after_code(self): raw_html = raw_html_path.read_text(encoding='utf-8') parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) assert html_to_element(parts[0][0]).xpath(f'.//{CCTag.CC_TABLE}')[0].text is None + + def test_table_involve_code(self): + """table involve code.""" + for test_case in TEST_CASES: + raw_html_path = base_dir.joinpath(test_case['input'][11]) + base_url = 'https://en.m.wikipedia.org/wiki/Variance' + raw_html = raw_html_path.read_text(encoding='utf-8') + parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) + complex_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}') + content = open('assets/recognizer/table_include_code_expect.json', 'r', encoding='utf-8').read() + assert complex_table_tag[0].text == content From e7c379248180ca57384269ed030a32eef7ddd6b6 Mon Sep 17 00:00:00 2001 From: quyuan Date: Tue, 25 Feb 2025 13:09:03 +0800 Subject: [PATCH 02/31] update extract table --- llm_web_kit/extractor/html/recognizer/table.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index b41f8834..3598aaba 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -128,10 +128,11 @@ def __is_table_nested(self, tree) -> int: # 计算祖先中的 table 数量(不包括自身),再加1表示自身层级 return len(tree.xpath('ancestor::table')) + 1 - def __extract_tables(self, ele: HtmlElement) -> list[tuple[str, str]]: + def __extract_tables(self, ele: str) -> list[tuple[str, str]]: """提取html中的table元素.""" - self.__do_extract_tables(ele) - new_html = self._element_to_html(ele) + tree = self._build_html_tree(ele) + self.__do_extract_tables(tree) + new_html = self._element_to_html(tree) lst = self.html_split_by_tags(new_html, CCTag.CC_TABLE) return lst From f0347ff6421dc53cf8906c6598b1d6f4b49e8308 Mon Sep 17 00:00:00 2001 From: quyuan Date: Tue, 25 Feb 2025 13:24:15 +0800 Subject: [PATCH 03/31] remove table tail --- tests/llm_web_kit/extractor/html/recognizer/test_table.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index 08f3492c..48c17998 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -87,8 +87,7 @@ def test_cc_simple_table(self): parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) assert len(parts) == 3 content = html_to_element(parts[1][0]).text_content() - assert content == r'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
Рейтинг:<br>\n<br>\n<br>\n<br>\nРейтинг<br>5.00<br>из 5 на основе опроса<br>3<br>пользователей<br>\n<br>\n<br>\n<br>\n
Тип товара:<br>\n<br>\n<br>\n<br>\nПрепараты для омоложения<br>\n<br>\n<br>\n<br>\n
Форма:<br>\n<br>\n<br>\n<br>\nКрем<br>\n<br>\n<br>\n<br>\n
Объем:<br>\n<br>\n<br>\n<br>\n50 мл<br>\n<br>\n<br>\n<br>\n
Рецепт:<br>\n<br>\n<br>\n<br>\nОтпускается без рецепта<br>\n<br>\n<br>\n<br>\n
Способ хранения:<br>\n<br>\n<br>\n<br>\nХранить при температуре 4-20°<br>\n<br>\n<br>\n<br>\n
Примечание:<br>\n<br>\n<br>\n<br>\nБеречь от детей<br>\n<br>\n<br>\n<br>\n
Оплата:<br>\n<br>\n<br>\n<br>\nНаличными/банковской картой<br>\n<br>\n<br>\n<br>\n
Доступность в Северске:<br>\n<br>\n<br>\n<br>\nВ наличии<br>\n<br>\n<br>\n<br>\n
Доставка:<br>\n<br>\n<br>\n<br>\n2-7 Дней<br>\n<br>\n<br>\n<br>\n
Цена:<br>\n<br>\n<br>\n<br>\n84<br>₽<br>\n<br>\n<br>\n<br>\n
\n' - + assert content == r"\n\n\n\n\n\n\n\n\n\n\n\n\n
Рейтинг:Рейтинг<br>5.00<br>3
Тип товара:Препараты для омоложения
Форма:Крем
Объем:50 мл
Рецепт:Отпускается без рецепта
Способ хранения:Хранить при температуре 4-20°
Примечание:Беречь от детей
Оплата:Наличными/банковской картой
Доступность в Северске:В наличии
Доставка:2-7 Дней
Цена:84<br>₽
" def test_cc_complex_table(self): """cc跨行跨列的表格.""" for test_case in TEST_CASES: From 5e176944beb0aa2b34b49a3a274380856c831bdd Mon Sep 17 00:00:00 2001 From: quyuan Date: Tue, 25 Feb 2025 13:34:44 +0800 Subject: [PATCH 04/31] normalize line endings --- llm_web_kit/extractor/html/recognizer/table.py | 2 +- tests/llm_web_kit/extractor/html/recognizer/test_table.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index 3598aaba..9d5dbb37 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -232,7 +232,7 @@ def __get_table_body(self, table_type, table_root): if elem.text: elem.text = elem.text.strip() if elem.tail: - elem.tail = elem.tail.strip() + elem.tail = None self.__simplify_td_th_content(table_root) # 迭代 diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index 48c17998..9f26c523 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -87,7 +87,8 @@ def test_cc_simple_table(self): parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) assert len(parts) == 3 content = html_to_element(parts[1][0]).text_content() - assert content == r"\n\n\n\n\n\n\n\n\n\n\n\n\n
Рейтинг:Рейтинг<br>5.00<br>3
Тип товара:Препараты для омоложения
Форма:Крем
Объем:50 мл
Рецепт:Отпускается без рецепта
Способ хранения:Хранить при температуре 4-20°
Примечание:Беречь от детей
Оплата:Наличными/банковской картой
Доступность в Северске:В наличии
Доставка:2-7 Дней
Цена:84<br>₽
" + assert content == r'\n\n\n\n\n\n\n\n\n\n\n\n\n
Рейтинг:Рейтинг<br>5.00<br>3
Тип товара:Препараты для омоложения
Форма:Крем
Объем:50 мл
Рецепт:Отпускается без рецепта
Способ хранения:Хранить при температуре 4-20°
Примечание:Беречь от детей
Оплата:Наличными/банковской картой
Доступность в Северске:В наличии
Доставка:2-7 Дней
Цена:84<br>₽
' + def test_cc_complex_table(self): """cc跨行跨列的表格.""" for test_case in TEST_CASES: From c15dea1fcdda4d59bfcb5b3a8b49a37c62cc7989 Mon Sep 17 00:00:00 2001 From: quyuan Date: Tue, 25 Feb 2025 14:02:39 +0800 Subject: [PATCH 05/31] update test case --- tests/llm_web_kit/extractor/html/recognizer/test_table.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index 9f26c523..b8b67029 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -26,7 +26,8 @@ 'expected': [ ('assets/recognizer/table_to_content_list_simple_res.json'), ('assets/recognizer/table_to_content_list_complex_res.json'), - ('assets/recognizer/table_include_image_expcet.json') + ('assets/recognizer/table_include_image_expcet.json'), + ('assets/recognizer/table_include_code_expect.json') ], } ] @@ -172,5 +173,6 @@ def test_table_involve_code(self): raw_html = raw_html_path.read_text(encoding='utf-8') parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) complex_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}') - content = open('assets/recognizer/table_include_code_expect.json', 'r', encoding='utf-8').read() + expect_path = base_dir.joinpath(test_case['expected'][3]) + content = open(expect_path, 'r', encoding='utf-8').read() assert complex_table_tag[0].text == content From d34a8a7416f3b238b49ea81ff5eff6ee37a396b7 Mon Sep 17 00:00:00 2001 From: quyuan Date: Tue, 25 Feb 2025 14:45:30 +0800 Subject: [PATCH 06/31] update format --- llm_web_kit/extractor/html/extractor.py | 2 +- llm_web_kit/extractor/html/recognizer/table.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/llm_web_kit/extractor/html/extractor.py b/llm_web_kit/extractor/html/extractor.py index 1d3facb3..bc3fe05b 100644 --- a/llm_web_kit/extractor/html/extractor.py +++ b/llm_web_kit/extractor/html/extractor.py @@ -93,7 +93,7 @@ def _do_extract(self, data_json: DataJson) -> DataJson: main_html, method = self._extract_main_html(raw_html, base_url, page_layout_type) parsed_html = [(main_html,raw_html)] - for extract_func in [self._extract_code, self._extract_table, self._extract_math, self._extract_list, + for extract_func in [self._extract_table, self._extract_code, self._extract_math, self._extract_list, self._extract_image, self._extract_title, self._extract_paragraph]: parsed_html = extract_func(base_url, parsed_html, raw_html) diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index 9d5dbb37..64528ea2 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -128,7 +128,7 @@ def __is_table_nested(self, tree) -> int: # 计算祖先中的 table 数量(不包括自身),再加1表示自身层级 return len(tree.xpath('ancestor::table')) + 1 - def __extract_tables(self, ele: str) -> list[tuple[str, str]]: + def __extract_tables(self, ele: str) -> list[Tuple[str, str]]: """提取html中的table元素.""" tree = self._build_html_tree(ele) self.__do_extract_tables(tree) @@ -233,7 +233,6 @@ def __get_table_body(self, table_type, table_root): elem.text = elem.text.strip() if elem.tail: elem.tail = None - self.__simplify_td_th_content(table_root) # 迭代 for child in table_root.iterchildren(): @@ -258,7 +257,7 @@ def __do_extract_tables(self, root: HtmlElement) -> None: for child in root.iterchildren(): self.__do_extract_tables(child) - def __get_attribute(self, html: str) -> tuple[bool, Any, Any]: + def __get_attribute(self, html: str) -> Tuple[bool, Any, Any]: """获取element的属性.""" ele = self._build_html_tree(html) if ele is not None and ele.tag == CCTag.CC_TABLE: From 87a24954be0bfe65f42dfd6d6df559661a02c928 Mon Sep 17 00:00:00 2001 From: quyuan Date: Tue, 25 Feb 2025 14:48:59 +0800 Subject: [PATCH 07/31] update format --- llm_web_kit/extractor/html/recognizer/table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index 64528ea2..232573ea 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -128,7 +128,7 @@ def __is_table_nested(self, tree) -> int: # 计算祖先中的 table 数量(不包括自身),再加1表示自身层级 return len(tree.xpath('ancestor::table')) + 1 - def __extract_tables(self, ele: str) -> list[Tuple[str, str]]: + def __extract_tables(self, ele: str) -> List[Tuple[str, str]]: """提取html中的table元素.""" tree = self._build_html_tree(ele) self.__do_extract_tables(tree) From 98610905a5a36e778fd85631e0fa8ffb8f9d68e9 Mon Sep 17 00:00:00 2001 From: quyuan Date: Tue, 25 Feb 2025 14:59:22 +0800 Subject: [PATCH 08/31] update format --- llm_web_kit/extractor/html/extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_web_kit/extractor/html/extractor.py b/llm_web_kit/extractor/html/extractor.py index bc3fe05b..1d3facb3 100644 --- a/llm_web_kit/extractor/html/extractor.py +++ b/llm_web_kit/extractor/html/extractor.py @@ -93,7 +93,7 @@ def _do_extract(self, data_json: DataJson) -> DataJson: main_html, method = self._extract_main_html(raw_html, base_url, page_layout_type) parsed_html = [(main_html,raw_html)] - for extract_func in [self._extract_table, self._extract_code, self._extract_math, self._extract_list, + for extract_func in [self._extract_code, self._extract_table, self._extract_math, self._extract_list, self._extract_image, self._extract_title, self._extract_paragraph]: parsed_html = extract_func(base_url, parsed_html, raw_html) From a77735f93a337c181e01cc3af3c03b2f691058b8 Mon Sep 17 00:00:00 2001 From: quyuan Date: Tue, 25 Feb 2025 18:01:25 +0800 Subject: [PATCH 09/31] change parse order --- llm_web_kit/extractor/html/extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_web_kit/extractor/html/extractor.py b/llm_web_kit/extractor/html/extractor.py index 1d3facb3..bc3fe05b 100644 --- a/llm_web_kit/extractor/html/extractor.py +++ b/llm_web_kit/extractor/html/extractor.py @@ -93,7 +93,7 @@ def _do_extract(self, data_json: DataJson) -> DataJson: main_html, method = self._extract_main_html(raw_html, base_url, page_layout_type) parsed_html = [(main_html,raw_html)] - for extract_func in [self._extract_code, self._extract_table, self._extract_math, self._extract_list, + for extract_func in [self._extract_table, self._extract_code, self._extract_math, self._extract_list, self._extract_image, self._extract_title, self._extract_paragraph]: parsed_html = extract_func(base_url, parsed_html, raw_html) From 419b2c1024efc37ce15d864bc0d66615bcff6f53 Mon Sep 17 00:00:00 2001 From: quyuan Date: Tue, 25 Feb 2025 21:32:30 +0800 Subject: [PATCH 10/31] add list nest level --- llm_web_kit/extractor/html/recognizer/list.py | 21 ++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/list.py b/llm_web_kit/extractor/html/recognizer/list.py index 1dbea3fc..315b8ac8 100644 --- a/llm_web_kit/extractor/html/recognizer/list.py +++ b/llm_web_kit/extractor/html/recognizer/list.py @@ -1,5 +1,5 @@ import json -from typing import List, Tuple +from typing import List, Tuple, Any from lxml.etree import _Element as HtmlElement from overrides import override @@ -88,16 +88,16 @@ def __do_extract_list(self, root:HtmlElement) -> None: list_tag_names = ['ul', 'ol', 'dl', 'menu', 'dir'] if root.tag in list_tag_names: - is_ordered, content_list, raw_html, tail_text = self.__extract_list_element(root) + list_nest_level, is_ordered, content_list, raw_html, tail_text = self.__extract_list_element(root) text = json.dumps(content_list, ensure_ascii=False, indent=4) - cc_element = self._build_cc_element(CCTag.CC_LIST, text, tail_text, ordered=is_ordered, html=raw_html) + cc_element = self._build_cc_element(CCTag.CC_LIST, text, tail_text, ordered=is_ordered, list_nest_level=list_nest_level, html=raw_html) self._replace_element(root, cc_element) # cc_element 替换掉原来的列表元素 return for child in root.iterchildren(): self.__do_extract_list(child) - def __extract_list_element(self, ele: HtmlElement) -> Tuple[bool, list, str, str]: + def __extract_list_element(self, ele: HtmlElement) -> tuple[int, bool, list[list[list]], str, Any]: """ 提取列表元素: 假如有如下列表: @@ -135,6 +135,7 @@ def __extract_list_element(self, ele: HtmlElement) -> Tuple[bool, list, str, str (bool, str, str): 第一个元素是是否有序; 第二个元素是个python list,内部是文本和行内公式,具体格式参考list的content_list定义。第三个元素是列表原始的html内容 """ is_ordered = ele.tag in ['ol', 'dl'] + list_nest_level = self.__get_list_type(ele) tail_text = ele.tail content_list = [] raw_html = self._element_to_html(ele) @@ -144,7 +145,17 @@ def __extract_list_element(self, ele: HtmlElement) -> Tuple[bool, list, str, str text_paragraph = self.__extract_list_item_text(item) content_list.append(text_paragraph) - return is_ordered, content_list, raw_html, tail_text + return list_nest_level, is_ordered, content_list, raw_html, tail_text + + def __get_list_type(self, list_ele:HtmlElement) -> int: + """ + 获取list嵌套的类型 + """ + if list_ele.tag not in ['ul', 'ol', 'dl', 'menu', 'dir']: + return 0 + ancestor_count = list_ele.xpath('count(ancestor::ul | ancestor::ol)') + # 层级 = 祖先列表数量 + 自身(1层) + return int(ancestor_count) + 1 def __extract_list_item_text(self, root:HtmlElement) -> list[list]: """提取列表项的文本. From c40b1ead2135c737a3f3c8943b7a54eea7f09595 Mon Sep 17 00:00:00 2001 From: quyuan Date: Tue, 25 Feb 2025 21:43:47 +0800 Subject: [PATCH 11/31] fix pylint --- llm_web_kit/extractor/html/recognizer/list.py | 6 ++---- .../assets/recognizer/table_include_code_expect.json | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/list.py b/llm_web_kit/extractor/html/recognizer/list.py index 315b8ac8..d564d41e 100644 --- a/llm_web_kit/extractor/html/recognizer/list.py +++ b/llm_web_kit/extractor/html/recognizer/list.py @@ -1,5 +1,5 @@ import json -from typing import List, Tuple, Any +from typing import Any, List, Tuple from lxml.etree import _Element as HtmlElement from overrides import override @@ -148,9 +148,7 @@ def __extract_list_element(self, ele: HtmlElement) -> tuple[int, bool, list[list return list_nest_level, is_ordered, content_list, raw_html, tail_text def __get_list_type(self, list_ele:HtmlElement) -> int: - """ - 获取list嵌套的类型 - """ + """获取list嵌套的类型.""" if list_ele.tag not in ['ul', 'ol', 'dl', 'menu', 'dir']: return 0 ancestor_count = list_ele.xpath('count(ancestor::ul | ancestor::ol)') diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_include_code_expect.json b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_include_code_expect.json index 15a9cf34..4f6fc9ed 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_include_code_expect.json +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_include_code_expect.json @@ -296,4 +296,4 @@ doGet(request,response); } publicvoidinit()throwsServletException { } -}
\ No newline at end of file +} From 6c7ca2dddf0c29f772bc1e0dcd0df99ec0b9d545 Mon Sep 17 00:00:00 2001 From: quyuan Date: Thu, 27 Feb 2025 16:38:35 +0800 Subject: [PATCH 12/31] update table nest spec.md --- .../output_format/content_list_spec.md | 25 +++++++++++-------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/docs/specification/output_format/content_list_spec.md b/docs/specification/output_format/content_list_spec.md index 5c27c663..8bbecc32 100644 --- a/docs/specification/output_format/content_list_spec.md +++ b/docs/specification/output_format/content_list_spec.md @@ -83,7 +83,8 @@ "html": "
12
", "title": "example table", "note": "数据来源于...", - "is_complex": false // 是否是复杂表格(跨行、跨列的, 默认为false + "is_complex": false // 是否是复杂表格(跨行、跨列的/嵌套表格, 默认为false, + "table_nest_level": 1 //table的嵌套层级 } }, { @@ -285,20 +286,22 @@ "html": "
12
", "title": "example table", "note": "数据来源于...", - "is_complex": false // 是否是复杂表格(跨行、跨列的, 默认为false + "is_complex": false // 是否是复杂表格(跨行、跨列的, 默认为false, + "table_nest_level": 1 //表格嵌套层级 } } ``` -| 字段 | 类型 | 描述 | 是否必须 | -| ------------------ | ------- | ---------------------------------------- | -------- | -| type | string | 值固定为table | 是 | -| bbox | array | \[x1, y1, x2, y2\] | 可选 | -| raw_content | string | 原始文本内容 | 可选 | -| content.html | string | 表格的html内容 | 是 | -| content.title | string | 表格的title属性 | 可选 | -| content.note | string | 表格的note属性 | 可选 | -| content.is_complex | boolean | 是否是复杂表格(跨行、跨列的, 默认为false | 可选 | +| 字段 | 类型 | 描述 | 是否必须 | +| ------------------------ | ------- | ------------------------------------------------- | -------- | +| type | string | 值固定为table | 是 | +| bbox | array | \[x1, y1, x2, y2\] | 可选 | +| raw_content | string | 原始文本内容 | 可选 | +| content.html | string | 表格的html内容 | 是 | +| content.title | string | 表格的title属性 | 可选 | +| content.note | string | 表格的note属性 | 可选 | +| content.is_complex | boolean | 是否是复杂表格(跨行、跨列的/嵌套表格, 默认为false | 可选 | +| content.table_nest_level | int | table嵌套层级(单个table为1,两层为2,以此类推) | 可选 | ### 列表段 From 9e1545293c7d4d6ec10362312a2adb9db6700f6b Mon Sep 17 00:00:00 2001 From: quyuan Date: Mon, 3 Mar 2025 12:36:38 +0800 Subject: [PATCH 13/31] update parse order --- llm_web_kit/extractor/html/extractor.py | 2 +- .../table_involve_complex_code.html | 237 ++++++++++++++++++ .../extractor/html/recognizer/test_code.py | 2 +- .../extractor/html/recognizer/test_table.py | 17 +- 4 files changed, 254 insertions(+), 4 deletions(-) create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_involve_complex_code.html diff --git a/llm_web_kit/extractor/html/extractor.py b/llm_web_kit/extractor/html/extractor.py index bc3fe05b..1d3facb3 100644 --- a/llm_web_kit/extractor/html/extractor.py +++ b/llm_web_kit/extractor/html/extractor.py @@ -93,7 +93,7 @@ def _do_extract(self, data_json: DataJson) -> DataJson: main_html, method = self._extract_main_html(raw_html, base_url, page_layout_type) parsed_html = [(main_html,raw_html)] - for extract_func in [self._extract_table, self._extract_code, self._extract_math, self._extract_list, + for extract_func in [self._extract_code, self._extract_table, self._extract_math, self._extract_list, self._extract_image, self._extract_title, self._extract_paragraph]: parsed_html = extract_func(base_url, parsed_html, raw_html) diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_involve_complex_code.html b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_involve_complex_code.html new file mode 100644 index 00000000..b929d7e0 --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_involve_complex_code.html @@ -0,0 +1,237 @@ + + + + ClientNetworkWrapper.java (Example JavaDoc) + + + + + + + + + + + + + + +
+

+ + + + + + + + + + + + + + + + + +
FileDocCategorySizeDatePackage
ClientNetworkWrapper.javaAPI DocExample2389Thu Nov 08 00:23:44 GMT 2001com.ora.rmibook.chapter3
+

ClientNetworkWrapper

+ public class ClientNetworkWrapper extends NetworkBaseClass implements + PrinterConstants + + + + +
+ + + + +
+
+
+

+ + + + + + + + + + + + + +
Fields Summary
private String +
_serverMachine
+
+
private int +
_serverPort
+
+
+ + + + + + + + +
Constructors Summary
public ClientNetworkWrapper()
+

+ + + + +
+
+
+

+        this (DEFAULT_SERVER_NAME, DEFAULT_SERVER_PORT);
+    
Test Test Test
ABC
+DEF
TEST TEST TEST
+
+
public ClientNetworkWrapper(String + serverMachine, int serverPort) +
+

+ + + + +
+
+
+

+        _serverMachine = serverMachine;
+        _serverPort = serverPort;
+    
+
+
+ + + + + + + + + + + + + + + + +
Methods Summary
private voidreadStatusFromSocket(java.net.Socket + connection) +
+

+ + + + +
+
+
+

+        InputStream inputStream = connection.getInputStream();
+        DataInputStream dataInputStream = new DataInputStream(inputStream);
+        BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
+        boolean response = dataInputStream.readBoolean();
+
+        if (response) {
+            return;
+        }
+        PrinterException error = new PrinterException(inputStream);
+
+        throw error;
+    
+
+
public voidsendDocumentToPrinter(java.io.InputStream actualDocument) +
+

+ + + + +
+
+
+

+        sendDocumentToPrinter(actualDocument, DEFAULT_DOCUMENT_TYPE,
+            DEFAULT_PRINT_TWO_SIDED, DEFAULT_PRINT_QUALITY);
+    
+
+
public voidsendDocumentToPrinter(java.io.InputStream actualDocument, int documentType, boolean printTwoSided, + int printQuality) +
+

+ + + + +
+
+
+

+        DocumentDescription documentToSend;
+
+        try {
+            documentToSend = new DocumentDescription(actualDocument, documentType, printTwoSided, printQuality);
+        } catch (IOException e) {
+            throw new ConnectionException();
+        }
+        sendDocumentToPrinter(documentToSend);
+    
+
+
public voidsendDocumentToPrinter(DocumentDescription documentDescription) +
+

+ + + + +
+
+
+

+        Socket connection = null;
+
+        try {
+            connection = new Socket(_serverMachine, _serverPort);
+            documentDescription.writeToStream(connection.getOutputStream());
+            readStatusFromSocket(connection);
+        } catch (IOException e) {
+            e.printStackTrace();
+            throw new ConnectionException();
+        }
+        closeSocket(connection);
+    
+
+
+

+ + + + \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_code.py b/tests/llm_web_kit/extractor/html/recognizer/test_code.py index 40f758c1..143591b1 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_code.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_code.py @@ -268,7 +268,7 @@ def test_code_rec(self): raw_html_path = base_dir.joinpath(test_case['input'][0]) base_url = test_case['input'][1] print(base_url) - raw_html = raw_html_path.read_text() + raw_html = raw_html_path.read_text(encoding="utf-8") parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) parts = [ part[0] diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index b8b67029..e92e7297 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -20,7 +20,8 @@ 'assets/recognizer/table_include_rowspan_colspan.html', 'assets/recognizer/table_involve_equation.html', 'assets/recognizer/table_include_after_code.html', - 'assets/recognizer/table_involve_code.html' + 'assets/recognizer/table_involve_code.html', + 'assets/recognizer/table_involve_complex_code.html' ), 'expected': [ @@ -175,4 +176,16 @@ def test_table_involve_code(self): complex_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}') expect_path = base_dir.joinpath(test_case['expected'][3]) content = open(expect_path, 'r', encoding='utf-8').read() - assert complex_table_tag[0].text == content + assert complex_table_tag[0].text == content.strip("\n") + + def test_table_involve_complex_code(self): + """table involve complex code""" + for test_case in TEST_CASES: + raw_html_path = base_dir.joinpath(test_case['input'][12]) + base_url = 'https://en.m.wikipedia.org/wiki/Variance' + raw_html = raw_html_path.read_text(encoding='utf-8') + parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) + complex_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}') + expect_path = base_dir.joinpath(test_case['expected'][3]) + content = open(expect_path, 'r', encoding='utf-8').read() + assert complex_table_tag[0].text == content.strip("\n") From 4a61728b43ce10d3230f8e54900ed39931f865e4 Mon Sep 17 00:00:00 2001 From: quyuan Date: Mon, 3 Mar 2025 12:49:57 +0800 Subject: [PATCH 14/31] update parse order --- tests/llm_web_kit/extractor/html/recognizer/test_code.py | 2 +- tests/llm_web_kit/extractor/html/recognizer/test_table.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_code.py b/tests/llm_web_kit/extractor/html/recognizer/test_code.py index 143591b1..5b55ed42 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_code.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_code.py @@ -268,7 +268,7 @@ def test_code_rec(self): raw_html_path = base_dir.joinpath(test_case['input'][0]) base_url = test_case['input'][1] print(base_url) - raw_html = raw_html_path.read_text(encoding="utf-8") + raw_html = raw_html_path.read_text(encoding='utf-8') parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) parts = [ part[0] diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index e92e7297..87ccbce8 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -176,10 +176,10 @@ def test_table_involve_code(self): complex_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}') expect_path = base_dir.joinpath(test_case['expected'][3]) content = open(expect_path, 'r', encoding='utf-8').read() - assert complex_table_tag[0].text == content.strip("\n") + assert complex_table_tag[0].text == content.strip('\n') def test_table_involve_complex_code(self): - """table involve complex code""" + """table involve complex code.""" for test_case in TEST_CASES: raw_html_path = base_dir.joinpath(test_case['input'][12]) base_url = 'https://en.m.wikipedia.org/wiki/Variance' @@ -188,4 +188,4 @@ def test_table_involve_complex_code(self): complex_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}') expect_path = base_dir.joinpath(test_case['expected'][3]) content = open(expect_path, 'r', encoding='utf-8').read() - assert complex_table_tag[0].text == content.strip("\n") + assert complex_table_tag[0].text == content.strip('\n') From 1b0e1e92993c9ff587db304e84f9a23cc3756acb Mon Sep 17 00:00:00 2001 From: quyuan Date: Mon, 3 Mar 2025 13:37:23 +0800 Subject: [PATCH 15/31] update parse order --- tests/llm_web_kit/extractor/html/recognizer/test_code.py | 2 +- tests/llm_web_kit/extractor/html/recognizer/test_table.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_code.py b/tests/llm_web_kit/extractor/html/recognizer/test_code.py index 5b55ed42..40f758c1 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_code.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_code.py @@ -268,7 +268,7 @@ def test_code_rec(self): raw_html_path = base_dir.joinpath(test_case['input'][0]) base_url = test_case['input'][1] print(base_url) - raw_html = raw_html_path.read_text(encoding='utf-8') + raw_html = raw_html_path.read_text() parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) parts = [ part[0] diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index 87ccbce8..e569d340 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -178,6 +178,7 @@ def test_table_involve_code(self): content = open(expect_path, 'r', encoding='utf-8').read() assert complex_table_tag[0].text == content.strip('\n') + @unittest.skip(reason='在code模块解决了这个问题') def test_table_involve_complex_code(self): """table involve complex code.""" for test_case in TEST_CASES: From 78ca0283c79ca84f05b5bdb1f1d87b0a4eb5ddfa Mon Sep 17 00:00:00 2001 From: quyuan Date: Mon, 3 Mar 2025 14:34:59 +0800 Subject: [PATCH 16/31] =?UTF-8?q?update=20list=E6=A0=87=E5=87=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../output_format/content_list_spec.md | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/docs/specification/output_format/content_list_spec.md b/docs/specification/output_format/content_list_spec.md index 8bbecc32..f04e2c85 100644 --- a/docs/specification/output_format/content_list_spec.md +++ b/docs/specification/output_format/content_list_spec.md @@ -108,7 +108,8 @@ ] ] ], - "ordered": true + "ordered": true, + "list_nest_level": 1 //list的嵌套层级 } } ], @@ -327,18 +328,20 @@ ] ] ], - "ordered": true + "ordered": true, + "list_nest_level": 1 //list嵌套层级 } } ``` -| 字段 | 类型 | 描述 | 是否必须 | -| --------------- | ------- | --------------------------------------------------- | -------- | -| type | string | 值固定为list | 是 | -| bbox | array | \[x1, y1, x2, y2\] | 可选 | -| raw_content | string | 原始文本内容 | 可选 | -| content.items | array | 列表项,每个元素是N个段落,段落里的元素是文本或公式 | 是 | -| content.ordered | boolean | 是否是有序列表 | 可选 | +| 字段 | 类型 | 描述 | 是否必须 | +| ----------------------- | ------- | --------------------------------------------------- | -------- | +| type | string | 值固定为list | 是 | +| bbox | array | \[x1, y1, x2, y2\] | 可选 | +| raw_content | string | 原始文本内容 | 可选 | +| content.items | array | 列表项,每个元素是N个段落,段落里的元素是文本或公式 | 是 | +| content.ordered | boolean | 是否是有序列表 | 可选 | +| content.list_nest_level | int | list的嵌套层级(单层list list_nest_level为1) | 可选 | items字段说明 From efcd7a21d99878c4f99d2bac3acfdeabb9d3d7f8 Mon Sep 17 00:00:00 2001 From: quyuan Date: Mon, 3 Mar 2025 15:55:38 +0800 Subject: [PATCH 17/31] add table involve inline code --- .../html/table_involve_inline_code.html | 26 +++++++++++++++++++ .../good_data/html_data_input.jsonl | 1 + .../extractor/test_extractor_chain.py | 15 +++++++++++ 3 files changed, 42 insertions(+) create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_involve_inline_code.html diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_involve_inline_code.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_involve_inline_code.html new file mode 100644 index 00000000..0f927ee3 --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_involve_inline_code.html @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + + + + + + + +
FunctionDescriptionExample
print()Prints a message to the console.print("Hello, World!")
len()Returns the length of an object.len([1, 2, 3])
range()Generates a sequence of numbers.range(1, 10)
diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl index 1efe87b6..5f08bdbf 100644 --- a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl @@ -9,3 +9,4 @@ {"track_id": "rfc-doc", "dataset_name": "test_pipeline_suit", "url": "https://www.test.com","data_source_category": "HTML", "path":"doc.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} {"track_id": "legato_doc", "dataset_name": "test_pipeline_suit", "url": "https://www.test.com","data_source_category": "HTML", "path":"legato_docs.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} {"track_id": "oracle_doc", "dataset_name": "test_pipeline_suit", "url": "https://docs.oracle.com/en-us/iaas/tools/java/3.57.1/com/oracle/bmc/integration/model/CustomEndpointDetails.html","data_source_category": "HTML", "path":"oracle_doc.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} +{"track_id": "table_involve_inline_code", "dataset_name": "test_table_involve_inline_code", "url": "https://docs.oracle.com/en-us/iaas/tools/java/3.57.1/com/oracle/bmc/integration/model/CustomEndpointDetails.html","data_source_category": "HTML", "path":"table_involve_inline_code.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index 4972673b..a6671f4f 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -344,3 +344,18 @@ def test_oracle_doc_comment(self): result = chain.extract(input_data) main_html = result.get_content_list().to_main_html() assert 'public int hashCode()' in main_html + + def test_table_involve_inline_code(self): + """ + table里面包含行内code + Returns: + + """ + chain = ExtractSimpleFactory.create(self.config) + self.assertIsNotNone(chain) + test_data = self.data_json[11] + # Create DataJson from test data + input_data = DataJson(test_data) + result = chain.extract(input_data) + content_list = result.get_content_list()._get_data() + print(content_list) From 0776f6efa0a118bad026147d923052cb662ae3a0 Mon Sep 17 00:00:00 2001 From: quyuan Date: Mon, 3 Mar 2025 16:58:41 +0800 Subject: [PATCH 18/31] add test case --- tests/llm_web_kit/extractor/test_extractor_chain.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index a6671f4f..ef596204 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -357,5 +357,5 @@ def test_table_involve_inline_code(self): # Create DataJson from test data input_data = DataJson(test_data) result = chain.extract(input_data) - content_list = result.get_content_list()._get_data() - print(content_list) + content_list = result.get_content_list()._get_data()[0][0]['content']['html'] + assert content_list == """
FunctionDescriptionExample
print()Prints a message to the console.print("Hello, World!")
len()Returns the length of an object.len([1, 2, 3])
range()Generates a sequence of numbers.range(1, 10)
""" From 3fda2a69efa527e95275d39dde6f3e26df4045fc Mon Sep 17 00:00:00 2001 From: quyuan Date: Mon, 3 Mar 2025 17:07:10 +0800 Subject: [PATCH 19/31] fix test case --- tests/llm_web_kit/extractor/test_extractor_chain.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index ef596204..884c9860 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -59,7 +59,7 @@ def setUp(self): for line in f: self.data_json.append(json.loads(line.strip())) - assert len(self.data_json) == 11 + assert len(self.data_json) == 12 # Config for HTML extraction self.config = { From e0196bfbb27f473cdd0c45ad2b48342d20e8754b Mon Sep 17 00:00:00 2001 From: dt-yy Date: Wed, 5 Mar 2025 16:28:59 +0800 Subject: [PATCH 20/31] add table tail --- .../extractor/html/recognizer/table.py | 35 +- .../good_data/html/table_tail_text.html | 367 ++++++++++++++++++ .../good_data/html_data_input.jsonl | 3 +- .../extractor/test_extractor_chain.py | 13 +- 4 files changed, 405 insertions(+), 13 deletions(-) create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_tail_text.html diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index fa24dd6d..6908398e 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -196,11 +196,22 @@ def __check_table_include_math_code(self, raw_html: HtmlElement): ] ele_res.extend(ccinterline_codes) else: - ele_res.extend([ - text.strip() - for text in self._build_html_tree(math_item[1]).itertext() - if text.strip() - ]) + tree = self._build_html_tree(math_item[1]) + texts = [] + for element in tree.iter(): + if element.text and element.text.strip(): + text = element.text.strip() + # 如果有tail,直接拼接到text后面 + if element.tail and element.tail.strip(): + text += element.tail.strip() + texts.append(text) + elif element.tail and element.tail.strip(): + # 如果只有tail且前面有内容,则拼接到最后一个text + if texts: + texts[-1] += element.tail.strip() + else: + texts.append(element.tail.strip()) + ele_res.extend(texts) return ele_res def __simplify_td_th_content(self, elem: HtmlElement) -> None: @@ -212,7 +223,8 @@ def __simplify_td_th_content(self, elem: HtmlElement) -> None: parse_res.extend(math_res) for item in list(elem.iterchildren()): elem.remove(item) - elem.text = '
'.join(parse_res) + if parse_res: + elem.text = '
'.join(parse_res) return for child in elem.iter('td', 'th'): self.__simplify_td_th_content(child) @@ -227,18 +239,19 @@ def __get_table_body(self, table_type, table_root): cleaned_attrs = {k: v for k, v in table_root.attrib.items() if k in allowed_attributes} table_root.attrib.clear() table_root.attrib.update(cleaned_attrs) - # text进行strip操作,tail去掉(有较多空换行) + # text进行strip操作,tail保留(部分内容留在tail中) for elem in chain([table_root], table_root.iterdescendants()): - if elem.text: + if elem.text is not None: elem.text = elem.text.strip() - if elem.tail: - elem.tail = None + if elem.tail is not None: + elem.tail = elem.tail.strip() + if not elem.tail: + elem.tail = None self.__simplify_td_th_content(table_root) # 迭代 for child in table_root.iterchildren(): if child is not None: self.__get_table_body(table_type, child) - return self._element_to_html(table_root) def __do_extract_tables(self, root: HtmlElement) -> None: diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_tail_text.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_tail_text.html new file mode 100644 index 00000000..4044b9a3 --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_tail_text.html @@ -0,0 +1,367 @@ + + + + + + + + + 🇷🇺 | Show hub - Big-Empty DC++ Dchublist NMDC and ADCs хабов Huburi Хаблист + + + + + + + + + + + + + + + + + + + + + + + + +
+ +

Big-Empty

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Client + https://dchublists.com/clients/FlylinkDC_x64.exe +
StatusOnline | ID: 975
URL + https://dchublists.com/hub-975 +
Address +NMDC | dchub://big-empty.ru +
ASN + Style-Com LLC +
Failover + Not available +
NameBig-Empty
Topic + Not available +
Description + Хаб сети Arbital +
Category + Not available +
Software + PtokaX 0.5.3.0 +
Owner + Self +
Location + RU Russian Federation +
Users + 25 | 55 +
Clones0
Share + 4.39 TB | 90.60 TB +
User limit10000
Share limit0 B
Slot limit0
Hub limit0
Reliability99.04%
Checked + 2024-12-09 03:06:01 | 2021-05-07 +
Votes + +0 | -0 | 0 +
Website + Not available +
Email + Not available +
+
+

Online users

+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NickShare
Darv1n1.55 TB
PtokaX0 B
1975628.43 GB
AndyDesktop0 B
Crtyujgfdscvgjh35.54 GB
DaymarixZZZ37.57 GB
Evgeniy_D76.15 GB
Julia0 B
Kuzma0 B
Larsenv0 B
MAXMED8888888864.10 GB
Qwerty_ytr_R724237.12 GB
SERG_B149.65 GB
Sculli156.92 GB
Shareaza404613.03 GB
Soliton14.68 GB
Sweaborg794.15 GB
Viktor138283179.23 GB
[fly]Fire_dU3JR10.72 GB
[fly]Monkey_QGrFy124.72 GB
[fly]Moon_x7m61.13 GB
kotbaun0 B
marcs3.62 GB
minili59.30 GB
y2b4k698df328djei3261.82 GB
+
+
+ +

Comments

+ There are no comments for this hub, you can write one here. +
+
+ + + + +\n\t\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\t\n\t\n\n\n\n\n\n\n\n
\n\t\t\t\t\t\n\t
\n\t\t\t\t\t\t\t\t
\n\t\t\t\n\t\t\t
\n\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\n\n
Выберите свой город:
\n
\n\n
\n
Выберите из списка:
\n
    \n
  • Абакан
  • \n
  • Ачинск
  • \n
  • Альметьевск
  • \n
  • Ангарск
  • \n
  • Архангельск
  • \n
  • Армавир
  • \n
  • Артём
  • \n
  • Арзамас
  • \n
  • Астрахань
  • \n
  • Балаково
  • \n
  • Балашиха
  • \n
  • Барнаул
  • \n
  • Батайск
  • \n
  • Белгород
  • \n
  • Бердск
  • \n
  • Березники
  • \n
  • Бийск
  • \n
  • Благовещенск
  • \n
  • Братск
  • \n
  • Брянск
  • \n
  • Чебоксары
  • \n
  • Челябинск
  • \n
  • Череповец
  • \n
  • Черкесск
  • \n
  • Чита
  • \n
  • Дербент
  • \n
  • Димитровград
  • \n
  • Долгопрудный
  • \n
  • Домодедово
  • \n
  • Дзержинск
  • \n
  • Екатеринбург
  • \n
  • Елец
  • \n
  • Электросталь
  • \n
  • Элиста
  • \n
  • Энгельс
  • \n
  • Ессентуки
  • \n
  • Евпатория
  • \n
  • Грозный
  • \n
  • Хабаровск
  • \n
  • Хасавюрт
  • \n
  • Химки
  • \n
  • Иркутск
  • \n
  • Иваново
  • \n
  • Ижевск
  • \n
  • Йошкар-Ола
  • \n
  • Калининград
  • \n
  • Калуга
  • \n
  • Каменск-Уральский
  • \n
  • Камышин
  • \n
  • Каспийск
  • \n
  • Казань
  • \n
  • Кемерово
  • \n
  • Керчь
  • \n
  • Киров
  • \n
  • Кисловодск
  • \n
  • Коломна
  • \n
  • Комсомольск-на-Амуре
  • \n
  • Копейск
  • \n
  • Королёв
  • \n
  • Кострома
  • \n
  • Ковров
  • \n
  • Краснодар
  • \n
  • Красногорск
  • \n
  • Красноярск
  • \n
  • Курган
  • \n
  • Курск
  • \n
  • Кызыл
  • \n
  • Липецк
  • \n
  • Люберцы
  • \n
  • Магнитогорск
  • \n
  • Махачкала
  • \n
  • Майкоп
  • \n
  • Миасс
  • \n
  • Мурманск
  • \n
  • Муром
  • \n
  • Мытищи
  • \n
  • Набережные Челны
  • \n
  • Находка
  • \n
  • Нальчик
  • \n
  • Назрань
  • \n
  • Нефтекамск
  • \n
  • Нефтеюганск
  • \n
  • Невинномысск
  • \n
  • Нижнекамск
  • \n
  • Нижневартовск
  • \n
  • Нижний Новгород
  • \n
  • Нижний Тагил
  • \n
  • Ногинск
  • \n
  • Норильск
  • \n
  • Новочебоксарск
  • \n
  • Новочеркасск
  • \n
  • Новокуйбышевск
  • \n
  • Новокузнецк
  • \n
  • Новомосковск
  • \n
  • Новороссийск
  • \n
  • Новошахтинск
  • \n
  • Новосибирск
  • \n
  • Новый Уренгой
  • \n
  • Ноябрьск
  • \n
  • Обнинск
  • \n
  • Одинцово
  • \n
  • Октябрьский
  • \n
  • Омск
  • \n
  • Орехово-Зуево
  • \n
  • Оренбург
  • \n
  • Орск
  • \n
  • Орёл
  • \n
  • Пенза
  • \n
  • Пермь
  • \n
  • Первоуральск
  • \n
  • Петропавловск-Камчатский
  • \n
  • Петрозаводск
  • \n
  • Подольск
  • \n
  • Прокопьевск
  • \n
  • Псков
  • \n
  • Пушкино
  • \n
  • Пятигорск
  • \n
  • Раменское
  • \n
  • Реутов
  • \n
  • Ростов-на-Дону
  • \n
  • Рубцовск
  • \n
  • Рязань
  • \n
  • Рыбинск
  • \n
  • Салават
  • \n
  • Самара
  • \n
  • Санкт-Петербург
  • \n
  • Саранск
  • \n
  • Саратов
  • \n
  • Сергиев Посад
  • \n
  • Серпухов
  • \n
  • Севастополь
  • \n
  • Северодвинск
  • \n
  • Северск
  • \n
  • Шахты
  • \n
  • Щёлково
  • \n
  • Симферополь
  • \n
  • Смоленск
  • \n
  • Сочи
  • \n
  • Старый Оскол
  • \n
  • Ставрополь
  • \n
  • Стерлитамак
  • \n
  • Сургут
  • \n
  • Сыктывкар
  • \n
  • Сызрань
  • \n
  • Таганрог
  • \n
  • Тамбов
  • \n
  • Тольятти
  • \n
  • Томск
  • \n
  • Тула
  • \n
  • Тверь
  • \n
  • Тюмень
  • \n
  • Уфа
  • \n
  • Улан-Удэ
  • \n
  • Ульяновск
  • \n
  • Уссурийск
  • \n
  • Великий Новгород
  • \n
  • Владикавказ
  • \n
  • Владимир
  • \n
  • Владивосток
  • \n
  • Волгодонск
  • \n
  • Волгоград
  • \n
  • Вологда
  • \n
  • Волжский
  • \n
  • Воронеж
  • \n
  • Якутск
  • \n
  • Ярославль
  • \n
  • Южно-Сахалинск
  • \n
  • Жуковский
  • \n
  • Златоуст
  • \n
\n
\n
\n\n\n\n
\n\n\n\n
\n
Не нашли свой город?
\n \n
\n\n
\n\t\n\n\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\tHot line8 800 752 18 22\n\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t
\n\t\t\t
\n\t\t
\n\t\t\t\t\t
\n\t\t
\n\t\t\t
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t \n\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t\t\t
\n\t\t
\n\t\t\t\t
\n\t\t\t
\n\t\t\t\t\n\t\t\t
\n\t\t\t
\n\t\t\t\t
\n\t\t\t\t\t\n\t\t\t\t
\n\t\t\t
\t\t\n\t\t\t\n\t\t\t\t\t\t
\n\t\t\t\t
\n\t\t\t
\n\t\t\t\t\t\t\n\t\t\t\t\t\t
\n\t\t\t\t\n\t\t\t\t2\n\t\t\t
\n\t\t\t\t\t
\n\t\t\t
\n\n\t\t\t\n\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t \n\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t2\n\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t
\n\t\t\t
\n\t\t\t
\n\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t\t\n\t\t\t\t\t
\n\t\t\t\t
\n\t\t\t
\n\t\t\t\n\t\t
\n\t
\n\t\t
\n\t\n
\n
\n\n\t\t\n\t
\n\t
\n\t\t
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t
\n\t
\t\n\t\t
\n\t\t\t
\n\t\t\t\t
\n\t\n
\t\t\t
\n\t\t\t
\n\t\t\t\t
\n\t\t\t\t\t

Miaflow в Северске

\n
\n\t
\n\t\t
Рейтинг 5.00 из 5 на основе опроса 3 пользователей
\t\t\t\t\t\t\t\t(3 отзыва клиентов)\n\t\t\t\t\t\t
\n\t
  В наличии
\n
\n\t\t\t
\n\t
84 
\n\t\t
\n\t\t\t\t\t\t\t\t\n\t\t\t
\n
\n
\n\t

Miaflow — это инновационный крем для омоложения лица, разработанный с использованием передовых технологий. Его уникальная формула, насыщенная ценными компонентами природы, обеспечивает интенсивный уход за кожей, возвращая ей молодость и сияние.

\n
\n\t\t\t\t
Заказать
\n\n
\n\n\t\n\t\n\tКатегория: Препараты для омоложения\n\t\n\t\n
\n
\n
\n
\n

* Не является лекарственным средством

\n
\n
\n
\n \"Оплата\"\n
\n

Оплата:

\n

при получении, наличными или банковской картой

\n
\n
\n
\n \"Доставка\"\n
\n

Доставка в Северске:

\n

1 - 7 дней, почтой или транспортными компаниями

\n
\n
\n
\n
Поделиться: 
\t\t\t\t
\n\t\t\t
\n\t\t
\n\t
\n\t\n
\n\t\t
\n\t\t
\n\t\t\t\n\t\t\t\n\t\t
\n\t\t
\n\t\t
Заказать\n\t\t\tMiaflow\t\t
\n\t\t\t
\n\t\t\t\t\t\t\t\t
\n\t\t\t\t\t

Преимущества

\n
    \n
  • Уменьшение морщин и линий
  • \n
  • Повышение упругости и эластичности кожи
  • \n
  • Омолаживающий эффект с первого применения
  • \n
  • Защита от вредного воздействия окружающей среды
  • \n
  • Глубокое увлажнение и питание
  • \n
\n

Принцип действия Miaflow

\n

Miaflow активирует естественные процессы обновления кожи, восстанавливая ее структуру и придавая заметный лифтинг-эффект. Это достигается благодаря уникальной комбинации активных ингредиентов.

\n

Состав Miaflow:

\n
    \n
  1. Концентрат пантов алтайского марала: Стимулирует обновление клеток, укрепляет структуру кожи.
  2. \n
  3. Концентрат трепанга: Обеспечивает увлажнение и смягчение, борется с признаками усталости.
  4. \n
  5. Каменное масло: Питает и улучшает тонус кожи.
  6. \n
  7. Живица кедровая и лиственничная: Прекрасные антисептики, поддерживают чистоту пор, способствуют заживлению.
  8. \n
  9. Эфирные масла кедра, тыквы, конопли, пихты, облепихи, чайного дерева, гвоздики: Обеспечивают ароматерапевтический эффект и усиливают регенерацию кожи.
  10. \n
\n

Клинические исследования

\n

Проведенные исследования показали, что более 90% участников заметили улучшение состояния кожи после использования Miaflow. Восстановление упругости, сокращение морщин, и природное сияние — вот результаты, подтвержденные клинически.

\n

Показания к применению

\n
    \n
  • Сухая и увядающая кожа
  • \n
  • Первые признаки старения
  • \n
  • Потеря упругости и эластичности
  • \n
\n

Способ применения Miaflow

\n

Наносите крем на чистую кожу лица и шеи массажными движениями до полного впитывания. Используйте утром и вечером для достижения максимального эффекта.

\n

Противопоказания Miaflow

\n

Не рекомендуется использовать при индивидуальной непереносимости к компонентам. Перед применением рекомендуется провести тест на небольшом участке кожи. В случае раздражения прекратите использование.

\n\t\t\t\t\t\t\t\t

Где купить Miaflow?

\n

Miaflow не продается в обычных аптеках в Северске и других регионах России. Однако, вы можете купить его у нас на сайте по выгодной цене 84  с удобной доставкой. Успешно достигните своих целей с данным средством!

\n\t\t\t\t\n\t\t\t\t
\n\t\t\t\t\t\t\t\t
\n\t\t\t\t\t \n
\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
Рейтинг:
Рейтинг 5.00 из 5 на основе опроса 3 пользователей
Тип товара: Препараты для омоложения
Форма:Крем
Объем:50 мл
Рецепт:Отпускается без рецепта
Способ хранения:Хранить при температуре 4-20°
Примечание:Беречь от детей
Оплата:Наличными/банковской картой
Доступность в Северске:В наличии
Доставка:2-7 Дней
Цена:84 
\n
\n\t\t\t\t\t\t\t\\n\t\t\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t
\n\t\t

3 отзывов о Miaflow

\n\t\t\t\t\t
    \n\t\t\t\t
  1. \n\t
    \n\t\t
    \n\n\t\t\t\n\t\t\t
    \n\n\t\t\t\t
    Оценка 5 из 5
    \n\t

    \n\t\tЕлена Евстегнеева \n\t\t\t\t \n\t

    \n\n\t\n\t\t\t
    \n\t\t
    \n\t\t

    Моя кожа претерпела настоящую революцию с Miaflow! Уже через неделю заметила, как морщины стали менее заметными, а цвет лица стал более ровным. Крем приятно наносится, быстро впитывается, и самое главное — результат на лице!

    \n
    \t
    \n
  2. \n
  3. \n\t
    \n\t\t
    \n\n\t\t\t\n\t\t\t
    \n\n\t\t\t\t
    Оценка 5 из 5
    \n\t

    \n\t\tЕрмаков Иван \n\t\t\t\t \n\t

    \n\n\t\n\t\t\t
    \n\t\t
    \n\t\t

    Совершенно случайно попробовал, и теперь я не могу себе представить свой уход без него. Кожа стала более упругой, а яркие следы усталости просто исчезли. Отличный продукт, с которым я чувствую себя настоящим джентльменом!

    \n
    \t
    \n
  4. \n
  5. \n\t
    \n\t\t
    \n\n\t\t\t\n\t\t\t
    \n\n\t\t\t\t
    Оценка 5 из 5
    \n\t

    \n\t\tЦветкова Ксения \n\t\t\t\t \n\t

    \n\n\t\n\t\t\t
    \n\t\t
    \n\t\t

    Мне было сложно найти подходящий уход для кожи после 50, но этот крем превзошел все мои ожидания! Мои друзья даже спрашивают, что я делаю, чтобы выглядеть так молодо. Этот крем — настоящее волшебство для кожи, и я рекомендую его каждой женщине!

    \n
    \t
    \n
  6. \n\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t

Средний рейтинг

\n\t\t\t\t\t\t

5.00

\n\t\t\t\t\t\t
Оценка 5.00 из 5
\t\t\t\t\t\t
\n\t\t\t\t\t\t\t3 Отзыв\t\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
5
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
100%
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
4
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
0%
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
3
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
0%
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
2
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
0%
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
1
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
0%
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\tНапишите отзыв

Ваш адрес email не будет опубликован. Обязательные поля помечены *

\n

\n

\n\n

\t
\n\t\t\t\t
\n\t\t\t
\n\t\t
\n\t\t
\n
\t\t\t\t\t\t\t\t\n\t\t\t\t
\n\t\t\t\t\t\t\t\t\n\t\t\t\t
\n\t\t\t\t\t
\"Miaflow\"
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\tMiaflow\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t84 ₽\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t
\n
\n
\n
\n\n\n\n\n\n
\n


\n

\n

Нажимая на кнопку, вы соглашаетесь с политикой конфиденциальности.

\n


\n

\n
\n\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t
\n\t\t\t\t
\n\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\tMiaflow\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t84 ₽\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
\"Miaflow\"
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t
\n
\n
\n
\n\n\n\n\n\n
\n


\n

\n

Нажимая на кнопку, вы соглашаетесь с политикой конфиденциальности.

\n


\n

\n
\t\t\t\t\t\t\n\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t
\n\n\t\t
\n\n\t
\n\n
\n\n
\n\n
\n\n\n\n\t
\n\n\t\t\t\t\t

Сопутствующие товары

\n\t\t\t\t\n\t\t
    \n\t\t\t\n\t\t\t\t\t
  • \n\t\t
    \n\t
    \n\t\t\t
    \n\t\t\t\t\t\t\t\t\n\t\t\t\t\t
    -25%
    \t\t\t
    \n\"Venzen\"\t\t
    \n\t\t\t
    Quick View \t\t
    \n\t\t\t
    \n\t
    \n\t\t
    \n\t\t\t
    \n\t\t\t\t\t\t\n\t\t\t\n\t\t\t\n
    Оценка 5.00 из 5
    \n\n\t1,990  1,490 \n\n
    Заказать\n
    \n\n\n
    \n\t\t\t
    \n
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t\tVenzen\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t\t1490 ₽\n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \"Venzen\"\n\t\t\t\t\t\t\t\t\tsrc=\"https://1bad.ru/wp-content/uploads/2022/09/venzen.jpg
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n
    \n
    \n
    \n\n\n\n\n\n
    \n


    \n

    \n

    Нажимая на кнопку, вы соглашаетесь с политикой конфиденциальности.

    \n


    \n

    \n
    \n\t\t\t\t\t
    \n\t\t\t\t
    \n\t\t\t
    \n\n
    \n\t\t
    \n\t
    \n
    \n
  • \n\t\t\t\n\t\t\t\t\t
  • \n\t\t
    \n\t
    \n\t\t\t
    \n\t\t\t\t\t\t\t\t\n\t\t\t
    \n\"Night\t\t
    \n\t\t\t
    Quick View \t\t
    \n\t\t\t
    \n\t
    \n\t\t
    \n\t\t\t
    \n\t\t\t\t\t\t\n\t\t\t\n\t\t\t\n
    Оценка 5.00 из 5
    \n\n\t149 \n\n
    Заказать\n
    \n\n\n
    \n\t\t\t
    \n
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t\tNight Miracle\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t\t149.00 ₽\n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \"Night
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n
    \n
    \n
    \n\n\n\n\n\n
    \n


    \n

    \n

    Нажимая на кнопку, вы соглашаетесь с политикой конфиденциальности.

    \n


    \n

    \n
    \n\t\t\t\t\t
    \n\t\t\t\t
    \n\t\t\t
    \n\n
    \n\t\t
    \n\t
    \n
    \n
  • \n\t\t\t\n\t\t\t\t\t
  • \n\t\t
    \n\t
    \n\t\t\t
    \n\t\t\t\t\t\t\t\t\n\t\t\t
    \n\"Молодильный\t\t
    \n\t\t\t
    Quick View \t\t
    \n\t\t\t
    \n\t
    \n\t\t
    \n\t\t\t
    \n\t\t\t\t\t\t\n\t\t\t\n\t\t\t\n
    Оценка 5.00 из 5
    \n\n\t149 \n\n
    Заказать\n
    \n\n\n
    \n\t\t\t
    \n
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t\tМолодильный спас\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t\t149.00 ₽\n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \"Молодильный
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n
    \n
    \n
    \n\n\n\n\n\n
    \n


    \n

    \n

    Нажимая на кнопку, вы соглашаетесь с политикой конфиденциальности.

    \n


    \n

    \n
    \n\t\t\t\t\t
    \n\t\t\t\t
    \n\t\t\t
    \n\n
    \n\t\t
    \n\t
    \n
    \n
  • \n\t\t\t\n\t\t\t\t\t
  • \n\t\t
    \n\t
    \n\t\t\t
    \n\t\t\t\t\t\t\t\t\n\t\t\t
    \n\"Zenza\t\t
    \n\t\t\t
    Quick View \t\t
    \n\t\t\t
    \n\t
    \n\t\t
    \n\t\t\t
    \n\t\t\t\t\t\t\n\t\t\t\n\t\t\t\n
    Оценка 5.00 из 5
    \n\n\t147 \n\n
    Заказать\n
    \n\n\n
    \n\t\t\t
    \n
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t\tZenza Cream\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t\t147.00 ₽\n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \"Zenza
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n
    \n
    \n
    \n\n\n\n\n\n
    \n


    \n

    \n

    Нажимая на кнопку, вы соглашаетесь с политикой конфиденциальности.

    \n


    \n

    \n
    \n\t\t\t\t\t
    \n\t\t\t\t
    \n\t\t\t
    \n\n
    \n\t\t
    \n\t
    \n
    \n
  • \n\t\t\t\n\t\t
\n\t
\n\t\t\n
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t
\t\t\t\t\t
\n\t
\n\n
\t\n\t\t
\n\t\t\t
\n\t\t\t\tЧто Вы ищете?\n\t\t\t\t
Закрыть
\n\t\t\t
\n\t\t\t\t
\n\t\t\t\n\t\t
\n\t\t\t\n\t\t\t\n\t\t\t
\n\t\t\t\t
    \n\t\t\t\t
\n\t\t\t
\n\t\t
\n\t\t\n\t
\n\t\t\n\t\t
\t\n\t
\n\n\n
\n\n\t
\n\t\t
\n\n\t\t\t
\n\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t\t

Вся информация на сайте - справочная. Перед применением лекарственных препаратов проконсультируйтесь с врачом. Дистанционная продажа БАД и лекарственных средств не осуществляется.

\n\n\t\t\t\t\t\t
\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t
\n\n\t\t\t\t\t\t

© 2023 1bad.ru 18+. Все права защищены.

\n\t\t\t\t\t
\n\t\t\t\t\t\n\t\t\t\t\t
\n\t\t\t\t\t\t

Адрес: г. Северск, ул. Курчатова, 11a

\n\t\t\t\t\t\t

Телефон: 8 800 752 18 22

\n\t\t\t\t\t\t

Почта: seversk@1bad.ru

\n\t\t\t\t\t
\n\t\t\t\t
\n\t\t\t
\n\t\t
\n\t
\n\n
\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\t\t\n\t\t\n
\r\n
\r\n
\r\n
\r\n
\r\n
\r\n \r\n
\r\n
\r\n
\r\n
\r\n
\r\n
\r\n\t\t\t\t\t\t
\r\n
\r\n
\r\n
\r\n
\r\n
    \r\n
  • \r\n
  • \r\n
\r\n\t\t\t\t\t\t\t\t\t\tSelect the fields to be shown. Others will be hidden. Drag and drop to rearrange the order.
    \r\n\t\t\t\t\t\t\t\t\t\t\t
  • Image
  • SKU
  • Rating
  • Price
  • Stock
  • Availability
  • Add to cart
  • Description
  • Content
  • Weight
  • Dimensions
  • Additional information
  • Attributes
  • Custom attributes
  • Custom fields
\r\n
\r\n
\r\n
\r\n
\r\n\t\t\t\t\t
\r\n
\r\n
\r\n
\r\n
\r\n \r\n
\r\n
\r\n
\r\n
\r\n\t\t\t\t\t
\r\n
\r\n
\r\n
\r\n\t\t\t\t\t\t\t\t\t \r\n\t\t\t\t\t\t\t\t\t
\r\n
\r\n
\r\n
\r\n\t\t\t\t\t\t\t\t
\r\n\t\t\t\t\t\t\t\t\t\tClick outside to hide the compare bar
\r\n\t\t\t\t\t\t\t\t \r\n\t\t\t\t\t\t\t\t \r\n\t\t\t\t\t\t\t\t
\r\n
\r\n
\r\n
\r\n
\r\n
\r\n\t\t\t\t\t\t\t\t\tCompare
\r\n
\r\n
\r\n
\r\n Compare\r\n \r\n × \r\n
\r\n
\r\n
\r\n Let's Compare!\r\n Continue shopping\r\n
\r\n
\r\n
\r\n
\r\n\t\t\t\t\t
\n\n\n\n\n\t\n\t\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n" \ No newline at end of file +\n\n\n\n\n\t\n\t\n\t\n\t\n\n\t\n\tMiaflow крем для лица: купить в Северске, цены в интернет-аптеке - 1bad.ru\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\t\n\t\n\n\n\n\n\n\n\n
\n\t\t\t\t\t\n\t
\n\t\t\t\t\t\t\t\t
\n\t\t\t\n\t\t\t
\n\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\n\n
Выберите свой город:
\n
\n\n
\n
Выберите из списка:
\n
    \n
  • Абакан
  • \n
  • Ачинск
  • \n
  • Альметьевск
  • \n
  • Ангарск
  • \n
  • Архангельск
  • \n
  • Армавир
  • \n
  • Артём
  • \n
  • Арзамас
  • \n
  • Астрахань
  • \n
  • Балаково
  • \n
  • Балашиха
  • \n
  • Барнаул
  • \n
  • Батайск
  • \n
  • Белгород
  • \n
  • Бердск
  • \n
  • Березники
  • \n
  • Бийск
  • \n
  • Благовещенск
  • \n
  • Братск
  • \n
  • Брянск
  • \n
  • Чебоксары
  • \n
  • Челябинск
  • \n
  • Череповец
  • \n
  • Черкесск
  • \n
  • Чита
  • \n
  • Дербент
  • \n
  • Димитровград
  • \n
  • Долгопрудный
  • \n
  • Домодедово
  • \n
  • Дзержинск
  • \n
  • Екатеринбург
  • \n
  • Елец
  • \n
  • Электросталь
  • \n
  • Элиста
  • \n
  • Энгельс
  • \n
  • Ессентуки
  • \n
  • Евпатория
  • \n
  • Грозный
  • \n
  • Хабаровск
  • \n
  • Хасавюрт
  • \n
  • Химки
  • \n
  • Иркутск
  • \n
  • Иваново
  • \n
  • Ижевск
  • \n
  • Йошкар-Ола
  • \n
  • Калининград
  • \n
  • Калуга
  • \n
  • Каменск-Уральский
  • \n
  • Камышин
  • \n
  • Каспийск
  • \n
  • Казань
  • \n
  • Кемерово
  • \n
  • Керчь
  • \n
  • Киров
  • \n
  • Кисловодск
  • \n
  • Коломна
  • \n
  • Комсомольск-на-Амуре
  • \n
  • Копейск
  • \n
  • Королёв
  • \n
  • Кострома
  • \n
  • Ковров
  • \n
  • Краснодар
  • \n
  • Красногорск
  • \n
  • Красноярск
  • \n
  • Курган
  • \n
  • Курск
  • \n
  • Кызыл
  • \n
  • Липецк
  • \n
  • Люберцы
  • \n
  • Магнитогорск
  • \n
  • Махачкала
  • \n
  • Майкоп
  • \n
  • Миасс
  • \n
  • Мурманск
  • \n
  • Муром
  • \n
  • Мытищи
  • \n
  • Набережные Челны
  • \n
  • Находка
  • \n
  • Нальчик
  • \n
  • Назрань
  • \n
  • Нефтекамск
  • \n
  • Нефтеюганск
  • \n
  • Невинномысск
  • \n
  • Нижнекамск
  • \n
  • Нижневартовск
  • \n
  • Нижний Новгород
  • \n
  • Нижний Тагил
  • \n
  • Ногинск
  • \n
  • Норильск
  • \n
  • Новочебоксарск
  • \n
  • Новочеркасск
  • \n
  • Новокуйбышевск
  • \n
  • Новокузнецк
  • \n
  • Новомосковск
  • \n
  • Новороссийск
  • \n
  • Новошахтинск
  • \n
  • Новосибирск
  • \n
  • Новый Уренгой
  • \n
  • Ноябрьск
  • \n
  • Обнинск
  • \n
  • Одинцово
  • \n
  • Октябрьский
  • \n
  • Омск
  • \n
  • Орехово-Зуево
  • \n
  • Оренбург
  • \n
  • Орск
  • \n
  • Орёл
  • \n
  • Пенза
  • \n
  • Пермь
  • \n
  • Первоуральск
  • \n
  • Петропавловск-Камчатский
  • \n
  • Петрозаводск
  • \n
  • Подольск
  • \n
  • Прокопьевск
  • \n
  • Псков
  • \n
  • Пушкино
  • \n
  • Пятигорск
  • \n
  • Раменское
  • \n
  • Реутов
  • \n
  • Ростов-на-Дону
  • \n
  • Рубцовск
  • \n
  • Рязань
  • \n
  • Рыбинск
  • \n
  • Салават
  • \n
  • Самара
  • \n
  • Санкт-Петербург
  • \n
  • Саранск
  • \n
  • Саратов
  • \n
  • Сергиев Посад
  • \n
  • Серпухов
  • \n
  • Севастополь
  • \n
  • Северодвинск
  • \n
  • Северск
  • \n
  • Шахты
  • \n
  • Щёлково
  • \n
  • Симферополь
  • \n
  • Смоленск
  • \n
  • Сочи
  • \n
  • Старый Оскол
  • \n
  • Ставрополь
  • \n
  • Стерлитамак
  • \n
  • Сургут
  • \n
  • Сыктывкар
  • \n
  • Сызрань
  • \n
  • Таганрог
  • \n
  • Тамбов
  • \n
  • Тольятти
  • \n
  • Томск
  • \n
  • Тула
  • \n
  • Тверь
  • \n
  • Тюмень
  • \n
  • Уфа
  • \n
  • Улан-Удэ
  • \n
  • Ульяновск
  • \n
  • Уссурийск
  • \n
  • Великий Новгород
  • \n
  • Владикавказ
  • \n
  • Владимир
  • \n
  • Владивосток
  • \n
  • Волгодонск
  • \n
  • Волгоград
  • \n
  • Вологда
  • \n
  • Волжский
  • \n
  • Воронеж
  • \n
  • Якутск
  • \n
  • Ярославль
  • \n
  • Южно-Сахалинск
  • \n
  • Жуковский
  • \n
  • Златоуст
  • \n
\n
\n
\n\n\n\n
\n\n\n\n
\n
Не нашли свой город?
\n \n
\n\n
\n\t\n\n\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\tHot line8 800 752 18 22\n\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t
\n\t\t\t
\n\t\t
\n\t\t\t\t\t
\n\t\t
\n\t\t\t
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t \n\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t\t\t
\n\t\t
\n\t\t\t\t
\n\t\t\t
\n\t\t\t\t\n\t\t\t
\n\t\t\t
\n\t\t\t\t
\n\t\t\t\t\t\n\t\t\t\t
\n\t\t\t
\t\t\n\t\t\t\n\t\t\t\t\t\t
\n\t\t\t\t
\n\t\t\t
\n\t\t\t\t\t\t\n\t\t\t\t\t\t
\n\t\t\t\t\n\t\t\t\t2\n\t\t\t
\n\t\t\t\t\t
\n\t\t\t
\n\n\t\t\t\n\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t \n\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t2\n\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t
\n\t\t\t
\n\t\t\t
\n\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t\t\n\t\t\t\t\t
\n\t\t\t\t
\n\t\t\t
\n\t\t\t\n\t\t
\n\t
\n\t\t
\n\t\n
\n
\n\n\t\t\n\t
\n\t
\n\t\t
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t
\n\t
\t\n\t\t
\n\t\t\t
\n\t\t\t\t
\n\t\n
\t\t\t
\n\t\t\t
\n\t\t\t\t
\n\t\t\t\t\t

Miaflow в Северске

\n
\n\t
\n\t\t
Рейтинг 5.00 из 5 на основе опроса 3 пользователей
\t\t\t\t\t\t\t\t(3 отзыва клиентов)\n\t\t\t\t\t\t
\n\t
  В наличии
\n
\n\t\t\t
\n\t
84 
\n\t\t
\n\t\t\t\t\t\t\t\t\n\t\t\t
\n
\n
\n\t

Miaflow — это инновационный крем для омоложения лица, разработанный с использованием передовых технологий. Его уникальная формула, насыщенная ценными компонентами природы, обеспечивает интенсивный уход за кожей, возвращая ей молодость и сияние.

\n
\n\t\t\t\t
Заказать
\n\n
\n\n\t\n\t\n\tКатегория: Препараты для омоложения\n\t\n\t\n
\n
\n
\n
\n

* Не является лекарственным средством

\n
\n
\n
\n \"Оплата\"\n
\n

Оплата:

\n

при получении, наличными или банковской картой

\n
\n
\n
\n \"Доставка\"\n
\n

Доставка в Северске:

\n

1 - 7 дней, почтой или транспортными компаниями

\n
\n
\n
\n
Поделиться: 
\t\t\t\t
\n\t\t\t
\n\t\t
\n\t
\n\t\n
\n\t\t
\n\t\t
\n\t\t\t\n\t\t\t\n\t\t
\n\t\t
\n\t\t
Заказать\n\t\t\tMiaflow\t\t
\n\t\t\t
\n\t\t\t\t\t\t\t\t
\n\t\t\t\t\t

Преимущества

\n
    \n
  • Уменьшение морщин и линий
  • \n
  • Повышение упругости и эластичности кожи
  • \n
  • Омолаживающий эффект с первого применения
  • \n
  • Защита от вредного воздействия окружающей среды
  • \n
  • Глубокое увлажнение и питание
  • \n
\n

Принцип действия Miaflow

\n

Miaflow активирует естественные процессы обновления кожи, восстанавливая ее структуру и придавая заметный лифтинг-эффект. Это достигается благодаря уникальной комбинации активных ингредиентов.

\n

Состав Miaflow:

\n
    \n
  1. Концентрат пантов алтайского марала: Стимулирует обновление клеток, укрепляет структуру кожи.
  2. \n
  3. Концентрат трепанга: Обеспечивает увлажнение и смягчение, борется с признаками усталости.
  4. \n
  5. Каменное масло: Питает и улучшает тонус кожи.
  6. \n
  7. Живица кедровая и лиственничная: Прекрасные антисептики, поддерживают чистоту пор, способствуют заживлению.
  8. \n
  9. Эфирные масла кедра, тыквы, конопли, пихты, облепихи, чайного дерева, гвоздики: Обеспечивают ароматерапевтический эффект и усиливают регенерацию кожи.
  10. \n
\n

Клинические исследования

\n

Проведенные исследования показали, что более 90% участников заметили улучшение состояния кожи после использования Miaflow. Восстановление упругости, сокращение морщин, и природное сияние — вот результаты, подтвержденные клинически.

\n

Показания к применению

\n
    \n
  • Сухая и увядающая кожа
  • \n
  • Первые признаки старения
  • \n
  • Потеря упругости и эластичности
  • \n
\n

Способ применения Miaflow

\n

Наносите крем на чистую кожу лица и шеи массажными движениями до полного впитывания. Используйте утром и вечером для достижения максимального эффекта.

\n

Противопоказания Miaflow

\n

Не рекомендуется использовать при индивидуальной непереносимости к компонентам. Перед применением рекомендуется провести тест на небольшом участке кожи. В случае раздражения прекратите использование.

\n\t\t\t\t\t\t\t\t

Где купить Miaflow?

\n

Miaflow не продается в обычных аптеках в Северске и других регионах России. Однако, вы можете купить его у нас на сайте по выгодной цене 84  с удобной доставкой. Успешно достигните своих целей с данным средством!

\n\t\t\t\t\n\t\t\t\t
\n\t\t\t\t\t\t\t\t
\n\t\t\t\t\t \n
\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
Рейтинг:
Рейтинг 5.00 из 5 на основе опроса 3 пользователей
Тип товара: Препараты для омоложения
Форма:Крем
Объем:50 мл
Рецепт:Отпускается без рецепта
Способ хранения:Хранить при температуре 4-20°
Примечание:Беречь от детей
Оплата:Наличными/банковской картой
Доступность в Северске:В наличии
Доставка:2-7 Дней
Цена:84 
\n
\n\t\t\t\t\t\t\t\\n\t\t\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t
\n\t\t

3 отзывов о Miaflow

\n\t\t\t\t\t
    \n\t\t\t\t
  1. \n\t
    \n\t\t
    \n\n\t\t\t\n\t\t\t
    \n\n\t\t\t\t
    Оценка 5 из 5
    \n\t

    \n\t\tЕлена Евстегнеева \n\t\t\t\t \n\t

    \n\n\t\n\t\t\t
    \n\t\t
    \n\t\t

    Моя кожа претерпела настоящую революцию с Miaflow! Уже через неделю заметила, как морщины стали менее заметными, а цвет лица стал более ровным. Крем приятно наносится, быстро впитывается, и самое главное — результат на лице!

    \n
    \t
    \n
  2. \n
  3. \n\t
    \n\t\t
    \n\n\t\t\t\n\t\t\t
    \n\n\t\t\t\t
    Оценка 5 из 5
    \n\t

    \n\t\tЕрмаков Иван \n\t\t\t\t \n\t

    \n\n\t\n\t\t\t
    \n\t\t
    \n\t\t

    Совершенно случайно попробовал, и теперь я не могу себе представить свой уход без него. Кожа стала более упругой, а яркие следы усталости просто исчезли. Отличный продукт, с которым я чувствую себя настоящим джентльменом!

    \n
    \t
    \n
  4. \n
  5. \n\t
    \n\t\t
    \n\n\t\t\t\n\t\t\t
    \n\n\t\t\t\t
    Оценка 5 из 5
    \n\t

    \n\t\tЦветкова Ксения \n\t\t\t\t \n\t

    \n\n\t\n\t\t\t
    \n\t\t
    \n\t\t

    Мне было сложно найти подходящий уход для кожи после 50, но этот крем превзошел все мои ожидания! Мои друзья даже спрашивают, что я делаю, чтобы выглядеть так молодо. Этот крем — настоящее волшебство для кожи, и я рекомендую его каждой женщине!

    \n
    \t
    \n
  6. \n\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t

Средний рейтинг

\n\t\t\t\t\t\t

5.00

\n\t\t\t\t\t\t
Оценка 5.00 из 5
\t\t\t\t\t\t
\n\t\t\t\t\t\t\t3 Отзыв\t\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
5
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
100%
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
4
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
0%
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
3
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
0%
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
2
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
0%
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
1
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
0%
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\tНапишите отзыв

Ваш адрес email не будет опубликован. Обязательные поля помечены *

\n

\n

\n\n

\t
\n\t\t\t\t
\n\t\t\t
\n\t\t
\n\t\t
\n
\t\t\t\t\t\t\t\t\n\t\t\t\t
\n\t\t\t\t\t\t\t\t\n\t\t\t\t
\n\t\t\t\t\t
\"Miaflow\"
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\tMiaflow\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t84 ₽\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t
\n
\n
\n
\n\n\n\n\n\n
\n


\n

\n

Нажимая на кнопку, вы соглашаетесь с политикой конфиденциальности.

\n


\n

\n
\n\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t
\n\t\t\t\t
\n\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\tMiaflow\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t84 ₽\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t\t\t
\"Miaflow\"
\n\t\t\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t
\n\t\t\t\t\t\t\t
\n
\n
\n
\n\n\n\n\n\n
\n


\n

\n

Нажимая на кнопку, вы соглашаетесь с политикой конфиденциальности.

\n


\n

\n
\t\t\t\t\t\t\n\t\t\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t
\n\n\t\t
\n\n\t
\n\n
\n\n
\n\n
\n\n\n\n\t
\n\n\t\t\t\t\t

Сопутствующие товары

\n\t\t\t\t\n\t\t
    \n\t\t\t\n\t\t\t\t\t
  • \n\t\t
    \n\t
    \n\t\t\t
    \n\t\t\t\t\t\t\t\t\n\t\t\t\t\t
    -25%
    \t\t\t
    \n\"Venzen\"\t\t
    \n\t\t\t
    Quick View \t\t
    \n\t\t\t
    \n\t
    \n\t\t
    \n\t\t\t
    \n\t\t\t\t\t\t\n\t\t\t\n\t\t\t\n
    Оценка 5.00 из 5
    \n\n\t1,990  1,490 \n\n
    Заказать\n
    \n\n\n
    \n\t\t\t
    \n
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t\tVenzen\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t\t1490 ₽\n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \"Venzen\"\n\t\t\t\t\t\t\t\t\tsrc=\"https://1bad.ru/wp-content/uploads/2022/09/venzen.jpg
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n
    \n
    \n
    \n\n\n\n\n\n
    \n


    \n

    \n

    Нажимая на кнопку, вы соглашаетесь с политикой конфиденциальности.

    \n


    \n

    \n
    \n\t\t\t\t\t
    \n\t\t\t\t
    \n\t\t\t
    \n\n
    \n\t\t
    \n\t
    \n
    \n
  • \n\t\t\t\n\t\t\t\t\t
  • \n\t\t
    \n\t
    \n\t\t\t
    \n\t\t\t\t\t\t\t\t\n\t\t\t
    \n\"Night\t\t
    \n\t\t\t
    Quick View \t\t
    \n\t\t\t
    \n\t
    \n\t\t
    \n\t\t\t
    \n\t\t\t\t\t\t\n\t\t\t\n\t\t\t\n
    Оценка 5.00 из 5
    \n\n\t149 \n\n
    Заказать\n
    \n\n\n
    \n\t\t\t
    \n
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t\tNight Miracle\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t\t149.00 ₽\n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \"Night
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n
    \n
    \n
    \n\n\n\n\n\n
    \n


    \n

    \n

    Нажимая на кнопку, вы соглашаетесь с политикой конфиденциальности.

    \n


    \n

    \n
    \n\t\t\t\t\t
    \n\t\t\t\t
    \n\t\t\t
    \n\n
    \n\t\t
    \n\t
    \n
    \n
  • \n\t\t\t\n\t\t\t\t\t
  • \n\t\t
    \n\t
    \n\t\t\t
    \n\t\t\t\t\t\t\t\t\n\t\t\t
    \n\"Молодильный\t\t
    \n\t\t\t
    Quick View \t\t
    \n\t\t\t
    \n\t
    \n\t\t
    \n\t\t\t
    \n\t\t\t\t\t\t\n\t\t\t\n\t\t\t\n
    Оценка 5.00 из 5
    \n\n\t149 \n\n
    Заказать\n
    \n\n\n
    \n\t\t\t
    \n
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t\tМолодильный спас\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t\t149.00 ₽\n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \"Молодильный
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n
    \n
    \n
    \n\n\n\n\n\n
    \n


    \n

    \n

    Нажимая на кнопку, вы соглашаетесь с политикой конфиденциальности.

    \n


    \n

    \n
    \n\t\t\t\t\t
    \n\t\t\t\t
    \n\t\t\t
    \n\n
    \n\t\t
    \n\t
    \n
    \n
  • \n\t\t\t\n\t\t\t\t\t
  • \n\t\t
    \n\t
    \n\t\t\t
    \n\t\t\t\t\t\t\t\t\n\t\t\t
    \n\"Zenza\t\t
    \n\t\t\t
    Quick View \t\t
    \n\t\t\t
    \n\t
    \n\t\t
    \n\t\t\t
    \n\t\t\t\t\t\t\n\t\t\t\n\t\t\t\n
    Оценка 5.00 из 5
    \n\n\t147 \n\n
    Заказать\n
    \n\n\n
    \n\t\t\t
    \n
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t\tZenza Cream\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t\t147.00 ₽\n\t\t\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t\t\t
    \"Zenza
    \n\t\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t
    \n\t\t\t\t\t\t
    \n
    \n
    \n
    \n\n\n\n\n\n
    \n


    \n

    \n

    Нажимая на кнопку, вы соглашаетесь с политикой конфиденциальности.

    \n


    \n

    \n
    \n\t\t\t\t\t
    \n\t\t\t\t
    \n\t\t\t
    \n\n
    \n\t\t
    \n\t
    \n
    \n
  • \n\t\t\t\n\t\t
\n\t
\n\t\t\n
\n\t\t\t\t\t\t\t\t\t
\n\t\t\t\t
\t\t\t\t\t
\n\t
\n\n
\t\n\t\t
\n\t\t\t
\n\t\t\t\tЧто Вы ищете?\n\t\t\t\t
Закрыть
\n\t\t\t
\n\t\t\t\t
\n\t\t\t\n\t\t
\n\t\t\t\n\t\t\t\n\t\t\t
\n\t\t\t\t
    \n\t\t\t\t
\n\t\t\t
\n\t\t
\n\t\t\n\t
\n\t\t\n\t\t
\t\n\t
\n\n\n
\n\n\t
\n\t\t
\n\n\t\t\t
\n\t\t\t\t
\n\t\t\t\t\t
\n\t\t\t\t\t\t

Вся информация на сайте - справочная. Перед применением лекарственных препаратов проконсультируйтесь с врачом. Дистанционная продажа БАД и лекарственных средств не осуществляется.

\n\n\t\t\t\t\t\t
\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t
\n\n\t\t\t\t\t\t

© 2023 1bad.ru 18+. Все права защищены.

\n\t\t\t\t\t
\n\t\t\t\t\t\n\t\t\t\t\t
\n\t\t\t\t\t\t

Адрес: г. Северск, ул. Курчатова, 11a

\n\t\t\t\t\t\t

Телефон: 8 800 752 18 22

\n\t\t\t\t\t\t

Почта: seversk@1bad.ru

\n\t\t\t\t\t
\n\t\t\t\t
\n\t\t\t
\n\t\t
\n\t
\n\n
\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\t\t\n\t\t\n
\r\n
\r\n
\r\n
\r\n
\r\n
\r\n \r\n
\r\n
\r\n
\r\n
\r\n
\r\n
\r\n\t\t\t\t\t\t
\r\n
\r\n
\r\n
\r\n
\r\n
    \r\n
  • \r\n
  • \r\n
\r\n\t\t\t\t\t\t\t\t\t\tSelect the fields to be shown. Others will be hidden. Drag and drop to rearrange the order.
    \r\n\t\t\t\t\t\t\t\t\t\t\t
  • Image
  • SKU
  • Rating
  • Price
  • Stock
  • Availability
  • Add to cart
  • Description
  • Content
  • Weight
  • Dimensions
  • Additional information
  • Attributes
  • Custom attributes
  • Custom fields
\r\n
\r\n
\r\n
\r\n
\r\n\t\t\t\t\t
\r\n
\r\n
\r\n
\r\n
\r\n \r\n
\r\n
\r\n
\r\n
\r\n\t\t\t\t\t
\r\n
\r\n
\r\n
\r\n\t\t\t\t\t\t\t\t\t \r\n\t\t\t\t\t\t\t\t\t
\r\n
\r\n
\r\n
\r\n\t\t\t\t\t\t\t\t
\r\n\t\t\t\t\t\t\t\t\t\tClick outside to hide the compare bar
\r\n\t\t\t\t\t\t\t\t \r\n\t\t\t\t\t\t\t\t \r\n\t\t\t\t\t\t\t\t
\r\n
\r\n
\r\n
\r\n
\r\n
\r\n\t\t\t\t\t\t\t\t\tCompare
\r\n
\r\n
\r\n
\r\n Compare\r\n \r\n × \r\n
\r\n
\r\n
\r\n Let's Compare!\r\n Continue shopping\r\n
\r\n
\r\n
\r\n
\r\n\t\t\t\t\t
\n\n\n\n\n\t\n\t\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \ No newline at end of file From 94f627f4e8709980ae91a01a1919dab25fc266b6 Mon Sep 17 00:00:00 2001 From: dt-yy Date: Thu, 6 Mar 2025 12:59:54 +0800 Subject: [PATCH 24/31] remove enter in table --- llm_web_kit/extractor/html/extractor.py | 2 +- llm_web_kit/libs/html_utils.py | 17 +- .../html/test_table_elem_include_enter.html | 3136 +++++++++++++++++ .../good_data/html_data_input.jsonl | 3 +- .../extractor/html/recognizer/test_table.py | 3 + .../extractor/test_extractor_chain.py | 19 +- 6 files changed, 3175 insertions(+), 5 deletions(-) create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/test_table_elem_include_enter.html diff --git a/llm_web_kit/extractor/html/extractor.py b/llm_web_kit/extractor/html/extractor.py index 1d3facb3..f68950d3 100644 --- a/llm_web_kit/extractor/html/extractor.py +++ b/llm_web_kit/extractor/html/extractor.py @@ -290,7 +290,7 @@ def __get_cc_node(self, html:str) -> (str, str): nodes = el.xpath(xpath_expr) if len(nodes) == 0: raise ValueError(f'html文本中没有cc标签: {html}') # TODO 异常处理 - if len(nodes) > 1: + if len(nodes) > 2: raise ValueError(f'html文本中包含多个cc标签: {html}') # TODO 异常处理 return element_to_html(nodes[0]), nodes[0].tag diff --git a/llm_web_kit/libs/html_utils.py b/llm_web_kit/libs/html_utils.py index c4628f9d..faf257a8 100644 --- a/llm_web_kit/libs/html_utils.py +++ b/llm_web_kit/libs/html_utils.py @@ -1,4 +1,5 @@ import html +import re from copy import deepcopy from lxml.html import HtmlElement, HTMLParser, fromstring, tostring @@ -114,6 +115,18 @@ def iter_node(element: HtmlElement): yield from iter_node(sub_element) +def _escape_table_cell(text: str) -> str: + """转义表格单元格中的特殊字符. + + 比如 |、内容中的\n等 + """ + # 首先处理换行符,将其替换为空格 + text = re.sub(r'[\r\n]+', ' ', text) + # 转义竖线和点号,避免与markdown表格语法冲突 + escaped = text.replace('|', '\\|') + return escaped + + def html_to_markdown_table(table_html_source: str) -> str: """把html代码片段转换成markdown表格. @@ -140,7 +153,7 @@ def html_to_markdown_table(table_html_source: str) -> str: # 检查第一行是否是表头并获取表头内容 first_row_tags = rows[0].xpath('.//th | .//td') - headers = [tag.text_content().strip() for tag in first_row_tags] + headers = [_escape_table_cell(tag.text_content().strip()) for tag in first_row_tags] # 如果表头存在,添加表头和分隔符,并保证表头与最大列数对齐 if headers: while len(headers) < max_cols: @@ -155,7 +168,7 @@ def html_to_markdown_table(table_html_source: str) -> str: # 添加表格内容,跳过已被用作表头的第一行(如果有的话) for row in rows[1:]: - columns = [td.text_content().strip() for td in row.xpath('.//td | .//th')] + columns = [_escape_table_cell(td.text_content().strip()) for td in row.xpath('.//td | .//th')] # 如果这一行的列数少于最大列数,则补充空白单元格 while len(columns) < max_cols: columns.append('') diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/test_table_elem_include_enter.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/test_table_elem_include_enter.html new file mode 100644 index 00000000..176f4fab --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/test_table_elem_include_enter.html @@ -0,0 +1,3136 @@ + + + + + + + + + + + + + + + + + + + + + دانلود ترجمه مقاله توسعه مالی و هزینه سرمایه حقوق سهامداران + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + +
+ +
+ + + + + + + + + +
+ +
+
+
+
+
+
+ + +
+
+
+
+
+
+
+
+

+دانلود ترجمه مقاله توسعه مالی و هزینه سرمایه حقوق سهامداران

+
+
+
+
+
+
+
+
+
+ + + + + + + + + + + + + + + + + + + +
+ + عنوان فارسی + + +

+توسعه مالی و هزینه سرمایه حقوق سهامداران: شواهدی از چین

+
+ + عنوان انگلیسی + + +

+ Financial development and the cost of equity capital: Evidence from China

+
+ + کلمات کلیدی : + + +

+   + توسعه مالی؛ هزینه سرمایه حقوق سهامداران؛ قانون و امور مالی؛ چین

+
+ + درسهای مرتبط + + + + حسابداری +
+ + + + + + + + + + + + + + + + + + + +
+ تعداد صفحات مقاله انگلیسی : + 35 + نشریه : +ELSEVIER
+ سال انتشار : + 2015 + تعداد رفرنس مقاله : + 112
+ فرمت مقاله انگلیسی : + PDF + + نوع مقاله : + ISI +
+ پاورپوینت : + ندارد + وضعیت ترجمه مقاله : + انجام نشده است.
+
+
+
+
+ +
+
+
+
+
+ فهرست مطالب +
+
+

+1. مقدمه +2. پیشینه نهادی +3. چارچوب نظری +4. طرح تحقیق +5. نتایج تجربی +6. آنالیز بیشتر: تاثیرات فاکتورهای نهادی +7. بررسی دقت +8. نتیجه گیری +

+
+
+سفارش ترجمه +
+
+ ترجمه نمونه متن انگلیسی +
+
+

+ این مطالعه، رابطه بین توسعه مالی سطح استان و هزینه دارایی ویژه در چین را بررسی می کند. یافته های اصلی ما از این قرارند که (1) توسعه بازار سهام، بطور کل هزینه دارایی ویژه را کاهش می دهد، اما این اثر در شرکت های دولتی (SOE) و شرکت های دارای پتانسیل رشد یا شدت نوآوری زیاد، به میزان قابل توجهی کمرنگ می شود و (2) توسعه بانکداری تنها به صورت جزئی هزینه دارایی ویژه را کاهش می دهد، اما این اثر در شرکت های غیر SOE، قویتر است. تحلیل های بیشتر جایگزین های توسعه بازار سهام برای چنین عوامل نهادی مانند کیفیت حسابداری، اجرای قانون، تلفیق بازار سهام و اصلاح ساختار تقسیم سهام در کاهش هزینه دارایی ویژه را آشکار می کنند. همچنین در می یابیم که عدم وجود رقابت در بانکداری و بازاری کردن بانکداری و توسعه ضعیف اقتصاد غیردولتی تاحدی مسئول اثر ضعیف توسعه بانکداری بر هزینه دارایی ویژه می باشد. + +مقدمه: +این مطالعه، تاثر توسعه مالی منطقه ای بر هزینه دارایی ویژه در چین را با استفاده از یک نمونه بزرگ از شرکت های چینی پذیرفته شده در بورس اوراق بهادار شانگهای (SHSE) و بورس اوراق بهادار شنزن (SZSE) در دوره 1998 تا 2008، را بررسی می کند. مخصوصاً اینکه، طبق رویکرد جایاراتنه و استراهان (1996) و گویسو و همکاران (2004 الف، 2004 ب)، بررسی می کنیم که آیا توسعه مالی منطقه ای سطح استانی در یک کشور با هزینه دارایی ویژه ارتباط دارد یا خیر و چه ارتباطی و همچنین اینکه این رابطه چگونه براساس زیرساخت های نهادی مانند اجرای قانونی، کیفیت حسابداری و مقررات دیگر، شرطی می شوند.

+
+
+
+
+ نمونه متن انگلیسی مقاله +
+
+

+ This study examines the relation between province-level financial development and the cost of equity in China. Our main findings are that (1) stock market development reduces the cost of equity in general, but the effect diminishes significantly in state-owned enterprises (SOEs) and firms with high growth potential or innovation intensity and (2) banking development only marginally lowers the cost of equity, but the effect is stronger in non-SOEs. Further analysis reveals that stock market development substitutes for such institutional factors as accounting quality, law enforcement, stock market integration and the split-share structure reform in lowering the cost of equity. We also find that lack of banking competition and banking marketization and under-development of the non-state economy partially account for the weak effect of banking development on the cost of equity. + +Introduction: +This study examines the impact of regional financial development on the cost of equity capital in China, using a large sample of Chinese firms listed on the Shanghai Stock Exchange (SHSE) and Shenzhen Stock Exchange (SZSE) over the period from 1998 to 2008. Specifically, following the approach of Jayaratne and Strahan (1996) and Guiso et al. (2004a, 2004b), we investigate whether and how regional province-level financial development within the same country is associated with the cost of equity, and how the relation is conditioned upon institutional infrastructures such as legal enforcement, accounting quality and other regulations.

+
+
+
+
+ توضیحات و مشاهده مقاله انگلیسی +
+
+

+
+
+
+
+ + +
+
+
سفارش ترجمه تخصصی این مقاله
+
+
+ + + + +
+
+
+
+
+

+ دیدگاهها

+ +

هیچ دیدگاهی برای این محصول نوشته نشده است.

+
+ +
+
+
+ اولین نفری باشید که دیدگاهی را ارسال می کنید برای “دانلود ترجمه مقاله توسعه مالی و هزینه سرمایه حقوق سهامداران”

نشانی ایمیل شما منتشر نخواهد شد. بخش‌های موردنیاز علامت‌گذاری شده‌اند *

+ +

4 × دو =

+ +

+
+
+ +
+
+
+
+
+
+
+
+

پروپوزال آماده

+
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + +
+

مقالات ترجمه شده

+
+ +
+
+ +
+
+ +
+ + +
+ +
+ + +
+ +
+ + +
+ +
+ + +
+ +
+ + +
+ +
+ + +
+ +
+ + +
+ +
+ + +
+ +
+ + +
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + +
+

پایان نامه آماده

+
+ + + + +
+

مطالب علمی

+
+ + + + +
+ +
+
+
+
+
+
+

نماد اعتماد الکترونیکی

+
+
+
+
+
+
+

پشتیبانی

+
+
logo-samandehi
+
+
+ +
+
+
+
+ + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl index 12bfb843..0c68d085 100644 --- a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl @@ -10,4 +10,5 @@ {"track_id": "legato_doc", "dataset_name": "test_pipeline_suit", "url": "https://www.test.com","data_source_category": "HTML", "path":"legato_docs.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} {"track_id": "oracle_doc", "dataset_name": "test_pipeline_suit", "url": "https://docs.oracle.com/en-us/iaas/tools/java/3.57.1/com/oracle/bmc/integration/model/CustomEndpointDetails.html","data_source_category": "HTML", "path":"oracle_doc.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} {"track_id": "table_involve_inline_code", "dataset_name": "test_table_involve_inline_code", "url": "https://docs.oracle.com/en-us/iaas/tools/java/3.57.1/com/oracle/bmc/integration/model/CustomEndpointDetails.html","data_source_category": "HTML", "path":"table_involve_inline_code.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} -{"track_id": "table_tail_text", "dataset_name": "test_table_tail_text", "url": "https://dchublists.com/?do=hublist&id=hub-975&language=en","data_source_category": "HTML", "path":"table_tail_text.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} \ No newline at end of file +{"track_id": "table_tail_text", "dataset_name": "test_table_tail_text", "url": "https://dchublists.com/?do=hublist&id=hub-975&language=en","data_source_category": "HTML", "path":"table_tail_text.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} +{"track_id": "table_elem_include_enter", "dataset_name": "table_elem_include_enter", "url": "https://fardapaper.ir/financial-development-equity-capital","data_source_category": "HTML", "path":"test_table_elem_include_enter.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index 18a40327..fe81f8e3 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -89,6 +89,7 @@ def test_cc_simple_table(self): parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) assert len(parts) == 3 content = html_to_element(parts[1][0]).text_content() + print(content) assert content == r'
Рейтинг:Рейтинг<br>5.00<br>из 5 на основе опроса<br>3<br>пользователей
Тип товара:Препараты для омоложения
Форма:Крем
Объем:50 мл
Рецепт:Отпускается без рецепта
Способ хранения:Хранить при температуре 4-20°
Примечание:Беречь от детей
Оплата:Наличными/банковской картой
Доступность в Северске:В наличии
Доставка:2-7 Дней
Цена:84<br>₽
' def test_cc_complex_table(self): @@ -166,6 +167,7 @@ def test_table_involve_after_code(self): parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) assert html_to_element(parts[0][0]).xpath(f'.//{CCTag.CC_TABLE}')[0].text is None + @unittest.skip(reason='在code模块解决了table嵌套多行代码问题') def test_table_involve_code(self): """table involve code.""" for test_case in TEST_CASES: @@ -176,6 +178,7 @@ def test_table_involve_code(self): complex_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}') expect_path = base_dir.joinpath(test_case['expected'][3]) content = open(expect_path, 'r', encoding='utf-8').read() + print(content) assert complex_table_tag[0].text == content.strip('\n') @unittest.skip(reason='在code模块解决了这个问题') diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index f55f2232..14ea7127 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -59,7 +59,7 @@ def setUp(self): for line in f: self.data_json.append(json.loads(line.strip())) - assert len(self.data_json) == 13 + assert len(self.data_json) == 14 # Config for HTML extraction self.config = { @@ -369,4 +369,21 @@ def test_table_tail_text(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_md = result.get_content_list().to_mm_md() + print(content_md) assert '| ID: 975' in content_md + + def test_table_element_include_enter(self): + """table的元素中间有换行.""" + chain = ExtractSimpleFactory.create(self.config) + self.assertIsNotNone(chain) + test_data = self.data_json[13] + # Create DataJson from test data + input_data = DataJson(test_data) + result = chain.extract(input_data) + content_md = result.get_content_list().to_mm_md() + print(content_md) + assert """| عنوان فارسی | توسعه مالی و هزینه سرمایه حقوق سهامداران: شواهدی از چین | +|---|---| +| عنوان انگلیسی | Financial development and the cost of equity capital: Evidence from China | +| کلمات کلیدی : |   توسعه مالی؛ هزینه سرمایه حقوق سهامداران؛ قانون و امور مالی؛ چین | +| درسهای مرتبط | حسابداری |""" in content_md From 95120d06a2ddc3e398ea208902bb680d5e6b4c79 Mon Sep 17 00:00:00 2001 From: dt-yy Date: Thu, 6 Mar 2025 13:29:21 +0800 Subject: [PATCH 25/31] remove print --- tests/llm_web_kit/extractor/html/recognizer/test_table.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index fe81f8e3..0608e825 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -89,7 +89,6 @@ def test_cc_simple_table(self): parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) assert len(parts) == 3 content = html_to_element(parts[1][0]).text_content() - print(content) assert content == r'
Рейтинг:Рейтинг<br>5.00<br>из 5 на основе опроса<br>3<br>пользователей
Тип товара:Препараты для омоложения
Форма:Крем
Объем:50 мл
Рецепт:Отпускается без рецепта
Способ хранения:Хранить при температуре 4-20°
Примечание:Беречь от детей
Оплата:Наличными/банковской картой
Доступность в Северске:В наличии
Доставка:2-7 Дней
Цена:84<br>₽
' def test_cc_complex_table(self): @@ -178,7 +177,6 @@ def test_table_involve_code(self): complex_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}') expect_path = base_dir.joinpath(test_case['expected'][3]) content = open(expect_path, 'r', encoding='utf-8').read() - print(content) assert complex_table_tag[0].text == content.strip('\n') @unittest.skip(reason='在code模块解决了这个问题') From 1c5ff71860bbdb747a054e194ca741cd944f0374 Mon Sep 17 00:00:00 2001 From: yyy <102640628+dt-yy@users.noreply.github.com> Date: Thu, 6 Mar 2025 14:19:48 +0800 Subject: [PATCH 26/31] remove print --- tests/llm_web_kit/extractor/test_extractor_chain.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index 14ea7127..c2e5ee2b 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -369,7 +369,6 @@ def test_table_tail_text(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_md = result.get_content_list().to_mm_md() - print(content_md) assert '| ID: 975' in content_md def test_table_element_include_enter(self): @@ -381,7 +380,6 @@ def test_table_element_include_enter(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_md = result.get_content_list().to_mm_md() - print(content_md) assert """| عنوان فارسی | توسعه مالی و هزینه سرمایه حقوق سهامداران: شواهدی از چین | |---|---| | عنوان انگلیسی | Financial development and the cost of equity capital: Evidence from China | From 371800ed25adf6dcfd6e469656c8063cf7fa09f2 Mon Sep 17 00:00:00 2001 From: dt-yy Date: Thu, 6 Mar 2025 19:38:22 +0800 Subject: [PATCH 27/31] add exception --- llm_web_kit/extractor/html/extractor.py | 70 +- .../extractor/html/recognizer/table.py | 107 +- .../good_data/html/table_include_math_p.html | 2326 +++++++++++++++++ .../good_data/html/test_list_empty.html | 2000 ++++++++++++++ .../good_data/html_data_input.jsonl | 4 +- .../extractor/test_extractor_chain.py | 25 +- 6 files changed, 4468 insertions(+), 64 deletions(-) create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_include_math_p.html create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/test_list_empty.html diff --git a/llm_web_kit/extractor/html/extractor.py b/llm_web_kit/extractor/html/extractor.py index f68950d3..53565f15 100644 --- a/llm_web_kit/extractor/html/extractor.py +++ b/llm_web_kit/extractor/html/extractor.py @@ -22,6 +22,7 @@ from llm_web_kit.libs.html_utils import element_to_html, html_to_element from llm_web_kit.libs.logger import mylogger from llm_web_kit.libs.path_lib import get_py_pkg_root_dir +from llm_web_kit.exception.exception import HtmlFileExtractorException class HTMLPageLayoutType: @@ -245,6 +246,63 @@ def _extract_paragraph(self, base_url:str, html_lst:List[Tuple[str,str]], raw_ht lst = self.__paragraph_recognizer.recognize(base_url, html_lst, raw_html) return lst + def __is_valid_node(self, node: dict) -> bool: + """检查节点是否有效(不为空). + + Args: + node (dict): 内容节点 + + Returns: + bool: 如果节点有效返回True,否则返回False + """ + if not node: + raise HtmlFileExtractorException("node is empty") + node_type = node.get('type') + valid_types = {'list', 'code', 'equation-interline', 'image', 'table', 'title', 'paragraph'} + if node_type not in valid_types: + raise HtmlFileExtractorException(f"Invalid node type: {node_type}") + # 检查列表类型的节点 + if node.get('type') == 'list': + items = node.get('content', {}).get('items', []) + # 过滤掉None、空列表,以及只包含None或空值的列表 + return bool(items) and any( + isinstance(item, (dict, list)) and bool(item) + for item in items) + #检测code类型的节点 + if node.get('type') == 'code': + code_content = node.get('content', {}).get('code_content') + # 如果代码内容为None或空字符串,则视为无效节点 + return bool(code_content and code_content.strip()) + #检测行间公式类型的节点 + if node.get('type') == 'equation-interline': + math_content = node.get('content', {}).get('math_content') + # 如果公式内容为None或空字符串,则视为无效节点 + return bool(math_content and math_content.strip()) + #检测image类型的节点 + if node.get('type') == 'image': + content = node.get('content', {}) + # 检查url、path或data字段是否至少有一个不为空 + return bool(content.get('url') or content.get('path') or content.get('data')) + #检测table类型的节点 + if node.get('type') == 'table': + html = node.get('content', {}).get('html') + # 如果表格的html内容为None或空字符串,则视为无效节点 + return bool(html and html.strip()) + #检测title类型的节点 + if node.get('type') == 'title': + title_content = node.get('content', {}).get('title_content') + # 如果标题内容为None或空字符串,则视为无效节点 + return bool(title_content and title_content.strip()) + #检测段落类型的节点 + if node.get('type') == 'paragraph': + content = node.get('content', []) + # 检查content列表是否存在且不为空,并且至少有一个非空的内容项 + return bool(content) and any( + item.get('c') and item.get('c').strip() + for item in content + ) + return True + def _export_to_content_list(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str) -> ContentList: """将解析结果存入content_list格式中. @@ -263,12 +321,10 @@ def _export_to_content_list(self, base_url:str, html_lst:List[Tuple[str,str]], r parser:BaseHTMLElementRecognizer = self.__to_content_list_mapper.get(cc_tag) if parser: node = parser.to_content_list_node(base_url, ccnode_html, raw_html) - if node: + if node and self.__is_valid_node(node): one_page.append(node) else: - mylogger.warning(f'无法识别的html标签:{cc_tag}, {parsed_html}') - # TODO 开发成熟的时候,在这里抛出异常,让调用者记录下来,以便后续分析改进 - + raise HtmlFileExtractorException(f'无法识别的html标签:{cc_tag}, {parsed_html}') content_list = ContentList([one_page]) # 对于网页来说仅有一页,如果多页,则剩下的每个都是一个论坛的回复 return content_list @@ -289,9 +345,9 @@ def __get_cc_node(self, html:str) -> (str, str): xpath_expr = ' | '.join(f'self::{tag} | .//{tag}' for tag in self.__to_content_list_mapper.keys()) nodes = el.xpath(xpath_expr) if len(nodes) == 0: - raise ValueError(f'html文本中没有cc标签: {html}') # TODO 异常处理 - if len(nodes) > 2: - raise ValueError(f'html文本中包含多个cc标签: {html}') # TODO 异常处理 + raise HtmlFileExtractorException(f'html文本中没有cc标签: {html}') + if len(nodes) > 3: + raise HtmlFileExtractorException(f'html文本中包含多个cc标签: {html}') return element_to_html(nodes[0]), nodes[0].tag def __build_extractor(self): diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index cd7cd387..9586713f 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -3,7 +3,7 @@ from lxml.html import HtmlElement from overrides import override - +import json from llm_web_kit.exception.exception import HtmlTableRecognizerException from llm_web_kit.extractor.html.recognizer.cccode import CodeRecognizer from llm_web_kit.extractor.html.recognizer.ccmath import MathRecognizer @@ -68,7 +68,6 @@ def __is_table_empty(self, table) -> bool: :param table: lxml.html.HtmlElement 对象,表示一个 元素 :return: 如果表格为空,返回 True;否则返回 False """ - def is_element_empty(elem): # 检查元素本身的文本内容 if elem.text and elem.text.strip(): @@ -113,20 +112,19 @@ def __is_simple_table(self, tree) -> bool: return False return True - def __is_table_contain_img(self, tree) -> bool: - """判断table元素是否包含图片.""" - imgs = tree.xpath('//table//img') - if len(imgs) == 0: - return True - else: - return False - - def __is_table_nested(self, tree) -> int: - """获取表格元素的嵌套层级(非表格元素返回0,顶层表格返回1,嵌套表格返回层级数).""" - if tree.tag != 'table': - return 0 # 非表格元素返回0 - # 计算祖先中的 table 数量(不包括自身),再加1表示自身层级 - return len(tree.xpath('ancestor::table')) + 1 + def __is_table_nested(self, element) -> int: + """计算表格的嵌套层级(非表格返回0)""" + if element.tag != "table": + return 0 + # 获取当前表格下所有的表格(包括自身) + all_tables = [element] + element.xpath('.//table') + max_level = 1 # 初始层级为1(当前表格) + # 计算每个表格的层级,取最大值 + for table in all_tables: + ancestor_count = len(table.xpath('ancestor::table')) + level = ancestor_count + 1 + max_level = max(max_level, level) + return max_level def __extract_tables(self, ele: str) -> List[Tuple[str, str]]: """提取html中的table元素.""" @@ -150,61 +148,60 @@ def __get_table_type(self, child: HtmlElement) -> str: table_type = 'complex' return table_type - def __extract_table_element(self, ele: HtmlElement) -> str: - """提取表格的元素.""" - for item in ele.iterchildren(): - return self._element_to_html(item) def __check_table_include_math_code(self, raw_html: HtmlElement): - """check table中是否包含math.""" + """检查table中的内容,包括普通文本、数学公式和代码.""" math_html = self._element_to_html(raw_html) - ele_res = list() + # 处理数学公式和代码 math_recognizer = MathRecognizer() math_res_parts = math_recognizer.recognize(base_url='', main_html_lst=[(math_html, math_html)], - raw_html=math_html) + raw_html=math_html) code_recognizer = CodeRecognizer() code_res_parts = code_recognizer.recognize(base_url='', main_html_lst=math_res_parts, - raw_html=math_html) + raw_html=math_html) + + result = [] for math_item in code_res_parts: ele_item = self._build_html_tree(math_item[0]) + # 处理所有文本内容 + for text_segment in ele_item.itertext(): + cleaned_text = text_segment.strip().replace('\\n', '') + if cleaned_text: # 过滤空字符串 + #print("cleaned_text", cleaned_text) + result.append(cleaned_text) + # 处理行内公式 ccinline_math_node = ele_item.xpath(f'//{CCTag.CC_MATH_INLINE}') - ccinline_code_node = ele_item.xpath(f'//{CCTag.CC_CODE_INLINE}') - ccinterline_math_node = ele_item.xpath(f'//{CCTag.CC_MATH_INTERLINE}') - ccinterline_code_node = ele_item.xpath(f'//{CCTag.CC_CODE}') if ccinline_math_node: formulas = [ - el.text if el.text.strip() else '' - for el in ccinline_math_node + el.text.strip() for el in ccinline_math_node if el.text and el.text.strip() ] - ele_res.extend(formulas) # 添加字符串 - elif ccinterline_math_node: - codes = [ - el.text if el.text.strip() else '' - for el in ccinterline_math_node + result.extend(formulas) + + # 处理行间公式 + ccinterline_math_node = ele_item.xpath(f'//{CCTag.CC_MATH_INTERLINE}') + if ccinterline_math_node: + formulas = [ + el.text.strip() for el in ccinterline_math_node if el.text and el.text.strip() ] - ele_res.extend(codes) - elif ccinline_code_node: - inline_codes = [ - el.text if el.text.strip() else '' - for el in ccinline_code_node + result.extend(formulas) + + # 处理行内代码 + ccinline_code_node = ele_item.xpath(f'//{CCTag.CC_CODE_INLINE}') + if ccinline_code_node: + codes = [ + el.text.strip() for el in ccinline_code_node if el.text and el.text.strip() ] - ele_res.extend(inline_codes) - elif ccinterline_code_node: - ccinterline_codes = [ - el.text if el.text else '' - for el in ccinterline_code_node + result.extend(codes) + + # 处理行间代码 + ccinterline_code_node = ele_item.xpath(f'//{CCTag.CC_CODE}') + if ccinterline_code_node: + codes = [ + el.text.strip() for el in ccinterline_code_node if el.text and el.text.strip() ] - ele_res.extend(ccinterline_codes) - else: - texts = [] - # 使用 itertext() 遍历所有文本片段 - for text_segment in ele_item.itertext(): - # 统一处理文本:去空白 + 替换字面 \n - cleaned_text = text_segment.strip().replace('\\n', '') - if cleaned_text: # 过滤空字符串 - texts.append(cleaned_text) - ele_res.extend(texts) - return ele_res + result.extend(codes) + + return result def __simplify_td_th_content(self, elem: HtmlElement) -> None: """简化
内容,仅保留文本内容.""" diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_include_math_p.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_include_math_p.html new file mode 100644 index 00000000..257b0bac --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_include_math_p.html @@ -0,0 +1,2326 @@ + + + + +factoring - Is $83^{27} +1 $ a prime number? - Mathematics Stack Exchange + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+ + + + + + + + +
+ + + + + +
+ + +
+ + +
+
+ +
+ Mathematics Stack Exchange is a question and answer site for people studying math at any level and professionals in related fields. It's 100% free, no registration required. +
+
+ Sign up +
+
+ Here's how it works: +
    +
  1. Anybody can ask a question +
  2. +
  3. Anybody can answer +
  4. +
  5. The best answers are voted up and rise to the top +
  6. +
+
+
+
+ +
+ +
+ + + +
+ + + + + + + + + + + +
+ + +
+ + up vote + 17 + down vote + + favorite +
5
+ + +
+ +
+
+
+ +

I'm having problems with exercises on proving whether or not a given number is prime. Is $83^{27} + 1$ prime?

+
+ + + + + + + +
+
share|cite|improve this question
+
+ + +
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+ 4 + +   +
+
+
+ Someone else points out that it can't be prime because it's even. That's probably the quickest way. May answer shows how to factor it (so it can't be prime) by a method that would work just as well if it had been $84^{27}+1$. + – Michael Hardy + Aug 2 '13 at 22:11 +
+
+ + + + + + + +
+ 5 + +   +
+
+
+ Wolfram Alpha says that $83^{27}+1= 2^2×3^4×7×109×757×2269×9613×49339×2208799×14685985270709080390792801$. Perhaps it's fun to try to prove that 3 and 7 are factors. + – lhf + Aug 2 '13 at 22:13 + +
+
+ + + + + + + +
+ 51 + +   +
+
+
+ The number is EVEN! + – Ali + Aug 3 '13 at 5:37 +
+
+ + + + + + + +
+ 3 + +   +
+
+
+ @Joseph It is "well-known" and not too hard to prove that if $b^n+1$ is prime for some integer $b>1$ then $n$ has to be zero or a power of two. And 27 is neither zero nor a power of two. Search for Generalized Fermat Prime to find a proof, or do the proof yourself. + – Jeppe Stig Nielsen + Aug 3 '13 at 6:48 + +
+
+ + + + + + + +
+ 5 + +   +
+
+
+ If it is for a test, the simple answer is that you don't have the tools to prove it prime, so it must be composite. + – Ross Millikan + Aug 3 '13 at 13:04 +
+
+
+ + +
+
+ +
+ + +
+
+

+ 9 Answers + 9 +

+ +
+
+ + + + + + +
+ + + + + + + + + + + + +
+ + +
+ + up vote + 68 + down vote + + + + accepted + +
+ +
+
+

$83$ is odd, so is any power of $83$. Hence $83^{27}+1$ is even, but the only even prime number is $2$ and this number is not $2$.

+ +

More generally, if $a,k\in\mathbb N$ and $k$ is odd, then +$$a^k+1\equiv (-1)^k+1\equiv 0\pmod{a+1}$$ +So $a+1\mid a^k+1$. In this case this yields $84=2^2\cdot 3\cdot 7$ as divisor.

+
+ + + + + + + + + +
+
share|cite|improve this answer
+ + + + +
+
+
+ + + + + + + + + + +
+ + + + + + + +
+ 3 + +   +
+
+
+ Your last statement can also be seen by geometric series: $(1+x+\cdots+x^{k-1})(x-1)=x^k-1$. For $k$ odd, substitute $x=-a$ and cancel out the minuses signs on each side to get the factor $(a+1)$ on the left with $a^k+1$ on the right. + – nayrb + Aug 2 '13 at 21:52 + +
+
+
+ + +
+
+ + + +
+ + + + + + + + + + + + +
+ + +
+ + up vote + 2 + down vote + + + + +
+ +
+
+

Let's ask WolframAlpha!

+ +
+

PrimeQ[83^27 + 1]

+
+ + + +
+

is $6\,532\,937\,361\,590\,551\,025\,727\,805\,459\,013\,652\,074\,798\,022\,177\,030\,828$ a prime number?

+ +

$83^{27} + 1$ is not a prime number

+ +

$2^2 \times 3^4 \times 7 \times 109 \times 757 \times 2269 \times 9613 \times 49339 \times 2208799 \times 14685985270709080390792801 \space\text{(14 prime factors, 10 distinct)}$

+
+ +
+ +

However, using basic knowledge that an odd times an odd is always an odd ($3 \times 3 = 9$), we see that $83$ (an odd number) raised to any power is an odd number. Then we add one to it and get an even number.

+ +

Being even (and obviously not equal to $2$), the definition of a prime tells us that the number is not prime because it is divisible by $2$ (my words):

+ +
+

prime (noun):

+ +
    +
  1. Any natural number, greater than $1$, that, when divided by any natural number, greater than $1$, other than itself or $1$ does not result in a natural number.
  2. +
  3. Any "natural number greater than $1$ that has no positive divisors other than $1$ and itself." (Wikipedia article "prime number")
  4. +
+
+
+ + + + + + + + + +
+
share|cite|improve this answer
+ + + + +
+
+
+ + + + + +
+
+ + +
+
+ + + +
+ + + + + + + + + + + + +
+ + +
+ + up vote + 2 + down vote + + + + +
+ +
+
+

well it is divisible by $84$ and in general $\forall a,m\in\mathbb {N}$ we have +$(a+1)\mid (a^{2m+1}+1)$ So....

+
+ + + + + + + + +
+
share|cite|improve this answer
+ + + +
+
+
+ + + + + +
+
+ + +
+
+ + + +
+ + + + + + + + + + + + +
+ + +
+ + up vote + 4 + down vote + + + + +
+ +
+
+

The only prime numbers of the form $a^x+b^x$, occur when $x$ is a power of two. This does not guarantee a prime, but if $x$ is not a power of $2$, then the number has algebraic factors.

+ +

In practice, there is an algebraic divisor of $a^n-b^n$, for each $m$ that divides $n$. For the equation $a^n+b^n$, one would look for divisors of $2n$ that don't divide $n$. Inthe question we have $n=27$, so the divisors of 54 that don't divide 27. That is, 2, 6, 18 and 54. For powers of 2, there is only one number that divides $2n$ but not $n$.

+
+ + + + + + + + + +
+
share|cite|improve this answer
+ + + + +
+
+
+ + + + + + + + + + +
+ + + + + + + +
+    + +   +
+
+
+ Extebded answer to include this. + – wendy.krieger + Aug 3 '13 at 23:44 +
+
+
+ + +
+
+ + + +
+ + + + + + + + + + + + +
+ + +
+ + up vote + 13 + down vote + + + + +
+ +
+
+

We have a chain of divisibilities, based on the fact that $(a-b)\mid(a^n-b^n)$, +$$ +83^1-(-1)^1\mid83^3-(-1)^3\mid83^9-(-1)^9\mid83^{27}-(-1)^{27}=83^{27}+1 +$$ +Using this chain, we get, using $a^3-b^3=(a-b)(a^2+ab+b^2)$, +$$ +\begin{align} +83^{27}+1 +&=\frac{83^{27}+1}{83^9+1}\times\frac{83^9+1}{83^3+1}\times\frac{83^3+1}{83^1+1}\times\left(83^1+1\right)\\ +&=\left(83^{18}-83^9+1\right)\times\left(83^6-83^3+1\right)\times\left(83^2-83^1+1\right)\times\left(83^1+1\right)\\[9pt] +&=34946659039493167203883141969862007\times326939801583\times6807\times84 +\end{align} +$$ +Thus, $83^{27}+1$ is not prime.

+ +

Note: none of these factors are guaranteed to be prime, just factors.

+
+ + + + + + + + + +
+
share|cite|improve this answer
+ + + + +
+
+
+ + + + + + + + + + + + + + +
+ + + + + + + +
+ 3 + +   +
+
+
+ Would the downvoter care to comment? + – robjohn + Aug 3 '13 at 19:37 +
+
+ + + + + + + +
+    + +   +
+
+
+ I like it. Some might say overly rigorous for a simple problem, but helps demonstrate some deeper thinking than just noticing that it would be even. +1 + – Asimov + Sep 28 '14 at 18:32 +
+
+
+ + +
+
+ + + +
+ + + + + + + + + + + + +
+ + +
+ + up vote + 40 + down vote + + + + +
+ +
+
+

$$ +83^{27} + 1 = \Big(83^9\Big)^3 + 1 = a^3+b^3 = (a+b)(a^2-ab+b^2) = \Big(83^9+1\Big)\Big((83^9)^2-83^9+1\Big). +$$

+ +

So, no, it's not prime.

+ +

PS (added later): Some point out that it's obviously an even number, so it's not prime. But what I do above would work just as well if it were $84$ rather than $83$.

+
+ + + + + + + + + +
+
share|cite|improve this answer
+ + + + +
+
+
+ + + + + +
+
+ + +
+
+ + + +
+ + + + + + + + + + + + +
+ + +
+ + up vote + 23 + down vote + + + + +
+ +
+
+

Note that $83\equiv -1\pmod{84}$. Thus $83^{27}+1\equiv 0\pmod{84}$.

+ +

It follows that our number is divisible by all the divisors of $84$.

+ +

It is also non-prime in other ways. For let $x=83^3$. Then our number is $x^9+1$, so is divisible by $x+1$. Similarly, we could let $y=83^9$, and conclude that our number is divisible by $y+1$.

+ +

Seriously non-prime!

+
+ + + + + + + + +
+
share|cite|improve this answer
+ + + +
+
+
+ + + + + +
+
+ + +
+
+ + + +
+ + + + + + + + + + + + +
+ + +
+ + up vote + 14 + down vote + + + + +
+ +
+
+

It is obviously not prime. $83$ is odd, therefore $83^{27}$ is odd, hence $83^{27}+1$ is even and not prime.

+
+ + + + + + + + +
+
share|cite|improve this answer
+ + + +
+
+
+ + + + + +
+
+ + +
+
+ + + +
+ + + + + + + + + + + + +
+ + +
+ + up vote + 46 + down vote + + + + +
+ +
+
+

Well, it is an even number, so...

+
+ + + + + + + + +
+
share|cite|improve this answer
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+ 1 + +   +
+
+
+ I downvoted your answer on the basis that it doesn't provide the reason behind why it is even. For example, it doesn't say that since $83$ is odd, so the powers of it must also be odd and thus, odd + 1 must be even. + – Jeel Shah + Nov 5 '13 at 3:37 +
+
+ + + + + + + +
+ 6 + +   +
+
+
+ @gekkostate: (1) If you think all the answers must provide all the reasons behind them then you're going to downvote a lot around here, as many participants, probably most of the serious ones, don't think like you do. (2) The question is at a level that requires as trivial to know that powers of odd numbers are odd, and sum of odd numbers is even, so to add that to the answer seems trivial after it's been remarked that the number is odd (and thus the OP begins to think "why?" and he completes the answer by himself). Think of this, perhaps you'll realize you rush too much to do downvote... + – DonAntonio + Nov 5 '13 at 4:51 +
+
+ + + + + + + +
+ 2 + +   +
+
+
+ I clearly failed to see the intent behind your answer but I still feel that it lacks any reasoning whatsoever. Your answer is equivalent to the highest upvoted comment so maybe, that should have been enough? Also, I hardly ever downvote questions/answers (ratio of up to down is 77/5) so I didn't really rush into this (clearly, I use downvotes sparingly). I don't want to make this into something that it is not. Let's leave this at the fact that we have a difference of opinions on answers. + – Jeel Shah + Nov 5 '13 at 14:03 +
+
+ + + + + + + +
+    + +   +
+
+
+ I just love the precise nature of this answer. +1 + – Asimov + Sep 28 '14 at 18:31 +
+
+
+ + +
+
+
+

protected by Community Jun 21 '14 at 19:06 +

+

+Thank you for your interest in this question. +Because it has attracted low-quality or spam answers that had to be removed, posting an answer now requires 10 reputation on this site. +

+Would you like to answer one of these unanswered questions instead? +

+
+ + + + + +

+Not the answer you're looking for? Browse other questions tagged or ask your own question.

+
+
+ + + +
+ + +
+
+ + + + + + + + \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/test_list_empty.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/test_list_empty.html new file mode 100644 index 00000000..96aa3568 --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/test_list_empty.html @@ -0,0 +1,2000 @@ + + + +Натуральное мыло ручной работы — продажа оптом от производителя, каталог 2024 из 39 разновидностей, цены + + + + + + + + + + + + + + + + +
+
+
+
+ +
+
+20 943 +Российских производителей
+
+82 716 +Товаров российского производства
+
+
+ +
+
+ + + + + +
+ +
+
+ + +
+
+ +
+
+
+
+
+
+
+
    +
  • 134970 картинка каталога «Производство России». Продукция Натуральное мыло ручной работы, г.Симферополь 2015
  • +
  • Фото 2 Натуральное мыло ручной работы, г.Симферополь 2015
  • +
  • Фото 3 Натуральное мыло ручной работы, г.Симферополь 2015
  • +
  • Фото 4 Натуральное мыло ручной работы, г.Симферополь 2015
  • +
+
+Источник фото: knk-kosmetika.ru © +
+
+

Натуральное мыло ручной работы

+ оптом от производителя, г.Симферополь +
Продажа оптом мыла ручной работы от производителя натуральной косметики «Крымская Натуральная Коллекция», г. Симферополь
+
+
+
+
+ + +
Цена от 54 
+мин. партия: 180 шт.
+Купить оптом в 1 клик
+ +
+
+
+
+
+
    +
  • + +Описание
  • +
  • + +Вопросы
  • +
  • + +Отзывы
  • +
  • + +Контакты
+
+
+

Натуральное мыло ручной работы изготавливает и реализует по оптовой цене российский производитель и поставщик косметики под брендом «Крымская Натуральная Коллекция».

+ +

В каталоге представлено 39 разновидностей мыла.

+ +

Выпускаем:

+ +
    +
  • мыло с омолаживающим эффектом;
  • +
  • антицеллюлитное мыло;
  • +
  • мыло-скраб;
  • +
  • лечебное;
  • +
  • мыло-духи (ароматизированное).
  • +
+ +

Список ассортимента, каталог и прайс-листы отправляем по запросу на электронную почту заказчиков.

+ +

Фасовка: бруски по 43 и 75 грамм. Также предлагаем поставки мыла брусками по 850 гр.

+ +

Преимущества мыла от «Крымская Натуральная Коллекция»:

+ +
    +
  • производство «холодным» способом по уникальным рецептурам с сохранением полезных веществ и микроэлементов;
  • +
  • натуральный состав без химических добавок, консервантов, синтетических красителей и отдушек;
  • +
  • не вызывает аллергии;
  • +
  • насыщение кожи полезными веществами, минералами и витаминами;
  • +
  • ароматерапевтическое действие;
  • +
  • в составе растительные экстракты, масла и травы;
  • +
  • омолаживающий и питательный эффект для кожи;
  • +
  • придание коже легкого уникального аромата;
  • +
  • тщательный контроль качества продкции;
  • +
  • не сушит кожу и не стягивает кожу, потому что имеет максимальный PH (не более 8,5).
  • +
+ +

Срок годности: 12 мес. (в упаковке).

+ +

Также мыло ручной работы от бренда «Крымская Натуральная Коллекция» подходит:

+ +
    +
  • для мыльного массажа (глубоко очищает кожу, удаляет ороговевшие слои, способствует уменьшению объемов тела и профилактике целлюлита);
  • +
  • в качестве средства для бритья;
  • +
  • для ежедневного очищения кожи (умывание и душ);
  • +
  • в качестве мыльной маски для очищения кожи;
  • +
  • для интимной гигиены (бережно очищает, уменьшает количество воспалений, не раздражает слизистые покровы).
  • +
+ +

Также выпускаем натуральные дезодоранты, соль для ванны, натуральные кремы для лица и маски-скрабы. Смотрите список продукции в каталоге компании на выставке и на официальном сайте фабрики.

Приглашаем к сотрудничеству косметически салоны и CGF-центры, косметологии, салоны красоты, магазины, дилеров и оптовых заказчиков, корпоративных клиентов.

+ +

Продажа оптом от 180 шт. (сумма полного оптового заказ от 20000 руб.).

+ +

Оплату принимаем на расчетный счет фабрики и отгружаем заказы при 100% предоплате. Доставка по России транспортными компаниями.

+ +

Прайс-лист закажите у менеджера бренда на выставке через кнопку «Заказать прайс-лист» или по телефону.

+
+
+
+
+
+
Аватар пользователя
+
+Hani +29.05.2023 12:49 +
+
+

كيف يمكن ان اشتري من الشركة بالجملة وكيف يمكن ان اتواصل مع مدير المبيعات

+Армения, г.Ереван +
+
+
+
Аватар пользователя
+
+Жанна А. +19.03.2022 08:57 +
+
+

Здравствуйте. Пишу с коммерческим предложением. Разрешаете ли продажу на маркетплейсе? Вышлите прайс,пожалуйста.

+Россия, г.Санкт-Петербург +
+
+
+
Аватар пользователя
+
+Михрим Баратова +6.02.2022 06:22 +
+
+

Здравствуйте, можно опт цену на бруски по 850гр. В Алматы отправите? Минимальный заказ?

+Казахстан, г.Алматы +
+
+
+
Аватар пользователя
+
+Татьяна Сергеевна Дернова +7.01.2022 03:00 +
+
+

Могу ли продавать вашу продукцию на маркетплейсах? Условия?

+Россия, г.Петропавловск-Камчатский +
+
+
+
Аватар пользователя
+
+Татьяна +5.10.2021 09:14 +
+
+

Здравствуйте! Скажите, пожалуйста, сколько стоит мыло в брусках?

+Россия, г.Москва +
+ + +
+Задать вопрос +
+ + +
+ +
+
+ + + + +Я соглашаюсь с политикой конфиденциальности
+ + +
+ + + +
+ + +
+
+ + + +
+ + + + + +
+ Написать отзыв +
+ + + + Ваша оценка +
+
+ + + +
+
Преимущества
+ +
Недостатки
+ +
Комментарий
+ + +
+ + + + + Я соглашаюсь с политикой конфиденциальности + +
+ + + + +
+ + + + + + +
+ + + + +
+
+
+
+Фабрика «Крымская Натуральная Коллекция»
+
+Фабрика «Крымская Натуральная Коллекция» +
+ + +3 отзыва
+
Фабрика «Крымская Натуральная Коллекция» — российский производитель и...
+
+Контактная информация + + + + + + + + + + + + + + + + +
АдресКрым, Симферополь, ул. Бородина 10
Телефон+7 (978) 875-4152
WhatsApp+7 9782866450
Электронная почтаzakaz.knk@mail.ru
Официальный сайтknk-kosmetika.ru
+Реквизиты компании + + + + + + + + + + + + + + + + + + + +
НаименованиеИП Долгая Ирина Анатольевна
ОГРН314910226700661
ИНН910200114800
Юридический адрес295000, Респ Крым, г Симферополь
Дата регистрации24.09.2014
Виды деятельности +
+Основной ОКВЭД +46.45 Торговля оптовая парфюмерными и косметическими товарами +Дополнительные ОКВЭД +46.31.2 Торговля оптовая консервированными овощами, фруктами и орехами + + + + + + + + + + + + + + + + + + +Показать весь список... +
+
+Компания на карте +
+
+
Продукция компании22 Смотреть всё +
+ + + +
+
+
+
+
+ Фото 1 ​Ароматические освежители воздуха натуральные, г.Симферополь 2015 +
+ +
+
Цена от 178,20 
+ ​Ароматические освежители воздуха натуральные +
​Ароматические освежители воздуха натуральные изготавливает и предлагает оптовым заказчикам купить по выгодной цене российский...
+ +
+
+ +
+ + +
+ + +
+
+ + + + 0 отзывов +
+
+ + +
+
+ + + + +
+
+
+
+
+ Фото 1 Натуральная хозяйственная паста с горчицей, г.Симферополь 2015 +
+ +
+
Цена от 180 
+ Натуральная хозяйственная паста с горчицей +
Российский производитель и поставщик натуральной косметической и бытовой продукциии «Крымская Натуральная...
+ +
+
+ +
+ + +
+ + +
+
+ + + + 0 отзывов +
+
+ + +
+
+ + + + +
+
+
+
+
+ Фото 1 Морская соль для ванн, г.Симферополь 2015 +
+ +
+
Цена от 90 
+ Морская соль для ванн +
Российская фабрика-поставщик натуральной косметики из Крыма Фабрика «Крымская Натуральная Коллекция» изготавливает...
+ +
+
+ +
+ + +
+ + +
+
+ + + + 0 отзывов +
+
+ + +
+
+ + + + +
+
+
+
+
+ Фото 1 Натуральный дезодорант-антиперспирант, г.Симферополь 2015 +
+ +
+
Цена от 145,20 
+ Натуральный дезодорант-антиперспирант +
Симферопольский бренд-поставщик экологичной косметики «Крымская Натуральная Коллекция» продает по оптовой цене...
+ +
+
+ +
+ + +
+ + +
+
+ + + + 0 отзывов +
+
+ + +
+
+ + + + +
+
+
+
+
+ Фото 1 Маска-скраб для лица, г.Симферополь 2015 +
+ +
+
Цена от 102 
+ Маска-скраб для лица +
Косметическая фабрика-поставщик «Крымская Натуральная Коллекция» предлагает поставки масок-скрабов для лица в...
+ +
+
+ +
+ + +
+ + +
+
+ + + + 0 отзывов +
+
+ + +
+
+ + + + +
+
+
+
+
+ Фото 1 Натуральные кремы для лица, г.Симферополь 2015 +
+ +
+
Цена от 318 
+ Натуральные кремы для лица +
Российский производитель и поставщик косметической продукции под брендом «Крымская Натуральная Коллекция»...
+ +
+
+ +
+ + +
+ + +
+
+ + + + 0 отзывов +
+
+ + +
+
+ + + + +
+
+
+
+
+ Фото 1 Косметические масляно-солевые скрабы, г.Симферополь 2015 +
+ +
+
Цена от 178,20 
+ Косметические масляно-солевые скрабы +
Фабрика-поставщик натуральной косметики «Крымская Натуральная Коллекция» представляет широкий ассортимент...
+ +
+
+ +
+ + +
+ + +
+
+ + + + 0 отзывов +
+
+ + +
+
+ + + + +
+
+
+
+
+ Фото 1 Натуральный крем для рук «Нежное прикосновение», г.Симферополь 2015 +
+ +
+
Цена от 198 
+ Натуральный крем для рук «Нежное прикосновение» +
Натуральный крем для рук «Нежное прикосновение» изготавливает и реализует по оптовой цене производитель и поставщик...
+ +
+
+ +
+ + +
+ + +
+
+ + + + 0 отзывов +
+
+ + +
+
+ + + + +
+
+
+
+
+ Фото 1 Натуральное мыло ручной работы, г.Симферополь 2015 +
+ +
+
Цена от 54 
+ Натуральное мыло ручной работы +
Натуральное мыло ручной работы изготавливает и реализует по оптовой цене российский производитель и поставщик косметики под...
+ +
+
+ +
+ + +
+ + +
+
+ + + + 0 отзывов +
+
+ + +
+
+ + + + +
+
+
+
+
+ Фото 1 Мягкое травяное мыло «Бельди», г.Симферополь 2015 +
+ +
+
Цена от 142,80 
+ Мягкое травяное мыло «Бельди» +
Мягкое травяное мыло «Бельди» изготавливает и реализует по оптовой цене российский бренд-поставщик косметики...
+ +
+
+ +
+ + +
+ + +
+
+ + + + 0 отзывов +
+
+ + +
+
+ + + + + + + + + +
+
+
+ + +
+
+
+ +Ожидайте, идёт загрузка...
+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl index 0c68d085..f41b78ec 100644 --- a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl @@ -11,4 +11,6 @@ {"track_id": "oracle_doc", "dataset_name": "test_pipeline_suit", "url": "https://docs.oracle.com/en-us/iaas/tools/java/3.57.1/com/oracle/bmc/integration/model/CustomEndpointDetails.html","data_source_category": "HTML", "path":"oracle_doc.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} {"track_id": "table_involve_inline_code", "dataset_name": "test_table_involve_inline_code", "url": "https://docs.oracle.com/en-us/iaas/tools/java/3.57.1/com/oracle/bmc/integration/model/CustomEndpointDetails.html","data_source_category": "HTML", "path":"table_involve_inline_code.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} {"track_id": "table_tail_text", "dataset_name": "test_table_tail_text", "url": "https://dchublists.com/?do=hublist&id=hub-975&language=en","data_source_category": "HTML", "path":"table_tail_text.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} -{"track_id": "table_elem_include_enter", "dataset_name": "table_elem_include_enter", "url": "https://fardapaper.ir/financial-development-equity-capital","data_source_category": "HTML", "path":"test_table_elem_include_enter.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} \ No newline at end of file +{"track_id": "table_elem_include_enter", "dataset_name": "table_elem_include_enter", "url": "https://fardapaper.ir/financial-development-equity-capital","data_source_category": "HTML", "path":"test_table_elem_include_enter.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} +{"track_id": "list_empty", "dataset_name": "test_list_empty", "url": "https://productcenter.ru/products/27276/naturalnoie-krymskoie-mylo-ruchnoi-raboty-39-raznovidnostiei","data_source_category": "HTML", "path":"test_list_empty.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} +{"track_id": "table_include_math_p", "dataset_name": "table_include_math_p", "url": "https://math.stackexchange.com/questions/458323/is-8327-1-a-prime-number?answertab=active","data_source_category": "HTML", "path":"table_include_math_p.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index c2e5ee2b..a62b8e27 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -59,7 +59,7 @@ def setUp(self): for line in f: self.data_json.append(json.loads(line.strip())) - assert len(self.data_json) == 14 + assert len(self.data_json) == 16 # Config for HTML extraction self.config = { @@ -385,3 +385,26 @@ def test_table_element_include_enter(self): | عنوان انگلیسی | Financial development and the cost of equity capital: Evidence from China | | کلمات کلیدی : |   توسعه مالی؛ هزینه سرمایه حقوق سهامداران؛ قانون و امور مالی؛ چین | | درسهای مرتبط | حسابداری |""" in content_md + + def test_list_empty(self): + """list抽取为空,原因是嵌套的img标签没有text""" + chain = ExtractSimpleFactory.create(self.config) + self.assertIsNotNone(chain) + test_data = self.data_json[14] + # Create DataJson from test data + input_data = DataJson(test_data) + result = chain.extract(input_data) + list_type = result.get_content_list()._get_data()[0][0]["type"] + assert list_type != "list" + + def test_table_include_math_p(self): + """table包含math和其他内容""" + chain = ExtractSimpleFactory.create(self.config) + self.assertIsNotNone(chain) + test_data = self.data_json[15] + # Create DataJson from test data + input_data = DataJson(test_data) + result = chain.extract(input_data) + content_list = result.get_content_list()._get_data() + # TODO math模块需要处理下$符号但是非公式 + assert len(content_list[0]) == 17 \ No newline at end of file From 11d0968897bda9c19eeec5633207b4d211cc0b62 Mon Sep 17 00:00:00 2001 From: dt-yy Date: Thu, 6 Mar 2025 19:40:59 +0800 Subject: [PATCH 28/31] fix pylint --- llm_web_kit/extractor/html/extractor.py | 27 +++++++++---------- .../extractor/html/recognizer/table.py | 17 ++++++------ .../extractor/test_extractor_chain.py | 12 ++++----- 3 files changed, 27 insertions(+), 29 deletions(-) diff --git a/llm_web_kit/extractor/html/extractor.py b/llm_web_kit/extractor/html/extractor.py index 53565f15..a3d4a5f6 100644 --- a/llm_web_kit/extractor/html/extractor.py +++ b/llm_web_kit/extractor/html/extractor.py @@ -5,6 +5,7 @@ from overrides import override from llm_web_kit.config.cfg_reader import load_config +from llm_web_kit.exception.exception import HtmlFileExtractorException from llm_web_kit.extractor.extractor import BaseFileFormatExtractor from llm_web_kit.extractor.html.magic_html import GeneralExtractor from llm_web_kit.extractor.html.recognizer.audio import AudioRecognizer @@ -20,9 +21,7 @@ from llm_web_kit.extractor.html.recognizer.video import VideoRecognizer from llm_web_kit.input.datajson import ContentList, DataJson from llm_web_kit.libs.html_utils import element_to_html, html_to_element -from llm_web_kit.libs.logger import mylogger from llm_web_kit.libs.path_lib import get_py_pkg_root_dir -from llm_web_kit.exception.exception import HtmlFileExtractorException class HTMLPageLayoutType: @@ -256,51 +255,51 @@ def __is_valid_node(self, node: dict) -> bool: bool: 如果节点有效返回True,否则返回False """ if not node: - raise HtmlFileExtractorException("node is empty") + raise HtmlFileExtractorException('node is empty') node_type = node.get('type') valid_types = {'list', 'code', 'equation-interline', 'image', 'table', 'title', 'paragraph'} if node_type not in valid_types: - raise HtmlFileExtractorException(f"Invalid node type: {node_type}") + raise HtmlFileExtractorException(f'Invalid node type: {node_type}') # 检查列表类型的节点 if node.get('type') == 'list': items = node.get('content', {}).get('items', []) # 过滤掉None、空列表,以及只包含None或空值的列表 return bool(items) and any( - isinstance(item, (dict, list)) and bool(item) + isinstance(item, (dict, list)) and bool(item) for item in items) - #检测code类型的节点 + # 检测code类型的节点 if node.get('type') == 'code': code_content = node.get('content', {}).get('code_content') # 如果代码内容为None或空字符串,则视为无效节点 return bool(code_content and code_content.strip()) - #检测行间公式类型的节点 + # 检测行间公式类型的节点 if node.get('type') == 'equation-interline': math_content = node.get('content', {}).get('math_content') # 如果公式内容为None或空字符串,则视为无效节点 return bool(math_content and math_content.strip()) - #检测image类型的节点 + # 检测image类型的节点 if node.get('type') == 'image': content = node.get('content', {}) # 检查url、path或data字段是否至少有一个不为空 return bool(content.get('url') or content.get('path') or content.get('data')) - #检测table类型的节点 + # 检测table类型的节点 if node.get('type') == 'table': html = node.get('content', {}).get('html') # 如果表格的html内容为None或空字符串,则视为无效节点 return bool(html and html.strip()) - #检测title类型的节点 + # 检测title类型的节点 if node.get('type') == 'title': title_content = node.get('content', {}).get('title_content') # 如果标题内容为None或空字符串,则视为无效节点 return bool(title_content and title_content.strip()) - #检测段落类型的节点 + # 检测段落类型的节点 if node.get('type') == 'paragraph': content = node.get('content', []) # 检查content列表是否存在且不为空,并且至少有一个非空的内容项 return bool(content) and any( - item.get('c') and item.get('c').strip() + item.get('c') and item.get('c').strip() for item in content - ) + ) return True def _export_to_content_list(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str) -> ContentList: @@ -347,7 +346,7 @@ def __get_cc_node(self, html:str) -> (str, str): if len(nodes) == 0: raise HtmlFileExtractorException(f'html文本中没有cc标签: {html}') if len(nodes) > 3: - raise HtmlFileExtractorException(f'html文本中包含多个cc标签: {html}') + raise HtmlFileExtractorException(f'html文本中包含多个cc标签: {html}') return element_to_html(nodes[0]), nodes[0].tag def __build_extractor(self): diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index 9586713f..6d7d94e8 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -3,7 +3,7 @@ from lxml.html import HtmlElement from overrides import override -import json + from llm_web_kit.exception.exception import HtmlTableRecognizerException from llm_web_kit.extractor.html.recognizer.cccode import CodeRecognizer from llm_web_kit.extractor.html.recognizer.ccmath import MathRecognizer @@ -114,7 +114,7 @@ def __is_simple_table(self, tree) -> bool: def __is_table_nested(self, element) -> int: """计算表格的嵌套层级(非表格返回0)""" - if element.tag != "table": + if element.tag != 'table': return 0 # 获取当前表格下所有的表格(包括自身) all_tables = [element] + element.xpath('.//table') @@ -148,7 +148,6 @@ def __get_table_type(self, child: HtmlElement) -> str: table_type = 'complex' return table_type - def __check_table_include_math_code(self, raw_html: HtmlElement): """检查table中的内容,包括普通文本、数学公式和代码.""" math_html = self._element_to_html(raw_html) @@ -159,7 +158,7 @@ def __check_table_include_math_code(self, raw_html: HtmlElement): code_recognizer = CodeRecognizer() code_res_parts = code_recognizer.recognize(base_url='', main_html_lst=math_res_parts, raw_html=math_html) - + result = [] for math_item in code_res_parts: ele_item = self._build_html_tree(math_item[0]) @@ -167,7 +166,7 @@ def __check_table_include_math_code(self, raw_html: HtmlElement): for text_segment in ele_item.itertext(): cleaned_text = text_segment.strip().replace('\\n', '') if cleaned_text: # 过滤空字符串 - #print("cleaned_text", cleaned_text) + # print("cleaned_text", cleaned_text) result.append(cleaned_text) # 处理行内公式 ccinline_math_node = ele_item.xpath(f'//{CCTag.CC_MATH_INLINE}') @@ -176,7 +175,7 @@ def __check_table_include_math_code(self, raw_html: HtmlElement): el.text.strip() for el in ccinline_math_node if el.text and el.text.strip() ] result.extend(formulas) - + # 处理行间公式 ccinterline_math_node = ele_item.xpath(f'//{CCTag.CC_MATH_INTERLINE}') if ccinterline_math_node: @@ -184,7 +183,7 @@ def __check_table_include_math_code(self, raw_html: HtmlElement): el.text.strip() for el in ccinterline_math_node if el.text and el.text.strip() ] result.extend(formulas) - + # 处理行内代码 ccinline_code_node = ele_item.xpath(f'//{CCTag.CC_CODE_INLINE}') if ccinline_code_node: @@ -192,7 +191,7 @@ def __check_table_include_math_code(self, raw_html: HtmlElement): el.text.strip() for el in ccinline_code_node if el.text and el.text.strip() ] result.extend(codes) - + # 处理行间代码 ccinterline_code_node = ele_item.xpath(f'//{CCTag.CC_CODE}') if ccinterline_code_node: @@ -200,7 +199,7 @@ def __check_table_include_math_code(self, raw_html: HtmlElement): el.text.strip() for el in ccinterline_code_node if el.text and el.text.strip() ] result.extend(codes) - + return result def __simplify_td_th_content(self, elem: HtmlElement) -> None: diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index a62b8e27..d322c6b1 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -387,18 +387,18 @@ def test_table_element_include_enter(self): | درسهای مرتبط | حسابداری |""" in content_md def test_list_empty(self): - """list抽取为空,原因是嵌套的img标签没有text""" + """list抽取为空,原因是嵌套的img标签没有text.""" chain = ExtractSimpleFactory.create(self.config) self.assertIsNotNone(chain) test_data = self.data_json[14] # Create DataJson from test data input_data = DataJson(test_data) result = chain.extract(input_data) - list_type = result.get_content_list()._get_data()[0][0]["type"] - assert list_type != "list" - + list_type = result.get_content_list()._get_data()[0][0]['type'] + assert list_type != 'list' + def test_table_include_math_p(self): - """table包含math和其他内容""" + """table包含math和其他内容.""" chain = ExtractSimpleFactory.create(self.config) self.assertIsNotNone(chain) test_data = self.data_json[15] @@ -407,4 +407,4 @@ def test_table_include_math_p(self): result = chain.extract(input_data) content_list = result.get_content_list()._get_data() # TODO math模块需要处理下$符号但是非公式 - assert len(content_list[0]) == 17 \ No newline at end of file + assert len(content_list[0]) == 17 From d3f995fb436fb27ec84788728a4397b55b1e4091 Mon Sep 17 00:00:00 2001 From: dt-yy Date: Fri, 7 Mar 2025 15:57:59 +0800 Subject: [PATCH 29/31] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dtable&list=E9=97=AE?= =?UTF-8?q?=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- llm_web_kit/extractor/extractor_chain.py | 1 - .../extractor/html/recognizer/cccode.py | 1 - .../extractor/html/recognizer/table.py | 143 ++++++++++-------- .../html/table_include_table_math.html | 90 +++++++++++ .../good_data/html_data_input.jsonl | 3 +- .../extractor/html/recognizer/test_table.py | 4 +- .../extractor/test_extractor_chain.py | 18 ++- 7 files changed, 189 insertions(+), 71 deletions(-) create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_include_table_math.html diff --git a/llm_web_kit/extractor/extractor_chain.py b/llm_web_kit/extractor/extractor_chain.py index 5063a2fa..d02d17e0 100644 --- a/llm_web_kit/extractor/extractor_chain.py +++ b/llm_web_kit/extractor/extractor_chain.py @@ -46,7 +46,6 @@ def extract(self, data: DataJson) -> DataJson: # Pre extractors for pre_ext in self.__pre_extractors: data = pre_ext.pre_extract(data) - # Main extractors for ext in self.__extractors: data = ext.extract(data) diff --git a/llm_web_kit/extractor/html/recognizer/cccode.py b/llm_web_kit/extractor/html/recognizer/cccode.py index d98d5a75..4a638fee 100644 --- a/llm_web_kit/extractor/html/recognizer/cccode.py +++ b/llm_web_kit/extractor/html/recognizer/cccode.py @@ -38,7 +38,6 @@ def recognize( if self.is_cc_html(html): rtn.append((html, raw_html)) continue - root: HtmlElement = html_to_element(html) while True: # 最常见: diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index 6d7d94e8..28694c7f 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -5,7 +5,6 @@ from overrides import override from llm_web_kit.exception.exception import HtmlTableRecognizerException -from llm_web_kit.extractor.html.recognizer.cccode import CodeRecognizer from llm_web_kit.extractor.html.recognizer.ccmath import MathRecognizer from llm_web_kit.extractor.html.recognizer.recognizer import ( BaseHTMLElementRecognizer, CCTag) @@ -113,7 +112,7 @@ def __is_simple_table(self, tree) -> bool: return True def __is_table_nested(self, element) -> int: - """计算表格的嵌套层级(非表格返回0)""" + """计算表格的嵌套层级(非表格返回0,根据原始table判断的.""" if element.tag != 'table': return 0 # 获取当前表格下所有的表格(包括自身) @@ -151,73 +150,90 @@ def __get_table_type(self, child: HtmlElement) -> str: def __check_table_include_math_code(self, raw_html: HtmlElement): """检查table中的内容,包括普通文本、数学公式和代码.""" math_html = self._element_to_html(raw_html) - # 处理数学公式和代码 math_recognizer = MathRecognizer() - math_res_parts = math_recognizer.recognize(base_url='', main_html_lst=[(math_html, math_html)], - raw_html=math_html) - code_recognizer = CodeRecognizer() - code_res_parts = code_recognizer.recognize(base_url='', main_html_lst=math_res_parts, - raw_html=math_html) - + math_res_parts = math_recognizer.recognize( + base_url='', + main_html_lst=[(math_html, math_html)], + raw_html=math_html + ) result = [] - for math_item in code_res_parts: + for math_item in math_res_parts: ele_item = self._build_html_tree(math_item[0]) - # 处理所有文本内容 - for text_segment in ele_item.itertext(): - cleaned_text = text_segment.strip().replace('\\n', '') - if cleaned_text: # 过滤空字符串 - # print("cleaned_text", cleaned_text) - result.append(cleaned_text) - # 处理行内公式 - ccinline_math_node = ele_item.xpath(f'//{CCTag.CC_MATH_INLINE}') - if ccinline_math_node: - formulas = [ - el.text.strip() for el in ccinline_math_node if el.text and el.text.strip() - ] - result.extend(formulas) - - # 处理行间公式 - ccinterline_math_node = ele_item.xpath(f'//{CCTag.CC_MATH_INTERLINE}') - if ccinterline_math_node: - formulas = [ - el.text.strip() for el in ccinterline_math_node if el.text and el.text.strip() - ] - result.extend(formulas) - - # 处理行内代码 - ccinline_code_node = ele_item.xpath(f'//{CCTag.CC_CODE_INLINE}') - if ccinline_code_node: - codes = [ - el.text.strip() for el in ccinline_code_node if el.text and el.text.strip() - ] - result.extend(codes) - - # 处理行间代码 - ccinterline_code_node = ele_item.xpath(f'//{CCTag.CC_CODE}') - if ccinterline_code_node: - codes = [ - el.text.strip() for el in ccinterline_code_node if el.text and el.text.strip() - ] - result.extend(codes) + def process_node(node): + """处理行内公式、行间公式、行间代码、行内代码.""" + if node.tag == CCTag.CC_MATH_INLINE: + if node.text and node.text.strip(): + result.append(f'${node.text.strip()}$') + if node.tail and node.tail.strip(): + result.append(node.tail.strip()) + # 处理行间公式 + elif node.tag == CCTag.CC_MATH_INTERLINE: + if node.text and node.text.strip(): + result.append(f'$${node.text.strip()}$$') + if node.tail and node.tail.strip(): + result.append(node.tail.strip()) + # 处理行间代码 + elif node.tag == CCTag.CC_CODE: + if node.text and node.text.strip(): + result.append(f'```{node.text.strip()}```') + if node.tail and node.tail.strip(): + result.append(node.tail.strip()) + # 处理行内代码 + elif node.tag == CCTag.CC_CODE_INLINE: + if node.text and node.text.strip(): + result.append(f'`{node.text.strip()}`') + if node.tail and node.tail.strip(): + result.append(node.tail.strip()) + else: + # 提取当前节点的文本 + if node.text and node.text.strip(): + cleaned_text = node.text.strip().replace('\\n', '') + result.append(cleaned_text) + # 处理节点的tail(元素闭合后的文本) + if node.tail and node.tail.strip(): + cleaned_tail = node.tail.strip().replace('\\n', '') + result.append(cleaned_tail) + # 递归处理子节点 + for child in node: + process_node(child) + # 从根节点开始处理 + process_node(ele_item) return result - def __simplify_td_th_content(self, elem: HtmlElement) -> None: - """简化
内容,仅保留文本内容.""" + def __simplify_td_th_content(self, table_nest_level, elem: HtmlElement) -> None: + """简化 内容,保留嵌套表格结构.""" if elem.tag in ['td', 'th']: - # 简化单元格中的元素 - parse_res = list() - math_res = self.__check_table_include_math_code(elem) - parse_res.extend(math_res) - for item in list(elem.iterchildren()): - elem.remove(item) - if parse_res: - elem.text = '
'.join(parse_res) + parse_res = [] + # 检查是否存在嵌套的表格 + if table_nest_level > 1: + # 存在嵌套表格,递归处理子节点 + for child in elem.iterchildren(): + if child.tag == 'table': + # 对嵌套表格递归调用简化处理 + self.__simplify_td_th_content(table_nest_level, child) + else: + # 处理非表格元素 + math_res = self.__check_table_include_math_code(child) + parse_res.extend(math_res) + elem.remove(child) + # 将非表格内容拼接后放在表格前面 + if parse_res: + elem.text = ' '.join(parse_res) + (elem.text or '') + else: + # 没有嵌套表格,直接简化 + math_res = self.__check_table_include_math_code(elem) + parse_res.extend(math_res) + for item in list(elem.iterchildren()): + elem.remove(item) + if parse_res: + elem.text = ' '.join(parse_res) return - for child in elem.iter('td', 'th'): - self.__simplify_td_th_content(child) + # 非 td/th 元素继续递归处理 + for child in elem.iterchildren(): + self.__simplify_td_th_content(table_nest_level, child) - def __get_table_body(self, table_type, table_root): + def __get_table_body(self, table_type, table_nest_level, table_root): """获取并处理table body,返回处理后的HTML字符串。""" if table_type == 'empty': return None @@ -233,11 +249,12 @@ def __get_table_body(self, table_type, table_root): elem.text = elem.text.strip().replace('\\n', '') if elem.tail is not None: elem.tail = elem.tail.strip().replace('\\n', '') - self.__simplify_td_th_content(table_root) + # 单元格内的多标签内容进行简化,空格拼接,公式、代码识别 + self.__simplify_td_th_content(table_nest_level, table_root) # 迭代 for child in table_root.iterchildren(): if child is not None: - self.__get_table_body(table_type, child) + self.__get_table_body(table_type, table_nest_level, child) return self._element_to_html(table_root) def __do_extract_tables(self, root: HtmlElement) -> None: @@ -247,7 +264,7 @@ def __do_extract_tables(self, root: HtmlElement) -> None: table_type = self.__get_table_type(root) table_nest_level = self.__is_table_nested(root) tail_text = root.tail - table_body = self.__get_table_body(table_type, root) + table_body = self.__get_table_body(table_type, table_nest_level, root) cc_element = self._build_cc_element( CCTag.CC_TABLE, table_body, tail_text, table_type=table_type, table_nest_level=table_nest_level, html=table_raw_html) diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_include_table_math.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_include_table_math.html new file mode 100644 index 00000000..16d7b72e --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_include_table_math.html @@ -0,0 +1,90 @@ + + + + + + + + +
+ + + + + + + + + + + + + +
+

STEM 综合展示表

+
+

基础公式:

+ E = mc^2 + + + + + + + + + + + +
单位换算: + 1 \text{km} = 10^3 \text{m} + + + + + + + + + + + + +
长度质量时间
1m=10^2cm1kg=10^3g1h=3600s
+
运动学: + v = \frac{dx}{dt} + a = \frac{dv}{dt} +
+
+

编程示例:

+
console.log("Hello World")
+ + + + + + + +
+

Python:

+
print(sum(range(1,n+1)))
+
+

对应公式:

+ \sum_{i=1}^{n} i = \frac{n(n+1)}{2} + + + + + + + + + + + +
等差数列等比数列
S_n = \frac{n(a_1+a_n)}{2}S_n = a_1\frac{1-r^n}{1-r}
+
+
+
+ + \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl index f41b78ec..76b39eb6 100644 --- a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl @@ -13,4 +13,5 @@ {"track_id": "table_tail_text", "dataset_name": "test_table_tail_text", "url": "https://dchublists.com/?do=hublist&id=hub-975&language=en","data_source_category": "HTML", "path":"table_tail_text.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} {"track_id": "table_elem_include_enter", "dataset_name": "table_elem_include_enter", "url": "https://fardapaper.ir/financial-development-equity-capital","data_source_category": "HTML", "path":"test_table_elem_include_enter.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} {"track_id": "list_empty", "dataset_name": "test_list_empty", "url": "https://productcenter.ru/products/27276/naturalnoie-krymskoie-mylo-ruchnoi-raboty-39-raznovidnostiei","data_source_category": "HTML", "path":"test_list_empty.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} -{"track_id": "table_include_math_p", "dataset_name": "table_include_math_p", "url": "https://math.stackexchange.com/questions/458323/is-8327-1-a-prime-number?answertab=active","data_source_category": "HTML", "path":"table_include_math_p.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} \ No newline at end of file +{"track_id": "table_include_math_p", "dataset_name": "table_include_math_p", "url": "https://math.stackexchange.com/questions/458323/is-8327-1-a-prime-number?answertab=active","data_source_category": "HTML", "path":"table_include_math_p.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} +{"track_id": "table_include_table_math", "dataset_name": "table_include_table_math", "url": "https://test","data_source_category": "HTML", "path":"table_include_table_math.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index 0608e825..afb9418f 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -89,7 +89,7 @@ def test_cc_simple_table(self): parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) assert len(parts) == 3 content = html_to_element(parts[1][0]).text_content() - assert content == r'
Рейтинг:Рейтинг<br>5.00<br>из 5 на основе опроса<br>3<br>пользователей
Тип товара:Препараты для омоложения
Форма:Крем
Объем:50 мл
Рецепт:Отпускается без рецепта
Способ хранения:Хранить при температуре 4-20°
Примечание:Беречь от детей
Оплата:Наличными/банковской картой
Доступность в Северске:В наличии
Доставка:2-7 Дней
Цена:84<br>₽
' + assert content == r'
Рейтинг:Рейтинг 5.00 из 5 на основе опроса 3 пользователей
Тип товара:Препараты для омоложения
Форма:Крем
Объем:50 мл
Рецепт:Отпускается без рецепта
Способ хранения:Хранить при температуре 4-20°
Примечание:Беречь от детей
Оплата:Наличными/банковской картой
Доступность в Северске:В наличии
Доставка:2-7 Дней
Цена:84 ₽
' def test_cc_complex_table(self): """cc跨行跨列的表格.""" @@ -155,7 +155,7 @@ def test_table_involve_equation(self): raw_html = raw_html_path.read_text(encoding='utf-8') parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) complex_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}') - assert complex_table_tag[0].text == r'
Name of the probability distributionProbability distribution functionMeanVariance
Binomial distribution{\displaystyle \Pr \,(X=k)={\binom {n}{k}}p^{k}(1-p)^{n-k}}{\displaystyle np}{\displaystyle np(1-p)}
Geometric distribution{\displaystyle \Pr \,(X=k)=(1-p)^{k-1}p}{\displaystyle {\frac {1}{p}}}{\displaystyle {\frac {(1-p)}{p^{2}}}}
Normal distribution{\displaystyle f\left(x\mid \mu ,\sigma ^{2}\right)={\frac {1}{\sqrt {2\pi \sigma ^{2}}}}e^{-{\frac {(x-\mu )^{2}}{2\sigma ^{2}}}}}{\displaystyle \mu }{\displaystyle \sigma ^{2}}
Uniform distribution (continuous){\displaystyle f(x\mid a,b)={\begin{cases}{\frac {1}{b-a}}&{\text{for }}a\leq x\leq b,\\[3pt]0&{\text{for }}x<a{\text{ or }}x>b\end{cases}}}{\displaystyle {\frac {a+b}{2}}}{\displaystyle {\frac {(b-a)^{2}}{12}}}
Exponential distribution{\displaystyle f(x\mid \lambda )=\lambda e^{-\lambda x}}{\displaystyle {\frac {1}{\lambda }}}{\displaystyle {\frac {1}{\lambda ^{2}}}}
Poisson distribution{\displaystyle f(k\mid \lambda )={\frac {e^{-\lambda }\lambda ^{k}}{k!}}}{\displaystyle \lambda }{\displaystyle \lambda }
' + assert complex_table_tag[0].text == r'
Name of the probability distributionProbability distribution functionMeanVariance
Binomial distribution${\displaystyle \Pr \,(X=k)={\binom {n}{k}}p^{k}(1-p)^{n-k}}$${\displaystyle np}$${\displaystyle np(1-p)}$
Geometric distribution${\displaystyle \Pr \,(X=k)=(1-p)^{k-1}p}$${\displaystyle {\frac {1}{p}}}$${\displaystyle {\frac {(1-p)}{p^{2}}}}$
Normal distribution${\displaystyle f\left(x\mid \mu ,\sigma ^{2}\right)={\frac {1}{\sqrt {2\pi \sigma ^{2}}}}e^{-{\frac {(x-\mu )^{2}}{2\sigma ^{2}}}}}$${\displaystyle \mu }$${\displaystyle \sigma ^{2}}$
Uniform distribution (continuous)${\displaystyle f(x\mid a,b)={\begin{cases}{\frac {1}{b-a}}&{\text{for }}a\leq x\leq b,\\[3pt]0&{\text{for }}x<a{\text{ or }}x>b\end{cases}}}$${\displaystyle {\frac {a+b}{2}}}$${\displaystyle {\frac {(b-a)^{2}}{12}}}$
Exponential distribution${\displaystyle f(x\mid \lambda )=\lambda e^{-\lambda x}}$${\displaystyle {\frac {1}{\lambda }}}$${\displaystyle {\frac {1}{\lambda ^{2}}}}$
Poisson distribution${\displaystyle f(k\mid \lambda )={\frac {e^{-\lambda }\lambda ^{k}}{k!}}}$${\displaystyle \lambda }$${\displaystyle \lambda }$
' def test_table_involve_after_code(self): """test table involve code, code被提取出去了,过滤掉空的和坏的table.""" diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index d322c6b1..7f3886eb 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -59,7 +59,7 @@ def setUp(self): for line in f: self.data_json.append(json.loads(line.strip())) - assert len(self.data_json) == 16 + assert len(self.data_json) == 17 # Config for HTML extraction self.config = { @@ -358,7 +358,7 @@ def test_table_involve_inline_code(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_list = result.get_content_list()._get_data()[0][0]['content']['html'] - assert content_list == """
FunctionDescriptionExample
print()Prints a message to the console.print("Hello, World!")
len()Returns the length of an object.len([1, 2, 3])
range()Generates a sequence of numbers.range(1, 10)
""" + assert content_list == r"""
FunctionDescriptionExample
`print()`Prints a message to the console.`print("Hello, World!")`
`len()`Returns the length of an object.`len([1, 2, 3])`
`range()`Generates a sequence of numbers.`range(1, 10)`
""" def test_table_tail_text(self): """table的tail文本保留.""" @@ -406,5 +406,17 @@ def test_table_include_math_p(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_list = result.get_content_list()._get_data() - # TODO math模块需要处理下$符号但是非公式 assert len(content_list[0]) == 17 + assert content_list[0][3]['content']['html'] == r"
up vote 17 down vote favorite 5I'm having problems with exercises on proving whether or not a given number is prime. Is $83^{27} + 1$ prime? prime-numbers factoring
" + + def test_table_include_math_p_2(self): + """table包含math和其他内容.""" + chain = ExtractSimpleFactory.create(self.config) + self.assertIsNotNone(chain) + test_data = self.data_json[16] + # Create DataJson from test data + input_data = DataJson(test_data) + result = chain.extract(input_data) + content_list = result.get_content_list()._get_data() + assert content_list[0][2]['content']['html'] == "
单位换算:$1 \\text{km} = 10^3 \\text{m}$
长度质量时间
$1m=10^2cm$$1kg=10^3g$$1h=3600s$
运动学:$v = \\frac{dx}{dt}$ $a = \\frac{dv}{dt}$
" + From 07f1de4689f7edf2323689fd864e77e80a3dc2ad Mon Sep 17 00:00:00 2001 From: dt-yy Date: Fri, 7 Mar 2025 16:06:47 +0800 Subject: [PATCH 30/31] fix pylint --- .../extractor/test_extractor_chain.py | 20 +------------------ 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index 35c5cf3a..3cd7feb1 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -59,11 +59,7 @@ def setUp(self): for line in f: self.data_json.append(json.loads(line.strip())) -<<<<<<< HEAD assert len(self.data_json) == 17 -======= - assert len(self.data_json) == 14 ->>>>>>> 620f5e2739380e5091b8d096ecd6242e219e10ae # Config for HTML extraction self.config = { @@ -380,7 +376,6 @@ def test_table_tail_text(self): content_md = result.get_content_list().to_mm_md() assert '| ID: 975' in content_md -<<<<<<< HEAD def test_table_element_include_enter(self): """table的元素中间有换行.""" chain = ExtractSimpleFactory.create(self.config) @@ -428,17 +423,4 @@ def test_table_include_math_p_2(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_list = result.get_content_list()._get_data() - assert content_list[0][2]['content']['html'] == "
单位换算:$1 \\text{km} = 10^3 \\text{m}$
长度质量时间
$1m=10^2cm$$1kg=10^3g$$1h=3600s$
运动学:$v = \\frac{dx}{dt}$ $a = \\frac{dv}{dt}$
" - -======= - def test_clean_tags(self): - """测试clean_tag的preExtractor是否生效.""" - chain = ExtractSimpleFactory.create(self.config) - self.assertIsNotNone(chain) - test_data = self.data_json[13] - input_data = DataJson(test_data) - result = chain.extract(input_data) - content_md = result.get_content_list().to_mm_md() - print(content_md) - self.assertNotIn('begingroup', content_md) ->>>>>>> 620f5e2739380e5091b8d096ecd6242e219e10ae + assert content_list[0][2]['content']['html'] == '
单位换算:$1 \\text{km} = 10^3 \\text{m}$
长度质量时间
$1m=10^2cm$$1kg=10^3g$$1h=3600s$
运动学:$v = \\frac{dx}{dt}$ $a = \\frac{dv}{dt}$
' From dbe26d620db9a724cb844229c3b19a3e6f2da606 Mon Sep 17 00:00:00 2001 From: dt-yy Date: Fri, 7 Mar 2025 16:19:55 +0800 Subject: [PATCH 31/31] =?UTF-8?q?=E8=A7=A3=E5=86=B3list=E5=92=8Ctable?= =?UTF-8?q?=E7=AD=89=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../good_data/html_data_input.jsonl | 5 +---- tests/llm_web_kit/extractor/test_extractor_chain.py | 12 +++++++++++- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl index 1dd414bd..6c191184 100644 --- a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl @@ -11,11 +11,8 @@ {"track_id": "oracle_doc", "dataset_name": "test_pipeline_suit", "url": "https://docs.oracle.com/en-us/iaas/tools/java/3.57.1/com/oracle/bmc/integration/model/CustomEndpointDetails.html","data_source_category": "HTML", "path":"oracle_doc.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} {"track_id": "table_involve_inline_code", "dataset_name": "test_table_involve_inline_code", "url": "https://docs.oracle.com/en-us/iaas/tools/java/3.57.1/com/oracle/bmc/integration/model/CustomEndpointDetails.html","data_source_category": "HTML", "path":"table_involve_inline_code.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} {"track_id": "table_tail_text", "dataset_name": "test_table_tail_text", "url": "https://dchublists.com/?do=hublist&id=hub-975&language=en","data_source_category": "HTML", "path":"table_tail_text.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} -<<<<<<< HEAD {"track_id": "table_elem_include_enter", "dataset_name": "table_elem_include_enter", "url": "https://fardapaper.ir/financial-development-equity-capital","data_source_category": "HTML", "path":"test_table_elem_include_enter.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} {"track_id": "list_empty", "dataset_name": "test_list_empty", "url": "https://productcenter.ru/products/27276/naturalnoie-krymskoie-mylo-ruchnoi-raboty-39-raznovidnostiei","data_source_category": "HTML", "path":"test_list_empty.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} {"track_id": "table_include_math_p", "dataset_name": "table_include_math_p", "url": "https://math.stackexchange.com/questions/458323/is-8327-1-a-prime-number?answertab=active","data_source_category": "HTML", "path":"table_include_math_p.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} {"track_id": "table_include_table_math", "dataset_name": "table_include_table_math", "url": "https://test","data_source_category": "HTML", "path":"table_include_table_math.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} -======= -{"track_id": "test_clean_tags", "dataset_name": "test_pipeline_suit", "url": "https://math.stackexchange.com/questions/4082284/solving-for-vector-contained-in-a-diagonal-matrix","data_source_category": "HTML", "path":"test_clean_tags.html", "file_bytes": 1000, "page_layout_type":"forum", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} ->>>>>>> 620f5e2739380e5091b8d096ecd6242e219e10ae +{"track_id": "test_clean_tags", "dataset_name": "test_pipeline_suit", "url": "https://math.stackexchange.com/questions/4082284/solving-for-vector-contained-in-a-diagonal-matrix","data_source_category": "HTML", "path":"test_clean_tags.html", "file_bytes": 1000, "page_layout_type":"forum", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index 3cd7feb1..40f9c9a5 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -59,7 +59,7 @@ def setUp(self): for line in f: self.data_json.append(json.loads(line.strip())) - assert len(self.data_json) == 17 + assert len(self.data_json) == 18 # Config for HTML extraction self.config = { @@ -424,3 +424,13 @@ def test_table_include_math_p_2(self): result = chain.extract(input_data) content_list = result.get_content_list()._get_data() assert content_list[0][2]['content']['html'] == '
单位换算:$1 \\text{km} = 10^3 \\text{m}$
长度质量时间
$1m=10^2cm$$1kg=10^3g$$1h=3600s$
运动学:$v = \\frac{dx}{dt}$ $a = \\frac{dv}{dt}$
' + + def test_clean_tags(self): + """测试clean_tag的preExtractor是否生效.""" + chain = ExtractSimpleFactory.create(self.config) + self.assertIsNotNone(chain) + test_data = self.data_json[17] + input_data = DataJson(test_data) + result = chain.extract(input_data) + content_md = result.get_content_list().to_mm_md() + self.assertNotIn('begingroup', content_md)