From 69d420cf2c8b1357bc9b454269da995991e4c669 Mon Sep 17 00:00:00 2001 From: quyuan Date: Tue, 25 Feb 2025 13:01:20 +0800 Subject: [PATCH 01/22] resolve nest table --- .../extractor/html/recognizer/table.py | 137 ++- .../recognizer/table_include_code_expect.json | 299 +++++ .../assets/recognizer/table_involve_code.html | 1001 +++++++++++++++++ .../table_to_content_list_complex_res.json | 3 +- .../extractor/html/recognizer/test_table.py | 23 +- 5 files changed, 1421 insertions(+), 42 deletions(-) create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_include_code_expect.json create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_involve_code.html diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index e2c70a39..b41f8834 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -1,9 +1,12 @@ -from typing import List, Tuple +from itertools import chain +from typing import Any, List, Tuple from lxml.html import HtmlElement from overrides import override from llm_web_kit.exception.exception import HtmlTableRecognizerExp +from llm_web_kit.extractor.html.recognizer.cccode import CodeRecognizer +from llm_web_kit.extractor.html.recognizer.ccmath import MathRecognizer from llm_web_kit.extractor.html.recognizer.recognizer import ( BaseHTMLElementRecognizer, CCTag) from llm_web_kit.libs.doc_element_type import DocElementType @@ -42,7 +45,7 @@ def recognize(self, def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segment: str) -> dict: if not parsed_content: raise HtmlTableRecognizerExp(f'table parsed_content{parsed_content}为空') - table_type, table_body = self.__get_attribute(parsed_content) + table_type, table_nest_level, table_body = self.__get_attribute(parsed_content) d = { 'type': DocElementType.TABLE, # "bbox": [], @@ -52,6 +55,7 @@ def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segm }, } d['content']['is_complex'] = table_type + d['content']['table_nest_level'] = table_nest_level return d def __is_contain_cc_html(self, cc_html: str) -> bool: @@ -64,6 +68,7 @@ def __is_table_empty(self, table) -> bool: :param table: lxml.html.HtmlElement 对象,表示一个 元素 :return: 如果表格为空,返回 True;否则返回 False """ + def is_element_empty(elem): # 检查元素本身的文本内容 if elem.text and elem.text.strip(): @@ -81,6 +86,7 @@ def is_element_empty(elem): if elem.tail and elem.tail.strip(): return False return True + # 检查所有单元格 for cell in table.xpath('.//td | .//th'): # 检查单元格内容 @@ -101,7 +107,8 @@ def __is_simple_table(self, tree) -> bool: colspan = int(colspan_str) rowspan = int(rowspan_str) except ValueError as e: - raise HtmlTableRecognizerExp(f'table的合并单元格属性值colspan:{colspan_str}或rowspan:{rowspan_str}不是有效的整数') from e + raise HtmlTableRecognizerExp( + f'table的合并单元格属性值colspan:{colspan_str}或rowspan:{rowspan_str}不是有效的整数') from e if (colspan > 1) or (rowspan > 1): return False return True @@ -114,28 +121,28 @@ def __is_table_contain_img(self, tree) -> bool: else: return False - def __is_table_nested(self, tree) -> bool: - """判断table元素是否嵌套.""" - nested_tables = tree.xpath('//table//table') - if len(nested_tables) == 0: - return True - else: - return False + def __is_table_nested(self, tree) -> int: + """获取表格元素的嵌套层级(非表格元素返回0,顶层表格返回1,嵌套表格返回层级数).""" + if tree.tag != 'table': + return 0 # 非表格元素返回0 + # 计算祖先中的 table 数量(不包括自身),再加1表示自身层级 + return len(tree.xpath('ancestor::table')) + 1 - def __extract_tables(self, ele: HtmlElement) -> List[str]: + def __extract_tables(self, ele: HtmlElement) -> list[tuple[str, str]]: """提取html中的table元素.""" - tree = self._build_html_tree(ele) - self.__do_extract_tables(tree) - new_html = self._element_to_html(tree) + self.__do_extract_tables(ele) + new_html = self._element_to_html(ele) lst = self.html_split_by_tags(new_html, CCTag.CC_TABLE) return lst def __get_table_type(self, child: HtmlElement) -> str: """获取table的类型.""" empty_flag = self.__is_table_empty(child) + level = self.__is_table_nested(child) if empty_flag: return 'empty' - flag = self.__is_simple_table(child) and self.__is_table_nested(child) + # 是否跨行跨列 + flag = (self.__is_simple_table(child) and level < 2) if flag: table_type = 'simple' else: @@ -147,36 +154,91 @@ def __extract_table_element(self, ele: HtmlElement) -> str: for item in ele.iterchildren(): return self._element_to_html(item) - def __simplify_td_th_content(self, elem): + def __check_table_include_math_code(self, raw_html: HtmlElement): + """check table中是否包含math.""" + math_html = self._element_to_html(raw_html) + ele_res = list() + math_recognizer = MathRecognizer() + math_res_parts = math_recognizer.recognize(base_url='', main_html_lst=[(math_html, math_html)], + raw_html=math_html) + code_recognizer = CodeRecognizer() + code_res_parts = code_recognizer.recognize(base_url='', main_html_lst=math_res_parts, + raw_html=math_html) + for math_item in code_res_parts: + ele_item = self._build_html_tree(math_item[0]) + ccinline_math_node = ele_item.xpath(f'//{CCTag.CC_MATH_INLINE}') + ccinline_code_node = ele_item.xpath(f'//{CCTag.CC_CODE_INLINE}') + ccinterline_math_node = ele_item.xpath(f'//{CCTag.CC_MATH_INTERLINE}') + ccinterline_code_node = ele_item.xpath(f'//{CCTag.CC_CODE}') + if ccinline_math_node: + formulas = [ + el.text if el.text.strip() else '' + for el in ccinline_math_node + ] + ele_res.extend(formulas) # 添加字符串 + elif ccinterline_math_node: + codes = [ + el.text if el.text.strip() else '' + for el in ccinterline_math_node + ] + ele_res.extend(codes) + elif ccinline_code_node: + inline_codes = [ + el.text if el.text.strip() else '' + for el in ccinline_code_node + ] + ele_res.extend(inline_codes) + elif ccinterline_code_node: + ccinterline_codes = [ + el.text if el.text else '' + for el in ccinterline_code_node + ] + ele_res.extend(ccinterline_codes) + else: + ele_res.extend([ + text.strip() + for text in self._build_html_tree(math_item[1]).itertext() + if text.strip() + ]) + return ele_res + + def __simplify_td_th_content(self, elem: HtmlElement) -> None: """简化
内容,仅保留文本内容.""" - if elem.tag in ['td', 'th'] and len(elem.xpath('.//table')) == 0: - result = '
'.join([text for text in elem.itertext() if text.strip()]) - for child in list(elem): - elem.remove(child) - elem.text = result - elif elem.tag in ['td', 'th'] and len(elem.xpath('.//table')) > 0: - for item in elem.iterchildren(): - self.__simplify_td_th_content(item) + if elem.tag in ['td', 'th']: + # 简化单元格中的元素 + parse_res = list() + math_res = self.__check_table_include_math_code(elem) + parse_res.extend(math_res) + for item in list(elem.iterchildren()): + elem.remove(item) + elem.text = '
'.join(parse_res) + return + for child in elem.iter('td', 'th'): + self.__simplify_td_th_content(child) def __get_table_body(self, table_type, table_root): """获取并处理table body,返回处理后的HTML字符串。""" if table_type == 'empty': return None allowed_attributes = ['colspan', 'rowspan'] - for child in list(table_root.iterchildren()): - if child.tag is not None: - self.__get_table_body(table_type, child) - for ele in table_root.iter('td', 'th'): - self.__simplify_td_th_content(ele) + # 清理除了colspan和rowspan之外的属性 if len(table_root.attrib) > 0: cleaned_attrs = {k: v for k, v in table_root.attrib.items() if k in allowed_attributes} table_root.attrib.clear() table_root.attrib.update(cleaned_attrs) - if table_root.text is not None: - table_root.text = table_root.text.strip() - for elem in table_root.iter(): - if elem.tail is not None: + # text进行strip操作,tail去掉(有较多空换行) + for elem in chain([table_root], table_root.iterdescendants()): + if elem.text: + elem.text = elem.text.strip() + if elem.tail: elem.tail = elem.tail.strip() + + self.__simplify_td_th_content(table_root) + # 迭代 + for child in table_root.iterchildren(): + if child is not None: + self.__get_table_body(table_type, child) + return self._element_to_html(table_root) def __do_extract_tables(self, root: HtmlElement) -> None: @@ -184,23 +246,26 @@ def __do_extract_tables(self, root: HtmlElement) -> None: if root.tag in ['table']: table_raw_html = self._element_to_html(root) table_type = self.__get_table_type(root) + table_nest_level = self.__is_table_nested(root) tail_text = root.tail table_body = self.__get_table_body(table_type, root) cc_element = self._build_cc_element( - CCTag.CC_TABLE, table_body, tail_text, table_type=table_type, html=table_raw_html) + CCTag.CC_TABLE, table_body, tail_text, table_type=table_type, table_nest_level=table_nest_level, + html=table_raw_html) self._replace_element(root, cc_element) return for child in root.iterchildren(): self.__do_extract_tables(child) - def __get_attribute(self, html: str) -> Tuple[int, str]: + def __get_attribute(self, html: str) -> tuple[bool, Any, Any]: """获取element的属性.""" ele = self._build_html_tree(html) if ele is not None and ele.tag == CCTag.CC_TABLE: table_type = ele.attrib.get('table_type') + table_nest_level = ele.attrib.get('table_nest_level') table_flag = self.__get_content_list_table_type(table_type) table_body = ele.text - return table_flag, table_body + return table_flag, table_nest_level, table_body else: raise HtmlTableRecognizerExp(f'{html}中没有cctable标签') diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_include_code_expect.json b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_include_code_expect.json new file mode 100644 index 00000000..15a9cf34 --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_include_code_expect.json @@ -0,0 +1,299 @@ +
1<br>2<br>3<br>4<br>5<br>6<br>7<br>8<br>9<br>10<br>11<br>12<br>13<br>14<br>15<br>16<br>17<br>18<br>19<br>20<br>21<br>22<br>23<br>24<br>25<br>26<br>27<br>28<br>29<br>30<br>31<br>32<br>33<br>34<br>35<br>36<br>37<br>38<br>39<br>40<br>41<br>42<br>43<br>44<br>45<br>46<br>47<br>48<br>49<br>50<br>51<br>52<br>53<br>54<br>55<br>56<br>57<br>58<br>59<br>60<br>61<br>62<br>63<br>64<br>65<br>66<br>67<br>68<br>69<br>70<br>71<br>72<br>73<br>74<br>75<br>76<br>77<br>78<br>79<br>80<br>81<br>82<br>83<br>84<br>85<br>86<br>87<br>88<br>89<br>90<br>91<br>92<br>93<br>94<br>95<br>96<br>97<br>98<br>99<br>100<br>101<br>102<br>103<br>104<br>105<br>106<br>107<br>108<br>109<br>110<br>111<br>112<br>113<br>114<br>115<br>116<br>117<br>118<br>119<br>120<br>121<br>122<br>123<br>124<br>125<br>126<br>127<br>128<br>129<br>130<br>131<br>132<br>133<br>134<br>135<br>136<br>137<br>138<br>139<br>140<br>141<br>142<br>143<br>144<br>145<br>146<br>147<br>148<br>149<br>150<br>151<br>152<br>153<br>154<br>155<br>156<br>157<br>158<br>159<br>160<br>161<br>162<br>163<br>164<br>165<br>166<br>167<br>168<br>169<br>170<br>171<br>172<br>173<br>174<br>175<br>176<br>177<br>178<br>179<br>180<br>181<br>182<br>183<br>184<br>185<br>186<br>187<br>188<br>189<br>190<br>191<br>192<br>193<br>194<br>195<br>196<br>197<br>198<br>199<br>200<br>201<br>202<br>203<br>204<br>205<br>206<br>207<br>208<br>209<br>210<br>211<br>212<br>213<br>214<br>215<br>216<br>217<br>218<br>219<br>220<br>221<br>222<br>223<br>224<br>225<br>226<br>227<br>228<br>229<br>230<br>231<br>232<br>233<br>234<br>235<br>236<br>237<br>238<br>239<br>240<br>241<br>242<br>243<br>244<br>245<br>246<br>247<br>248<br>249<br>250<br>251<br>252<br>253<br>254<br>255<br>256<br>257<br>258<br>259<br>260<br>261<br>262<br>263<br>264<br>265<br>266<br>267<br>268<br>269<br>270<br>271<br>272<br>273<br>274<br>275<br>276<br>277<br>278<br>279<br>280<br>281<br>282<br>283<br>284<br>285<br>286<br>287<br>288<br>289<br>290<br>291<br>292<br>293<br>294<br>295<br>296<br>297<br>298<br>299<%@ page language="java"import="java.util.*"pageEncoding="utf-8"%> +<% +String path = request.getContextPath(); +String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/"; +%> + +<!DOCTYPE HTML PUBLIC"-//W3C//DTD HTML 4.01 Transitional//EN"> +<html> +<head> +<title>My JSP'register.jsp'starting page</title> +</head> + +<body> +<script type="text/javascript"> +function validate(){ +if(registerForm.uname.value==""){ +alert("账号不能为空!"); +return; +} +if(registerForm.upwd.value==""){ +alert("密码不能为空!"); +return; +} +registerForm.submit(); +} +</script> + +<form name="registerForm"action="DoregServlet"method="post"> + +用户名:<input type="text"name="uname"><br> +密 码: <input type="password"name="upwd"> <br> +<input type="submit"value="注册"> +<a href="denglu.jsp">登录</a> +</form> + +</body> +</html> + + + +packagecom.servlet; + +importjava.io.IOException; +importjava.io.PrintWriter; + +importjavax.servlet.ServletException; +importjavax.servlet.http.HttpServlet; +importjavax.servlet.http.HttpServletRequest; +importjavax.servlet.http.HttpServletResponse; + +importcom.dao.UsersDao; + +publicclassservlet3extendsHttpServlet { + +publicservlet3() { +super(); +} + + +publicvoiddestroy() { +super.destroy();// Just puts "destroy" string in log +// Put your code here +} + + +publicvoiddoGet(HttpServletRequest request, HttpServletResponse response) +throwsServletException, IOException { +doPost (request, response); + +} + + +publicvoiddoPost(HttpServletRequest request, HttpServletResponse response) +throwsServletException, IOException { + +String uname = request.getParameter("uname"); +String upwd = request.getParameter("upwd"); +UsersDao usersDao =newUsersDao(); +inti=usersDao.reg(uname, upwd); +if(i>0){ + +response.setHeader("refresh","2;url=login.jsp"); +}else{ + +response.setHeader("refresh","2;url=reg.jsp"); +} +} + +/** +* Initialization of the servlet. <br> +* +* @throws ServletException if an error occurs +*/ +publicvoidinit()throwsServletException { +// Put your code here +} + +} + + + + + +packagecom.sf.servlet; + +importjava.io.IOException; +importjava.io.PrintWriter; + +importjavax.servlet.ServletException; +importjavax.servlet.http.HttpServlet; +importjavax.servlet.http.HttpServletRequest; +importjavax.servlet.http.HttpServletResponse; + +importcom.sf.dao.MsgDao; +importcom.sf.dao.UsersDao; + +publicclassDoregservletextendsHttpServlet { + +/** +* Constructor of the object. +*/ +publicDoregservlet() { +super(); +} + +/** +* Destruction of the servlet. <br> +*/ +publicvoiddestroy() { +super.destroy();// Just puts "destroy" string in log +// Put your code here +} + +publicvoiddoGet(HttpServletRequest request, HttpServletResponse response) +throwsServletException, IOException { + +response.setContentType("text/html"); +PrintWriter out = response.getWriter(); +request.setCharacterEncoding("utf-8"); +String uname = request.getParameter("uname"); +String upwd = request.getParameter("upwd"); + +UsersDao ud =newUsersDao(); +MsgDao md =newMsgDao(); +if(ud.register(uname, upwd) >0) { +request.getSession().setAttribute("uname", uname); +request.getRequestDispatcher("denglu.jsp").forward(request, +response); +}else{ +out.print("注册失败,请重新注册......."); +response.setHeader("refresh","3;url=reg.jsp"); +} +} +publicvoiddoPost(HttpServletRequest request, HttpServletResponse response) +throwsServletException, IOException { + +doGet(request,response); +} + +/** +* Initialization of the servlet. <br> +* +* @throws ServletException if an error occurs +*/ +publicvoidinit()throwsServletException { +// Put your code here +} + +} + + + + + +packagecom.servlet; + +importjava.io.IOException; +importjava.io.PrintWriter; + +importjavax.servlet.ServletException; +importjavax.servlet.http.HttpServlet; +importjavax.servlet.http.HttpServletRequest; +importjavax.servlet.http.HttpServletResponse; + +importcom.dao.MsgDao; + +publicclassservlet5extendsHttpServlet { + +publicservlet5() { +super(); +} + +publicvoiddestroy() { +super.destroy();// Just puts "destroy" string in log +// Put your code here +} + + +publicvoiddoGet(HttpServletRequest request, HttpServletResponse response) +throwsServletException, IOException { + +doPost(request, response); +} + + +publicvoiddoPost(HttpServletRequest request, HttpServletResponse response) +throwsServletException, IOException { + +request.setCharacterEncoding("utf-8"); + +intid=Integer.parseInt(request.getParameter("id")); +MsgDao md=newMsgDao(); +md.delMail(id); +response.getWriter().print("刪除成功....."); +response.setHeader("refresh","2;url=main.jsp"); +response.sendRedirect("main2.jsp"); +} + + +publicvoidinit()throwsServletException { + +} + +} + + + + + + + +packagecom.sf.servlet; + +importjava.io.IOException; +importjava.io.PrintWriter; + +importjavax.servlet.ServletException; +importjavax.servlet.http.HttpServlet; +importjavax.servlet.http.HttpServletRequest; +importjavax.servlet.http.HttpServletResponse; + +importcom.sf.dao.MsgDao; +importcom.sf.entity.Msg; + +publicclassDowriteservletextendsHttpServlet { + +/** +* Constructor of the object. +*/ +publicDowriteservlet() { +super(); +} + +/** +* Destruction of the servlet. <br> +*/ +publicvoiddestroy() { +super.destroy();// Just puts "destroy" string in log +// Put your code here +} + +publicvoiddoGet(HttpServletRequest request, HttpServletResponse response) +throwsServletException, IOException { + +response.setContentType("text/html"); +PrintWriter out = response.getWriter(); +request.setCharacterEncoding("utf-8"); +String uname = (String) request.getSession().getAttribute("uname"); +String sendto = request.getParameter("receiver"); +String title = request.getParameter("title"); +String content = request.getParameter("content"); + +Msg m =newMsg(); +m.setMsgcontent(content); +m.setUsername(uname); +m.setSendto(sendto); +m.setTitle(title); + +MsgDao md =newMsgDao(); +md.addMsg(m); + +out.print("发送成功....."); +response.setHeader("refresh","3;url=main.jsp"); +} + +publicvoiddoPost(HttpServletRequest request, HttpServletResponse response) +throwsServletException, IOException { + +doGet(request,response); } + +/** +* Initialization of the servlet. <br> +* +* @throws ServletException if an error occurs +*/ +publicvoidinit()throwsServletException { +} + +}
\ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_involve_code.html b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_involve_code.html new file mode 100644 index 00000000..d1961838 --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_involve_code.html @@ -0,0 +1,1001 @@ + + + + + + + + + + + + + + 第十三周作业 - 徐涛% - 博客园 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+
+
+ +
+
+

+ + 第十三周作业 + + + + +

+
+
+
+
+ + + + + + + + + + +
+
1
+
2
+
3
+
4
+
5
+
6
+
7
+
8
+
9
+
10
+
11
+
12
+
13
+
14
+
15
+
16
+
17
+
18
+
19
+
20
+
21
+
22
+
23
+
24
+
25
+
26
+
27
+
28
+
29
+
30
+
31
+
32
+
33
+
34
+
35
+
36
+
37
+
38
+
39
+
40
+
41
+
42
+
43
+
44
+
45
+
46
+
47
+
48
+
49
+
50
+
51
+
52
+
53
+
54
+
55
+
56
+
57
+
58
+
59
+
60
+
61
+
62
+
63
+
64
+
65
+
66
+
67
+
68
+
69
+
70
+
71
+
72
+
73
+
74
+
75
+
76
+
77
+
78
+
79
+
80
+
81
+
82
+
83
+
84
+
85
+
86
+
87
+
88
+
89
+
90
+
91
+
92
+
93
+
94
+
95
+
96
+
97
+
98
+
99
+
100
+
101
+
102
+
103
+
104
+
105
+
106
+
107
+
108
+
109
+
110
+
111
+
112
+
113
+
114
+
115
+
116
+
117
+
118
+
119
+
120
+
121
+
122
+
123
+
124
+
125
+
126
+
127
+
128
+
129
+
130
+
131
+
132
+
133
+
134
+
135
+
136
+
137
+
138
+
139
+
140
+
141
+
142
+
143
+
144
+
145
+
146
+
147
+
148
+
149
+
150
+
151
+
152
+
153
+
154
+
155
+
156
+
157
+
158
+
159
+
160
+
161
+
162
+
163
+
164
+
165
+
166
+
167
+
168
+
169
+
170
+
171
+
172
+
173
+
174
+
175
+
176
+
177
+
178
+
179
+
180
+
181
+
182
+
183
+
184
+
185
+
186
+
187
+
188
+
189
+
190
+
191
+
192
+
193
+
194
+
195
+
196
+
197
+
198
+
199
+
200
+
201
+
202
+
203
+
204
+
205
+
206
+
207
+
208
+
209
+
210
+
211
+
212
+
213
+
214
+
215
+
216
+
217
+
218
+
219
+
220
+
221
+
222
+
223
+
224
+
225
+
226
+
227
+
228
+
229
+
230
+
231
+
232
+
233
+
234
+
235
+
236
+
237
+
238
+
239
+
240
+
241
+
242
+
243
+
244
+
245
+
246
+
247
+
248
+
249
+
250
+
251
+
252
+
253
+
254
+
255
+
256
+
257
+
258
+
259
+
260
+
261
+
262
+
263
+
264
+
265
+
266
+
267
+
268
+
269
+
270
+
271
+
272
+
273
+
274
+
275
+
276
+
277
+
278
+
279
+
280
+
281
+
282
+
283
+
284
+
285
+
286
+
287
+
288
+
289
+
290
+
291
+
292
+
293
+
294
+
295
+
296
+
297
+
298
+
299
+ +
+
+
<%@ page language="java" import="java.util.*" pageEncoding="utf-8"%>
+
<%
+
String path = request.getContextPath();
+
String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
+
%>
+
 
+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+
<html>
+
  <head>
+
    <title>My JSP 'register.jsp' starting page</title>   
+
  </head>
+
  
+
  <body>
+
  <script type="text/javascript">
+
        function validate(){
+
            if(registerForm.uname.value==""){
+
                alert("账号不能为空!");
+
                return;
+
            }
+
            if(registerForm.upwd.value==""){
+
                alert("密码不能为空!");
+
                return;
+
            }
+
            registerForm.submit();
+
        }
+
    </script>
+
 
+
    <form  name="registerForm" action="DoregServlet" method="post">
+
 
+
        用户名:<input type="text" name="uname"><br>
+
        密   码: <input type="password" name="upwd"> <br>
+
        <input type="submit" value="注册" >
+
        <a href="denglu.jsp">登录</a>
+
    </form>
+
     
+
  </body>
+
</html>
+
 
+
  
+
 
+
package com.servlet;
+
 
+
import java.io.IOException;
+
import java.io.PrintWriter;
+
 
+
import javax.servlet.ServletException;
+
import javax.servlet.http.HttpServlet;
+
import javax.servlet.http.HttpServletRequest;
+
import javax.servlet.http.HttpServletResponse;
+
 
+
import com.dao.UsersDao;
+
 
+
public class servlet3 extends HttpServlet {
+
 
+
    public servlet3() {
+
        super();
+
    }
+
 
+
     
+
    public void destroy() {
+
        super.destroy(); // Just puts "destroy" string in log
+
        // Put your code here
+
    }
+
 
+
 
+
    public void doGet(HttpServletRequest request, HttpServletResponse response)
+
            throws ServletException, IOException {
+
        doPost (request, response);
+
         
+
    }
+
 
+
     
+
    public void doPost(HttpServletRequest request, HttpServletResponse response)
+
            throws ServletException, IOException {
+
 
+
        String uname = request.getParameter("uname");
+
        String upwd = request.getParameter("upwd");
+
        UsersDao usersDao = new UsersDao();
+
        int i=usersDao.reg(uname, upwd);
+
        if(i>0){
+
         
+
            response.setHeader("refresh""2;url=login.jsp");
+
        }else{
+
     
+
            response.setHeader("refresh""2;url=reg.jsp");
+
        }
+
    }
+
 
+
    /**
+
     * Initialization of the servlet. <br>
+
     *
+
     * @throws ServletException if an error occurs
+
     */
+
    public void init() throws ServletException {
+
        // Put your code here
+
    }
+
 
+
}
+
 
+
  
+
 
+
  
+
 
+
package com.sf.servlet;
+
 
+
import java.io.IOException;
+
import java.io.PrintWriter;
+
 
+
import javax.servlet.ServletException;
+
import javax.servlet.http.HttpServlet;
+
import javax.servlet.http.HttpServletRequest;
+
import javax.servlet.http.HttpServletResponse;
+
 
+
import com.sf.dao.MsgDao;
+
import com.sf.dao.UsersDao;
+
 
+
public class Doregservlet extends HttpServlet {
+
 
+
    /**
+
     * Constructor of the object.
+
     */
+
    public Doregservlet() {
+
        super();
+
    }
+
 
+
    /**
+
     * Destruction of the servlet. <br>
+
     */
+
    public void destroy() {
+
        super.destroy(); // Just puts "destroy" string in log
+
        // Put your code here
+
    }
+
 
+
    public void doGet(HttpServletRequest request, HttpServletResponse response)
+
            throws ServletException, IOException {
+
 
+
        response.setContentType("text/html");
+
        PrintWriter out = response.getWriter();
+
        request.setCharacterEncoding("utf-8");
+
        String uname = request.getParameter("uname");
+
        String upwd = request.getParameter("upwd");
+
 
+
        UsersDao ud = new UsersDao();
+
        MsgDao md = new MsgDao();
+
        if (ud.register(uname, upwd) > 0) {
+
            request.getSession().setAttribute("uname", uname);
+
            request.getRequestDispatcher("denglu.jsp").forward(request,
+
                    response);
+
        else {
+
            out.print("注册失败,请重新注册.......");
+
            response.setHeader("refresh""3;url=reg.jsp");
+
        }
+
    }
+
    public void doPost(HttpServletRequest request, HttpServletResponse response)
+
            throws ServletException, IOException {
+
 
+
        doGet(request,response);
+
    }
+
 
+
    /**
+
     * Initialization of the servlet. <br>
+
     *
+
     * @throws ServletException if an error occurs
+
     */
+
    public void init() throws ServletException {
+
        // Put your code here
+
    }
+
 
+
}
+
 
+
  
+
 
+
  
+
 
+
package com.servlet;
+
 
+
import java.io.IOException;
+
import java.io.PrintWriter;
+
 
+
import javax.servlet.ServletException;
+
import javax.servlet.http.HttpServlet;
+
import javax.servlet.http.HttpServletRequest;
+
import javax.servlet.http.HttpServletResponse;
+
 
+
import com.dao.MsgDao;
+
 
+
public class servlet5 extends HttpServlet {
+
 
+
    public servlet5() {
+
        super();
+
    }
+
 
+
    public void destroy() {
+
        super.destroy(); // Just puts "destroy" string in log
+
        // Put your code here
+
    }
+
 
+
     
+
    public void doGet(HttpServletRequest request, HttpServletResponse response)
+
            throws ServletException, IOException {
+
 
+
        doPost(request,  response);
+
    }
+
 
+
     
+
    public void doPost(HttpServletRequest request, HttpServletResponse response)
+
            throws ServletException, IOException {
+
 
+
        request.setCharacterEncoding("utf-8");
+
          
+
        int id=Integer.parseInt(request.getParameter("id"));
+
        MsgDao md=new MsgDao();
+
        md.delMail(id);   
+
        response.getWriter().print("刪除成功.....");
+
        response.setHeader("refresh""2;url=main.jsp");
+
        response.sendRedirect("main2.jsp");
+
    }
+
 
+
     
+
    public void init() throws ServletException {
+
     
+
    }
+
 
+
}
+
 
+
  
+
 
+
  
+
 
+
  
+
 
+
package com.sf.servlet;
+
 
+
import java.io.IOException;
+
import java.io.PrintWriter;
+
 
+
import javax.servlet.ServletException;
+
import javax.servlet.http.HttpServlet;
+
import javax.servlet.http.HttpServletRequest;
+
import javax.servlet.http.HttpServletResponse;
+
 
+
import com.sf.dao.MsgDao;
+
import com.sf.entity.Msg;
+
 
+
public class Dowriteservlet extends HttpServlet {
+
 
+
    /**
+
     * Constructor of the object.
+
     */
+
    public Dowriteservlet() {
+
        super();
+
    }
+
 
+
    /**
+
     * Destruction of the servlet. <br>
+
     */
+
    public void destroy() {
+
        super.destroy(); // Just puts "destroy" string in log
+
        // Put your code here
+
    }
+
 
+
    public void doGet(HttpServletRequest request, HttpServletResponse response)
+
            throws ServletException, IOException {
+
 
+
        response.setContentType("text/html");
+
        PrintWriter out = response.getWriter();
+
        request.setCharacterEncoding("utf-8");
+
        String uname = (String) request.getSession().getAttribute("uname");
+
        String sendto = request.getParameter("receiver");
+
        String title = request.getParameter("title");
+
        String content = request.getParameter("content");
+
 
+
        Msg m = new Msg();
+
        m.setMsgcontent(content);
+
        m.setUsername(uname);
+
        m.setSendto(sendto);
+
        m.setTitle(title);
+
 
+
        MsgDao md = new MsgDao();
+
        md.addMsg(m);
+
 
+
        out.print("发送成功.....");
+
        response.setHeader("refresh""3;url=main.jsp");
+
    }
+
 
+
    public void doPost(HttpServletRequest request, HttpServletResponse response)
+
            throws ServletException, IOException {
+
 
+
        doGet(request,response);     }
+
 
+
    /**
+
     * Initialization of the servlet. <br>
+
     *
+
     * @throws ServletException if an error occurs
+
     */
+
    public void init() throws ServletException {
+
    }
+
 
+
}
+ +
+ +
+
+
+ +
+
posted @ +2022-05-29 20:20  +徐涛%  +阅读(70)  +评论(0)  +编辑  +收藏  +举报 +
+
+ + +
+
+ + +
+
+ +
+ +
+
+
+
+
+ + + + +
+
+
+
+ +
+ +
+
+ +
+
+
+ +
+ + + + + + + + + + + \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json index f1c6da6a..b0baf47d 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json @@ -3,6 +3,7 @@ "raw_content": "<table><caption>ফেব্রুয়ারি ২০২৪</caption><thead><tr><th>সোম</th><th>মঙ্গল</th><th>বুধ</th><th>বৃহ</th><th>শুক্র</th><th>শনি</th><th>রবি</th></tr></thead><tfoot><tr><td colspan=\\\"3\\\">« জানুয়ারি</td><td></td><td colspan=\\\"3\\\"></td></tr></tfoot><tbody><tr><td colspan=\\\"3\\\"></td><td>১</td><td>২</td><td>৩</td><td>৪</td></tr><tr><td>৫</td><td>৬</td><td>৭</td><td>৮</td><td>৯</td><td>১০</td><td>১১</td></tr><tr><td>১২</td><td>১৩</td><td>১৪</td><td>১৫</td><td>১৬</td><td>১৭</td><td>১৮</td></tr><tr><td>১৯</td><td>২০</td><td>২১</td><td>২২</td><td>২৩</td><td>২৪</td><td>২৫</td></tr><tr><td>২৬</td><td>২৭</td><td>২৮</td><td>২৯</td><td colspan=\\\"3\\\"></td></tr></tbody></table>", "content": { "html": "
ফেব্রুয়ারি ২০২৪
সোমমঙ্গলবুধবৃহশুক্রশনিরবি
« জানুয়ারি
১০১১
১২১৩১৪১৫১৬১৭১৮
১৯২০২১২২২৩২৪২৫
২৬২৭২৮২৯
", - "is_complex": true + "is_complex": true, + "table_nest_level": null } } diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index 19e1b106..08f3492c 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -19,7 +19,8 @@ 'assets/recognizer/table_simple_cc.html', 'assets/recognizer/table_include_rowspan_colspan.html', 'assets/recognizer/table_involve_equation.html', - 'assets/recognizer/table_include_after_code.html' + 'assets/recognizer/table_include_after_code.html', + 'assets/recognizer/table_involve_code.html' ), 'expected': [ @@ -86,7 +87,7 @@ def test_cc_simple_table(self): parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) assert len(parts) == 3 content = html_to_element(parts[1][0]).text_content() - assert content == r'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
Рейтинг:Рейтинг<br>5.00<br>из 5 на основе опроса<br>3<br>пользователей
Тип товара:Препараты для омоложения
Форма:Крем
Объем:50 мл
Рецепт:Отпускается без рецепта
Способ хранения:Хранить при температуре 4-20°
Примечание:Беречь от детей
Оплата:Наличными/банковской картой
Доступность в Северске:В наличии
Доставка:2-7 Дней
Цена:84<br>₽
\n' + assert content == r'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
Рейтинг:<br>\n<br>\n<br>\n<br>\nРейтинг<br>5.00<br>из 5 на основе опроса<br>3<br>пользователей<br>\n<br>\n<br>\n<br>\n
Тип товара:<br>\n<br>\n<br>\n<br>\nПрепараты для омоложения<br>\n<br>\n<br>\n<br>\n
Форма:<br>\n<br>\n<br>\n<br>\nКрем<br>\n<br>\n<br>\n<br>\n
Объем:<br>\n<br>\n<br>\n<br>\n50 мл<br>\n<br>\n<br>\n<br>\n
Рецепт:<br>\n<br>\n<br>\n<br>\nОтпускается без рецепта<br>\n<br>\n<br>\n<br>\n
Способ хранения:<br>\n<br>\n<br>\n<br>\nХранить при температуре 4-20°<br>\n<br>\n<br>\n<br>\n
Примечание:<br>\n<br>\n<br>\n<br>\nБеречь от детей<br>\n<br>\n<br>\n<br>\n
Оплата:<br>\n<br>\n<br>\n<br>\nНаличными/банковской картой<br>\n<br>\n<br>\n<br>\n
Доступность в Северске:<br>\n<br>\n<br>\n<br>\nВ наличии<br>\n<br>\n<br>\n<br>\n
Доставка:<br>\n<br>\n<br>\n<br>\n2-7 Дней<br>\n<br>\n<br>\n<br>\n
Цена:<br>\n<br>\n<br>\n<br>\n84<br>₽<br>\n<br>\n<br>\n<br>\n
\n' def test_cc_complex_table(self): """cc跨行跨列的表格.""" @@ -111,11 +112,11 @@ def test_simple_complex_table(self): simple_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}')[0] simple_table_type = simple_table_tag.attrib assert simple_table_type['table_type'] == 'simple' - assert simple_table_type == {'table_type': 'simple', 'html': '\n \n \n \n \n \n \n \n \n
12
34
\n\n'} + assert simple_table_type == {'table_type': 'simple', 'table_nest_level': '1', 'html': '\n \n \n \n \n \n \n \n \n
12
34
\n\n'} complex_table_tag = html_to_element(parts[2][0]).xpath(f'.//{CCTag.CC_TABLE}')[0] complex_table_type = complex_table_tag.attrib assert complex_table_type['table_type'] == 'complex' - assert complex_table_type == {'table_type': 'complex', 'html': '\n \n \n \n \n \n \n \n \n \n \n \n \n \n
123
4
567
\n '} + assert complex_table_type == {'table_type': 'complex', 'table_nest_level': '1', 'html': '\n \n \n \n \n \n \n \n \n \n \n \n \n \n
123
4
567
\n '} def test_table_to_content_list_node_simple(self): """测试table的 to content list node方法.""" @@ -151,7 +152,8 @@ def test_table_involve_equation(self): base_url = 'https://en.m.wikipedia.org/wiki/Variance' raw_html = raw_html_path.read_text(encoding='utf-8') parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) - assert parts is not None + complex_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}') + assert complex_table_tag[0].text == r'
Name of the probability distributionProbability distribution functionMeanVariance
Binomial distribution{\displaystyle \Pr \,(X=k)={\binom {n}{k}}p^{k}(1-p)^{n-k}}{\displaystyle np}{\displaystyle np(1-p)}
Geometric distribution{\displaystyle \Pr \,(X=k)=(1-p)^{k-1}p}{\displaystyle {\frac {1}{p}}}{\displaystyle {\frac {(1-p)}{p^{2}}}}
Normal distribution{\displaystyle f\left(x\mid \mu ,\sigma ^{2}\right)={\frac {1}{\sqrt {2\pi \sigma ^{2}}}}e^{-{\frac {(x-\mu )^{2}}{2\sigma ^{2}}}}}{\displaystyle \mu }{\displaystyle \sigma ^{2}}
Uniform distribution (continuous){\displaystyle f(x\mid a,b)={\begin{cases}{\frac {1}{b-a}}&{\text{for }}a\leq x\leq b,\\[3pt]0&{\text{for }}x<a{\text{ or }}x>b\end{cases}}}{\displaystyle {\frac {a+b}{2}}}{\displaystyle {\frac {(b-a)^{2}}{12}}}
Exponential distribution{\displaystyle f(x\mid \lambda )=\lambda e^{-\lambda x}}{\displaystyle {\frac {1}{\lambda }}}{\displaystyle {\frac {1}{\lambda ^{2}}}}
Poisson distribution{\displaystyle f(k\mid \lambda )={\frac {e^{-\lambda }\lambda ^{k}}{k!}}}{\displaystyle \lambda }{\displaystyle \lambda }
' def test_table_involve_after_code(self): """test table involve code, code被提取出去了,过滤掉空的和坏的table.""" @@ -161,3 +163,14 @@ def test_table_involve_after_code(self): raw_html = raw_html_path.read_text(encoding='utf-8') parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) assert html_to_element(parts[0][0]).xpath(f'.//{CCTag.CC_TABLE}')[0].text is None + + def test_table_involve_code(self): + """table involve code.""" + for test_case in TEST_CASES: + raw_html_path = base_dir.joinpath(test_case['input'][11]) + base_url = 'https://en.m.wikipedia.org/wiki/Variance' + raw_html = raw_html_path.read_text(encoding='utf-8') + parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) + complex_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}') + content = open('assets/recognizer/table_include_code_expect.json', 'r', encoding='utf-8').read() + assert complex_table_tag[0].text == content From e7c379248180ca57384269ed030a32eef7ddd6b6 Mon Sep 17 00:00:00 2001 From: quyuan Date: Tue, 25 Feb 2025 13:09:03 +0800 Subject: [PATCH 02/22] update extract table --- llm_web_kit/extractor/html/recognizer/table.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index b41f8834..3598aaba 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -128,10 +128,11 @@ def __is_table_nested(self, tree) -> int: # 计算祖先中的 table 数量(不包括自身),再加1表示自身层级 return len(tree.xpath('ancestor::table')) + 1 - def __extract_tables(self, ele: HtmlElement) -> list[tuple[str, str]]: + def __extract_tables(self, ele: str) -> list[tuple[str, str]]: """提取html中的table元素.""" - self.__do_extract_tables(ele) - new_html = self._element_to_html(ele) + tree = self._build_html_tree(ele) + self.__do_extract_tables(tree) + new_html = self._element_to_html(tree) lst = self.html_split_by_tags(new_html, CCTag.CC_TABLE) return lst From f0347ff6421dc53cf8906c6598b1d6f4b49e8308 Mon Sep 17 00:00:00 2001 From: quyuan Date: Tue, 25 Feb 2025 13:24:15 +0800 Subject: [PATCH 03/22] remove table tail --- tests/llm_web_kit/extractor/html/recognizer/test_table.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index 08f3492c..48c17998 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -87,8 +87,7 @@ def test_cc_simple_table(self): parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) assert len(parts) == 3 content = html_to_element(parts[1][0]).text_content() - assert content == r'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
Рейтинг:<br>\n<br>\n<br>\n<br>\nРейтинг<br>5.00<br>из 5 на основе опроса<br>3<br>пользователей<br>\n<br>\n<br>\n<br>\n
Тип товара:<br>\n<br>\n<br>\n<br>\nПрепараты для омоложения<br>\n<br>\n<br>\n<br>\n
Форма:<br>\n<br>\n<br>\n<br>\nКрем<br>\n<br>\n<br>\n<br>\n
Объем:<br>\n<br>\n<br>\n<br>\n50 мл<br>\n<br>\n<br>\n<br>\n
Рецепт:<br>\n<br>\n<br>\n<br>\nОтпускается без рецепта<br>\n<br>\n<br>\n<br>\n
Способ хранения:<br>\n<br>\n<br>\n<br>\nХранить при температуре 4-20°<br>\n<br>\n<br>\n<br>\n
Примечание:<br>\n<br>\n<br>\n<br>\nБеречь от детей<br>\n<br>\n<br>\n<br>\n
Оплата:<br>\n<br>\n<br>\n<br>\nНаличными/банковской картой<br>\n<br>\n<br>\n<br>\n
Доступность в Северске:<br>\n<br>\n<br>\n<br>\nВ наличии<br>\n<br>\n<br>\n<br>\n
Доставка:<br>\n<br>\n<br>\n<br>\n2-7 Дней<br>\n<br>\n<br>\n<br>\n
Цена:<br>\n<br>\n<br>\n<br>\n84<br>₽<br>\n<br>\n<br>\n<br>\n
\n' - + assert content == r"\n\n\n\n\n\n\n\n\n\n\n\n\n
Рейтинг:Рейтинг<br>5.00<br>3
Тип товара:Препараты для омоложения
Форма:Крем
Объем:50 мл
Рецепт:Отпускается без рецепта
Способ хранения:Хранить при температуре 4-20°
Примечание:Беречь от детей
Оплата:Наличными/банковской картой
Доступность в Северске:В наличии
Доставка:2-7 Дней
Цена:84<br>₽
" def test_cc_complex_table(self): """cc跨行跨列的表格.""" for test_case in TEST_CASES: From 5e176944beb0aa2b34b49a3a274380856c831bdd Mon Sep 17 00:00:00 2001 From: quyuan Date: Tue, 25 Feb 2025 13:34:44 +0800 Subject: [PATCH 04/22] normalize line endings --- llm_web_kit/extractor/html/recognizer/table.py | 2 +- tests/llm_web_kit/extractor/html/recognizer/test_table.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index 3598aaba..9d5dbb37 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -232,7 +232,7 @@ def __get_table_body(self, table_type, table_root): if elem.text: elem.text = elem.text.strip() if elem.tail: - elem.tail = elem.tail.strip() + elem.tail = None self.__simplify_td_th_content(table_root) # 迭代 diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index 48c17998..9f26c523 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -87,7 +87,8 @@ def test_cc_simple_table(self): parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) assert len(parts) == 3 content = html_to_element(parts[1][0]).text_content() - assert content == r"\n\n\n\n\n\n\n\n\n\n\n\n\n
Рейтинг:Рейтинг<br>5.00<br>3
Тип товара:Препараты для омоложения
Форма:Крем
Объем:50 мл
Рецепт:Отпускается без рецепта
Способ хранения:Хранить при температуре 4-20°
Примечание:Беречь от детей
Оплата:Наличными/банковской картой
Доступность в Северске:В наличии
Доставка:2-7 Дней
Цена:84<br>₽
" + assert content == r'\n\n\n\n\n\n\n\n\n\n\n\n\n
Рейтинг:Рейтинг<br>5.00<br>3
Тип товара:Препараты для омоложения
Форма:Крем
Объем:50 мл
Рецепт:Отпускается без рецепта
Способ хранения:Хранить при температуре 4-20°
Примечание:Беречь от детей
Оплата:Наличными/банковской картой
Доступность в Северске:В наличии
Доставка:2-7 Дней
Цена:84<br>₽
' + def test_cc_complex_table(self): """cc跨行跨列的表格.""" for test_case in TEST_CASES: From c15dea1fcdda4d59bfcb5b3a8b49a37c62cc7989 Mon Sep 17 00:00:00 2001 From: quyuan Date: Tue, 25 Feb 2025 14:02:39 +0800 Subject: [PATCH 05/22] update test case --- tests/llm_web_kit/extractor/html/recognizer/test_table.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index 9f26c523..b8b67029 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -26,7 +26,8 @@ 'expected': [ ('assets/recognizer/table_to_content_list_simple_res.json'), ('assets/recognizer/table_to_content_list_complex_res.json'), - ('assets/recognizer/table_include_image_expcet.json') + ('assets/recognizer/table_include_image_expcet.json'), + ('assets/recognizer/table_include_code_expect.json') ], } ] @@ -172,5 +173,6 @@ def test_table_involve_code(self): raw_html = raw_html_path.read_text(encoding='utf-8') parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) complex_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}') - content = open('assets/recognizer/table_include_code_expect.json', 'r', encoding='utf-8').read() + expect_path = base_dir.joinpath(test_case['expected'][3]) + content = open(expect_path, 'r', encoding='utf-8').read() assert complex_table_tag[0].text == content From d34a8a7416f3b238b49ea81ff5eff6ee37a396b7 Mon Sep 17 00:00:00 2001 From: quyuan Date: Tue, 25 Feb 2025 14:45:30 +0800 Subject: [PATCH 06/22] update format --- llm_web_kit/extractor/html/extractor.py | 2 +- llm_web_kit/extractor/html/recognizer/table.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/llm_web_kit/extractor/html/extractor.py b/llm_web_kit/extractor/html/extractor.py index 1d3facb3..bc3fe05b 100644 --- a/llm_web_kit/extractor/html/extractor.py +++ b/llm_web_kit/extractor/html/extractor.py @@ -93,7 +93,7 @@ def _do_extract(self, data_json: DataJson) -> DataJson: main_html, method = self._extract_main_html(raw_html, base_url, page_layout_type) parsed_html = [(main_html,raw_html)] - for extract_func in [self._extract_code, self._extract_table, self._extract_math, self._extract_list, + for extract_func in [self._extract_table, self._extract_code, self._extract_math, self._extract_list, self._extract_image, self._extract_title, self._extract_paragraph]: parsed_html = extract_func(base_url, parsed_html, raw_html) diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index 9d5dbb37..64528ea2 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -128,7 +128,7 @@ def __is_table_nested(self, tree) -> int: # 计算祖先中的 table 数量(不包括自身),再加1表示自身层级 return len(tree.xpath('ancestor::table')) + 1 - def __extract_tables(self, ele: str) -> list[tuple[str, str]]: + def __extract_tables(self, ele: str) -> list[Tuple[str, str]]: """提取html中的table元素.""" tree = self._build_html_tree(ele) self.__do_extract_tables(tree) @@ -233,7 +233,6 @@ def __get_table_body(self, table_type, table_root): elem.text = elem.text.strip() if elem.tail: elem.tail = None - self.__simplify_td_th_content(table_root) # 迭代 for child in table_root.iterchildren(): @@ -258,7 +257,7 @@ def __do_extract_tables(self, root: HtmlElement) -> None: for child in root.iterchildren(): self.__do_extract_tables(child) - def __get_attribute(self, html: str) -> tuple[bool, Any, Any]: + def __get_attribute(self, html: str) -> Tuple[bool, Any, Any]: """获取element的属性.""" ele = self._build_html_tree(html) if ele is not None and ele.tag == CCTag.CC_TABLE: From 87a24954be0bfe65f42dfd6d6df559661a02c928 Mon Sep 17 00:00:00 2001 From: quyuan Date: Tue, 25 Feb 2025 14:48:59 +0800 Subject: [PATCH 07/22] update format --- llm_web_kit/extractor/html/recognizer/table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index 64528ea2..232573ea 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -128,7 +128,7 @@ def __is_table_nested(self, tree) -> int: # 计算祖先中的 table 数量(不包括自身),再加1表示自身层级 return len(tree.xpath('ancestor::table')) + 1 - def __extract_tables(self, ele: str) -> list[Tuple[str, str]]: + def __extract_tables(self, ele: str) -> List[Tuple[str, str]]: """提取html中的table元素.""" tree = self._build_html_tree(ele) self.__do_extract_tables(tree) From 98610905a5a36e778fd85631e0fa8ffb8f9d68e9 Mon Sep 17 00:00:00 2001 From: quyuan Date: Tue, 25 Feb 2025 14:59:22 +0800 Subject: [PATCH 08/22] update format --- llm_web_kit/extractor/html/extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_web_kit/extractor/html/extractor.py b/llm_web_kit/extractor/html/extractor.py index bc3fe05b..1d3facb3 100644 --- a/llm_web_kit/extractor/html/extractor.py +++ b/llm_web_kit/extractor/html/extractor.py @@ -93,7 +93,7 @@ def _do_extract(self, data_json: DataJson) -> DataJson: main_html, method = self._extract_main_html(raw_html, base_url, page_layout_type) parsed_html = [(main_html,raw_html)] - for extract_func in [self._extract_table, self._extract_code, self._extract_math, self._extract_list, + for extract_func in [self._extract_code, self._extract_table, self._extract_math, self._extract_list, self._extract_image, self._extract_title, self._extract_paragraph]: parsed_html = extract_func(base_url, parsed_html, raw_html) From a77735f93a337c181e01cc3af3c03b2f691058b8 Mon Sep 17 00:00:00 2001 From: quyuan Date: Tue, 25 Feb 2025 18:01:25 +0800 Subject: [PATCH 09/22] change parse order --- llm_web_kit/extractor/html/extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_web_kit/extractor/html/extractor.py b/llm_web_kit/extractor/html/extractor.py index 1d3facb3..bc3fe05b 100644 --- a/llm_web_kit/extractor/html/extractor.py +++ b/llm_web_kit/extractor/html/extractor.py @@ -93,7 +93,7 @@ def _do_extract(self, data_json: DataJson) -> DataJson: main_html, method = self._extract_main_html(raw_html, base_url, page_layout_type) parsed_html = [(main_html,raw_html)] - for extract_func in [self._extract_code, self._extract_table, self._extract_math, self._extract_list, + for extract_func in [self._extract_table, self._extract_code, self._extract_math, self._extract_list, self._extract_image, self._extract_title, self._extract_paragraph]: parsed_html = extract_func(base_url, parsed_html, raw_html) From 419b2c1024efc37ce15d864bc0d66615bcff6f53 Mon Sep 17 00:00:00 2001 From: quyuan Date: Tue, 25 Feb 2025 21:32:30 +0800 Subject: [PATCH 10/22] add list nest level --- llm_web_kit/extractor/html/recognizer/list.py | 21 ++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/list.py b/llm_web_kit/extractor/html/recognizer/list.py index 1dbea3fc..315b8ac8 100644 --- a/llm_web_kit/extractor/html/recognizer/list.py +++ b/llm_web_kit/extractor/html/recognizer/list.py @@ -1,5 +1,5 @@ import json -from typing import List, Tuple +from typing import List, Tuple, Any from lxml.etree import _Element as HtmlElement from overrides import override @@ -88,16 +88,16 @@ def __do_extract_list(self, root:HtmlElement) -> None: list_tag_names = ['ul', 'ol', 'dl', 'menu', 'dir'] if root.tag in list_tag_names: - is_ordered, content_list, raw_html, tail_text = self.__extract_list_element(root) + list_nest_level, is_ordered, content_list, raw_html, tail_text = self.__extract_list_element(root) text = json.dumps(content_list, ensure_ascii=False, indent=4) - cc_element = self._build_cc_element(CCTag.CC_LIST, text, tail_text, ordered=is_ordered, html=raw_html) + cc_element = self._build_cc_element(CCTag.CC_LIST, text, tail_text, ordered=is_ordered, list_nest_level=list_nest_level, html=raw_html) self._replace_element(root, cc_element) # cc_element 替换掉原来的列表元素 return for child in root.iterchildren(): self.__do_extract_list(child) - def __extract_list_element(self, ele: HtmlElement) -> Tuple[bool, list, str, str]: + def __extract_list_element(self, ele: HtmlElement) -> tuple[int, bool, list[list[list]], str, Any]: """ 提取列表元素: 假如有如下列表: @@ -135,6 +135,7 @@ def __extract_list_element(self, ele: HtmlElement) -> Tuple[bool, list, str, str (bool, str, str): 第一个元素是是否有序; 第二个元素是个python list,内部是文本和行内公式,具体格式参考list的content_list定义。第三个元素是列表原始的html内容 """ is_ordered = ele.tag in ['ol', 'dl'] + list_nest_level = self.__get_list_type(ele) tail_text = ele.tail content_list = [] raw_html = self._element_to_html(ele) @@ -144,7 +145,17 @@ def __extract_list_element(self, ele: HtmlElement) -> Tuple[bool, list, str, str text_paragraph = self.__extract_list_item_text(item) content_list.append(text_paragraph) - return is_ordered, content_list, raw_html, tail_text + return list_nest_level, is_ordered, content_list, raw_html, tail_text + + def __get_list_type(self, list_ele:HtmlElement) -> int: + """ + 获取list嵌套的类型 + """ + if list_ele.tag not in ['ul', 'ol', 'dl', 'menu', 'dir']: + return 0 + ancestor_count = list_ele.xpath('count(ancestor::ul | ancestor::ol)') + # 层级 = 祖先列表数量 + 自身(1层) + return int(ancestor_count) + 1 def __extract_list_item_text(self, root:HtmlElement) -> list[list]: """提取列表项的文本. From c40b1ead2135c737a3f3c8943b7a54eea7f09595 Mon Sep 17 00:00:00 2001 From: quyuan Date: Tue, 25 Feb 2025 21:43:47 +0800 Subject: [PATCH 11/22] fix pylint --- llm_web_kit/extractor/html/recognizer/list.py | 6 ++---- .../assets/recognizer/table_include_code_expect.json | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/list.py b/llm_web_kit/extractor/html/recognizer/list.py index 315b8ac8..d564d41e 100644 --- a/llm_web_kit/extractor/html/recognizer/list.py +++ b/llm_web_kit/extractor/html/recognizer/list.py @@ -1,5 +1,5 @@ import json -from typing import List, Tuple, Any +from typing import Any, List, Tuple from lxml.etree import _Element as HtmlElement from overrides import override @@ -148,9 +148,7 @@ def __extract_list_element(self, ele: HtmlElement) -> tuple[int, bool, list[list return list_nest_level, is_ordered, content_list, raw_html, tail_text def __get_list_type(self, list_ele:HtmlElement) -> int: - """ - 获取list嵌套的类型 - """ + """获取list嵌套的类型.""" if list_ele.tag not in ['ul', 'ol', 'dl', 'menu', 'dir']: return 0 ancestor_count = list_ele.xpath('count(ancestor::ul | ancestor::ol)') diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_include_code_expect.json b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_include_code_expect.json index 15a9cf34..4f6fc9ed 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_include_code_expect.json +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_include_code_expect.json @@ -296,4 +296,4 @@ doGet(request,response); } publicvoidinit()throwsServletException { } -}
\ No newline at end of file +} From 6c7ca2dddf0c29f772bc1e0dcd0df99ec0b9d545 Mon Sep 17 00:00:00 2001 From: quyuan Date: Thu, 27 Feb 2025 16:38:35 +0800 Subject: [PATCH 12/22] update table nest spec.md --- .../output_format/content_list_spec.md | 25 +++++++++++-------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/docs/specification/output_format/content_list_spec.md b/docs/specification/output_format/content_list_spec.md index 5c27c663..8bbecc32 100644 --- a/docs/specification/output_format/content_list_spec.md +++ b/docs/specification/output_format/content_list_spec.md @@ -83,7 +83,8 @@ "html": "
12
", "title": "example table", "note": "数据来源于...", - "is_complex": false // 是否是复杂表格(跨行、跨列的, 默认为false + "is_complex": false // 是否是复杂表格(跨行、跨列的/嵌套表格, 默认为false, + "table_nest_level": 1 //table的嵌套层级 } }, { @@ -285,20 +286,22 @@ "html": "
12
", "title": "example table", "note": "数据来源于...", - "is_complex": false // 是否是复杂表格(跨行、跨列的, 默认为false + "is_complex": false // 是否是复杂表格(跨行、跨列的, 默认为false, + "table_nest_level": 1 //表格嵌套层级 } } ``` -| 字段 | 类型 | 描述 | 是否必须 | -| ------------------ | ------- | ---------------------------------------- | -------- | -| type | string | 值固定为table | 是 | -| bbox | array | \[x1, y1, x2, y2\] | 可选 | -| raw_content | string | 原始文本内容 | 可选 | -| content.html | string | 表格的html内容 | 是 | -| content.title | string | 表格的title属性 | 可选 | -| content.note | string | 表格的note属性 | 可选 | -| content.is_complex | boolean | 是否是复杂表格(跨行、跨列的, 默认为false | 可选 | +| 字段 | 类型 | 描述 | 是否必须 | +| ------------------------ | ------- | ------------------------------------------------- | -------- | +| type | string | 值固定为table | 是 | +| bbox | array | \[x1, y1, x2, y2\] | 可选 | +| raw_content | string | 原始文本内容 | 可选 | +| content.html | string | 表格的html内容 | 是 | +| content.title | string | 表格的title属性 | 可选 | +| content.note | string | 表格的note属性 | 可选 | +| content.is_complex | boolean | 是否是复杂表格(跨行、跨列的/嵌套表格, 默认为false | 可选 | +| content.table_nest_level | int | table嵌套层级(单个table为1,两层为2,以此类推) | 可选 | ### 列表段 From 9e1545293c7d4d6ec10362312a2adb9db6700f6b Mon Sep 17 00:00:00 2001 From: quyuan Date: Mon, 3 Mar 2025 12:36:38 +0800 Subject: [PATCH 13/22] update parse order --- llm_web_kit/extractor/html/extractor.py | 2 +- .../table_involve_complex_code.html | 237 ++++++++++++++++++ .../extractor/html/recognizer/test_code.py | 2 +- .../extractor/html/recognizer/test_table.py | 17 +- 4 files changed, 254 insertions(+), 4 deletions(-) create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_involve_complex_code.html diff --git a/llm_web_kit/extractor/html/extractor.py b/llm_web_kit/extractor/html/extractor.py index bc3fe05b..1d3facb3 100644 --- a/llm_web_kit/extractor/html/extractor.py +++ b/llm_web_kit/extractor/html/extractor.py @@ -93,7 +93,7 @@ def _do_extract(self, data_json: DataJson) -> DataJson: main_html, method = self._extract_main_html(raw_html, base_url, page_layout_type) parsed_html = [(main_html,raw_html)] - for extract_func in [self._extract_table, self._extract_code, self._extract_math, self._extract_list, + for extract_func in [self._extract_code, self._extract_table, self._extract_math, self._extract_list, self._extract_image, self._extract_title, self._extract_paragraph]: parsed_html = extract_func(base_url, parsed_html, raw_html) diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_involve_complex_code.html b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_involve_complex_code.html new file mode 100644 index 00000000..b929d7e0 --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_involve_complex_code.html @@ -0,0 +1,237 @@ + + + + ClientNetworkWrapper.java (Example JavaDoc) + + + + + + + + + + + + + + +
+

+ + + + + + + + + + + + + + + + + +
FileDocCategorySizeDatePackage
ClientNetworkWrapper.javaAPI DocExample2389Thu Nov 08 00:23:44 GMT 2001com.ora.rmibook.chapter3
+

ClientNetworkWrapper

+ public class ClientNetworkWrapper extends NetworkBaseClass implements + PrinterConstants + + + + +
+ + + + +
+
+
+

+ + + + + + + + + + + + + +
Fields Summary
private String +
_serverMachine
+
+
private int +
_serverPort
+
+
+ + + + + + + + +
Constructors Summary
public ClientNetworkWrapper()
+

+ + + + +
+
+
+

+        this (DEFAULT_SERVER_NAME, DEFAULT_SERVER_PORT);
+    
Test Test Test
ABC
+DEF
TEST TEST TEST
+
+
public ClientNetworkWrapper(String + serverMachine, int serverPort) +
+

+ + + + +
+
+
+

+        _serverMachine = serverMachine;
+        _serverPort = serverPort;
+    
+
+
+ + + + + + + + + + + + + + + + +
Methods Summary
private voidreadStatusFromSocket(java.net.Socket + connection) +
+

+ + + + +
+
+
+

+        InputStream inputStream = connection.getInputStream();
+        DataInputStream dataInputStream = new DataInputStream(inputStream);
+        BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
+        boolean response = dataInputStream.readBoolean();
+
+        if (response) {
+            return;
+        }
+        PrinterException error = new PrinterException(inputStream);
+
+        throw error;
+    
+
+
public voidsendDocumentToPrinter(java.io.InputStream actualDocument) +
+

+ + + + +
+
+
+

+        sendDocumentToPrinter(actualDocument, DEFAULT_DOCUMENT_TYPE,
+            DEFAULT_PRINT_TWO_SIDED, DEFAULT_PRINT_QUALITY);
+    
+
+
public voidsendDocumentToPrinter(java.io.InputStream actualDocument, int documentType, boolean printTwoSided, + int printQuality) +
+

+ + + + +
+
+
+

+        DocumentDescription documentToSend;
+
+        try {
+            documentToSend = new DocumentDescription(actualDocument, documentType, printTwoSided, printQuality);
+        } catch (IOException e) {
+            throw new ConnectionException();
+        }
+        sendDocumentToPrinter(documentToSend);
+    
+
+
public voidsendDocumentToPrinter(DocumentDescription documentDescription) +
+

+ + + + +
+
+
+

+        Socket connection = null;
+
+        try {
+            connection = new Socket(_serverMachine, _serverPort);
+            documentDescription.writeToStream(connection.getOutputStream());
+            readStatusFromSocket(connection);
+        } catch (IOException e) {
+            e.printStackTrace();
+            throw new ConnectionException();
+        }
+        closeSocket(connection);
+    
+
+
+

+ + + + \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_code.py b/tests/llm_web_kit/extractor/html/recognizer/test_code.py index 40f758c1..143591b1 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_code.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_code.py @@ -268,7 +268,7 @@ def test_code_rec(self): raw_html_path = base_dir.joinpath(test_case['input'][0]) base_url = test_case['input'][1] print(base_url) - raw_html = raw_html_path.read_text() + raw_html = raw_html_path.read_text(encoding="utf-8") parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) parts = [ part[0] diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index b8b67029..e92e7297 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -20,7 +20,8 @@ 'assets/recognizer/table_include_rowspan_colspan.html', 'assets/recognizer/table_involve_equation.html', 'assets/recognizer/table_include_after_code.html', - 'assets/recognizer/table_involve_code.html' + 'assets/recognizer/table_involve_code.html', + 'assets/recognizer/table_involve_complex_code.html' ), 'expected': [ @@ -175,4 +176,16 @@ def test_table_involve_code(self): complex_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}') expect_path = base_dir.joinpath(test_case['expected'][3]) content = open(expect_path, 'r', encoding='utf-8').read() - assert complex_table_tag[0].text == content + assert complex_table_tag[0].text == content.strip("\n") + + def test_table_involve_complex_code(self): + """table involve complex code""" + for test_case in TEST_CASES: + raw_html_path = base_dir.joinpath(test_case['input'][12]) + base_url = 'https://en.m.wikipedia.org/wiki/Variance' + raw_html = raw_html_path.read_text(encoding='utf-8') + parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) + complex_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}') + expect_path = base_dir.joinpath(test_case['expected'][3]) + content = open(expect_path, 'r', encoding='utf-8').read() + assert complex_table_tag[0].text == content.strip("\n") From 4a61728b43ce10d3230f8e54900ed39931f865e4 Mon Sep 17 00:00:00 2001 From: quyuan Date: Mon, 3 Mar 2025 12:49:57 +0800 Subject: [PATCH 14/22] update parse order --- tests/llm_web_kit/extractor/html/recognizer/test_code.py | 2 +- tests/llm_web_kit/extractor/html/recognizer/test_table.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_code.py b/tests/llm_web_kit/extractor/html/recognizer/test_code.py index 143591b1..5b55ed42 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_code.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_code.py @@ -268,7 +268,7 @@ def test_code_rec(self): raw_html_path = base_dir.joinpath(test_case['input'][0]) base_url = test_case['input'][1] print(base_url) - raw_html = raw_html_path.read_text(encoding="utf-8") + raw_html = raw_html_path.read_text(encoding='utf-8') parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) parts = [ part[0] diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index e92e7297..87ccbce8 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -176,10 +176,10 @@ def test_table_involve_code(self): complex_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}') expect_path = base_dir.joinpath(test_case['expected'][3]) content = open(expect_path, 'r', encoding='utf-8').read() - assert complex_table_tag[0].text == content.strip("\n") + assert complex_table_tag[0].text == content.strip('\n') def test_table_involve_complex_code(self): - """table involve complex code""" + """table involve complex code.""" for test_case in TEST_CASES: raw_html_path = base_dir.joinpath(test_case['input'][12]) base_url = 'https://en.m.wikipedia.org/wiki/Variance' @@ -188,4 +188,4 @@ def test_table_involve_complex_code(self): complex_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}') expect_path = base_dir.joinpath(test_case['expected'][3]) content = open(expect_path, 'r', encoding='utf-8').read() - assert complex_table_tag[0].text == content.strip("\n") + assert complex_table_tag[0].text == content.strip('\n') From 1b0e1e92993c9ff587db304e84f9a23cc3756acb Mon Sep 17 00:00:00 2001 From: quyuan Date: Mon, 3 Mar 2025 13:37:23 +0800 Subject: [PATCH 15/22] update parse order --- tests/llm_web_kit/extractor/html/recognizer/test_code.py | 2 +- tests/llm_web_kit/extractor/html/recognizer/test_table.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_code.py b/tests/llm_web_kit/extractor/html/recognizer/test_code.py index 5b55ed42..40f758c1 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_code.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_code.py @@ -268,7 +268,7 @@ def test_code_rec(self): raw_html_path = base_dir.joinpath(test_case['input'][0]) base_url = test_case['input'][1] print(base_url) - raw_html = raw_html_path.read_text(encoding='utf-8') + raw_html = raw_html_path.read_text() parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) parts = [ part[0] diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index 87ccbce8..e569d340 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -178,6 +178,7 @@ def test_table_involve_code(self): content = open(expect_path, 'r', encoding='utf-8').read() assert complex_table_tag[0].text == content.strip('\n') + @unittest.skip(reason='在code模块解决了这个问题') def test_table_involve_complex_code(self): """table involve complex code.""" for test_case in TEST_CASES: From 78ca0283c79ca84f05b5bdb1f1d87b0a4eb5ddfa Mon Sep 17 00:00:00 2001 From: quyuan Date: Mon, 3 Mar 2025 14:34:59 +0800 Subject: [PATCH 16/22] =?UTF-8?q?update=20list=E6=A0=87=E5=87=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../output_format/content_list_spec.md | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/docs/specification/output_format/content_list_spec.md b/docs/specification/output_format/content_list_spec.md index 8bbecc32..f04e2c85 100644 --- a/docs/specification/output_format/content_list_spec.md +++ b/docs/specification/output_format/content_list_spec.md @@ -108,7 +108,8 @@ ] ] ], - "ordered": true + "ordered": true, + "list_nest_level": 1 //list的嵌套层级 } } ], @@ -327,18 +328,20 @@ ] ] ], - "ordered": true + "ordered": true, + "list_nest_level": 1 //list嵌套层级 } } ``` -| 字段 | 类型 | 描述 | 是否必须 | -| --------------- | ------- | --------------------------------------------------- | -------- | -| type | string | 值固定为list | 是 | -| bbox | array | \[x1, y1, x2, y2\] | 可选 | -| raw_content | string | 原始文本内容 | 可选 | -| content.items | array | 列表项,每个元素是N个段落,段落里的元素是文本或公式 | 是 | -| content.ordered | boolean | 是否是有序列表 | 可选 | +| 字段 | 类型 | 描述 | 是否必须 | +| ----------------------- | ------- | --------------------------------------------------- | -------- | +| type | string | 值固定为list | 是 | +| bbox | array | \[x1, y1, x2, y2\] | 可选 | +| raw_content | string | 原始文本内容 | 可选 | +| content.items | array | 列表项,每个元素是N个段落,段落里的元素是文本或公式 | 是 | +| content.ordered | boolean | 是否是有序列表 | 可选 | +| content.list_nest_level | int | list的嵌套层级(单层list list_nest_level为1) | 可选 | items字段说明 From efcd7a21d99878c4f99d2bac3acfdeabb9d3d7f8 Mon Sep 17 00:00:00 2001 From: quyuan Date: Mon, 3 Mar 2025 15:55:38 +0800 Subject: [PATCH 17/22] add table involve inline code --- .../html/table_involve_inline_code.html | 26 +++++++++++++++++++ .../good_data/html_data_input.jsonl | 1 + .../extractor/test_extractor_chain.py | 15 +++++++++++ 3 files changed, 42 insertions(+) create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_involve_inline_code.html diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_involve_inline_code.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_involve_inline_code.html new file mode 100644 index 00000000..0f927ee3 --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_involve_inline_code.html @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + + + + + + + +
FunctionDescriptionExample
print()Prints a message to the console.print("Hello, World!")
len()Returns the length of an object.len([1, 2, 3])
range()Generates a sequence of numbers.range(1, 10)
diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl index 1efe87b6..5f08bdbf 100644 --- a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl @@ -9,3 +9,4 @@ {"track_id": "rfc-doc", "dataset_name": "test_pipeline_suit", "url": "https://www.test.com","data_source_category": "HTML", "path":"doc.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} {"track_id": "legato_doc", "dataset_name": "test_pipeline_suit", "url": "https://www.test.com","data_source_category": "HTML", "path":"legato_docs.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} {"track_id": "oracle_doc", "dataset_name": "test_pipeline_suit", "url": "https://docs.oracle.com/en-us/iaas/tools/java/3.57.1/com/oracle/bmc/integration/model/CustomEndpointDetails.html","data_source_category": "HTML", "path":"oracle_doc.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} +{"track_id": "table_involve_inline_code", "dataset_name": "test_table_involve_inline_code", "url": "https://docs.oracle.com/en-us/iaas/tools/java/3.57.1/com/oracle/bmc/integration/model/CustomEndpointDetails.html","data_source_category": "HTML", "path":"table_involve_inline_code.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index 4972673b..a6671f4f 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -344,3 +344,18 @@ def test_oracle_doc_comment(self): result = chain.extract(input_data) main_html = result.get_content_list().to_main_html() assert 'public int hashCode()' in main_html + + def test_table_involve_inline_code(self): + """ + table里面包含行内code + Returns: + + """ + chain = ExtractSimpleFactory.create(self.config) + self.assertIsNotNone(chain) + test_data = self.data_json[11] + # Create DataJson from test data + input_data = DataJson(test_data) + result = chain.extract(input_data) + content_list = result.get_content_list()._get_data() + print(content_list) From 0776f6efa0a118bad026147d923052cb662ae3a0 Mon Sep 17 00:00:00 2001 From: quyuan Date: Mon, 3 Mar 2025 16:58:41 +0800 Subject: [PATCH 18/22] add test case --- tests/llm_web_kit/extractor/test_extractor_chain.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index a6671f4f..ef596204 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -357,5 +357,5 @@ def test_table_involve_inline_code(self): # Create DataJson from test data input_data = DataJson(test_data) result = chain.extract(input_data) - content_list = result.get_content_list()._get_data() - print(content_list) + content_list = result.get_content_list()._get_data()[0][0]['content']['html'] + assert content_list == """
FunctionDescriptionExample
print()Prints a message to the console.print("Hello, World!")
len()Returns the length of an object.len([1, 2, 3])
range()Generates a sequence of numbers.range(1, 10)
""" From 3fda2a69efa527e95275d39dde6f3e26df4045fc Mon Sep 17 00:00:00 2001 From: quyuan Date: Mon, 3 Mar 2025 17:07:10 +0800 Subject: [PATCH 19/22] fix test case --- tests/llm_web_kit/extractor/test_extractor_chain.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index ef596204..884c9860 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -59,7 +59,7 @@ def setUp(self): for line in f: self.data_json.append(json.loads(line.strip())) - assert len(self.data_json) == 11 + assert len(self.data_json) == 12 # Config for HTML extraction self.config = { From e0196bfbb27f473cdd0c45ad2b48342d20e8754b Mon Sep 17 00:00:00 2001 From: dt-yy Date: Wed, 5 Mar 2025 16:28:59 +0800 Subject: [PATCH 20/22] add table tail --- .../extractor/html/recognizer/table.py | 35 +- .../good_data/html/table_tail_text.html | 367 ++++++++++++++++++ .../good_data/html_data_input.jsonl | 3 +- .../extractor/test_extractor_chain.py | 13 +- 4 files changed, 405 insertions(+), 13 deletions(-) create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_tail_text.html diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index fa24dd6d..6908398e 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -196,11 +196,22 @@ def __check_table_include_math_code(self, raw_html: HtmlElement): ] ele_res.extend(ccinterline_codes) else: - ele_res.extend([ - text.strip() - for text in self._build_html_tree(math_item[1]).itertext() - if text.strip() - ]) + tree = self._build_html_tree(math_item[1]) + texts = [] + for element in tree.iter(): + if element.text and element.text.strip(): + text = element.text.strip() + # 如果有tail,直接拼接到text后面 + if element.tail and element.tail.strip(): + text += element.tail.strip() + texts.append(text) + elif element.tail and element.tail.strip(): + # 如果只有tail且前面有内容,则拼接到最后一个text + if texts: + texts[-1] += element.tail.strip() + else: + texts.append(element.tail.strip()) + ele_res.extend(texts) return ele_res def __simplify_td_th_content(self, elem: HtmlElement) -> None: @@ -212,7 +223,8 @@ def __simplify_td_th_content(self, elem: HtmlElement) -> None: parse_res.extend(math_res) for item in list(elem.iterchildren()): elem.remove(item) - elem.text = '
'.join(parse_res) + if parse_res: + elem.text = '
'.join(parse_res) return for child in elem.iter('td', 'th'): self.__simplify_td_th_content(child) @@ -227,18 +239,19 @@ def __get_table_body(self, table_type, table_root): cleaned_attrs = {k: v for k, v in table_root.attrib.items() if k in allowed_attributes} table_root.attrib.clear() table_root.attrib.update(cleaned_attrs) - # text进行strip操作,tail去掉(有较多空换行) + # text进行strip操作,tail保留(部分内容留在tail中) for elem in chain([table_root], table_root.iterdescendants()): - if elem.text: + if elem.text is not None: elem.text = elem.text.strip() - if elem.tail: - elem.tail = None + if elem.tail is not None: + elem.tail = elem.tail.strip() + if not elem.tail: + elem.tail = None self.__simplify_td_th_content(table_root) # 迭代 for child in table_root.iterchildren(): if child is not None: self.__get_table_body(table_type, child) - return self._element_to_html(table_root) def __do_extract_tables(self, root: HtmlElement) -> None: diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_tail_text.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_tail_text.html new file mode 100644 index 00000000..4044b9a3 --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_tail_text.html @@ -0,0 +1,367 @@ + + + + + + + + + 🇷🇺 | Show hub - Big-Empty DC++ Dchublist NMDC and ADCs хабов Huburi Хаблист + + + + + + + + + + + + + + + + + + + + + + + + +
+ +

Big-Empty

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Client + https://dchublists.com/clients/FlylinkDC_x64.exe +
StatusOnline | ID: 975
URL + https://dchublists.com/hub-975 +
Address +NMDC | dchub://big-empty.ru +
ASN + Style-Com LLC +
Failover + Not available +
NameBig-Empty
Topic + Not available +
Description + Хаб сети Arbital +
Category + Not available +
Software + PtokaX 0.5.3.0 +
Owner + Self +
Location + RU Russian Federation +
Users + 25 | 55 +
Clones0
Share + 4.39 TB | 90.60 TB +
User limit10000
Share limit0 B
Slot limit0
Hub limit0
Reliability99.04%
Checked + 2024-12-09 03:06:01 | 2021-05-07 +
Votes + +0 | -0 | 0 +
Website + Not available +
Email + Not available +
+
+

Online users

+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NickShare
Darv1n1.55 TB
PtokaX0 B
1975628.43 GB
AndyDesktop0 B
Crtyujgfdscvgjh35.54 GB
DaymarixZZZ37.57 GB
Evgeniy_D76.15 GB
Julia0 B
Kuzma0 B
Larsenv0 B
MAXMED8888888864.10 GB
Qwerty_ytr_R724237.12 GB
SERG_B149.65 GB
Sculli156.92 GB
Shareaza404613.03 GB
Soliton14.68 GB
Sweaborg794.15 GB
Viktor138283179.23 GB
[fly]Fire_dU3JR10.72 GB
[fly]Monkey_QGrFy124.72 GB
[fly]Moon_x7m61.13 GB
kotbaun0 B
marcs3.62 GB
minili59.30 GB
y2b4k698df328djei3261.82 GB
+
+
+ +

Comments

+ There are no comments for this hub, you can write one here. +
+
+ + + + +