diff --git a/llm_web_kit/extractor/html/recognizer/recognizer.py b/llm_web_kit/extractor/html/recognizer/recognizer.py index d8a4b7c8..736b3637 100644 --- a/llm_web_kit/extractor/html/recognizer/recognizer.py +++ b/llm_web_kit/extractor/html/recognizer/recognizer.py @@ -5,6 +5,7 @@ from lxml.html import HtmlElement, HTMLParser from llm_web_kit.libs.html_utils import (build_cc_element, element_to_html, + element_to_html_unescaped, html_to_element, replace_element) from llm_web_kit.libs.logger import mylogger @@ -90,6 +91,10 @@ def _element_to_html(self, element: HtmlElement) -> str: """ return element_to_html(element) + def _element_to_html_entity(self, element: HtmlElement) -> str: + """将element转换成html字符串.""" + return element_to_html_unescaped(element) + def _build_cc_element(self, html_tag_name: str, text: str, tail: str, **kwargs) -> HtmlElement: """构建cctitle的html. 例如:标题1 diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index 28694c7f..db9351a0 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -255,7 +255,7 @@ def __get_table_body(self, table_type, table_nest_level, table_root): for child in table_root.iterchildren(): if child is not None: self.__get_table_body(table_type, table_nest_level, child) - return self._element_to_html(table_root) + return self._element_to_html_entity(table_root) def __do_extract_tables(self, root: HtmlElement) -> None: """递归处理所有子标签.""" diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_include_entity.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_include_entity.html new file mode 100644 index 00000000..f9b20e14 --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/table_include_entity.html @@ -0,0 +1,1689 @@ + + + + + trigonometry - Perfectly centered break of a perfectly aligned pool ball rack - Mathematics Stack Exchange + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+ + + + +
+
+ +
+ +
+ + + +
+

+your communities

+ +
+ + + + +
+
+ + + + +
+
+ + +
+ + + + + +
+ + +
+ + + +
+
+ + Take the 2-minute tour + × + +
+ Mathematics Stack Exchange is a question and answer site for people studying math at any level and professionals in related fields. It's 100% free, no registration required. +
+
+ +
+ + +
+ + + +
+ + + + + + + + + + + +
+ + +
+ + up vote + 20 + down vote + + favorite +
7
+ +
+ +
+
+
+ +

This question is asked on Physics SE and MathOverflow by somebody else. I don't think it belongs there, but rather here (for reasons given there in my comments there; edit: now self-removed).

+ +
+

Imagine the beginning of a game of pool, you have 16 balls, 15 of them in a triangle <| and 1 of them being the cue ball off to the left of that triangle. Imagine that the rack (the 15 balls in a triangle) has every ball equally spaced apart and all balls touching all other appropriate balls. All balls are perfectly round. Now, imagine that the cue ball was hit along a friction free surface on the center axis for this triangle O-------<| and hits the far left ball of the rack dead center on this axis. How would the rack react? I would imagine this would be an extension of newtons cradle and only the 5 balls on the far end would move at all. But in what way would they move? Thanks

+
+ +
+ + + + + + + +
+
share|improve this question
+
+ + +
+
+
+
+ + + + + + + + + + +
+ + + + + + + +
+    + +   +
+
+
+ Great question, spectacular answer! A related, earlier question is here: ∃ a shot in ideal pocket billiards? + –  + Joseph O'Rourke + Feb 6 '14 at 18:24 +
+
+
+ +
+
+ +
+ + +
+
+

+ 2 Answers + 2 +

+
+ +
+
+
+ + + + + + +
+ + + + + + + + + + + + +
+ + +
+ + up vote + 61 + down vote + + + + accepted +
+ +
+
+

This is it.  The perfectly centered billiards break.  Behold.

+ +

enter image description here

+ +

Setup

+ +

This break was computed in Mathematica using a numerical differential equations model. Here are a few details of the model:

+ +
    +
  • All balls are assumed to be perfectly elastic and almost perfectly rigid.
  • +
  • Each ball has a mass of 1 unit and a radius of 1 unit.
  • +
  • The cue ball has a initial speed of 10 units/sec.
  • +
  • The force between two balls is given by the formula +$$ +F \;=\; \begin{cases}0 & \text{if }d \geq 2, \\ 10^{11}(2-d)^{3/2} & \text{if }d<2,\end{cases} +$$ +where $d$ is the distance between the centers of the balls. Note that the balls overlap if and only if $d < 2$. The power of $3/2$ was suggested by Yoav Kallus on Math Overflow, because it follows Hertz's theory of non-adhesive elastic contact.
  • +
+ +

The initial speed of the cue ball is immaterial -- slowing down the cue ball is the same as slowing down time. The force constant $10^{11}$ has no real effect as long as it's large enough, although it does change the speed at which the initial collision takes place.

+ +

The Collision

+ +

For this model, the entire collision takes place in the first 0.2 milliseconds, and none of the balls overlap by more than 0.025% of their radius during the collision. (These figures are model dependent -- real billiard balls may collide faster or slower than this.)

+ +

The following animation shows the forces between the balls during the collision, with the force proportional to the area of each yellow circle. Note that the balls themselves hardly move at all during the collision, although they do accelerate quite a bit.

+ +

enter image description here

+ +

The Trajectories

+ +

The following picture shows the trajectories of the billiard balls after the collision.

+ +

enter image description here

+ +

After the collision, some of the balls are travelling considerably faster than others. The following table shows the magnitude and direction of the velocity of each ball, where $0^\circ$ indicates straight up.

+ +

$$ +\begin{array}{|c|c|c|c|c|c|c|c|c|c|c|} +\hline +\text{ball} & \text{cue} & 1 & 2,3 & 4,6 & 5 & 7,10 & 8,9 & 11,15 & 12,14 & 13 \\ +\hline +\text{angle} & 0^\circ & 0^\circ & 40.1^\circ & 43.9^\circ & 0^\circ & 82.1^\circ & 161.8^\circ & 150^\circ & 178.2^\circ & 180^\circ \\ +\hline +\text{speed} & 1.79 & 1.20 & 1.57 & 1.42 & 0.12 & 1.31 & 0.25 & 5.60 & 2.57 & 2.63 \\ +\hline +\end{array} +$$

+ +

For comparison, remember that the initial speed of the cue ball was 10 units/sec. Thus, balls 11 and 15 (the back corner balls) shoot out at more than half the speed of the original cue ball, whereas ball 5 slowly rolls upwards at less than 2% of the speed of the original cue ball.

+ +

By the way, if you add up the sum of the squares of the speeds of the balls, you get 100, since kinetic energy is conserved.

+ +

Linear and Quadratic Responses

+ +

The results of this model are dependent on the power of $3/2$ in the force law -- other force laws give other breaks. For example, we could try making the force a linear function of the overlap distance (in analogy with springs and Hooke's law), or we could try making the force proportional to the square of the overlap distance. The results are noticeably different

+ +

enter image description here enter image description here

+ +

Stiff Response

+ +

Glenn the Udderboat points out that "stiff" balls might be best approximated by a force response involving a higher power of the distance (although this isn't the usual definition of "stiffness"). Unfortunately, the calculation time in Mathematica becomes longer when the power is increased, presumably because it needs to use a smaller time step to be sufficiently accurate.

+ +

Here is a simulation involving a reasonably "stiff" force law +$$ +F \;=\; \begin{cases}0 & \text{if }d \geq 2, \\ 10^{54}(2-d)^{10} & \text{if }d<2.\end{cases} +$$

+ +

enter image description here

+ +

As you can see, the result is very similar to my first answer below. This seems like good evidence that the behavior discussed in my first answer is indeed the limiting behavior in the case where the stiffness goes to infinity.

+ +

As you might expect, most of the energy in this case is transferred very quickly at the beginning of the collision. Almost all of the energy has moves to the back corner balls in the first 0.02 milliseconds. Here is an animation of the forces:

+ +

enter image description here

+ +

After that, the corner balls and the cue ball shoot out, and the remaining balls continue to collide gently for the next millisecond or so.

+ +

While the simplicity of this behavior is appealing, I would guess that "real" billard balls do not have such a stiff force response. Of the models listed here, the intial Hertz-based model is probably the most accurate. Qualitatively, it certainly seems the closest to an "actual" break.

+ +

Note: I have now posted the Mathematica code on my web page.

+
+ + + + + + + + + +
+
share|improve this answer
+ + + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+    + +   +
+
+
+ Very nice answer! However, it looks like the angles in the table are inconsistent with the rest of your post: the angle of ball $5$ should be $0^\circ$, and those of $7,10$ and $8,9$ should probably be swapped. It may be worthwhile to double-check the speeds, too. + –  + Rahul + Feb 1 '14 at 9:02 + +
+
+ + + + + + + +
+ 1 + +   +
+
+
+ Also, can you check if changing the form of the force function, say from $2-d$ to $(2-d)^2$, changes the results? + –  + Rahul + Feb 1 '14 at 9:03 +
+
+ + + + + + + +
+ 1 + +   +
+
+
+ @JimBelk, If changing parameters asymmetrically doesn't change the qualitative behavior of the solution, then it is usually safe. + –  + achille hui + Feb 1 '14 at 9:57 +
+
+ + + + + + + +
+ 2 + +   +
+
+
+ Aha! What this shows is that the original question is indeed underdetermined. One cannot simply treat rigid balls as the limiting case as stiffness approaches infinity, for it matters how the limit is approached (in the sense of what the shape of the stiffness function is). + –  + Rahul + Feb 1 '14 at 11:01 +
+
+ + + + + + + +
+ 1 + +   +
+
+
+ But that's not what "stiffness" means. In Hooke's law $F = k x$, the stiffness is the constant $k$ in front of the $x$, not the unwritten exponent $1$ over the $x$. Infinite stiffness in this billiards-balls scenario would be a force that rises to infinity as soon as $d<2$. All of your examples are extremely stiff; the only difference is how they approach the infinite step function. + –  + Rahul + Feb 1 '14 at 21:11 + +
+
+
+ +
+
+ + + +
+ + + + + + + + + + + + +
+ + +
+ + up vote + 5 + down vote + + + +
+ +
+
+

Note: I'm not deleting this answer since the discussion in the comments is interesting, but I no longer believe that this answer is physically realistic. See my other answer for a much better model.

+ +

Later Note: Well, it turns out that this model is realistic if you put certain hypothesis on the bouncing of the balls! See the "stiff response" section in my other answer.

+ +

The two balls in the back corners shoot away along rays parallel to the two sides of the triangle. Here is a picture showing the forces, with each force vector emanating from the point of contact.

+ +

enter image description here

+
+ + + + + + + + + +
+
share|improve this answer
+ + + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+ 1 + +   +
+
+
+ Yeah, but ball 2 (balls are numbered 0-15) touches ball 4 and 5. I think the impact will be allocated to 4 and 5 by a ratio of 2:1. (Although there might be a $\pi$ involved.) + –  + Glen The Udderboat + Jan 31 '14 at 20:58 + +
+
+ + + + + + + +
+    + +   +
+
+
+ @GlenTheUdderboat I don't think so. The force on ball 2 is entirely from its contact with ball 1. This is a normal force, so it has to be perpendicular to the tangent planes, i.e. parallel to the vector from the center of ball 1 to the center of ball 2. The forces that ball 2 exerts on balls 4 and 5 have to add up to the negative of this force vector, only the only such linear combination is that ball 5 exerts no force on ball 2, and ball 4 exerts the same magnitude of force on ball 2 as ball 1 did. + –  + Jim Belk + Jan 31 '14 at 21:06 +
+
+ + + + + + + +
+ 1 + +   +
+
+
+ You almost got me there. :) But you didn't account for ball 2 to move left(ish) after impact. Note that, because of the instantaneity, there is no physical causality or intuition to assume. + –  + Glen The Udderboat + Jan 31 '14 at 21:13 + +
+
+ + + + + + + +
+ 2 + +   +
+
+
+ I don't believe your explanation in the comment, Jim. Note that your argument applies just as well to the situation where only balls 1, 2, 4, and 5 were present; your argument would conclude that ball 5 doesn't move at all, which is false. + –  + Greg Martin + Jan 31 '14 at 21:32 +
+
+ + + + + + + +
+ 1 + +   +
+
+
+ @GlenTheUdderboat Part of what's going on with this problem is that conservation of momentum and energy do not completely determine the solution. The solution I gave certainly obeys both, because it's what would happen if the six balls not along the sides were removed. + –  + Jim Belk + Jan 31 '14 at 22:35 +
+
+
+ +
+
+ +
+ + + +

Your Answer

+ + + + + + + + + +
+ +
+
+
+ +
+
+ +
 
+ + + + + + +
+
+
+ + +
+
+
+ + + + +
+ +
+ + discard + +

+By posting your answer, you agree to the privacy policy and terms of service.

+
+
+ + + +

+Not the answer you're looking for? Browse other questions tagged or ask your own question.

+
+
+ + + +
+ + + +
+
+ + + + + + + \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl index 857adf62..3aa72f85 100644 --- a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl @@ -16,4 +16,5 @@ {"track_id": "table_include_math_p", "dataset_name": "table_include_math_p", "url": "https://math.stackexchange.com/questions/458323/is-8327-1-a-prime-number?answertab=active","data_source_category": "HTML", "path":"table_include_math_p.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} {"track_id": "table_include_table_math", "dataset_name": "table_include_table_math", "url": "https://test","data_source_category": "HTML", "path":"table_include_table_math.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} {"track_id": "test_clean_tags", "dataset_name": "test_pipeline_suit", "url": "https://math.stackexchange.com/questions/4082284/solving-for-vector-contained-in-a-diagonal-matrix","data_source_category": "HTML", "path":"test_clean_tags.html", "file_bytes": 1000, "page_layout_type":"forum", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} -{"track_id": "list_nest_three", "dataset_name": "list_nest_three", "url": "http://test.com","data_source_category": "HTML", "path":"list_nest_three.html", "file_bytes": 1000, "page_layout_type":"forum", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} \ No newline at end of file +{"track_id": "list_nest_three", "dataset_name": "list_nest_three", "url": "http://test.com","data_source_category": "HTML", "path":"list_nest_three.html", "file_bytes": 1000, "page_layout_type":"forum", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} +{"track_id": "table_include_entity", "dataset_name": "table_include_entity", "url": "http://math.stackexchange.com/questions/658871/perfectly-centered-break-of-a-perfectly-aligned-pool-ball-rack?answertab=active","data_source_category": "HTML", "path":"table_include_entity.html", "file_bytes": 1000, "page_layout_type":"forum", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index afb9418f..6e91c85e 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -155,7 +155,8 @@ def test_table_involve_equation(self): raw_html = raw_html_path.read_text(encoding='utf-8') parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html) complex_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}') - assert complex_table_tag[0].text == r'
Name of the probability distributionProbability distribution functionMeanVariance
Binomial distribution${\displaystyle \Pr \,(X=k)={\binom {n}{k}}p^{k}(1-p)^{n-k}}$${\displaystyle np}$${\displaystyle np(1-p)}$
Geometric distribution${\displaystyle \Pr \,(X=k)=(1-p)^{k-1}p}$${\displaystyle {\frac {1}{p}}}$${\displaystyle {\frac {(1-p)}{p^{2}}}}$
Normal distribution${\displaystyle f\left(x\mid \mu ,\sigma ^{2}\right)={\frac {1}{\sqrt {2\pi \sigma ^{2}}}}e^{-{\frac {(x-\mu )^{2}}{2\sigma ^{2}}}}}$${\displaystyle \mu }$${\displaystyle \sigma ^{2}}$
Uniform distribution (continuous)${\displaystyle f(x\mid a,b)={\begin{cases}{\frac {1}{b-a}}&{\text{for }}a\leq x\leq b,\\[3pt]0&{\text{for }}x<a{\text{ or }}x>b\end{cases}}}$${\displaystyle {\frac {a+b}{2}}}$${\displaystyle {\frac {(b-a)^{2}}{12}}}$
Exponential distribution${\displaystyle f(x\mid \lambda )=\lambda e^{-\lambda x}}$${\displaystyle {\frac {1}{\lambda }}}$${\displaystyle {\frac {1}{\lambda ^{2}}}}$
Poisson distribution${\displaystyle f(k\mid \lambda )={\frac {e^{-\lambda }\lambda ^{k}}{k!}}}$${\displaystyle \lambda }$${\displaystyle \lambda }$
' + print(complex_table_tag[0].text) + assert complex_table_tag[0].text == r'
Name of the probability distributionProbability distribution functionMeanVariance
Binomial distribution${\displaystyle \Pr \,(X=k)={\binom {n}{k}}p^{k}(1-p)^{n-k}}$${\displaystyle np}$${\displaystyle np(1-p)}$
Geometric distribution${\displaystyle \Pr \,(X=k)=(1-p)^{k-1}p}$${\displaystyle {\frac {1}{p}}}$${\displaystyle {\frac {(1-p)}{p^{2}}}}$
Normal distribution${\displaystyle f\left(x\mid \mu ,\sigma ^{2}\right)={\frac {1}{\sqrt {2\pi \sigma ^{2}}}}e^{-{\frac {(x-\mu )^{2}}{2\sigma ^{2}}}}}$${\displaystyle \mu }$${\displaystyle \sigma ^{2}}$
Uniform distribution (continuous)${\displaystyle f(x\mid a,b)={\begin{cases}{\frac {1}{b-a}}&{\text{for }}a\leq x\leq b,\\[3pt]0&{\text{for }}xb\end{cases}}}$${\displaystyle {\frac {a+b}{2}}}$${\displaystyle {\frac {(b-a)^{2}}{12}}}$
Exponential distribution${\displaystyle f(x\mid \lambda )=\lambda e^{-\lambda x}}$${\displaystyle {\frac {1}{\lambda }}}$${\displaystyle {\frac {1}{\lambda ^{2}}}}$
Poisson distribution${\displaystyle f(k\mid \lambda )={\frac {e^{-\lambda }\lambda ^{k}}{k!}}}$${\displaystyle \lambda }$${\displaystyle \lambda }$
' def test_table_involve_after_code(self): """test table involve code, code被提取出去了,过滤掉空的和坏的table.""" diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index b58b6964..0fdf2da3 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -59,7 +59,7 @@ def setUp(self): for line in f: self.data_json.append(json.loads(line.strip())) - assert len(self.data_json) == 19 + assert len(self.data_json) == 20 # Config for HTML extraction self.config = { @@ -444,3 +444,14 @@ def test_list_nest_three(self): result = chain.extract(input_data) result_content_list = result.get_content_list()._get_data() assert int(result_content_list[0][0]['content']['list_nest_level']) == 3 + + def test_table_include_entity(self): + """测试table包含实体.""" + chain = ExtractSimpleFactory.create(self.config) + self.assertIsNotNone(chain) + test_data = self.data_json[19] + input_data = DataJson(test_data) + result = chain.extract(input_data) + result_md = result.get_content_list().to_mm_md() + assert '&' not in result_md + assert ' ' not in result_md