Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion llm_web_kit/libs/html_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@ def html_to_element(html:str) -> HtmlElement:
element: lxml.html.HtmlElement: element
"""
parser = HTMLParser(collect_ids=False, encoding='utf-8', remove_comments=True, remove_pis=True)
root = fromstring(html, parser=parser)
# 将 HTML 字符串编码为字节类型, 兼容html中有 XML 声明(如 <?xml version="1.0" encoding="utf-8"?>)
html_bytes = html.encode('utf-8')
root = fromstring(html_bytes, parser=parser)
standalone = deepcopy(root) # 通过拷贝才能去掉自动加入的<html><body>等标签, 非常奇怪的表现。
return standalone

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,373 @@
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-gb" lang="en-gb" dir="ltr" >
<head>
<base href="http://www.chicks.org.uk/index.php" />
<meta http-equiv="content-type" content="text/html; charset=utf-8" />
<meta name="keywords" content="CHICKS Holiday, Respite, Break, Children, Kids, charity, disadvantaged, underprivileged, regional, national, local, birmingham, devon, cornwall, south west, london, east anglia, midlands, north west, north east, scotland, south east, wales, glasgow, plymouth, exeter, penzance, happy memories, make a difference, positive, life changing, smiles, free, enhancing, make a difference, see the difference, donate, get involved, events, joy, retreats, Moorland, Coastal, challenges, activities, climbing, horse riding, body boarding, high ropes, crealy, terthorne, wrigleys, ginsters, sourthern cooperative, corporate partners, community groups, trusts, foundations, income, individuals, children in need, events, major donors" />
<meta name="author" content="Megan" />
<meta name="description" content="CHICKS provides free, week long respite breaks for disadvantaged children from across the UK, who would not otherwise have time away from their difficult home lives during the year." />
<meta name="generator" content="Chicks" />
<title>CHICKS Children on the Ice - CHICKS</title>
<script src="/media/system/js/mootools-core.js" type="text/javascript"></script>
<script src="/media/system/js/core.js" type="text/javascript"></script>
<script src="/media/system/js/caption.js" type="text/javascript"></script>
<script src="/media/system/js/mootools-more.js" type="text/javascript"></script>
<script type="text/javascript">
window.addEvent('load', function() {
new JCaption('img.caption');
});
</script>

<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />

<link rel="stylesheet" href="/templates/chicks_inside/css/template.css" type="text/css" />
<link rel="stylesheet" href="/templates/chicks_inside/css/custommodule.css" type="text/css" />

<script type="text/javascript">

var _gaq = _gaq || [];
_gaq.push(['_setAccount', 'UA-25828996-1']);
_gaq.push(['_trackPageview']);

(function() {
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
})();

</script>
</head>

<body>
<div id="main">
<div class="wrapper">
<div id="content">
<div id="header">
<div id="logo"><a href="/index.php"><img src="/templates/chicks/images/chicks_logo.gif" alt="CHICKS logo" /></a></div>


<div class="toolbar"> <div class="moduletable">
<style>

#mc_embed_signup {
background:#fff;
clear:left;
font:14px Helvetica,Arial,sans-serif;
}

#mc_embed_signup input.email {
border:1px solid #CCC;
padding: 2px;
width: 100px;
}

#mc_embed_signup input.email:focus {
border:1px solid #CCC;
background-color:#FFC;
outline:none;
}

#mc_embed_signup label {
font-weight: normal;
font-size: 11px;
padding-bottom: 10px;
}

#mc-embedded-subscribe {
float: right;
margin: 2px 54px 0 0;
}

</style>

<div id="mc_embed_signup">
<form action="http://chicks.us2.list-manage.com/subscribe/post?u=1c33ef5c2e710ead11e9e9cfb&amp;id=e549bb8981" method="post" id="mc-embedded-subscribe-form" name="mc-embedded-subscribe-form" class="validate" target="_blank" novalidate>
<label for="mce-EMAIL">Sign up to CHICKS eNewsletter</label>
<input type="email" value="" name="EMAIL" class="email" id="mce-EMAIL" placeholder="email address" required>
<input type="submit" value="Go" name="subscribe" id="mc-embedded-subscribe">
</form>
</div> </div>
</div>

<div class="mainlevel">
<div class="moduletable_menu">

<ul class="menu">
<li class="item-285"><a href="/" >Home</a></li><li class="item-2"><a href="/about-us" >About Us</a></li><li class="item-3"><a href="/news-3" >News</a></li><li class="item-234"><a href="/camps" >Camps</a></li><li class="item-4"><a href="/events-home" >Events</a></li><li class="item-5"><a href="/volunteer" >Volunteer</a></li><li class="item-6"><a href="/refer-a-child-6" >Refer a Child</a></li><li class="item-7"><a href="/get-involved" >Get Involved</a></li><li class="item-8"><a href="/support-us" >Donate</a></li><li class="item-9"><a href="/contact-us" >Contact Us</a></li></ul>
</div>

</div>

</div>



<!--left column -->


<div id="leftcol">

<div class="sublevel">
<div class="moduletable_news">
<h3>News</h3>

<ul class="menu_all">
<li class="item-26"><a href="/news-26" >Latest News</a></li><li class="item-27 current active"><a href="/news-archive" >News Archive</a></li><li class="item-244"><a href="/whats-on-guide" >What's on Guide</a></li><li class="item-105 parent"><a href="/chicks-magazines" >CHICKS Magazines</a></li><li class="item-28 parent"><a href="/latest-news-articles" >Media Centre</a></li><li class="item-30"><a href="/social-media" >Social Media</a></li><li class="item-218"><a href="/enewsletter" >eNewsletter</a></li></ul>
</div>

</div>


<div class="leftmodule">

</div>


<div class="leftbanner">

<script language="JavaScript">


function random_imglink(){
var images=new Array()
images[1]="/templates/chicks_inside/images/sidedonate.jpg"
images[2]="/templates/chicks_inside/images/sideinvolve.jpg"
images[3]="/templates/chicks_inside/images/sidechild.jpg"
images[4]="/templates/chicks_inside/images/sidevolunteer.jpg"

var links=new Array()
links[1]="/give-regularly"
links[2]="/events-calendar"
links[3]="/refer-a-child-6"
links[4]="/volunteer"

var ry=Math.floor(Math.random()*images.length)
if (ry==0)
ry=1
document.write('<a href='+'"'+links[ry]+'"'+'><img src="'+images[ry]+'" border=0></a>')
}
random_imglink()

</script>
</div>

</div>


<!--middle column upper half-->
<div id="middlecol">

<div id="user1">
<div class="usercontent">
<div class="moduletable_ia">



<div class="articleheading">News Archive</div>

<div class="articleheadingcontent"><p>Just in case you have been on holiday or were unable to view our site at the time, we have saved our news stories here for you to view at your leisure.</p> <a class='readmore' href='/news-archive/7-news/news/78-news-archive'>
More... </a>
</div> </div>

</div>
</div>


<div id="mainbody">
<div class="usercontent">

<h2 class="contentheading">
CHICKS Children on the Ice </h2>

<div class="item-page_newsrelease">







<dl class="article-info">
<dt class="article-info-term"></dt>
<dd class="create">
Monday, 31 October 2011 15:24 </dd>
</dl>



<p><img src="/images/stories/news/Ice_Angels.jpg" alt="Monday morning fun on the rink" /></p>
<p>CHICKS Children on the Ice</p>
<div></div>
<p>The children at CHICKS have enjoyed six days of fabulous activities and special events to celebrate our 9000th child coming on camp since we began in 1992.</p>

<p>Tango Camp, which started on Thursday 27th October and finishes tomorrow, Tuesday 1st November have enjoyed horse riding, spooky Halloween games and an extra treat as they enjoyed an exclusive session on the Ice in the centre of Plymouth with the Ice Angels. For most children this was the first time they experienced ice skating and some took to it like ducks to water... others not so much!</p>
<p>Every child that attends a CHICKS break has a deserving story as to why they need time away from their tough home lives. On Tango camp we have children who have been neglected, witnessed domestic abuse, suffered emotional abuse and live in severe poverty.</p>
<p>Every child has had the most incredible week, watching them on the Ice you could see their confidence grow every time they did a loop around the Rink. After their six day break CHICKS will continue to make an impact as they can write to everyone they meet time and time again through the network we operate, to date we have received over 900 letters!</p>
<p>Here are a few quotes from the children at the Ice Rink. Craig "I've never been ice skating before", Kellee "I want to be an ice dancer when I'm older", Josh "This has been my favourite part of the week" and Stephanie "My three favourite things of this week are horse riding, laser quest and ice skating".</p>
<p>Every child will leave CHICKS with happy memories, new skills and new friends. They will return home knowing that someone cares and that life has more to offer them then what it has done so far.</p>
<p>9000 disadvantaged children - thousands of happy childhood memories.</p>
<p> </p>

</div>

</div>
</div>




<!--middle column lower half-->






</div>

<!--right column -->


<div id="rightcol">

<div id="kitedonate">
<a href="https://mydonate.bt.com/charities/chicks"><img src="/templates/chicks_inside/images/trans.png" width="126" height="126px"></a>
</div>
<div id="kiteinvolve">
<a href="/events-calendar"><img src="/templates/chicks_inside/images/trans.png" width="126" height="126px"></a>
</div>
<div id="kitechild">
<a href="/refer-a-child-6"><img src="/templates/chicks_inside/images/trans.png" width="126" height="126px"></a>
</div>
<div id="kitevolunteer">
<a href="/volunteer"><img src="/templates/chicks_inside/images/trans.png" width="126" height="126px"></a>
</div>
<!--<div id="kitelottery">
<a href="/chicks-lottery-63"><img src="/templates/chicks_inside/images/trans.png" width="126" height="126px"></a>
</div>-->


<!-- AddThis Button BEGIN -->
<a class="addthis_button" href="http://www.addthis.com/bookmark.php?v=250&amp;username=xa-4c5ac97e11c156af"><img src="http://s7.addthis.com/static/btn/v2/lg-share-en.gif" width="125" height="16" alt="Bookmark and Share" style="border:0"/></a><script type="text/javascript" src="http://s7.addthis.com/js/250/addthis_widget.js#username=xa-4c5ac97e11c156af"></script>
<!-- AddThis Button END -->


<div class="rightmodule">
<div class="moduletable_ia">



<div class="articleheading">Change for CHICKS</div>

<div class="articleheadingcontent"><p><img alt="changeforchicksweb" src="/images/stories/get-involved/changeforchicksweb.jpg" /></p>
<p>Throughout May we will be running our Change for CHICKS campaign. We're asking all our supporters to encourage their friends, family and colleagues to donate their small change to CHICKS. <a class='readmore' href='/news-archive/12-get-involved/get-involved/569-change-for-chicks'>
More... </a>
</div> </div>

</div>

</div>


</div>
</div>

</div>

<!--footer section -->
<div id="footer">
<div class="wrapper">
<div class="tagline">
<img src="/templates/chicks/images/footer_tagline.gif" /></div>


<div class="minipic">
<div class="moduletable">
<div class="random-image">
<img src="/images/stories/homepage/footer/footer_5.gif" alt="footer_5.gif" width="200" height="80" /></div>
</div>

</div>


</div>

<div class="wrapper">
<div id="getconnected">
<div class="footertitletextfirst">Get Connected</div>

<div class="footerlinks"> <a href="http://www.facebook.com/chickscharity"><img src="/templates/chicks/images/fb_icon.gif" alt="Facebook" /></a>
<a href="http://twitter.com/CHICKScharity"><img src="/templates/chicks/images/twitter_icon.gif" alt="Twitter" /></a>
<a href="/news-3?format=feed&amp;type=rss"><img src="/templates/chicks/images/rss_icon.gif" alt="RSS" /></a>
<a href="http://vimeo.com/user4620456"><img src="/templates/chicks/images/vimeo_icon.gif" alt="Vimeo" /></div></a>
</div>

<div id="contactuslinks">
<div class="footertitletext">Contact Us</div>

<div class="footerlinks">
<ul>
<li><a href="/contact-us">Our Offices</a></li>
<li><a href="/vacancies-join-our-team">Vacancies</a></li>
<li><a href="/latest-news-articles/press-releases">Press Releases</a></li>
<li><a href="/latest-news-articles">Media Centre</a></li>
</ul>
</div>
</div>

<div id="quicklinks">
<div class="footertitletext">Quick Links</div>

<div class="quickfooterlinks">
<ul>
<li> <a href="/about-us">About Us</a></li>
<li> <a href="/who-we-are">Who are CHICKS?</a></li>
<li><a href="/our-retreats">Our Retreats</a></li>
<li><a href="/childrens-stories">Children's Stories</a></li>
</ul>
</div>

<div class="quickfooterlinks">
<ul>
<li><a href="/support-us">Donate Now</a></li>
<li><a href="/our-retreats">Get Involved</a></li>
<li><a href="/refer-a-child-6">Refer a Child</a></li>
<li><a href="/volunteer">Volunteer</a></li>
</ul>
</div>

</div>

<div id="generallinks">
<div class="footertitletext">General</div>

<div class="footerlinks">
<ul>
<li><a href="/index.php">Home </a></li>
<li><a href="/sitemap-x">Sitemap</a></li>
<li><a href="/privacy-statement">Privacy Statement</a></li>
<li><a href="/terms-and-conditions"</a></li>
<li><a href="/privacy-cookies">Cookies</a></li>
</ul>
</div>

</div>



<div id="copyright">
Copyright CHICKS <script language="javascript">
var d=new Date();
var y = d.getFullYear();
document.write(y);
</script> Registered Charity in England & Wales (1080953) and in Scotland (SCO40536)
</div>


</div>

</div>

</body>
</html>
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,5 @@
{"track_id": "table_include_table_math", "dataset_name": "table_include_table_math", "url": "https://test","data_source_category": "HTML", "path":"table_include_table_math.html", "file_bytes": 1000, "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
{"track_id": "test_clean_tags", "dataset_name": "test_pipeline_suit", "url": "https://math.stackexchange.com/questions/4082284/solving-for-vector-contained-in-a-diagonal-matrix","data_source_category": "HTML", "path":"test_clean_tags.html", "file_bytes": 1000, "page_layout_type":"forum", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
{"track_id": "list_nest_three", "dataset_name": "list_nest_three", "url": "http://test.com","data_source_category": "HTML", "path":"list_nest_three.html", "file_bytes": 1000, "page_layout_type":"forum", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
{"track_id": "table_include_entity", "dataset_name": "table_include_entity", "url": "http://math.stackexchange.com/questions/658871/perfectly-centered-break-of-a-perfectly-aligned-pool-ball-rack?answertab=active","data_source_category": "HTML", "path":"table_include_entity.html", "file_bytes": 1000, "page_layout_type":"forum", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
{"track_id": "table_include_entity", "dataset_name": "table_include_entity", "url": "http://math.stackexchange.com/questions/658871/perfectly-centered-break-of-a-perfectly-aligned-pool-ball-rack?answertab=active","data_source_category": "HTML", "path":"table_include_entity.html", "file_bytes": 1000, "page_layout_type":"forum", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
{"track_id": "xml_tag", "dataset_name": "test_pipeline_suit", "url": "http://www.chicks.org.uk/index.php?option=com_content&view=article&id=342%3Achicks-children-on-the-ice&Itemid=27","data_source_category": "HTML", "path":"xml_tag.html", "file_bytes": 1000, "page_layout_type":"article", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
Loading