From bfacbc24d5443899a6c7daaab801b79996b400fd Mon Sep 17 00:00:00 2001 From: "j@mailb.org" Date: Mon, 28 Mar 2011 22:47:23 +0200 Subject: [PATCH 1/3] allow iframes with frameborder as used by vid.ly, youtube and other sites --- planet/vendor/feedparser.py | 5 +++-- planet/vendor/html5lib/sanitizer.py | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/planet/vendor/feedparser.py b/planet/vendor/feedparser.py index 76167ced..0600c814 100755 --- a/planet/vendor/feedparser.py +++ b/planet/vendor/feedparser.py @@ -1982,6 +1982,7 @@ def getPropertyValue(self, elmRoot, sProperty, iPropertyType=4, bAllowMultiple=0 sValue = bNormalize and self.normalize(sValue) or sValue.strip() if (not sValue) and (iPropertyType == self.URI): if sNodeName == 'a': sValue = elmResult.get('href') + elif sNodeName == 'iframe': sValue = elmResult.get('src') elif sNodeName == 'img': sValue = elmResult.get('src') elif sNodeName == 'object': sValue = elmResult.get('data') if sValue: @@ -2339,7 +2340,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor): 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset', 'figure', 'footer', 'font', 'form', 'header', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', - 'img', 'input', 'ins', 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', + 'iframe', 'img', 'input', 'ins', 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter', 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option', 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select', 'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong', 'sub', @@ -2355,7 +2356,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor): 'colspan', 'compact', 'contenteditable', 'controls', 'coords', 'data', 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default', 'delay', 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 'face', 'for', - 'form', 'frame', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus', + 'form', 'frame', 'frameborder', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus', 'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode', 'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc', 'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max', diff --git a/planet/vendor/html5lib/sanitizer.py b/planet/vendor/html5lib/sanitizer.py index 05face97..d81b2943 100644 --- a/planet/vendor/html5lib/sanitizer.py +++ b/planet/vendor/html5lib/sanitizer.py @@ -13,7 +13,7 @@ class HTMLSanitizerMixin(object): 'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset', 'figure', 'footer', 'font', 'form', 'header', 'h1', 'h2', 'h3', 'h4', - 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', 'keygen', 'kbd', + 'h5', 'h6', 'hr', 'i', 'iframe', 'img', 'input', 'ins', 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter', 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option', 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select', 'small', 'sound', @@ -43,7 +43,7 @@ class HTMLSanitizerMixin(object): 'cols', 'colspan', 'compact', 'contenteditable', 'controls', 'coords', 'data', 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default', 'delay', 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', - 'face', 'for', 'form', 'frame', 'galleryimg', 'gutter', 'headers', + 'face', 'for', 'form', 'frame', 'frameborder', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus', 'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode', 'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc', 'loop', 'loopcount', 'loopend', From 649e369f8d7b53c6e958149b3709fb8d621efb8d Mon Sep 17 00:00:00 2001 From: Chris Hostetter Date: Sat, 7 Jan 2012 18:18:03 -0800 Subject: [PATCH 2/3] coercedates plugin new plugin that coerces the 'updated' and 'published' dates on any entry already in the cache to match the 'updated' date already in the cache if an entry is _not_ already in the cache, coerces the date values to be the smaller of older of the two values commit also 'fixes' spider.py to work with filters that modify the 'updated' date see: https://github.com/rubys/venus/issues/15 --- filters/coercedates.plugin | 116 +++++++++++++++++++++ planet/spider.py | 9 ++ tests/data/filter/coercedates/a-rss-1.xml | 42 ++++++++ tests/data/filter/coercedates/a-rss-2.xml | 42 ++++++++ tests/data/filter/coercedates/b-atom-1.xml | 92 ++++++++++++++++ tests/data/filter/coercedates/b-atom-2.xml | 92 ++++++++++++++++ tests/data/filter/coercedates/config.ini | 7 ++ tests/test_filter_coercedates.py | 106 +++++++++++++++++++ 8 files changed, 506 insertions(+) create mode 100644 filters/coercedates.plugin create mode 100644 tests/data/filter/coercedates/a-rss-1.xml create mode 100644 tests/data/filter/coercedates/a-rss-2.xml create mode 100644 tests/data/filter/coercedates/b-atom-1.xml create mode 100644 tests/data/filter/coercedates/b-atom-2.xml create mode 100644 tests/data/filter/coercedates/config.ini create mode 100644 tests/test_filter_coercedates.py diff --git a/filters/coercedates.plugin b/filters/coercedates.plugin new file mode 100644 index 00000000..421418e9 --- /dev/null +++ b/filters/coercedates.plugin @@ -0,0 +1,116 @@ +# If you don't want items to "move up" on your planet if the source feed +# updates them (and changes the update date to something newer then was +# originally used) you may be tempted to use the "ignore_in_feed: updated" +# option, but there are three important things to realize about doing this: +# +# * When you ignore the "updated" date, it will default to the +# "published" date -- but if there is no "published" date (very common +# in many RSS feeds) it will default to the current date+time. +# +# * If you purge the entire cache (perhaps because you added a filter) +# all of the "updated" dates for those items w/o a "published" date will +# be re-set to the current date+time +# +# * The "updated" date is what Venus uses to sort the list +# +# This may seem all seem obvious, but can be highly annoying when you deal +# with some feeds that have no "published" date and have to occasionally +# purge your cache. +# +# One solution would be to only use "ignore_in_feed: updated" on the feeds +# where you know they feed contains a "published" date for each item, and +# don't use it for feeds that only contain an "updated" date for each item +# -- but that can be tedious. +# +# So use this plugin instead +# +# This plugin will replace the "updated" and "published" dates of every item +# with whichever of the two values is the lowest, unless the item is already +# in the cache, in which case it will use the "updated" date from the item in +# the cache -- making it a safe alternative to "ignore_in_feed: updated" for +# all feeds regardless of whether the items have a "published" date or not, +# and regardless of whether the ones that do have a "published" date try to +# modify it or not. +# +########################################################################### + +import sys, time, os +from xml.dom import minidom +from planet import reconstitute +from planet import config +from planet.reconstitute import date +from planet.spider import filename + +log = planet.logger + +# finds the first descendent element that matches the specified +# namespace and tag name, parses it (in canonical date format), +# returns the parsed value, and removes (all of the) element(s) +def parseAndPurgeDateElement(element, ns, tagName): + result = None + # see if we have any date(s?) + kids = element.getElementsByTagNameNS(ns, tagName) + if kids: + # record the first one + result = time.strptime(kids[0].childNodes[0].nodeValue, + '%Y-%m-%dT%H:%M:%SZ') + # get rid of all of them + for trash in kids: + trash.parentNode.removeChild(trash) + return result + + +# given the identifier of an entry in the cache, fetches the +# formated mtime of that entry (which should match the updated +# date if venus has done it's job right +# +# returns None if the entry is not in the cache +def getDateFromCache(entry): + if entry is None: + log.error("Attempted to lookup the date of 'None'") + return None + + id = entry.getElementsByTagNameNS(atomNS, 'id')[0].childNodes[0].nodeValue + if id is None: + log.error("Unable to find id in entry") + return None + + cache = os.path.join(config.cache_directory()) + file = filename(cache, id) + if os.path.exists(file): + return time.gmtime(os.stat(file).st_mtime) + return None + + +atomNS = 'http://www.w3.org/2005/Atom' +planetNS = 'http://planet.intertwingly.net/' + +# parse input stream +dom = minidom.parse(sys.stdin) + +entries = dom.getElementsByTagNameNS(atomNS, 'entry') +for e in entries: + + # get & remove our dates from the entry + updatedDate = parseAndPurgeDateElement(e, atomNS, 'updated') + pubDate = parseAndPurgeDateElement(e, atomNS, 'published') + + cacheDate = getDateFromCache(e) + + if cacheDate is not None: + mainDate = cacheDate + elif not updatedDate: + mainDate = pubDate + elif not pubDate: + mainDate = updatedDate + elif pubDate < updatedDate: + mainDate = pubDate + else: + mainDate = updatedDate + + # add back to the entry + reconstitute.date(e, 'published', mainDate) + reconstitute.date(e, 'updated', mainDate) + +# output the dom +print dom.toxml('utf-8') diff --git a/planet/spider.py b/planet/spider.py index 50d17393..c636f561 100644 --- a/planet/spider.py +++ b/planet/spider.py @@ -235,6 +235,15 @@ def writeCache(feed_uri, feed_info, data): if os.path.exists(cache_file): os.remove(cache_file) continue + # re-set mtime incase filters have modified it + try: + edoc = feedparser.parse(output) + mtime = calendar.timegm(edoc.entries[0].updated_parsed) + except: + log.warning("Unable to re-set mtime on %s after running filters: ", + entry.id, + sys.exc_info()[0]) + # write out and timestamp the results write(output, cache_file, mtime) diff --git a/tests/data/filter/coercedates/a-rss-1.xml b/tests/data/filter/coercedates/a-rss-1.xml new file mode 100644 index 00000000..d2467593 --- /dev/null +++ b/tests/data/filter/coercedates/a-rss-1.xml @@ -0,0 +1,42 @@ + + + +Fake RSS Blog +http://fake.url.example.com +Fake RSS Feed For testing + +http://fake.url.example.com/feedlogo.gif +Test RSS Feed +http://fake.url.example.com + +en-us +Not Copyright 2011 Fake Feed, LLC. The contents of this headlines and excerpts feed are available for unlimited distribution. +Blogsmith http://www.blogsmith.com/ + + + Fake Title: RSS Has No Date + http://fake.url.example.com/rss-no-date + http://fake.url.example.com/rss-no-date + http://fake.url.example.com/rss-no-date#comments + + Blah Blah Blah something poinient blah blah blah

]]> +
+ http://fake.url.example.com/rss-no-date.gif + Fake Person +
+ + + + Fake Title: RSS Has Changing Date + http://fake.url.example.com/rss-changing-date + http://fake.url.example.com/rss-changing-date + http://fake.url.example.com/rss-changing-date#comments + + Blah Blah Blah something poinient blah blah blah

]]> +
+ http://fake.url.example.com/rss-changing-date.gif + Fake Person + 2011-12-01T11:00:00+00:00 +
+ +
diff --git a/tests/data/filter/coercedates/a-rss-2.xml b/tests/data/filter/coercedates/a-rss-2.xml new file mode 100644 index 00000000..82f48a2d --- /dev/null +++ b/tests/data/filter/coercedates/a-rss-2.xml @@ -0,0 +1,42 @@ + + + +Fake RSS Blog +http://fake.url.example.com +Fake RSS Feed For testing + +http://fake.url.example.com/feedlogo.gif +Test RSS Feed +http://fake.url.example.com + +en-us +Not Copyright 2011 Fake Feed, LLC. The contents of this headlines and excerpts feed are available for unlimited distribution. +Blogsmith http://www.blogsmith.com/ + + + Fake Title: RSS Has No Date + http://fake.url.example.com/rss-no-date + http://fake.url.example.com/rss-no-date + http://fake.url.example.com/rss-no-date#comments + + Blah Blah Blah something poinient blah blah blah

]]> +
+ http://fake.url.example.com/rss-no-date.gif + Fake Person +
+ + + + Fake Title: RSS Has Changing Date + http://fake.url.example.com/rss-changing-date + http://fake.url.example.com/rss-changing-date + http://fake.url.example.com/rss-changing-date#comments + + Blah Blah Blah something poinient blah blah blah

]]> +
+ http://fake.url.example.com/rss-changing-date.gif + Fake Person + 2011-12-07T11:07:07+00:00 +
+ +
diff --git a/tests/data/filter/coercedates/b-atom-1.xml b/tests/data/filter/coercedates/b-atom-1.xml new file mode 100644 index 00000000..f549b6d8 --- /dev/null +++ b/tests/data/filter/coercedates/b-atom-1.xml @@ -0,0 +1,92 @@ + + + Fake Atom Feed + Fake Atom feed for testing stuff + + 2011-12-08T00:00:28Z + + + http://fake.url.example.com/feed/atom/ + + + WordPress + + + + + + + Fake Person + http://fake.url.example.com + + <![CDATA[Atom Changing Updated Date]]> + + http://fake.url.example.com/atom-changing-updated + 2011-12-05T10:06:38Z + 2011-11-09T00:00:28Z + + + + + 0 + + + + + + + Fake Person + http://fake.url.example.com + + <![CDATA[Atom Changing Published Date]]> + + http://fake.url.example.com/atom-changing-published + 2011-12-08T02:02:28Z + + + + + 0 + + + + + + Fake Person + http://fake.url.example.com + + <![CDATA[Atom No Date]]> + + http://fake.url.example.com/atom-no-date + + + + + 0 + + + + + Fake Person + http://fake.url.example.com + + <![CDATA[Atom Update Before Published]]> + 2011-11-11T11:11:11Z + 2011-12-12T12:12:12Z + + http://fake.url.example.com/atom-update-before-pub + + + + + 0 + + + + + diff --git a/tests/data/filter/coercedates/b-atom-2.xml b/tests/data/filter/coercedates/b-atom-2.xml new file mode 100644 index 00000000..6b71301f --- /dev/null +++ b/tests/data/filter/coercedates/b-atom-2.xml @@ -0,0 +1,92 @@ + + + Fake Atom Feed + Fake Atom feed for testing stuff + + 2011-12-08T00:00:28Z + + + http://fake.url.example.com/feed/atom/ + + + WordPress + + + + + + + Fake Person + http://fake.url.example.com + + <![CDATA[Atom Changing Updated Date]]> + + http://fake.url.example.com/atom-changing-updated + 2011-12-07T07:07:37Z + 2011-11-09T00:00:28Z + + + + + 0 + + + + + + + Fake Person + http://fake.url.example.com + + <![CDATA[Atom Changing Published Date]]> + + http://fake.url.example.com/atom-changing-published + 2011-12-13T13:13:13Z + + + + + 0 + + + + + + Fake Person + http://fake.url.example.com + + <![CDATA[Atom No Date]]> + + http://fake.url.example.com/atom-no-date + + + + + 0 + + + + + Fake Person + http://fake.url.example.com + + <![CDATA[Atom Update Before Published]]> + 2009-09-09T09:09:09Z + 2011-12-12T12:12:12Z + + http://fake.url.example.com/atom-update-before-pub + + + + + 0 + + + + + diff --git a/tests/data/filter/coercedates/config.ini b/tests/data/filter/coercedates/config.ini new file mode 100644 index 00000000..f4223bde --- /dev/null +++ b/tests/data/filter/coercedates/config.ini @@ -0,0 +1,7 @@ +[Planet] +name = test planet +cache_directory = tests/work/coercedates/cache +cache_blasklist_directory = tests/work/coercedates/cache/blacklist + +filters: coercedates.plugin + diff --git a/tests/test_filter_coercedates.py b/tests/test_filter_coercedates.py new file mode 100644 index 00000000..04259eec --- /dev/null +++ b/tests/test_filter_coercedates.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python + +import unittest, os, glob, calendar, shutil, time +from planet.spider import filename, spiderPlanet, writeCache +from planet import feedparser, config +import planet + +workdir = 'tests/work/coercedates/cache' +testfeed = 'tests/data/filter/coercedates/%s.xml' +configfile = 'tests/data/filter/coercedates/config.ini' + +class CoerceDatesTest(unittest.TestCase): + def setUp(self): + # silence errors + self.original_logger = planet.logger + # planet.getLogger('CRITICAL',None) + + try: + os.makedirs(workdir) + except: + self.tearDown() + os.makedirs(workdir) + + def tearDown(self): + shutil.rmtree(workdir) + os.removedirs(os.path.split(workdir)[0]) + planet.logger = self.original_logger + + def spiderFeed(self, feed_uri): + feed_info = feedparser.parse('') + data = feedparser.parse(feed_uri) + writeCache(feed_uri, feed_info, data) + + # no expected_date means we don't know what it should be yet + def verify_date(self, id, expected_date = None): + + file = os.path.join(workdir, id) + + # verify that the file exists + self.assertTrue(os.path.exists(file), msg=file); + + data = feedparser.parse(file) + + # verify published & updated dates are in sync and match expected + + self.assertEqual(data.entries[0].updated, + data.entries[0].published) + + # verify mtime is in sync + self.assertEqual(time.gmtime(os.stat(file).st_mtime), + data.entries[0].updated_parsed) + self.assertEqual(time.gmtime(os.stat(file).st_mtime), + data.entries[0].published_parsed) + + # verify meet hardcoded expectations + if expected_date is not None: + self.assertEqual(expected_date, + data.entries[0].updated) + + return data.entries[0].updated + + def test_coerce_rss(self): + config.load(configfile) + + # load first version of RSS + self.spiderFeed(testfeed % 'a-rss-1') + + rss_no_date_expected = self.verify_date('fake.url.example.com,rss-no-date') + self.verify_date('fake.url.example.com,rss-changing-date', + u'2011-12-01T11:00:00Z') + + # parse updated RSS feed + self.spiderFeed(testfeed % 'a-rss-2') + + # verify dates haven't changed + self.verify_date('fake.url.example.com,rss-no-date', + rss_no_date_expected) + self.verify_date('fake.url.example.com,rss-changing-date', + u'2011-12-01T11:00:00Z') + + def test_coerce_atom(self): + config.load(configfile) + + # load first version of Atom + self.spiderFeed(testfeed % 'b-atom-1') + + atom_no_date_expected = self.verify_date('fake.url.example.com,atom-no-date') + self.verify_date('fake.url.example.com,atom-changing-published', + u'2011-12-08T02:02:28Z') + self.verify_date('fake.url.example.com,atom-changing-updated', + u'2011-11-09T00:00:28Z') + self.verify_date('fake.url.example.com,atom-update-before-pub', + u'2011-11-11T11:11:11Z') + + # parse updated Atom feed + self.spiderFeed(testfeed % 'b-atom-2') + + # verify dates haven't changed + self.verify_date('fake.url.example.com,atom-no-date', + atom_no_date_expected) + self.verify_date('fake.url.example.com,atom-changing-published', + u'2011-12-08T02:02:28Z') + self.verify_date('fake.url.example.com,atom-changing-updated', + u'2011-11-09T00:00:28Z') + self.verify_date('fake.url.example.com,atom-update-before-pub', + u'2011-11-11T11:11:11Z') From ae70a51bd822c264b67f9e2e40c287de20bc3223 Mon Sep 17 00:00:00 2001 From: Chris Hostetter Date: Sat, 7 Jan 2012 18:38:15 -0800 Subject: [PATCH 3/3] fix missing import that runtest.py was hiding --- filters/coercedates.plugin | 1 + 1 file changed, 1 insertion(+) diff --git a/filters/coercedates.plugin b/filters/coercedates.plugin index 421418e9..b5f73aac 100644 --- a/filters/coercedates.plugin +++ b/filters/coercedates.plugin @@ -36,6 +36,7 @@ import sys, time, os from xml.dom import minidom +import planet from planet import reconstitute from planet import config from planet.reconstitute import date