From c57e04072b227920c858954576e3ba2bc52429aa Mon Sep 17 00:00:00 2001 From: Daryl Tucker Date: Fri, 4 Jul 2014 17:12:51 -0500 Subject: [PATCH 1/2] Added support for other OpenGraph like tags Twitter and OpenGraph have been split apart. I made all attempts to respect OpenGraph as the default provider, while allowing either to be used very easily. Each 'Provider' can have different required attributes, as well as the attribute we key off of for finding these tags. Passing **provider** to OpenGraph() will force a Provider. If any OpenGraph tags are found, OpenGraph becomes the Provider, which forces validity of OpenGraph instead of the original Provider. is_valid() has been modified such that it will use the detected Provider. If tags of mixed providers are found, OpenGraph will take precedence. to_html() has been altered to print using the Provider's name. OrderedDict from collections is used to keep OpenGraph as top priority. --- opengraph/opengraph.py | 100 +++++++++++++++++++++++++++-------------- 1 file changed, 66 insertions(+), 34 deletions(-) diff --git a/opengraph/opengraph.py b/opengraph/opengraph.py index 9edbdd5..bc58974 100644 --- a/opengraph/opengraph.py +++ b/opengraph/opengraph.py @@ -14,27 +14,46 @@ except ImportError: import_json = False +from collections import OrderedDict + + class OpenGraph(dict): """ """ - required_attrs = ['title', 'type', 'image', 'url'] + providers = OrderedDict([ + ('twitter', { + 'attr': 'name', + 'required_attrs': ['title', 'card', 'image'] + }), + ('og', { + 'attr': 'property', + 'required_attrs': ['title', 'type', 'image', 'url'] + }) + # keep on bottom to respect og precedence + ]) def __init__(self, url=None, html=None, scrape=False, **kwargs): # If scrape == True, then will try to fetch missing attribtues # from the page's body + # Automatically detect provider unless user specifies with + # provider='provider' from providers dict + self.scrape = scrape self._url = url for k in kwargs.keys(): self[k] = kwargs[k] - + + if not self.get('provider'): + self.provider = '' + dict.__init__(self) - + if url is not None: self.fetch(url) - + if html is not None: self.parser(html) @@ -43,48 +62,59 @@ def __setattr__(self, name, val): def __getattr__(self, name): return self[name] - + def fetch(self, url): """ """ raw = urllib2.urlopen(url) html = raw.read() return self.parser(html) - + def parser(self, html): """ """ - if not isinstance(html,BeautifulSoup): + if not isinstance(html, BeautifulSoup): doc = BeautifulSoup(html) else: doc = html - ogs = doc.html.head.findAll(property=re.compile(r'^og')) - for og in ogs: - if og.has_attr(u'content'): - self[og[u'property'][3:]]=og[u'content'] - # Couldn't fetch all attrs from og tags, try scraping body - if not self.is_valid() and self.scrape: - for attr in self.required_attrs: - if not hasattr(self, attr): - try: - self[attr] = getattr(self, 'scrape_%s' % attr)(doc) - except AttributeError: - pass - + for provider, details in self.providers.iteritems(): + attribute = details.get('attr') + search = {attribute: re.compile(r'^%s' % (provider))} + ogs = doc.html.head.findAll(attrs=search) + for og in ogs: + if og.has_attr(u'content'): + self[og[attribute][len(provider)+1:]] = og[u'content'] + if not self.provider or provider is 'og': # respect og + self.provider = provider + # Couldn't fetch all attrs from og tags, try scraping body + if self.scrape and not self.is_valid(): + for attr in self.providers.get( + self.provider, {} + ).get('required_attrs'): + if not hasattr(self, attr): + try: + self[attr] = getattr(self, 'scrape_%s' % attr)(doc) + except AttributeError: + pass + def is_valid(self): - return all([hasattr(self, attr) for attr in self.required_attrs]) - + return all([ + hasattr(self, attr) for attr in self.providers.get( + self.provider, {} + ).get('required_attrs') + ]) + def to_html(self): if not self.is_valid(): - return u"" - + return u"" % (self.provider, self.provider) + meta = u"" - for key,value in self.iteritems(): - meta += u"\n" %(key, value) + for key, value in self.iteritems(): + meta += u"\n" % (self.provider, key, value) meta += u"\n" - + return meta - + def to_json(self): # TODO: force unicode global import_json @@ -92,16 +122,18 @@ def to_json(self): return "{'error':'there isn't json module'}" if not self.is_valid(): - return json.dumps({'error':'og metadata is not valid'}) - + return json.dumps({'error': 'og metadata is not valid'}) + return json.dumps(self) - + def to_xml(self): pass def scrape_image(self, doc): - images = [dict(img.attrs)['src'] - for img in doc.html.body.findAll('img')] + images = [ + dict(img.attrs)['src'] + for img in doc.html.body.findAll('img') + ] if images: return images[0] @@ -115,4 +147,4 @@ def scrape_type(self, doc): return 'other' def scrape_url(self, doc): - return self._url \ No newline at end of file + return self._url From 4b167726ba387210c3bb387e95c0419572baadae Mon Sep 17 00:00:00 2001 From: Daryl Tucker Date: Fri, 4 Jul 2014 18:28:31 -0500 Subject: [PATCH 2/2] Fixing bug with False Positives --- opengraph/opengraph.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/opengraph/opengraph.py b/opengraph/opengraph.py index bc58974..87d920f 100644 --- a/opengraph/opengraph.py +++ b/opengraph/opengraph.py @@ -90,18 +90,20 @@ def parser(self, html): if self.scrape and not self.is_valid(): for attr in self.providers.get( self.provider, {} - ).get('required_attrs'): + ).get('required_attrs', []): if not hasattr(self, attr): try: self[attr] = getattr(self, 'scrape_%s' % attr)(doc) except AttributeError: pass + else: + self.provider = 'og' def is_valid(self): return all([ hasattr(self, attr) for attr in self.providers.get( self.provider, {} - ).get('required_attrs') + ).get('required_attrs', []) ]) def to_html(self):