diff --git a/opengraph/opengraph.py b/opengraph/opengraph.py
index 9edbdd5..87d920f 100644
--- a/opengraph/opengraph.py
+++ b/opengraph/opengraph.py
@@ -14,27 +14,46 @@
except ImportError:
import_json = False
+from collections import OrderedDict
+
+
class OpenGraph(dict):
"""
"""
- required_attrs = ['title', 'type', 'image', 'url']
+ providers = OrderedDict([
+ ('twitter', {
+ 'attr': 'name',
+ 'required_attrs': ['title', 'card', 'image']
+ }),
+ ('og', {
+ 'attr': 'property',
+ 'required_attrs': ['title', 'type', 'image', 'url']
+ })
+ # keep on bottom to respect og precedence
+ ])
def __init__(self, url=None, html=None, scrape=False, **kwargs):
# If scrape == True, then will try to fetch missing attribtues
# from the page's body
+ # Automatically detect provider unless user specifies with
+ # provider='provider' from providers dict
+
self.scrape = scrape
self._url = url
for k in kwargs.keys():
self[k] = kwargs[k]
-
+
+ if not self.get('provider'):
+ self.provider = ''
+
dict.__init__(self)
-
+
if url is not None:
self.fetch(url)
-
+
if html is not None:
self.parser(html)
@@ -43,48 +62,61 @@ def __setattr__(self, name, val):
def __getattr__(self, name):
return self[name]
-
+
def fetch(self, url):
"""
"""
raw = urllib2.urlopen(url)
html = raw.read()
return self.parser(html)
-
+
def parser(self, html):
"""
"""
- if not isinstance(html,BeautifulSoup):
+ if not isinstance(html, BeautifulSoup):
doc = BeautifulSoup(html)
else:
doc = html
- ogs = doc.html.head.findAll(property=re.compile(r'^og'))
- for og in ogs:
- if og.has_attr(u'content'):
- self[og[u'property'][3:]]=og[u'content']
- # Couldn't fetch all attrs from og tags, try scraping body
- if not self.is_valid() and self.scrape:
- for attr in self.required_attrs:
- if not hasattr(self, attr):
- try:
- self[attr] = getattr(self, 'scrape_%s' % attr)(doc)
- except AttributeError:
- pass
-
+ for provider, details in self.providers.iteritems():
+ attribute = details.get('attr')
+ search = {attribute: re.compile(r'^%s' % (provider))}
+ ogs = doc.html.head.findAll(attrs=search)
+ for og in ogs:
+ if og.has_attr(u'content'):
+ self[og[attribute][len(provider)+1:]] = og[u'content']
+ if not self.provider or provider is 'og': # respect og
+ self.provider = provider
+ # Couldn't fetch all attrs from og tags, try scraping body
+ if self.scrape and not self.is_valid():
+ for attr in self.providers.get(
+ self.provider, {}
+ ).get('required_attrs', []):
+ if not hasattr(self, attr):
+ try:
+ self[attr] = getattr(self, 'scrape_%s' % attr)(doc)
+ except AttributeError:
+ pass
+ else:
+ self.provider = 'og'
+
def is_valid(self):
- return all([hasattr(self, attr) for attr in self.required_attrs])
-
+ return all([
+ hasattr(self, attr) for attr in self.providers.get(
+ self.provider, {}
+ ).get('required_attrs', [])
+ ])
+
def to_html(self):
if not self.is_valid():
- return u""
-
+ return u"" % (self.provider, self.provider)
+
meta = u""
- for key,value in self.iteritems():
- meta += u"\n" %(key, value)
+ for key, value in self.iteritems():
+ meta += u"\n" % (self.provider, key, value)
meta += u"\n"
-
+
return meta
-
+
def to_json(self):
# TODO: force unicode
global import_json
@@ -92,16 +124,18 @@ def to_json(self):
return "{'error':'there isn't json module'}"
if not self.is_valid():
- return json.dumps({'error':'og metadata is not valid'})
-
+ return json.dumps({'error': 'og metadata is not valid'})
+
return json.dumps(self)
-
+
def to_xml(self):
pass
def scrape_image(self, doc):
- images = [dict(img.attrs)['src']
- for img in doc.html.body.findAll('img')]
+ images = [
+ dict(img.attrs)['src']
+ for img in doc.html.body.findAll('img')
+ ]
if images:
return images[0]
@@ -115,4 +149,4 @@ def scrape_type(self, doc):
return 'other'
def scrape_url(self, doc):
- return self._url
\ No newline at end of file
+ return self._url