Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 68 additions & 34 deletions opengraph/opengraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,27 +14,46 @@
except ImportError:
import_json = False

from collections import OrderedDict


class OpenGraph(dict):
"""
"""

required_attrs = ['title', 'type', 'image', 'url']
providers = OrderedDict([
('twitter', {
'attr': 'name',
'required_attrs': ['title', 'card', 'image']
}),
('og', {
'attr': 'property',
'required_attrs': ['title', 'type', 'image', 'url']
})
# keep on bottom to respect og precedence
])

def __init__(self, url=None, html=None, scrape=False, **kwargs):
# If scrape == True, then will try to fetch missing attribtues
# from the page's body

# Automatically detect provider unless user specifies with
# provider='provider' from providers dict

self.scrape = scrape
self._url = url

for k in kwargs.keys():
self[k] = kwargs[k]


if not self.get('provider'):
self.provider = ''

dict.__init__(self)

if url is not None:
self.fetch(url)

if html is not None:
self.parser(html)

Expand All @@ -43,65 +62,80 @@ def __setattr__(self, name, val):

def __getattr__(self, name):
return self[name]

def fetch(self, url):
"""
"""
raw = urllib2.urlopen(url)
html = raw.read()
return self.parser(html)

def parser(self, html):
"""
"""
if not isinstance(html,BeautifulSoup):
if not isinstance(html, BeautifulSoup):
doc = BeautifulSoup(html)
else:
doc = html
ogs = doc.html.head.findAll(property=re.compile(r'^og'))
for og in ogs:
if og.has_attr(u'content'):
self[og[u'property'][3:]]=og[u'content']
# Couldn't fetch all attrs from og tags, try scraping body
if not self.is_valid() and self.scrape:
for attr in self.required_attrs:
if not hasattr(self, attr):
try:
self[attr] = getattr(self, 'scrape_%s' % attr)(doc)
except AttributeError:
pass

for provider, details in self.providers.iteritems():
attribute = details.get('attr')
search = {attribute: re.compile(r'^%s' % (provider))}
ogs = doc.html.head.findAll(attrs=search)
for og in ogs:
if og.has_attr(u'content'):
self[og[attribute][len(provider)+1:]] = og[u'content']
if not self.provider or provider is 'og': # respect og
self.provider = provider
# Couldn't fetch all attrs from og tags, try scraping body
if self.scrape and not self.is_valid():
for attr in self.providers.get(
self.provider, {}
).get('required_attrs', []):
if not hasattr(self, attr):
try:
self[attr] = getattr(self, 'scrape_%s' % attr)(doc)
except AttributeError:
pass
else:
self.provider = 'og'

def is_valid(self):
return all([hasattr(self, attr) for attr in self.required_attrs])

return all([
hasattr(self, attr) for attr in self.providers.get(
self.provider, {}
).get('required_attrs', [])
])

def to_html(self):
if not self.is_valid():
return u"<meta property=\"og:error\" content=\"og metadata is not valid\" />"
return u"<meta property=\"%s:error\" content=\"%s metadata is not valid\" />" % (self.provider, self.provider)

meta = u""
for key,value in self.iteritems():
meta += u"\n<meta property=\"og:%s\" content=\"%s\" />" %(key, value)
for key, value in self.iteritems():
meta += u"\n<meta property=\"%s:%s\" content=\"%s\" />" % (self.provider, key, value)
meta += u"\n"

return meta

def to_json(self):
# TODO: force unicode
global import_json
if not import_json:
return "{'error':'there isn't json module'}"

if not self.is_valid():
return json.dumps({'error':'og metadata is not valid'})
return json.dumps({'error': 'og metadata is not valid'})

return json.dumps(self)

def to_xml(self):
pass

def scrape_image(self, doc):
images = [dict(img.attrs)['src']
for img in doc.html.body.findAll('img')]
images = [
dict(img.attrs)['src']
for img in doc.html.body.findAll('img')
]

if images:
return images[0]
Expand All @@ -115,4 +149,4 @@ def scrape_type(self, doc):
return 'other'

def scrape_url(self, doc):
return self._url
return self._url