diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3444199 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ + +boilerpipe-1.2.0-bin.tar.gz + +*.jar + +build/lib/boilerpipe/__init__.py + +build/lib/boilerpipe/extract/__init__.py diff --git a/README.rst b/README.rst index 329fc98..b3e9c3c 100644 --- a/README.rst +++ b/README.rst @@ -43,4 +43,6 @@ Then, to extract relevant content: extracted_html = extractor.getHTML() + extracted_title = extractor.getTitle() + .. _Boilerpipe: http://code.google.com/p/boilerpipe/ diff --git a/src/boilerpipe/extract/__init__.py b/src/boilerpipe/extract/__init__.py index c427209..282180c 100644 --- a/src/boilerpipe/extract/__init__.py +++ b/src/boilerpipe/extract/__init__.py @@ -1,5 +1,5 @@ import jpype -import urllib2 +import requests import socket import charade import threading @@ -29,16 +29,12 @@ class Extractor(object): source = None data = None headers = {'User-Agent': 'Mozilla/5.0'} - + def __init__(self, extractor='DefaultExtractor', **kwargs): if kwargs.get('url'): - request = urllib2.Request(kwargs['url'], headers=self.headers) - connection = urllib2.urlopen(request) - self.data = connection.read() - encoding = connection.headers['content-type'].lower().split('charset=')[-1] - if encoding.lower() == 'text/html': - encoding = charade.detect(self.data)['encoding'] - self.data = unicode(self.data, encoding) + response = requests.request('GET', kwargs['url'], headers=self.headers, timeout=10) + self.data = response.text + elif kwargs.get('html'): self.data = kwargs['html'] if not isinstance(self.data, unicode): @@ -68,6 +64,9 @@ def getText(self): def getHTML(self): highlighter = HTMLHighlighter.newExtractingInstance() return highlighter.process(self.source, self.data) + + def getTitle(self): + return self.source.getTitle() def getImages(self): extractor = jpype.JClass(