From 2a1ad696fdbee0ebcff43a1a7cdc3791ea26413c Mon Sep 17 00:00:00 2001 From: kanarinka Date: Tue, 3 Dec 2013 21:20:41 -0500 Subject: [PATCH 1/3] added getTitle function --- .gitignore | 8 ++++++++ src/boilerpipe/extract/__init__.py | 3 +++ 2 files changed, 11 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3444199 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ + +boilerpipe-1.2.0-bin.tar.gz + +*.jar + +build/lib/boilerpipe/__init__.py + +build/lib/boilerpipe/extract/__init__.py diff --git a/src/boilerpipe/extract/__init__.py b/src/boilerpipe/extract/__init__.py index c427209..31938db 100644 --- a/src/boilerpipe/extract/__init__.py +++ b/src/boilerpipe/extract/__init__.py @@ -68,6 +68,9 @@ def getText(self): def getHTML(self): highlighter = HTMLHighlighter.newExtractingInstance() return highlighter.process(self.source, self.data) + + def getTitle(self): + return self.source.getTitle() def getImages(self): extractor = jpype.JClass( From 22ced106f29245c24f3a9580b36eccb59e136964 Mon Sep 17 00:00:00 2001 From: kanarinka Date: Tue, 7 Jan 2014 16:54:38 -0500 Subject: [PATCH 2/3] Revised to use the Requests library instead of urllib2 urllib2 wasn't handling redirects properly, among other things --- src/boilerpipe/extract/__init__.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/boilerpipe/extract/__init__.py b/src/boilerpipe/extract/__init__.py index 31938db..72aaf36 100644 --- a/src/boilerpipe/extract/__init__.py +++ b/src/boilerpipe/extract/__init__.py @@ -1,5 +1,5 @@ import jpype -import urllib2 +import requests import socket import charade import threading @@ -29,16 +29,12 @@ class Extractor(object): source = None data = None headers = {'User-Agent': 'Mozilla/5.0'} - + def __init__(self, extractor='DefaultExtractor', **kwargs): if kwargs.get('url'): - request = urllib2.Request(kwargs['url'], headers=self.headers) - connection = urllib2.urlopen(request) - self.data = connection.read() - encoding = connection.headers['content-type'].lower().split('charset=')[-1] - if encoding.lower() == 'text/html': - encoding = charade.detect(self.data)['encoding'] - self.data = unicode(self.data, encoding) + response = requests.request('GET', kwargs['url'], headers=self.headers) + self.data = response.text + elif kwargs.get('html'): self.data = kwargs['html'] if not isinstance(self.data, unicode): From c835751bd6e1d6fb149f18218972958697986e98 Mon Sep 17 00:00:00 2001 From: kanarinka Date: Thu, 9 Jan 2014 14:44:13 -0500 Subject: [PATCH 3/3] Included timeout so that request will close connection instead of hanging --- README.rst | 2 ++ src/boilerpipe/extract/__init__.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 329fc98..b3e9c3c 100644 --- a/README.rst +++ b/README.rst @@ -43,4 +43,6 @@ Then, to extract relevant content: extracted_html = extractor.getHTML() + extracted_title = extractor.getTitle() + .. _Boilerpipe: http://code.google.com/p/boilerpipe/ diff --git a/src/boilerpipe/extract/__init__.py b/src/boilerpipe/extract/__init__.py index 72aaf36..282180c 100644 --- a/src/boilerpipe/extract/__init__.py +++ b/src/boilerpipe/extract/__init__.py @@ -32,7 +32,7 @@ class Extractor(object): def __init__(self, extractor='DefaultExtractor', **kwargs): if kwargs.get('url'): - response = requests.request('GET', kwargs['url'], headers=self.headers) + response = requests.request('GET', kwargs['url'], headers=self.headers, timeout=10) self.data = response.text elif kwargs.get('html'):