From 2a1ad696fdbee0ebcff43a1a7cdc3791ea26413c Mon Sep 17 00:00:00 2001
From: kanarinka <kanarinka@gmail.com>
Date: Tue, 3 Dec 2013 21:20:41 -0500
Subject: [PATCH 1/3] added getTitle function

---
 .gitignore                         | 8 ++++++++
 src/boilerpipe/extract/__init__.py | 3 +++
 2 files changed, 11 insertions(+)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..3444199
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,8 @@
+
+boilerpipe-1.2.0-bin.tar.gz
+
+*.jar
+
+build/lib/boilerpipe/__init__.py
+
+build/lib/boilerpipe/extract/__init__.py
diff --git a/src/boilerpipe/extract/__init__.py b/src/boilerpipe/extract/__init__.py
index c427209..31938db 100644
--- a/src/boilerpipe/extract/__init__.py
+++ b/src/boilerpipe/extract/__init__.py
@@ -68,6 +68,9 @@ def getText(self):
     def getHTML(self):
         highlighter = HTMLHighlighter.newExtractingInstance()
         return highlighter.process(self.source, self.data)
+
+    def getTitle(self):
+        return self.source.getTitle()
     
     def getImages(self):
         extractor = jpype.JClass(

From 22ced106f29245c24f3a9580b36eccb59e136964 Mon Sep 17 00:00:00 2001
From: kanarinka <kanarinka@gmail.com>
Date: Tue, 7 Jan 2014 16:54:38 -0500
Subject: [PATCH 2/3] Revised to use the Requests library instead of urllib2

urllib2 wasn't handling redirects properly, among other things
---
 src/boilerpipe/extract/__init__.py | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/src/boilerpipe/extract/__init__.py b/src/boilerpipe/extract/__init__.py
index 31938db..72aaf36 100644
--- a/src/boilerpipe/extract/__init__.py
+++ b/src/boilerpipe/extract/__init__.py
@@ -1,5 +1,5 @@
 import jpype
-import urllib2
+import requests
 import socket
 import charade
 import threading
@@ -29,16 +29,12 @@ class Extractor(object):
     source    = None
     data      = None
     headers   = {'User-Agent': 'Mozilla/5.0'}
-    
+
     def __init__(self, extractor='DefaultExtractor', **kwargs):
         if kwargs.get('url'):
-            request     = urllib2.Request(kwargs['url'], headers=self.headers)
-            connection  = urllib2.urlopen(request)
-            self.data   = connection.read()
-            encoding    = connection.headers['content-type'].lower().split('charset=')[-1]
-            if encoding.lower() == 'text/html':
-                encoding = charade.detect(self.data)['encoding']
-            self.data = unicode(self.data, encoding)
+            response = requests.request('GET', kwargs['url'], headers=self.headers)
+            self.data = response.text
+
         elif kwargs.get('html'):
             self.data = kwargs['html']
             if not isinstance(self.data, unicode):

From c835751bd6e1d6fb149f18218972958697986e98 Mon Sep 17 00:00:00 2001
From: kanarinka <kanarinka@gmail.com>
Date: Thu, 9 Jan 2014 14:44:13 -0500
Subject: [PATCH 3/3] Included timeout so that request will close connection
 instead of hanging

---
 README.rst                         | 2 ++
 src/boilerpipe/extract/__init__.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.rst b/README.rst
index 329fc98..b3e9c3c 100644
--- a/README.rst
+++ b/README.rst
@@ -43,4 +43,6 @@ Then, to extract relevant content:
 	
 	extracted_html = extractor.getHTML()
 
+	extracted_title = extractor.getTitle()
+
 .. _Boilerpipe: http://code.google.com/p/boilerpipe/ 
diff --git a/src/boilerpipe/extract/__init__.py b/src/boilerpipe/extract/__init__.py
index 72aaf36..282180c 100644
--- a/src/boilerpipe/extract/__init__.py
+++ b/src/boilerpipe/extract/__init__.py
@@ -32,7 +32,7 @@ class Extractor(object):
 
     def __init__(self, extractor='DefaultExtractor', **kwargs):
         if kwargs.get('url'):
-            response = requests.request('GET', kwargs['url'], headers=self.headers)
+            response = requests.request('GET', kwargs['url'], headers=self.headers, timeout=10)
             self.data = response.text
 
         elif kwargs.get('html'):