Skip to content
12 changes: 4 additions & 8 deletions src/boilerpipe/extract/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import jpype
import urllib2
import requests
import socket
import charade
import threading
Expand Down Expand Up @@ -28,17 +29,12 @@ class Extractor(object):
extractor = None
source = None
data = None
headers = {'User-Agent': 'Mozilla/5.0'}
headers = {'User-Agent': 'Mozilla'}

def __init__(self, extractor='DefaultExtractor', **kwargs):
if kwargs.get('url'):
request = urllib2.Request(kwargs['url'], headers=self.headers)
connection = urllib2.urlopen(request)
self.data = connection.read()
encoding = connection.headers['content-type'].lower().split('charset=')[-1]
if encoding.lower() == 'text/html':
encoding = charade.detect(self.data)['encoding']
self.data = unicode(self.data, encoding)
response = requests.request('GET', kwargs['url'], headers=self.headers)
self.data = response.text
elif kwargs.get('html'):
self.data = kwargs['html']
if not isinstance(self.data, unicode):
Expand Down