diff --git a/src/boilerpipe/extract/__init__.py b/src/boilerpipe/extract/__init__.py index c427209..f1a8a4a 100644 --- a/src/boilerpipe/extract/__init__.py +++ b/src/boilerpipe/extract/__init__.py @@ -1,5 +1,6 @@ import jpype import urllib2 +import requests import socket import charade import threading @@ -28,17 +29,12 @@ class Extractor(object): extractor = None source = None data = None - headers = {'User-Agent': 'Mozilla/5.0'} + headers = {'User-Agent': 'Mozilla'} def __init__(self, extractor='DefaultExtractor', **kwargs): if kwargs.get('url'): - request = urllib2.Request(kwargs['url'], headers=self.headers) - connection = urllib2.urlopen(request) - self.data = connection.read() - encoding = connection.headers['content-type'].lower().split('charset=')[-1] - if encoding.lower() == 'text/html': - encoding = charade.detect(self.data)['encoding'] - self.data = unicode(self.data, encoding) + response = requests.request('GET', kwargs['url'], headers=self.headers) + self.data = response.text elif kwargs.get('html'): self.data = kwargs['html'] if not isinstance(self.data, unicode):