diff --git a/src/boilerpipe/extract/__init__.py b/src/boilerpipe/extract/__init__.py index 0f8a2a5..fb225f0 100644 --- a/src/boilerpipe/extract/__init__.py +++ b/src/boilerpipe/extract/__init__.py @@ -41,6 +41,16 @@ def __init__(self, extractor='DefaultExtractor', **kwargs): encoding = connection.headers['content-type'].lower().split('charset=')[-1] if encoding.lower() == 'text/html': encoding = chardet.detect(self.data)['encoding'] + try: + import gzip + import StringIO + data = StringIO.StringIO(self.data) + gzipper = gzip.GzipFile(fileobj=data) + self.data = gzipper.read() + #self.data = gzip.decompress(self.data) + except Exception as inst: + #print inst + pass try: self.data = unicode(self.data, encoding) except NameError: