Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ python-boilerpipe

A python wrapper for Boilerpipe_, an excellent Java library for boilerplate removal and fulltext extraction from HTML pages.



Configuration
=============

Expand Down Expand Up @@ -36,6 +38,8 @@ If no extractor is passed the DefaultExtractor will be used by default. Addition

from boilerpipe.extract import Extractor
extractor = Extractor(extractor='ArticleExtractor', url=your_url)

``you can chose 'url' ,'html' or 'file' as second argment.

Then, to extract relevant content:

Expand All @@ -46,3 +50,4 @@ Then, to extract relevant content:
extracted_html = extractor.getHTML()

.. _Boilerpipe: http://code.google.com/p/boilerpipe/

4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ def download_jars(datapath, version=boilerpipe_version):
],
author='Misja Hoebe',
author_email='misja.hoebe@gmail.com',
maintainer = 'Matthew Russell',
maintainer_email = 'ptwobrussell@gmail.com',
maintainer = 'Matthew Russell','CaiMujia'
maintainer_email = 'ptwobrussell@gmail.com','caimujia@gmail.com'
url = 'https://github.com/ptwobrussell/python-boilerpipe/',
classifiers=[
'Development Status :: 5 - Production/Stable',
Expand Down
64 changes: 40 additions & 24 deletions src/boilerpipe/extract/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import socket
import charade
import threading
import re

socket.setdefaulttimeout(15)
lock = threading.Lock()
Expand All @@ -18,7 +19,8 @@ class Extractor(object):
being one of the boilerpipe extractors:
- DefaultExtractor
- ArticleExtractor
- ArticleSentencesExtractor
- ArticleSentencesExtractor file='/home/mj/t20150528_653893.htm'

- KeepEverythingExtractor
- KeepEverythingWithMinKWordsExtractor
- LargestContentExtractor
Expand All @@ -29,7 +31,7 @@ class Extractor(object):
source = None
data = None
headers = {'User-Agent': 'Mozilla/5.0'}

def __init__(self, extractor='DefaultExtractor', **kwargs):
if kwargs.get('url'):
request = urllib2.Request(kwargs['url'], headers=self.headers)
Expand All @@ -38,11 +40,34 @@ def __init__(self, extractor='DefaultExtractor', **kwargs):
encoding = connection.headers['content-type'].lower().split('charset=')[-1]
if encoding.lower() == 'text/html':
encoding = charade.detect(self.data)['encoding']
self.data = unicode(self.data, encoding)
# self.data = unicode(self.data, 'gbk')
#self.data = self.data.decode(encoding, 'ignore')
try:
self.data = unicode(self.data, charade.detect(self.data)['encoding'])
except UnicodeError:
encoding = charade.detect(self.data)['encoding']
self.data = self.data.decode(encoding, 'ignore')
elif kwargs.get('html'):
self.data = kwargs['html']
if not isinstance(self.data, unicode):
self.data = unicode(self.data, charade.detect(self.data)['encoding'])
# self.data = unicode(self.data, charade.detect(self.data)['encoding'])
try:
self.data = unicode(self.data, charade.detect(self.data)['encoding'])
except UnicodeError:
encoding = charade.detect(self.data)['encoding']
self.data = self.data.decode(encoding, 'ignore')
## Extractor(extractor='ArticleExtractor',file='/tmp/a.html')
elif kwargs.get('file'):
Path = kwargs['file']
f = open(Path, 'r')
self.data = f.read()
if not isinstance(self.data, unicode):
try:
self.data = unicode(self.data, charade.detect(self.data)['encoding'])
except UnicodeError:
encoding = charade.detect(self.data)['encoding']
self.data = self.data.decode(encoding, 'ignore')

else:
raise Exception('No text or url provided')

Expand All @@ -52,35 +77,26 @@ def __init__(self, extractor='DefaultExtractor', **kwargs):
if jpype.isThreadAttachedToJVM() == False:
jpype.attachThreadToJVM()
lock.acquire()

self.extractor = jpype.JClass(
"de.l3s.boilerpipe.extractors."+extractor).INSTANCE
finally:
lock.release()

reader = StringReader(self.data)
self.source = BoilerpipeSAXInput(InputSource(reader)).getTextDocument()
self.extractor.process(self.source)

def getText(self):
return self.source.getContent()


def getTitle(self):
return self.source.getTitle()

def getHTML(self):
highlighter = HTMLHighlighter.newExtractingInstance()
return highlighter.process(self.source, self.data)

def getImages(self):
extractor = jpype.JClass(
"de.l3s.boilerpipe.sax.ImageExtractor").INSTANCE
images = extractor.process(self.source, self.data)
jpype.java.util.Collections.sort(images)
images = [
{
'src' : image.getSrc(),
'width' : image.getWidth(),
'height': image.getHeight(),
'alt' : image.getAlt(),
'area' : image.getArea()
} for image in images
]
return images

def getDate(self):
r='(19[7-9][0-9]|20[0-1][0-9])-(0[1-9]|1[0-2])-([1-2][0-9]|0[1-9]|3[0-1]) ([0-1][0-9]|2[0-4]):([0-5][0-9]):([0-5][0-9])'
return re.search(r,self.data).group()