diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4a97d0c --- /dev/null +++ b/.gitignore @@ -0,0 +1,38 @@ +# Distribution / packaging +__pycache__ +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST +PKG-INFO + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# IDEs +.idea +.vscode +*.iml + + +# data downloaded +boilerpipe/data \ No newline at end of file diff --git a/MANIFEST b/MANIFEST deleted file mode 100644 index e339936..0000000 --- a/MANIFEST +++ /dev/null @@ -1,8 +0,0 @@ -# file GENERATED by distutils, do NOT edit -setup.cfg -setup.py -src/boilerpipe/__init__.py -src/boilerpipe/data/boilerpipe-1.2.0/boilerpipe-1.2.0.jar -src/boilerpipe/data/boilerpipe-1.2.0/lib/nekohtml-1.9.13.jar -src/boilerpipe/data/boilerpipe-1.2.0/lib/xerces-2.9.1.jar -src/boilerpipe/extract/__init__.py diff --git a/PKG-INFO b/PKG-INFO deleted file mode 100644 index 2e9e1e7..0000000 --- a/PKG-INFO +++ /dev/null @@ -1,18 +0,0 @@ -Metadata-Version: 1.0 -Name: boilerpipe3 -Version: 1.0 -Summary: A fork of boilerpipe with python 3 with a small fixes, ported from source `https://pypi.python.org/pypi/boilerpipe-py3 -Home-page: https://github.com/slaveofcode/boilerpipe3 -Author: Aditya Kresna Permana -Author-email: zeandcode@gmail.com -License: Apache 2.0 -Description: UNKNOWN -Keywords: boilerpipe, bolierpipe3 -Platform: UNKNOWN -Classifier: Development Status :: 5 - Production/Stable -Classifier: Environment :: Console -Classifier: Intended Audience :: Developers -Classifier: License :: OSI Approved :: Apache Software License -Classifier: Operating System :: OS Independent -Classifier: Programming Language :: Python :: 3.4 -Classifier: Natural Language :: English diff --git a/README.md b/README.md index 29e6d7e..ec071a5 100644 --- a/README.md +++ b/README.md @@ -5,19 +5,8 @@ Installation ============ You can install this lib directly from github repository by execute these command - pip install git+ssh://git@github.com/slaveofcode/boilerpipe3@master + pip install git+ssh://git@github.com/derlin/boilerpipe3@master -Or from official pypi - - pip install boilerpipe3 - -Configuration -============= - -Dependencies: -jpype, charade - -The boilerpipe jar files will get fetched and included automatically when building the package. Usage ===== @@ -35,14 +24,30 @@ The constructor takes a keyword argment ``extractor``, being one of the availabl - NumWordsRulesExtractor - CanolaExtractor -If no extractor is passed the DefaultExtractor will be used by default. Additional keyword arguments are either ``html`` for HTML text or ``url``. +If no extractor is passed the DefaultExtractor will be used by default. from boilerpipe.extract import Extractor - extractor = Extractor(extractor='ArticleExtractor', url=your_url) + extractor = Extractor(extractor='ArticleExtractor') + +Once you get an extractor instance, extract relevant content using one of `getText`, `getHTML`, `getTextBlock`, `getImages`. Each one accepts one of the following arguments: -Then, to extract relevant content: +- `url`: the url of the page +- `html`: an html string to parse +- `processed`: the `(source, data)` returned by the method `get`. - extracted_text = extractor.getText() + +Example: + + extracted_text = extractor.getText(url=your_url) - extracted_html = extractor.getHTML() + extracted_html = extractor.getHTML(url=your_url) + +If you need multiple information, you can save some computation time by doing: + + processed = extractor.get(url=url) # download and process once + + text = extractor.getText(processed=processed) + text_blocks = extractor.getTextBlocks(processed=processed) + html = extractor.getHTML(processed=processed) + images = extractor.getImages(processed=processed) diff --git a/boilerpipe-1.2.1-bin.tar.gz b/boilerpipe-1.2.1-bin.tar.gz new file mode 100644 index 0000000..98a8edb Binary files /dev/null and b/boilerpipe-1.2.1-bin.tar.gz differ diff --git a/src/boilerpipe/__init__.py b/boilerpipe/__init__.py similarity index 57% rename from src/boilerpipe/__init__.py rename to boilerpipe/__init__.py index 0b0fac0..d2d2708 100644 --- a/src/boilerpipe/__init__.py +++ b/boilerpipe/__init__.py @@ -1,10 +1,13 @@ import os -import imp import jpype +from os import path if jpype.isJVMStarted() != True: jars = [] - for top, dirs, files in os.walk(imp.find_module('boilerpipe')[1]+'/data'): + data_dir = path.join(path.dirname(path.realpath(__file__)), 'data') + for top, dirs, files in os.walk(data_dir): for nm in files: jars.append(os.path.join(top, nm)) jpype.startJVM(jpype.getDefaultJVMPath(), "-Djava.class.path=%s" % os.pathsep.join(jars)) + +from .extractor import Extractor, EXTRACTORS \ No newline at end of file diff --git a/boilerpipe/extractor.py b/boilerpipe/extractor.py new file mode 100644 index 0000000..f4a558c --- /dev/null +++ b/boilerpipe/extractor.py @@ -0,0 +1,156 @@ +import jpype +import socket +import threading + +from bs4 import UnicodeDammit +import requests + +socket.setdefaulttimeout(15) +lock = threading.Lock() + +InputSource = jpype.JClass('org.xml.sax.InputSource') +StringReader = jpype.JClass('java.io.StringReader') +HTMLHighlighter = jpype.JClass('de.l3s.boilerpipe.sax.HTMLHighlighter') +BoilerpipeSAXInput = jpype.JClass('de.l3s.boilerpipe.sax.BoilerpipeSAXInput') + +from functools import wraps + +# suppress warning for invalid SSL certificates +requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning) + +#: Headers passed with each request +_DEFAULT_HEADERS = { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.89 Safari/537.36'} + +EXTRACTORS = [ + 'DefaultExtractor', + 'ArticleExtractor', + 'ArticleSentencesExtractor', + 'KeepEverythingWithMinKWordsExtractor', # if used, don't forget to pass the kMin argument to its constructor + 'KeepEverythingExtractor', + 'LargestContentExtractor', + 'NumWordsRulesExtractor', + 'CanolaExtractor' +] + + +def thread_safe(method): + @wraps(method) + def _impl(self, *args, **kwargs): + try: + # make it thread safe, see jpype documentation for more info + if threading.activeCount() > 1: + if jpype.isThreadAttachedToJVM() is False: + jpype.attachThreadToJVM() + # lock.acquire() + return method(self, *args, **kwargs) + finally: + # lock.release() + pass + + return _impl + + +class Extractor(object): + """ + Extract text. Constructor takes 'extractor' as a keyword argument, + being one of the boilerpipe extractors: + - DefaultExtractor + - ArticleExtractor + - ArticleSentencesExtractor + - KeepEverythingExtractor + - KeepEverythingWithMinKWordsExtractor + - LargestContentExtractor + - NumWordsRulesExtractor + - CanolaExtractor + """ + extractor = None + source = None + data = None + headers = {'User-Agent': 'Mozilla/5.0'} + + @thread_safe + def __init__(self, extractor='DefaultExtractor', **kwargs): + if extractor == "KeepEverythingWithMinKWordsExtractor": + kMin = kwargs.get("kMin", 1) # set default to 1 + self.extractor = jpype.JClass( + "de.l3s.boilerpipe.extractors." + extractor)(kMin) + else: + self.extractor = jpype.JClass( + "de.l3s.boilerpipe.extractors." + extractor).INSTANCE + + @thread_safe + def get(self, url=None, html=None): + return self._process(url=url, html=html) + + @thread_safe + def getText(self, url=None, html=None, processed=None): + source, data = self._process(html=html, url=url, processed=processed) + return source.getContent() + + @thread_safe + def getTextBlocks(self, url=None, html=None, processed=None): + source, data = self._process(html=html, url=url, processed=processed) + blocks = source.getTextBlocks() + results = [] + for i in range(blocks.size()): + if blocks[i].isContent(): + results.append(blocks[i].getText()) + return results + + @thread_safe + def getHTML(self, url=None, html=None, processed=None): + source, data = self._process(html=html, url=url, processed=processed) + highlighter = HTMLHighlighter.newExtractingInstance() + return highlighter.process(source, data) + + def getImages(self, url=None, html=None, processed=None): + if processed is not None: + source, data = processed + else: + source, data = self.get(url=url, html=html) + + extractor = jpype.JClass("de.l3s.boilerpipe.sax.ImageExtractor").INSTANCE + images = extractor.process(source, data) + jpype.java.util.Collections.sort(images) + # list comprehension returns + # TypeError: iter() returned non-iterator of type 'java.util.ArrayList$Itr' + # so do it the old way: + results = [] + for i in range(images.size()): + img = images[i] + results.append({ + 'src': img.getSrc(), + 'width': img.getWidth(), + 'height': img.getHeight(), + 'alt': img.getAlt(), + 'area': img.getArea() + }) + return results + + def _process(self, **kwargs): + if kwargs.get('processed'): + source, data = kwargs.get('processed') + return source, data + if kwargs.get('url'): + resp = requests.get(kwargs['url'], verify=False, stream=True, headers=_DEFAULT_HEADERS) + data = self._convert(resp.content) + elif kwargs.get('html'): + data = kwargs['html'] + if not isinstance(data, str): + data = self._convert(data) + else: + raise Exception('No text or url provided') + + reader = StringReader(data) + source = BoilerpipeSAXInput(InputSource(reader)).getTextDocument() + self.extractor.process(source) + return source, data + + def _convert(self, content): + converted = UnicodeDammit(content) + if not converted.unicode_markup: + raise UnicodeDecodeError( + "Failed to detect encoding, tried [%s]", ', '.join(converted.tried_encodings)) + # print converted.original_encoding + return converted.unicode_markup diff --git a/dist/boilerpipe3-1.1.tar.gz b/dist/boilerpipe3-1.1.tar.gz deleted file mode 100644 index e7e340f..0000000 Binary files a/dist/boilerpipe3-1.1.tar.gz and /dev/null differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e89cda6 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +charade==1.0.3 +JPype1-py3==0.5.5.2 +numpy==1.14.2 +scikit-learn==0.19.1 +scipy==1.0.1 diff --git a/setup.py b/setup.py index d0f5600..fe63241 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ import tarfile from fnmatch import fnmatch from os.path import basename, exists, dirname, abspath, join -from distutils.core import setup +import setuptools try: from urllib import urlretrieve @@ -9,18 +9,19 @@ from urllib.request import urlretrieve import sys + if sys.version_info[0] < 3: print("This module can only be used with Python 3.") print("For a Python 2 version, see:\nhttps://github.com/misja/python-boilerpipe") sys.exit(1) -__version__ = '1.1' -boilerpipe_version = '1.2.0' -DATAPATH = join(abspath(dirname((__file__))), 'src/boilerpipe/data') +__version__ = '1.2' +boilerpipe_version = '1.2.1' +DATAPATH = join(abspath(dirname((__file__))), 'boilerpipe/data') def download_jars(datapath, version=boilerpipe_version): - tgz_url = 'https://github.com/slaveofcode/boilerpipe3/raw/master/boilerpipe-{0}-bin.tar.gz'.format(version) + tgz_url = 'https://github.com/derlin/boilerpipe3/raw/master/boilerpipe-{0}-bin.tar.gz'.format(version) tgz_name = basename(tgz_url) if not exists(tgz_name): @@ -33,13 +34,17 @@ def download_jars(datapath, version=boilerpipe_version): continue tar.extract(tarinfo, datapath) + download_jars(datapath=DATAPATH) -setup( +setuptools.setup( name='boilerpipe3', version=__version__, - packages=['boilerpipe', 'boilerpipe.extract'], - package_dir={'': 'src'}, + author='Lucy Linder', + author_email='lucy.derlin@gmail.com', + url='https://github.com/derlin/boilerpipe3', + + packages=['boilerpipe'], package_data={ 'boilerpipe': [ 'data/boilerpipe-{version}/boilerpipe-{version}.jar'.format(version=boilerpipe_version), @@ -48,24 +53,23 @@ def download_jars(datapath, version=boilerpipe_version): }, install_requires=[ 'JPype1-py3', - 'charade', + 'requests', + 'beautifulsoup4', ], - author='Aditya Kresna Permana', - author_email='zeandcode@gmail.com', - maintainer = 'Aditya Kresna Permana', - maintainer_email = 'zeandcode@gmail.com', - url = 'https://github.com/slaveofcode/boilerpipe3', + classifiers=[ - 'Development Status :: 5 - Production/Stable', - 'Environment :: Console', - 'Intended Audience :: Developers', - 'License :: OSI Approved :: Apache Software License', - 'Operating System :: OS Independent', - 'Programming Language :: Python :: 3.4', - 'Natural Language :: English', - ], - keywords='boilerpipe', - license='Apache 2.0', + 'Development Status :: 5 - Production/Stable', + 'Environment :: Console', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: Apache Software License', + 'Operating System :: OS Independent', + 'Programming Language :: Python :: 3.4', + 'Natural Language :: English', + ], + + keywords='boilerpipe', + license='Apache 2.0', - description='Python interface to Boilerpipe, Boilerplate Removal and Fulltext Extraction from HTML pages with Python 3 support' + description='Python interface to Boilerpipe, Boilerplate Removal and Fulltext Extraction from HTML pages with Python 3 support. ' + 'Forked and improved from https://github.com/slaveofcode/boilerpipe3.' ) diff --git a/setup.py~ b/setup.py~ deleted file mode 100644 index f495fa7..0000000 --- a/setup.py~ +++ /dev/null @@ -1,65 +0,0 @@ -import tarfile -from fnmatch import fnmatch -from os.path import basename, exists, dirname, abspath, join -from distutils.core import setup - -try: - from urllib import urlretrieve -except: - from urllib.request import urlretrieve - -import sys -if sys.version_info[0] < 3: - print("This module can only be used with Python 3.") - print("For a Python 2 version, see:\nhttps://github.com/misja/python-boilerpipe") - sys.exit(1) - -__version__ = '1.2' -boilerpipe_version = '1.2.0' -DATAPATH = join(abspath(dirname((__file__))), 'src/boilerpipe/data') - - -def download_jars(datapath, version=boilerpipe_version): - tgz_name = 'boilerpipe-{0}-bin.tar.gz'.format(version) - tar = tarfile.open(tgz_name, mode='r:gz') - for tarinfo in tar.getmembers(): - if not fnmatch(tarinfo.name, '*.jar'): - continue - tar.extract(tarinfo, datapath) - -download_jars(datapath=DATAPATH) - -setup( - name='boilerpipe3', - version=__version__, - packages=['boilerpipe', 'boilerpipe.extract'], - package_dir={'': 'src'}, - package_data={ - 'boilerpipe': [ - 'data/boilerpipe-{version}/boilerpipe-{version}.jar'.format(version=boilerpipe_version), - 'data/boilerpipe-{version}/lib/*.jar'.format(version=boilerpipe_version), - ], - }, - install_requires=[ - 'JPype1-py3', - 'charade', - ], - author='Aditya Kresna Permana', - author_email='zeandcode@gmail.com', - maintainer = 'Aditya Kresna Permana', - maintainer_email = 'zeandcode@gmail.com', - url = 'https://github.com/slaveofcode/boilerpipe3', - classifiers=[ - 'Development Status :: 5 - Production/Stable', - 'Environment :: Console', - 'Intended Audience :: Developers', - 'License :: OSI Approved :: Apache Software License', - 'Operating System :: OS Independent', - 'Programming Language :: Python :: 3.4', - 'Natural Language :: English', - ], - keywords='boilerpipe', - license='Apache 2.0', - - description='Python interface to Boilerpipe, Boilerplate Removal and Fulltext Extraction from HTML pages with Python 3 support' -) diff --git a/src/boilerpipe/data/boilerpipe-1.2.0/boilerpipe-1.2.0.jar b/src/boilerpipe/data/boilerpipe-1.2.0/boilerpipe-1.2.0.jar deleted file mode 100644 index bc3cb42..0000000 Binary files a/src/boilerpipe/data/boilerpipe-1.2.0/boilerpipe-1.2.0.jar and /dev/null differ diff --git a/src/boilerpipe/data/boilerpipe-1.2.0/boilerpipe-demo-1.2.0.jar b/src/boilerpipe/data/boilerpipe-1.2.0/boilerpipe-demo-1.2.0.jar deleted file mode 100644 index c36634f..0000000 Binary files a/src/boilerpipe/data/boilerpipe-1.2.0/boilerpipe-demo-1.2.0.jar and /dev/null differ diff --git a/src/boilerpipe/data/boilerpipe-1.2.0/boilerpipe-javadoc-1.2.0.jar b/src/boilerpipe/data/boilerpipe-1.2.0/boilerpipe-javadoc-1.2.0.jar deleted file mode 100644 index 611e1f3..0000000 Binary files a/src/boilerpipe/data/boilerpipe-1.2.0/boilerpipe-javadoc-1.2.0.jar and /dev/null differ diff --git a/src/boilerpipe/data/boilerpipe-1.2.0/boilerpipe-sources-1.2.0.jar b/src/boilerpipe/data/boilerpipe-1.2.0/boilerpipe-sources-1.2.0.jar deleted file mode 100644 index 3f63759..0000000 Binary files a/src/boilerpipe/data/boilerpipe-1.2.0/boilerpipe-sources-1.2.0.jar and /dev/null differ diff --git a/src/boilerpipe/data/boilerpipe-1.2.0/lib/nekohtml-1.9.13.jar b/src/boilerpipe/data/boilerpipe-1.2.0/lib/nekohtml-1.9.13.jar deleted file mode 100644 index a8081d2..0000000 Binary files a/src/boilerpipe/data/boilerpipe-1.2.0/lib/nekohtml-1.9.13.jar and /dev/null differ diff --git a/src/boilerpipe/data/boilerpipe-1.2.0/lib/xerces-2.9.1.jar b/src/boilerpipe/data/boilerpipe-1.2.0/lib/xerces-2.9.1.jar deleted file mode 100644 index 547f563..0000000 Binary files a/src/boilerpipe/data/boilerpipe-1.2.0/lib/xerces-2.9.1.jar and /dev/null differ diff --git a/src/boilerpipe/extract/__init__.py b/src/boilerpipe/extract/__init__.py deleted file mode 100644 index 45f40d5..0000000 --- a/src/boilerpipe/extract/__init__.py +++ /dev/null @@ -1,91 +0,0 @@ -import jpype -try: - import urllib.request as urllib2 -except ImportError: - import urllib2 -import socket -import charade -import threading - -socket.setdefaulttimeout(15) -lock = threading.Lock() - -InputSource = jpype.JClass('org.xml.sax.InputSource') -StringReader = jpype.JClass('java.io.StringReader') -HTMLHighlighter = jpype.JClass('de.l3s.boilerpipe.sax.HTMLHighlighter') -BoilerpipeSAXInput = jpype.JClass('de.l3s.boilerpipe.sax.BoilerpipeSAXInput') - -class Extractor(object): - """ - Extract text. Constructor takes 'extractor' as a keyword argument, - being one of the boilerpipe extractors: - - DefaultExtractor - - ArticleExtractor - - ArticleSentencesExtractor - - KeepEverythingExtractor - - KeepEverythingWithMinKWordsExtractor - - LargestContentExtractor - - NumWordsRulesExtractor - - CanolaExtractor - """ - extractor = None - source = None - data = None - headers = {'User-Agent': 'Mozilla/5.0'} - - def __init__(self, extractor='DefaultExtractor', **kwargs): - if kwargs.get('url'): - request = urllib2.Request(kwargs['url'], headers=self.headers) - connection = urllib2.urlopen(request) - self.data = connection.read() - encoding = connection.headers['content-type'].lower().split('charset=')[-1] - if encoding.lower() == 'text/html': - encoding = charade.detect(self.data)['encoding'] - if encoding is None: - encoding = 'utf-8' - self.data = str(self.data, encoding) - elif kwargs.get('html'): - self.data = kwargs['html'] - if not isinstance(self.data, str): - self.data = str(self.data, charade.detect(self.data)['encoding']) - else: - raise Exception('No text or url provided') - - try: - # make it thread-safe - if threading.activeCount() > 1: - if jpype.isThreadAttachedToJVM() == False: - jpype.attachThreadToJVM() - lock.acquire() - - self.extractor = jpype.JClass( - "de.l3s.boilerpipe.extractors."+extractor).INSTANCE - finally: - lock.release() - - reader = StringReader(self.data) - self.source = BoilerpipeSAXInput(InputSource(reader)).getTextDocument() - self.extractor.process(self.source) - - def getText(self): - return self.source.getContent() - - def getHTML(self): - highlighter = HTMLHighlighter.newExtractingInstance() - return highlighter.process(self.source, self.data) - - def getImages(self): - extractor = jpype.JClass( - "de.l3s.boilerpipe.sax.ImageExtractor").INSTANCE - images = extractor.process(self.source, self.data) - jpype.java.util.Collections.sort(images) - images = [ - { - 'src' : image.getSrc(), - 'width' : image.getWidth(), - 'height': image.getHeight(), - 'alt' : image.getAlt(), - 'area' : image.getArea() - } for image in images - ] - return images diff --git a/test.py b/test.py new file mode 100644 index 0000000..e2e226e --- /dev/null +++ b/test.py @@ -0,0 +1,22 @@ +from boilerpipe import Extractor + +urls = [ + "https://stackoverflow.com/questions/20996639/why-does-setup-py-develop-not-work", + "http://www.hyperkommunikation.ch/personen/mani_matter_texte.htm", + "https://www.freevap.ch/fr/ustensiles-diy-flacons-vides/111-flacon-diy-10ml.html", + "https://fr.wikipedia.org/wiki/Codage_des_caract%C3%A8res#Europe" +] + +extractor = Extractor() + +for url in urls: + html = extractor.getHTML(url) + processed = extractor.get(url) + print('%s : %d chars, html:%d=url:%d=proc:%d text blocks, %d images.' % ( + url, + len(extractor.getText(processed=processed)), + len(extractor.getTextBlocks(html=html)), + len(extractor.getTextBlocks(url=url)), + len(extractor.getTextBlocks(processed=processed)), + len(extractor.getImages(processed=processed)) + ))