2mh · lfurrer · Feb 25, 2015 · Feb 27, 2015 · Feb 27, 2015 · Mar 3, 2015
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+*.pyc
+src/output_bioc.xml
+src/output_bioc.json
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -1,3 +1,33 @@
+2.0
+---
+New reader and writer for BioC JSON, based on the converter by Don Comeau (https://github.com/ncbi-nlp/BioC-JSON).
+Renamed the existing, XML-based BioCReader and BioCWriter to BioCXMLReader and BioCXMLWriter, respectively.
+
+1.02.4
+------
+New method `BioCWriter.iterfragments()`: Iterate over serialised XML fragments,
+thus avoiding in-memory construction of the complete tree.
+
+1.02.3
+------
+Make `BioCWriter.tostring()` a public method.
+Avoid rebuilding the whole structure for every call to `__str__()`.
+
+1.02.2
+------
+Various minor bugfixes
+
+1.02.1
+------
+Approach PEP8, especially by making implicit relative import explicit.
+Changes by Tilia Ellendorff <ellendorff@ifi.uzh.ch>,
+Max Nanis <max@maxnanis.com>, Lenz Furrer <furrer@cl.uzh.ch>.
+
+1.02
+----
+Don't assume text element in passage must be avilable.
+(Thanks to Adrian van der Lek <avdl@gmx.ch>.)
+
 1.01
 ----
 Fix invalid handling of id attributes for annotation and relation tags.

diff --git a/README.md b/README.md
@@ -0,0 +1,65 @@
+# PyBioC
+
+**[PyBioC][1] is a native Python library for reading and writing BioC XML data.**
+
+More information about BioC is available at [sourceforge][2].
+
+
+## Installation
+
+Use [`pip`][3]:
+
+    pip install git+https://github.com/OntoGene/PyBioC.git
+
+For Python 3, you might have to type `pip3`.
+
+
+## Usage
+
+Two example programs, test_read+write.py and stemming.py are shipped in the `src/` folder.
+
+- `test_read+write.py` shows the very
+basic reading and writing capability
+of the  library.
+- `stemming.py` uses the Python Natural
+Language Toolkit (NLTK) library to
+manipulate a BioC XML file read in
+before; it then tokenizes the
+corresponding text, does stemming on
+the tokens and transforms the
+manipulated PyBioC objects back to
+valid BioC XML format.
+
+
+## Example
+
+### Generate BioC object for export
+
+```python
+from bioc import BioCXMLWriter, BioCCollection, BioCDocument, BioCPassage
+
+writer = BioCXMLWriter()
+writer.collection = BioCCollection()
+collection = writer.collection
+collection.date = '20150301'
+collection.source = 'ngy1 corpus'
+
+document = BioCDocument()
+document.id = '123456'  # pubmed id
+
+passage = BioCPassage()
+passage.put_infon('type', 'paragraph')
+passage.offset = '0'
+passage.text = 'This is a biomedical sentence about various rare diseases.'
+document.add_passage(passage)
+
+collection.add_document(document)
+
+print writer
+```
+
+
+
+[1]: https://github.com/OntoGene/PyBioC
+[2]: http://bioc.sourceforge.net/
+[3]: http://pip.pypa.io/
diff --git a/README.txt b/README.txt
diff --git a/setup.py b/setup.py
@@ -0,0 +1,16 @@
+#!/usr/bin/env python
+# coding: utf8
+
+from distutils.core import setup
+
+with open('README.md') as f:
+    readme = f.read()
+
+setup(name='PyBioC',
+      version='2.0',
+      author='Hernani Marques',
+      author_email='h2m@access.uzh.ch',
+      description='Python library for working with BioC XML data',
+      long_description=readme,
+      packages=['bioc', 'bioc.meta', 'bioc.compat'],
+      package_dir={'bioc': 'src/bioc'})
diff --git a/src/bioc/__init__.py b/src/bioc/__init__.py
@@ -2,23 +2,25 @@
 # Package for interoperability in BioCreative work
 #
 
-__version__ = '1.02'
+__version__ = '1.02.4'
 
 __all__ = [
-    'BioCAnnotation', 'BioCCollection', 'BioCDocument',
-    'BioCLocation', 'BioCNode', 'BioCPassage', 'BioCRelation',
-    'BioCSentence', 'BioCReader', 'BioCWriter'
+    'BioCAnnotation', 'BioCCollection', 'BioCDocument', 'BioCLocation',
+    'BioCNode', 'BioCPassage', 'BioCRelation', 'BioCSentence',
+    'BioCXMLReader', 'BioCJSONReader', 'BioCXMLWriter', 'BioCJSONWriter'
     ]
 
 __author__ = 'Hernani Marques (h2m@access.uzh.ch)'
 
-from bioc_annotation import BioCAnnotation
-from bioc_collection import BioCCollection
-from bioc_document import BioCDocument
-from bioc_location import BioCLocation
-from bioc_node import BioCNode
-from bioc_passage import BioCPassage
-from bioc_relation import BioCRelation
-from bioc_sentence import BioCSentence
-from bioc_reader import BioCReader
-from bioc_writer import BioCWriter
+from .bioc_annotation import BioCAnnotation
+from .bioc_collection import BioCCollection
+from .bioc_document import BioCDocument
+from .bioc_location import BioCLocation
+from .bioc_node import BioCNode
+from .bioc_passage import BioCPassage
+from .bioc_relation import BioCRelation
+from .bioc_sentence import BioCSentence
+from .bioc_reader import BioCXMLReader
+from .bioc_reader import BioCJSONReader
+from .bioc_writer import BioCXMLWriter
+from .bioc_writer import BioCJSONWriter
diff --git a/src/bioc/bioc_annotation.py b/src/bioc/bioc_annotation.py
@@ -1,11 +1,12 @@
 __all__ = ['BioCAnnotation']
 
-from meta import _MetaId, _MetaInfons, _MetaText
+from .meta import _MetaId, _MetaInfons, _MetaText
+
 
 class BioCAnnotation(_MetaId, _MetaInfons, _MetaText):
 
     def __init__(self, annotation=None):
-        
+
         self.id = ''
         self.infons = dict()
         self.locations = list()

diff --git a/src/bioc/bioc_collection.py b/src/bioc/bioc_collection.py
@@ -1,12 +1,13 @@
 __all__ = ['BioCCollection']
 
-from meta import _MetaInfons, _MetaIter
-from compat import _Py2Next
+from .meta import _MetaInfons, _MetaIter
+from .compat import _Py2Next
+
 
 class BioCCollection(_Py2Next, _MetaInfons, _MetaIter):
 
     def __init__(self, collection=None):
-        
+
         self.infons = dict()
         self.source = ''
         self.date = ''
@@ -31,18 +32,18 @@ def __str__(self):
 
     def _iterdata(self):
         return self.documents
-       
+
     def clear_documents(self):
         self.documents = list()
 
     def get_document(self, doc_idx):
-        return self.documents[doc_idx] 
+        return self.documents[doc_idx]
 
     def add_document(self, document):
         self.documents.append(document)
 
     def remove_document(self, document):
-       if type(document) is int:
-           self.dcouments.remove(self.documents[document])
-       else:
-           self.documents.remove(document) # TBC
+        if isinstance(document, int):
+            self.documents.pop(document)
+        else:
+            self.documents.remove(document)  # TBC
diff --git a/src/bioc/bioc_document.py b/src/bioc/bioc_document.py
@@ -1,10 +1,10 @@
 __all__ = ['BioCDocument']
 
-from compat import _Py2Next
-from meta import _MetaId, _MetaInfons, _MetaRelations, _MetaIter
+from .compat import _Py2Next
+from .meta import _MetaId, _MetaInfons, _MetaRelations, _MetaIter
 
-class BioCDocument(_MetaId, _MetaInfons, _MetaRelations, _MetaIter,
-                   _Py2Next):
+
+class BioCDocument(_MetaId, _MetaInfons, _MetaRelations, _MetaIter, _Py2Next):
 
     def __init__(self, document=None):
 
@@ -31,7 +31,7 @@ def _iterdata(self):
         return self.passages
 
     def get_size(self):
-        return self.passages.size() # As in Java BioC
+        return sum(p.size() for p in self.passages)  # As in Java BioC
 
     def clear_passages(self):
         self.passages = list()
@@ -40,7 +40,7 @@ def add_passage(self, passage):
         self.passages.append(passage)
 
     def remove_passage(self, passage):
-        if type(passage) is int:
-            self.passages.remove(self.passages[passage])
+        if isinstance(passage, int):
+            self.passages.pop(passage)
         else:
-            self.passages.remove(passage) # TBC
+            self.passages.remove(passage)  # TBC
diff --git a/src/bioc/bioc_location.py b/src/bioc/bioc_location.py
@@ -1,17 +1,18 @@
 __all__ = ['BioCLocation']
 
-from meta import _MetaOffset
+from .meta import _MetaOffset
+
 
 class BioCLocation(_MetaOffset):
 
     def __init__(self, location=None):
-        
+
         self.offset = '-1'
         self.length = '0'
 
         if location is not None:
-             self.offset = location.offset
-             self.length = location.length 
+            self.offset = location.offset
+            self.length = location.length
 
     def __str__(self):
         s = str(self.offset) + ':' + str(self.length)

diff --git a/src/bioc/bioc_node.py b/src/bioc/bioc_node.py
@@ -1,9 +1,10 @@
 __all__ = ['BioCNode']
 
+
 class BioCNode:
 
     def __init__(self, node=None, refid=None, role=None):
-        
+
         self.refid = ''
         self.role = ''
 
@@ -17,7 +18,7 @@ def __init__(self, node=None, refid=None, role=None):
             self.role = role
 
     def __str__(self):
-         s = 'refid: ' + self.refid + '\n'
-         s += 'role: ' + self.role + '\n'
+        s = 'refid: ' + self.refid + '\n'
+        s += 'role: ' + self.role + '\n'
 
-         return s
+        return s
diff --git a/src/bioc/bioc_passage.py b/src/bioc/bioc_passage.py
@@ -1,13 +1,14 @@
 __all__ = ['BioCPassage']
 
-from meta import _MetaAnnotations, _MetaInfons, _MetaOffset, \
-                 _MetaRelations, _MetaText
+from .meta import _MetaAnnotations, _MetaInfons, _MetaOffset, \
+                  _MetaRelations, _MetaText
 
-class BioCPassage(_MetaAnnotations, _MetaOffset, _MetaText,
-                  _MetaRelations, _MetaInfons):
+
+class BioCPassage(_MetaAnnotations, _MetaOffset, _MetaText, _MetaRelations,
+                  _MetaInfons):
 
     def __init__(self, passage=None):
-        
+
         self.offset = '-1'
         self.text = ''
         self.infons = dict()
@@ -27,20 +28,19 @@ def size(self):
         return len(self.sentences)
 
     def has_sentences(self):
-        if len(self.sentences) > 0:
-            return True
+        return bool(self.sentences)
 
     def add_sentence(self, sentence):
         self.sentences.append(sentence)
 
     def sentences_iterator(self):
-        return self.sentences.iterator() # TBD
+        return self.sentences.iterator()  # TBD
 
     def clear_sentences(self):
-        self.relations = list()
+        self.sentences = list()
 
-    def remove_sentence(self, sentence): # int or obj
-        if type(sentence) is int:
-            self.sentences.remove(self.sentences[sentence])
+    def remove_sentence(self, sentence):  # int or obj
+        if isinstance(sentence, int):
+            self.sentences.pop(sentence)
         else:
             self.sentences.remove(sentence)