Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
*.pyc
src/output_bioc.xml
src/output_bioc.json
30 changes: 30 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,33 @@
2.0
---
New reader and writer for BioC JSON, based on the converter by Don Comeau (https://github.com/ncbi-nlp/BioC-JSON).
Renamed the existing, XML-based BioCReader and BioCWriter to BioCXMLReader and BioCXMLWriter, respectively.

1.02.4
------
New method `BioCWriter.iterfragments()`: Iterate over serialised XML fragments,
thus avoiding in-memory construction of the complete tree.

1.02.3
------
Make `BioCWriter.tostring()` a public method.
Avoid rebuilding the whole structure for every call to `__str__()`.

1.02.2
------
Various minor bugfixes

1.02.1
------
Approach PEP8, especially by making implicit relative import explicit.
Changes by Tilia Ellendorff <ellendorff@ifi.uzh.ch>,
Max Nanis <max@maxnanis.com>, Lenz Furrer <furrer@cl.uzh.ch>.

1.02
----
Don't assume text element in passage must be avilable.
(Thanks to Adrian van der Lek <avdl@gmx.ch>.)

1.01
----
Fix invalid handling of id attributes for annotation and relation tags.
Expand Down
65 changes: 65 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# PyBioC

**[PyBioC][1] is a native Python library for reading and writing BioC XML data.**

More information about BioC is available at [sourceforge][2].


## Installation

Use [`pip`][3]:

pip install git+https://github.com/OntoGene/PyBioC.git

For Python 3, you might have to type `pip3`.


## Usage

Two example programs, test_read+write.py and stemming.py are shipped in the `src/` folder.

- `test_read+write.py` shows the very
basic reading and writing capability
of the library.
- `stemming.py` uses the Python Natural
Language Toolkit (NLTK) library to
manipulate a BioC XML file read in
before; it then tokenizes the
corresponding text, does stemming on
the tokens and transforms the
manipulated PyBioC objects back to
valid BioC XML format.


## Example

### Generate BioC object for export

```python
from bioc import BioCXMLWriter, BioCCollection, BioCDocument, BioCPassage

writer = BioCXMLWriter()
writer.collection = BioCCollection()
collection = writer.collection
collection.date = '20150301'
collection.source = 'ngy1 corpus'

document = BioCDocument()
document.id = '123456' # pubmed id

passage = BioCPassage()
passage.put_infon('type', 'paragraph')
passage.offset = '0'
passage.text = 'This is a biomedical sentence about various rare diseases.'
document.add_passage(passage)

collection.add_document(document)

print writer
```



[1]: https://github.com/OntoGene/PyBioC
[2]: http://bioc.sourceforge.net/
[3]: http://pip.pypa.io/
15 changes: 0 additions & 15 deletions README.txt

This file was deleted.

16 changes: 16 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/usr/bin/env python
# coding: utf8

from distutils.core import setup

with open('README.md') as f:
readme = f.read()

setup(name='PyBioC',
version='2.0',
author='Hernani Marques',
author_email='h2m@access.uzh.ch',
description='Python library for working with BioC XML data',
long_description=readme,
packages=['bioc', 'bioc.meta', 'bioc.compat'],
package_dir={'bioc': 'src/bioc'})
30 changes: 16 additions & 14 deletions src/bioc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,25 @@
# Package for interoperability in BioCreative work
#

__version__ = '1.02'
__version__ = '1.02.4'

__all__ = [
'BioCAnnotation', 'BioCCollection', 'BioCDocument',
'BioCLocation', 'BioCNode', 'BioCPassage', 'BioCRelation',
'BioCSentence', 'BioCReader', 'BioCWriter'
'BioCAnnotation', 'BioCCollection', 'BioCDocument', 'BioCLocation',
'BioCNode', 'BioCPassage', 'BioCRelation', 'BioCSentence',
'BioCXMLReader', 'BioCJSONReader', 'BioCXMLWriter', 'BioCJSONWriter'
]

__author__ = 'Hernani Marques (h2m@access.uzh.ch)'

from bioc_annotation import BioCAnnotation
from bioc_collection import BioCCollection
from bioc_document import BioCDocument
from bioc_location import BioCLocation
from bioc_node import BioCNode
from bioc_passage import BioCPassage
from bioc_relation import BioCRelation
from bioc_sentence import BioCSentence
from bioc_reader import BioCReader
from bioc_writer import BioCWriter
from .bioc_annotation import BioCAnnotation
from .bioc_collection import BioCCollection
from .bioc_document import BioCDocument
from .bioc_location import BioCLocation
from .bioc_node import BioCNode
from .bioc_passage import BioCPassage
from .bioc_relation import BioCRelation
from .bioc_sentence import BioCSentence
from .bioc_reader import BioCXMLReader
from .bioc_reader import BioCJSONReader
from .bioc_writer import BioCXMLWriter
from .bioc_writer import BioCJSONWriter
5 changes: 3 additions & 2 deletions src/bioc/bioc_annotation.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
__all__ = ['BioCAnnotation']

from meta import _MetaId, _MetaInfons, _MetaText
from .meta import _MetaId, _MetaInfons, _MetaText


class BioCAnnotation(_MetaId, _MetaInfons, _MetaText):

def __init__(self, annotation=None):

self.id = ''
self.infons = dict()
self.locations = list()
Expand Down
19 changes: 10 additions & 9 deletions src/bioc/bioc_collection.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
__all__ = ['BioCCollection']

from meta import _MetaInfons, _MetaIter
from compat import _Py2Next
from .meta import _MetaInfons, _MetaIter
from .compat import _Py2Next


class BioCCollection(_Py2Next, _MetaInfons, _MetaIter):

def __init__(self, collection=None):

self.infons = dict()
self.source = ''
self.date = ''
Expand All @@ -31,18 +32,18 @@ def __str__(self):

def _iterdata(self):
return self.documents

def clear_documents(self):
self.documents = list()

def get_document(self, doc_idx):
return self.documents[doc_idx]
return self.documents[doc_idx]

def add_document(self, document):
self.documents.append(document)

def remove_document(self, document):
if type(document) is int:
self.dcouments.remove(self.documents[document])
else:
self.documents.remove(document) # TBC
if isinstance(document, int):
self.documents.pop(document)
else:
self.documents.remove(document) # TBC
16 changes: 8 additions & 8 deletions src/bioc/bioc_document.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
__all__ = ['BioCDocument']

from compat import _Py2Next
from meta import _MetaId, _MetaInfons, _MetaRelations, _MetaIter
from .compat import _Py2Next
from .meta import _MetaId, _MetaInfons, _MetaRelations, _MetaIter

class BioCDocument(_MetaId, _MetaInfons, _MetaRelations, _MetaIter,
_Py2Next):

class BioCDocument(_MetaId, _MetaInfons, _MetaRelations, _MetaIter, _Py2Next):

def __init__(self, document=None):

Expand All @@ -31,7 +31,7 @@ def _iterdata(self):
return self.passages

def get_size(self):
return self.passages.size() # As in Java BioC
return sum(p.size() for p in self.passages) # As in Java BioC

def clear_passages(self):
self.passages = list()
Expand All @@ -40,7 +40,7 @@ def add_passage(self, passage):
self.passages.append(passage)

def remove_passage(self, passage):
if type(passage) is int:
self.passages.remove(self.passages[passage])
if isinstance(passage, int):
self.passages.pop(passage)
else:
self.passages.remove(passage) # TBC
self.passages.remove(passage) # TBC
9 changes: 5 additions & 4 deletions src/bioc/bioc_location.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
__all__ = ['BioCLocation']

from meta import _MetaOffset
from .meta import _MetaOffset


class BioCLocation(_MetaOffset):

def __init__(self, location=None):

self.offset = '-1'
self.length = '0'

if location is not None:
self.offset = location.offset
self.length = location.length
self.offset = location.offset
self.length = location.length

def __str__(self):
s = str(self.offset) + ':' + str(self.length)
Expand Down
9 changes: 5 additions & 4 deletions src/bioc/bioc_node.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
__all__ = ['BioCNode']


class BioCNode:

def __init__(self, node=None, refid=None, role=None):

self.refid = ''
self.role = ''

Expand All @@ -17,7 +18,7 @@ def __init__(self, node=None, refid=None, role=None):
self.role = role

def __str__(self):
s = 'refid: ' + self.refid + '\n'
s += 'role: ' + self.role + '\n'
s = 'refid: ' + self.refid + '\n'
s += 'role: ' + self.role + '\n'

return s
return s
24 changes: 12 additions & 12 deletions src/bioc/bioc_passage.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
__all__ = ['BioCPassage']

from meta import _MetaAnnotations, _MetaInfons, _MetaOffset, \
_MetaRelations, _MetaText
from .meta import _MetaAnnotations, _MetaInfons, _MetaOffset, \
_MetaRelations, _MetaText

class BioCPassage(_MetaAnnotations, _MetaOffset, _MetaText,
_MetaRelations, _MetaInfons):

class BioCPassage(_MetaAnnotations, _MetaOffset, _MetaText, _MetaRelations,
_MetaInfons):

def __init__(self, passage=None):

self.offset = '-1'
self.text = ''
self.infons = dict()
Expand All @@ -27,20 +28,19 @@ def size(self):
return len(self.sentences)

def has_sentences(self):
if len(self.sentences) > 0:
return True
return bool(self.sentences)

def add_sentence(self, sentence):
self.sentences.append(sentence)

def sentences_iterator(self):
return self.sentences.iterator() # TBD
return self.sentences.iterator() # TBD

def clear_sentences(self):
self.relations = list()
self.sentences = list()

def remove_sentence(self, sentence): # int or obj
if type(sentence) is int:
self.sentences.remove(self.sentences[sentence])
def remove_sentence(self, sentence): # int or obj
if isinstance(sentence, int):
self.sentences.pop(sentence)
else:
self.sentences.remove(sentence)
Loading