diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6666947 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +*.pyc +src/output_bioc.xml +src/output_bioc.json diff --git a/CHANGES.txt b/CHANGES.txt index 05366d4..1df55b0 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,3 +1,33 @@ +2.0 +--- +New reader and writer for BioC JSON, based on the converter by Don Comeau (https://github.com/ncbi-nlp/BioC-JSON). +Renamed the existing, XML-based BioCReader and BioCWriter to BioCXMLReader and BioCXMLWriter, respectively. + +1.02.4 +------ +New method `BioCWriter.iterfragments()`: Iterate over serialised XML fragments, +thus avoiding in-memory construction of the complete tree. + +1.02.3 +------ +Make `BioCWriter.tostring()` a public method. +Avoid rebuilding the whole structure for every call to `__str__()`. + +1.02.2 +------ +Various minor bugfixes + +1.02.1 +------ +Approach PEP8, especially by making implicit relative import explicit. +Changes by Tilia Ellendorff , +Max Nanis , Lenz Furrer . + +1.02 +---- +Don't assume text element in passage must be avilable. +(Thanks to Adrian van der Lek .) + 1.01 ---- Fix invalid handling of id attributes for annotation and relation tags. diff --git a/README.md b/README.md new file mode 100644 index 0000000..e002d88 --- /dev/null +++ b/README.md @@ -0,0 +1,65 @@ +# PyBioC + +**[PyBioC][1] is a native Python library for reading and writing BioC XML data.** + +More information about BioC is available at [sourceforge][2]. + + +## Installation + +Use [`pip`][3]: + + pip install git+https://github.com/OntoGene/PyBioC.git + +For Python 3, you might have to type `pip3`. + + +## Usage + +Two example programs, test_read+write.py and stemming.py are shipped in the `src/` folder. + +- `test_read+write.py` shows the very +basic reading and writing capability +of the library. +- `stemming.py` uses the Python Natural +Language Toolkit (NLTK) library to +manipulate a BioC XML file read in +before; it then tokenizes the +corresponding text, does stemming on +the tokens and transforms the +manipulated PyBioC objects back to +valid BioC XML format. + + +## Example + +### Generate BioC object for export + +```python +from bioc import BioCXMLWriter, BioCCollection, BioCDocument, BioCPassage + +writer = BioCXMLWriter() +writer.collection = BioCCollection() +collection = writer.collection +collection.date = '20150301' +collection.source = 'ngy1 corpus' + +document = BioCDocument() +document.id = '123456' # pubmed id + +passage = BioCPassage() +passage.put_infon('type', 'paragraph') +passage.offset = '0' +passage.text = 'This is a biomedical sentence about various rare diseases.' +document.add_passage(passage) + +collection.add_document(document) + +print writer +``` + + + +[1]: https://github.com/OntoGene/PyBioC +[2]: http://bioc.sourceforge.net/ +[3]: http://pip.pypa.io/ diff --git a/README.txt b/README.txt deleted file mode 100644 index d9069cb..0000000 --- a/README.txt +++ /dev/null @@ -1,15 +0,0 @@ -PyBioC is a native python library to deal with BioCreative XML data, -i. e. to read from and to write to it. - -Usage: ------- -Two example programs, test_read+write.py and stemming.py are shipped in the -src/ folder. - -test_read+write.py shows the very basic reading and writing capability of the -library. - -stemming.py uses the Python Natural Language Toolkit (NLTK) library to -manipulate a BioC XML file read in before; it then tokenizes the corresponding -text, does stemming on the tokens and transforms the manipulated PyBioC -objects back to valid BioC XML format. diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..1f99754 --- /dev/null +++ b/setup.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python +# coding: utf8 + +from distutils.core import setup + +with open('README.md') as f: + readme = f.read() + +setup(name='PyBioC', + version='2.0', + author='Hernani Marques', + author_email='h2m@access.uzh.ch', + description='Python library for working with BioC XML data', + long_description=readme, + packages=['bioc', 'bioc.meta', 'bioc.compat'], + package_dir={'bioc': 'src/bioc'}) diff --git a/src/bioc/__init__.py b/src/bioc/__init__.py index 5df7fef..a3ab661 100644 --- a/src/bioc/__init__.py +++ b/src/bioc/__init__.py @@ -2,23 +2,25 @@ # Package for interoperability in BioCreative work # -__version__ = '1.02' +__version__ = '1.02.4' __all__ = [ - 'BioCAnnotation', 'BioCCollection', 'BioCDocument', - 'BioCLocation', 'BioCNode', 'BioCPassage', 'BioCRelation', - 'BioCSentence', 'BioCReader', 'BioCWriter' + 'BioCAnnotation', 'BioCCollection', 'BioCDocument', 'BioCLocation', + 'BioCNode', 'BioCPassage', 'BioCRelation', 'BioCSentence', + 'BioCXMLReader', 'BioCJSONReader', 'BioCXMLWriter', 'BioCJSONWriter' ] __author__ = 'Hernani Marques (h2m@access.uzh.ch)' -from bioc_annotation import BioCAnnotation -from bioc_collection import BioCCollection -from bioc_document import BioCDocument -from bioc_location import BioCLocation -from bioc_node import BioCNode -from bioc_passage import BioCPassage -from bioc_relation import BioCRelation -from bioc_sentence import BioCSentence -from bioc_reader import BioCReader -from bioc_writer import BioCWriter +from .bioc_annotation import BioCAnnotation +from .bioc_collection import BioCCollection +from .bioc_document import BioCDocument +from .bioc_location import BioCLocation +from .bioc_node import BioCNode +from .bioc_passage import BioCPassage +from .bioc_relation import BioCRelation +from .bioc_sentence import BioCSentence +from .bioc_reader import BioCXMLReader +from .bioc_reader import BioCJSONReader +from .bioc_writer import BioCXMLWriter +from .bioc_writer import BioCJSONWriter diff --git a/src/bioc/bioc_annotation.py b/src/bioc/bioc_annotation.py index bc6af21..41eb12a 100644 --- a/src/bioc/bioc_annotation.py +++ b/src/bioc/bioc_annotation.py @@ -1,11 +1,12 @@ __all__ = ['BioCAnnotation'] -from meta import _MetaId, _MetaInfons, _MetaText +from .meta import _MetaId, _MetaInfons, _MetaText + class BioCAnnotation(_MetaId, _MetaInfons, _MetaText): def __init__(self, annotation=None): - + self.id = '' self.infons = dict() self.locations = list() diff --git a/src/bioc/bioc_collection.py b/src/bioc/bioc_collection.py index 9abc164..5d519a1 100644 --- a/src/bioc/bioc_collection.py +++ b/src/bioc/bioc_collection.py @@ -1,12 +1,13 @@ __all__ = ['BioCCollection'] -from meta import _MetaInfons, _MetaIter -from compat import _Py2Next +from .meta import _MetaInfons, _MetaIter +from .compat import _Py2Next + class BioCCollection(_Py2Next, _MetaInfons, _MetaIter): def __init__(self, collection=None): - + self.infons = dict() self.source = '' self.date = '' @@ -31,18 +32,18 @@ def __str__(self): def _iterdata(self): return self.documents - + def clear_documents(self): self.documents = list() def get_document(self, doc_idx): - return self.documents[doc_idx] + return self.documents[doc_idx] def add_document(self, document): self.documents.append(document) def remove_document(self, document): - if type(document) is int: - self.dcouments.remove(self.documents[document]) - else: - self.documents.remove(document) # TBC + if isinstance(document, int): + self.documents.pop(document) + else: + self.documents.remove(document) # TBC diff --git a/src/bioc/bioc_document.py b/src/bioc/bioc_document.py index 8033f13..f86f3a8 100644 --- a/src/bioc/bioc_document.py +++ b/src/bioc/bioc_document.py @@ -1,10 +1,10 @@ __all__ = ['BioCDocument'] -from compat import _Py2Next -from meta import _MetaId, _MetaInfons, _MetaRelations, _MetaIter +from .compat import _Py2Next +from .meta import _MetaId, _MetaInfons, _MetaRelations, _MetaIter -class BioCDocument(_MetaId, _MetaInfons, _MetaRelations, _MetaIter, - _Py2Next): + +class BioCDocument(_MetaId, _MetaInfons, _MetaRelations, _MetaIter, _Py2Next): def __init__(self, document=None): @@ -31,7 +31,7 @@ def _iterdata(self): return self.passages def get_size(self): - return self.passages.size() # As in Java BioC + return sum(p.size() for p in self.passages) # As in Java BioC def clear_passages(self): self.passages = list() @@ -40,7 +40,7 @@ def add_passage(self, passage): self.passages.append(passage) def remove_passage(self, passage): - if type(passage) is int: - self.passages.remove(self.passages[passage]) + if isinstance(passage, int): + self.passages.pop(passage) else: - self.passages.remove(passage) # TBC + self.passages.remove(passage) # TBC diff --git a/src/bioc/bioc_location.py b/src/bioc/bioc_location.py index 2e574d0..ca5fa7c 100644 --- a/src/bioc/bioc_location.py +++ b/src/bioc/bioc_location.py @@ -1,17 +1,18 @@ __all__ = ['BioCLocation'] -from meta import _MetaOffset +from .meta import _MetaOffset + class BioCLocation(_MetaOffset): def __init__(self, location=None): - + self.offset = '-1' self.length = '0' if location is not None: - self.offset = location.offset - self.length = location.length + self.offset = location.offset + self.length = location.length def __str__(self): s = str(self.offset) + ':' + str(self.length) diff --git a/src/bioc/bioc_node.py b/src/bioc/bioc_node.py index a4b2526..475a86c 100644 --- a/src/bioc/bioc_node.py +++ b/src/bioc/bioc_node.py @@ -1,9 +1,10 @@ __all__ = ['BioCNode'] + class BioCNode: def __init__(self, node=None, refid=None, role=None): - + self.refid = '' self.role = '' @@ -17,7 +18,7 @@ def __init__(self, node=None, refid=None, role=None): self.role = role def __str__(self): - s = 'refid: ' + self.refid + '\n' - s += 'role: ' + self.role + '\n' + s = 'refid: ' + self.refid + '\n' + s += 'role: ' + self.role + '\n' - return s + return s diff --git a/src/bioc/bioc_passage.py b/src/bioc/bioc_passage.py index 4347675..f1cb0ba 100644 --- a/src/bioc/bioc_passage.py +++ b/src/bioc/bioc_passage.py @@ -1,13 +1,14 @@ __all__ = ['BioCPassage'] -from meta import _MetaAnnotations, _MetaInfons, _MetaOffset, \ - _MetaRelations, _MetaText +from .meta import _MetaAnnotations, _MetaInfons, _MetaOffset, \ + _MetaRelations, _MetaText -class BioCPassage(_MetaAnnotations, _MetaOffset, _MetaText, - _MetaRelations, _MetaInfons): + +class BioCPassage(_MetaAnnotations, _MetaOffset, _MetaText, _MetaRelations, + _MetaInfons): def __init__(self, passage=None): - + self.offset = '-1' self.text = '' self.infons = dict() @@ -27,20 +28,19 @@ def size(self): return len(self.sentences) def has_sentences(self): - if len(self.sentences) > 0: - return True + return bool(self.sentences) def add_sentence(self, sentence): self.sentences.append(sentence) def sentences_iterator(self): - return self.sentences.iterator() # TBD + return self.sentences.iterator() # TBD def clear_sentences(self): - self.relations = list() + self.sentences = list() - def remove_sentence(self, sentence): # int or obj - if type(sentence) is int: - self.sentences.remove(self.sentences[sentence]) + def remove_sentence(self, sentence): # int or obj + if isinstance(sentence, int): + self.sentences.pop(sentence) else: self.sentences.remove(sentence) diff --git a/src/bioc/bioc_reader.py b/src/bioc/bioc_reader.py index a35b9f7..022b9c0 100644 --- a/src/bioc/bioc_reader.py +++ b/src/bioc/bioc_reader.py @@ -1,40 +1,39 @@ -__all__ = ['BioCReader'] +__all__ = ['BioCXMLReader', 'BioCJSONReader'] -import StringIO +import io +import json from lxml import etree -from bioc_annotation import BioCAnnotation -from bioc_collection import BioCCollection -from bioc_document import BioCDocument -from bioc_location import BioCLocation -from bioc_passage import BioCPassage -from bioc_sentence import BioCSentence -from bioc_node import BioCNode -from bioc_relation import BioCRelation +from .bioc_annotation import BioCAnnotation +from .bioc_collection import BioCCollection +from .bioc_document import BioCDocument +from .bioc_location import BioCLocation +from .bioc_passage import BioCPassage +from .bioc_sentence import BioCSentence +from .bioc_node import BioCNode +from .bioc_relation import BioCRelation -class BioCReader: + +class BioCXMLReader(object): """ - This class can be used to store BioC XML files in PyBioC objects, - for further manipulation. + Reader for parsing BioC XML files into BioC* objects. """ - def __init__(self, source, dtd_valid_file=None): """ source: File path to a BioC XML input document. dtd_valid_file: File path to a BioC.dtd file. Using this optional argument ensures DTD validation. """ - self.source = source self.collection = BioCCollection() self.xml_tree = etree.parse(source) - + if dtd_valid_file is not None: dtd = etree.DTD(dtd_valid_file) - if dtd.validate(self.xml_tree) is False: - raise(Exception(dtd.error_log.filter_from_errors()[0])) - + if not dtd.validate(self.xml_tree): + raise Exception(dtd.error_log.filter_from_errors()[0]) + def read(self): """ Invoke this method in order to read in the file provided by @@ -42,25 +41,25 @@ def read(self): called the BioCReader object gets populated. """ self._read_collection() - + def _read_collection(self): collection_elem = self.xml_tree.xpath('/collection')[0] - + self.collection.source = collection_elem.xpath('source')[0].text self.collection.date = collection_elem.xpath('date')[0].text self.collection.key = collection_elem.xpath('key')[0].text - + infon_elem_list = collection_elem.xpath('infon') document_elem_list = collection_elem.xpath('document') - + self._read_infons(infon_elem_list, self.collection) self._read_documents(document_elem_list) - - - def _read_infons(self, infon_elem_list, infons_parent_elem): + + @staticmethod + def _read_infons(infon_elem_list, infons_parent_elem): for infon_elem in infon_elem_list: - infons_parent_elem.put_infon(self._get_infon_key(infon_elem), - infon_elem.text) + infons_parent_elem.put_infon(infon_elem.attrib['key'], + infon_elem.text) def _read_documents(self, document_elem_list): for document_elem in document_elem_list: @@ -70,8 +69,8 @@ def _read_documents(self, document_elem_list): self._read_passages(document_elem.xpath('passage'), document) self._read_relations(document_elem.xpath('relation'), - document) - + document) + self.collection.add_document(document) def _read_passages(self, passage_elem_list, document_parent_elem): @@ -79,59 +78,53 @@ def _read_passages(self, passage_elem_list, document_parent_elem): passage = BioCPassage() self._read_infons(passage_elem.xpath('infon'), passage) passage.offset = passage_elem.xpath('offset')[0].text - + # Is this BioC document with ? if len(passage_elem.xpath('sentence')) > 0: - self._read_sentences(passage_elem.xpath('sentence'), - passage) + self._read_sentences(passage_elem.xpath('sentence'), passage) else: # Is the (optional) text element available? - try: + try: passage.text = passage_elem.xpath('text')[0].text - except: + except (IndexError, AttributeError): pass self._read_annotations(passage_elem.xpath('annotation'), - passage) - - self._read_relations(passage_elem.xpath('relation'), - passage) - + passage) + + self._read_relations(passage_elem.xpath('relation'), passage) + document_parent_elem.add_passage(passage) - + def _read_sentences(self, sentence_elem_list, passage_parent_elem): for sentence_elem in sentence_elem_list: sentence = BioCSentence() self._read_infons(sentence_elem.xpath('infon'), sentence) sentence.offset = sentence_elem.xpath('offset')[0].text sentence.text = sentence_elem.xpath('text')[0].text - self._read_annotations(sentence_elem.xpath('annotation'), - sentence) - self._read_relations(sentence_elem.xpath('relation'), - sentence) - + self._read_annotations(sentence_elem.xpath('annotation'), sentence) + self._read_relations(sentence_elem.xpath('relation'), sentence) + passage_parent_elem.add_sentence(sentence) - - def _read_annotations(self, annotation_elem_list, - annotations_parent_elem): + + def _read_annotations(self, annotation_elem_list, annotations_parent_elem): for annotation_elem in annotation_elem_list: annotation = BioCAnnotation() # Attribute id is just #IMPLIED, not #REQUIRED if 'id' in annotation_elem.attrib: annotation.id = annotation_elem.attrib['id'] - self._read_infons(annotation_elem.xpath('infon'), - annotation) - + self._read_infons(annotation_elem.xpath('infon'), annotation) + for location_elem in annotation_elem.xpath('location'): location = BioCLocation() location.offset = location_elem.attrib['offset'] location.length = location_elem.attrib['length'] - + annotation.add_location(location) - + annotation.text = annotation_elem.xpath('text')[0].text - + annotations_parent_elem.add_annotation(annotation) - + def _read_relations(self, relation_elem_list, relations_parent_elem): for relation_elem in relation_elem_list: relation = BioCRelation() @@ -144,10 +137,39 @@ def _read_relations(self, relation_elem_list, relations_parent_elem): node = BioCNode() node.refid = node_elem.attrib['refid'] node.role = node_elem.attrib['role'] - + relation.add_node(node) - + relations_parent_elem.add_relation(relation) - - def _get_infon_key(self, elem): - return elem.attrib['key'] + + + +class BioCJSONReader(object): + ''' + Reader for parsing BioC JSON files into BioC* objects. + ''' + def __init__(self, source): + self.source = source + self.collection = None + + def read(self): + ''' + Read self.source and save the result to self.collection. + ''' + with io.open(self.source, encoding='utf-8') as f: + dict_ = json.load(f) + self.collection = self._read_dict(BioCCollection, dict_) + + def _read_dict(self, class_, dict_): + obj = class_() + for label, value in dict_.items(): + if label in ('offset', 'length'): + # The converter in Don Comeau's reference implementation + # converts offset and length to strings, too. + value = str(value) + elif isinstance(value, list): + # Get the class from the label (eg. "documents" -> BioCDocument). + class_ = globals()['BioC{}'.format(label.rstrip('s').title())] + value = [self._read_dict(class_, d) for d in value] + setattr(obj, label, value) + return obj diff --git a/src/bioc/bioc_relation.py b/src/bioc/bioc_relation.py index e315c1d..242a505 100644 --- a/src/bioc/bioc_relation.py +++ b/src/bioc/bioc_relation.py @@ -1,13 +1,13 @@ __all__ = ['BioCRelation'] -from compat import _Py2Next -from meta import _MetaId, _MetaInfons, _MetaIter -from bioc_node import BioCNode +from .compat import _Py2Next +from .meta import _MetaId, _MetaInfons, _MetaIter + class BioCRelation(_MetaId, _MetaInfons, _Py2Next, _MetaIter): def __init__(self, relation=None): - + self.id = '' self.nodes = list() self.infons = dict() @@ -31,5 +31,5 @@ def add_node(self, node, refid=None, role=None): # Discard arg ``node'' if optional args fully provided if (refid is not None) and (role is not None): self.add_node(refid=refid, role=role) - else: # Only consider optional args if both set + else: # Only consider optional args if both set self.nodes.append(node) diff --git a/src/bioc/bioc_sentence.py b/src/bioc/bioc_sentence.py index d5d757c..a5c1e20 100644 --- a/src/bioc/bioc_sentence.py +++ b/src/bioc/bioc_sentence.py @@ -1,15 +1,15 @@ __all__ = ['BioCSentence'] -from meta import _MetaAnnotations, _MetaInfons, _MetaOffset, \ - _MetaRelations, _MetaText - +from .meta import _MetaAnnotations, _MetaInfons, _MetaOffset, \ + _MetaRelations, _MetaText -class BioCSentence(_MetaAnnotations, _MetaInfons, _MetaOffset, + +class BioCSentence(_MetaAnnotations, _MetaInfons, _MetaOffset, _MetaRelations, _MetaText): - + def __init__(self, sentence=None): - + self.offset = '-1' self.text = '' self.infons = dict() @@ -25,9 +25,9 @@ def __init__(self, sentence=None): def __str__(self): s = 'offset: ' + str(self.offset) + '\n' - s += 'infons: ' + str(self.infons) + '\n' # TBD - s += 'text: ' + str(self.text) + '\n' # TBD - s += str(self.annotations) + '\n' # TBD - s += str(self.relations) + '\n' # TBD + s += 'infons: ' + str(self.infons) + '\n' # TBD + s += 'text: ' + str(self.text) + '\n' # TBD + s += str(self.annotations) + '\n' # TBD + s += str(self.relations) + '\n' # TBD return s diff --git a/src/bioc/bioc_writer.py b/src/bioc/bioc_writer.py index 38ebf51..aa34912 100644 --- a/src/bioc/bioc_writer.py +++ b/src/bioc/bioc_writer.py @@ -1,81 +1,178 @@ -__all__ = ['BioCWriter'] +__all__ = ['BioCXMLWriter', 'BioCJSONWriter'] + +import sys +import json from lxml.builder import E from lxml.etree import tostring -class BioCWriter: - + +# Resolve Python 2/3 differences. +if sys.version_info < (3,): + # In Py2, use codecs.open rather than io.open, because the write() method + # of the latter doesn't accept native str values (only unicode). + from codecs import open + # Since BioCXMLWriter.__str__ calls lxml.etree.tostring, it must actually + # encode the serialised dump to get native str. + STR_ENCODING = 'ascii' +else: + STR_ENCODING = 'unicode' + + +class _BioCWriter(object): + ''' + Base for BioC serializers. + ''' def __init__(self, filename=None, collection=None): - - self.root_tree = None - - self.collection = None - self.doctype = '''''' - self.doctype += '''''' + self.collection = collection self.filename = filename - - if collection is not None: - self.collection = collection - - if filename is not None: - self.filename = filename - + + def _check_for_data(self): + if self.collection is None: + raise Exception('No data available.') + + def _resolve_filename(self, filename): + if filename is None: + if self.filename is None: + raise Exception('No output file path provided.') + filename = self.filename + return filename + + +class BioCXMLWriter(_BioCWriter): + ''' + XML serializer for BioC objects. + ''' + + doctype = "" + + def __init__(self, filename=None, collection=None): + super(BioCXMLWriter, self).__init__(filename, collection) + self.root_tree = None + def __str__(self): - """ A BioCWriter object can be printed as string. """ - self._check_for_data() - + A BioCWriter object can be printed as string. + """ + return self.tostring(encoding=STR_ENCODING) + + def tostring(self, encoding='UTF-8'): + ''' + Serialize the collection to BioC XML. + + Return an encoded string (a bytes object in Python 3), + unless encoding is "unicode", in which case a decoded + string is returned (a unicode object in Python 2). + ''' self.build() - s = tostring(self.root_tree, - pretty_print=True, - doctype=self.doctype) - + + xml_declaration = self._binary_encoding(encoding) + s = tostring(self.root_tree, + encoding=encoding, + pretty_print=True, + xml_declaration=xml_declaration, + doctype=self.doctype) + return s - - def _check_for_data(self): - if self.collection is None: - raise(Exception('No data available.')) - + + @staticmethod + def _binary_encoding(codec): + ''' + Is this actually a binary encoding? + + The etree.tostring method accepts an encoding + parameter value "unicode" or str/unicode, + in which case the returned serialisation is a + decoded unicode string rather than an encoded + byte string. + ''' + return not callable(codec) and codec != 'unicode' + def write(self, filename=None): - """ Use this method to write the data in the PyBioC objects - to disk. - - filename: Output file path (optional argument; filename - provided by __init__ used otherwise.) """ - if filename is not None: - self.filename = filename - - if self.filename is None: - raise(Exception('No output file path provided.')) - - f = open(self.filename, 'w') - f.write(self.__str__()) - + Write the data in the PyBioC objects to disk. + + filename: Output file path (optional argument; + filename provided through __init__ used + otherwise.) + """ + filename = self._resolve_filename(filename) + + with open(filename, 'wb') as f: + f.write(self.tostring(encoding='UTF-8')) + + def iterfragments(self, encoding='UTF-8'): + ''' + Iterate over serialised XML fragments. + + Use this for large collections, as it avoids building + the whole tree in memory. + ''' + self._check_for_data() + + # Temporarily remove the document nodes and the root tree. + documents = self.collection.documents + previous_tree = self.root_tree + self.collection.documents = () + self.root_tree = None + + # Construct and serialise the collection-level nodes. + # Split them into a head and tail portion. + shell = self.tostring(encoding) + tail = u'\n' + BOM = '' + if self._binary_encoding(encoding): + BOM = ''.encode(encoding) + tail = tail.encode(encoding).lstrip(BOM) + head = shell[:-len(tail)] + + # Yield fragment by fragment. + yield head + + step_parent = E('collection') + for doc in documents: + self._build_documents([doc], step_parent) + frag = tostring(step_parent[0], + encoding=encoding, + pretty_print=True, + xml_declaration=False) + step_parent.clear() + yield frag.lstrip(BOM) + + yield tail + + # Restore the collection object and the root tree. + self.collection.documents = documents + self.root_tree = previous_tree + def build(self): - self._build_collection() - + ''' + Create an Element tree in memory. + ''' + self._check_for_data() + if self.root_tree is None: + self._build_collection() + def _build_collection(self): - self.root_tree = E('collection', - E('source'), E('date'), E('key')) + self.root_tree = E('collection', E('source'), E('date'), E('key')) self.root_tree.xpath('source')[0].text = self.collection.source self.root_tree.xpath('date')[0].text = self.collection.date - self.root_tree.xpath('key')[0].text = self.collection.key + self.root_tree.xpath('key')[0].text = self.collection.key collection_elem = self.root_tree.xpath('/collection')[0] # infon* self._build_infons(self.collection.infons, collection_elem) # document+ - self._build_documents(self.collection.documents, - collection_elem) - - def _build_infons(self, infons_dict, infons_parent_elem): + self._build_documents(self.collection.documents, collection_elem) + + @staticmethod + def _build_infons(infons_dict, infons_parent_elem): for infon_key, infon_val in infons_dict.items(): infons_parent_elem.append(E('infon')) infon_elem = infons_parent_elem.xpath('infon')[-1] - + infon_elem.attrib['key'] = infon_key infon_elem.text = infon_val - + def _build_documents(self, documents_list, collection_parent_elem): for document in documents_list: collection_parent_elem.append(E('document', E('id'))) @@ -89,7 +186,7 @@ def _build_documents(self, documents_list, collection_parent_elem): self._build_passages(document.passages, document_elem) # relation* self._build_relations(document.relations, document_elem) - + def _build_passages(self, passages_list, document_parent_elem): for passage in passages_list: document_parent_elem.append(E('passage')) @@ -106,11 +203,11 @@ def _build_passages(self, passages_list, document_parent_elem): # text?, annotation* passage_elem.append(E('text')) passage_elem.xpath('text')[0].text = passage.text - self._build_annotations(passage.annotations, + self._build_annotations(passage.annotations, passage_elem) # relation* self._build_relations(passage.relations, passage_elem) - + def _build_relations(self, relations_list, relations_parent_elem): for relation in relations_list: relations_parent_elem.append(E('relation')) @@ -126,9 +223,8 @@ def _build_relations(self, relations_list, relations_parent_elem): # id (just #IMPLIED) if len(relation.id) > 0: relation_elem.attrib['id'] = relation.id - - def _build_annotations(self, annotations_list, - annotations_parent_elem): + + def _build_annotations(self, annotations_list, annotations_parent_elem): for annotation in annotations_list: annotations_parent_elem.append(E('annotation')) annotation_elem = \ @@ -168,3 +264,66 @@ def _build_sentences(self, sentences_list, passage_parent_elem): self._build_annotations(sentence.annotations, sentence_elem) # relation* self._build_relations(sentence.relations, sentence_elem) + + +class BioCJSONWriter(_BioCWriter): + ''' + JSON serializer for BioC objects. + ''' + def __init__(self, filename=None, collection=None): + super(BioCJSONWriter, self).__init__(filename, collection) + self.root_dict = None + + def __str__(self): + return str(self.tostring()) + + def tostring(self, **kwargs): + ''' + Dump serialized BioC JSON to a string. + ''' + self.build() + return json.dumps(self.root_dict, **kwargs) + + def write(self, filename=None, **kwargs): + ''' + Write serialised BioC JSON to disk. + ''' + self.build() + filename = self._resolve_filename(filename) + with open(filename, 'w', encoding='utf-8') as f: + json.dump(self.root_dict, f, **kwargs) + + def iterfragments(self, **kwargs): + ''' + Iterate over chunks of serialised BioC JSON. + + This method still creates an entire copy of the + structure in memory. + ''' + self.build() + for chunk in json.JSONEncoder(**kwargs).iterencode(self.root_dict): + yield chunk + + def build(self): + ''' + Construct a nested dictionary in memory. + ''' + self._check_for_data() + if self.root_dict is None: + self.root_dict = self._build_dict(self.collection) + + def _build_dict(self, obj): + # Note: + # Unlike the DTD, Don Comeau's reference implementation of a BioC JSON + # converter does not enforce mutual exclusion of either sentences + # or text + annotations inside passage elements. + dict_ = {} + for label, value in obj.__dict__.items(): + if label == 'text' and value is None: + value = '' # avoid None/null + elif label in ('offset', 'length'): + value = int(value) + elif isinstance(value, list): + value = [self._build_dict(c) for c in value] + dict_[label] = value + return dict_ diff --git a/src/bioc/compat/__init__.py b/src/bioc/compat/__init__.py index 176d401..58d14fb 100644 --- a/src/bioc/compat/__init__.py +++ b/src/bioc/compat/__init__.py @@ -2,4 +2,4 @@ __author__ = 'Hernani Marques (h2m@access.uzh.ch)' -from _py2_next import _Py2Next +from ._py2_next import _Py2Next diff --git a/src/bioc/compat/_py2_next.py b/src/bioc/compat/_py2_next.py index 745018e..7578301 100644 --- a/src/bioc/compat/_py2_next.py +++ b/src/bioc/compat/_py2_next.py @@ -1,5 +1,7 @@ __all__ = [] + class _Py2Next: - def __next__(self): - self.next() + + def __next__(self): + self.next() diff --git a/src/bioc/meta/__init__.py b/src/bioc/meta/__init__.py index d87c4f9..afbce03 100644 --- a/src/bioc/meta/__init__.py +++ b/src/bioc/meta/__init__.py @@ -2,6 +2,6 @@ __author__ = 'Hernani Marques (h2m@access.uzh.ch)' -from _bioc_meta import _MetaAnnotations, _MetaInfons, _MetaOffset, \ - _MetaRelations, _MetaText, _MetaId -from _iter import _MetaIter +from ._bioc_meta import _MetaAnnotations, _MetaInfons, _MetaOffset, \ + _MetaRelations, _MetaText, _MetaId +from ._iter import _MetaIter diff --git a/src/bioc/meta/_bioc_meta.py b/src/bioc/meta/_bioc_meta.py index 268cfbe..bb0b95b 100644 --- a/src/bioc/meta/_bioc_meta.py +++ b/src/bioc/meta/_bioc_meta.py @@ -1,10 +1,11 @@ __all__ = [] + class _MetaAnnotations: annotations = list() def annotation_iterator(self): - return self.annotations.iterator() # TBD + return self.annotations.iterator() # TBD def clear_annotations(self): self.annotations = list() @@ -12,32 +13,35 @@ def clear_annotations(self): def add_annotation(self, annotation): self.annotations.append(annotation) - def remove_annotation(self, annotation): # Can be int or obj - if type(annotation) is int: + def remove_annotation(self, annotation): # Can be int or obj + if isinstance(annotation, int): self.annotations.remove(self.annotations[annotation]) else: - self.annotations.remove(annotation) # TBC + self.annotations.remove(annotation) # TBC + class _MetaInfons: infons = dict() def put_infon(self, key, val): - self.infons[key] = val + self.infons[key] = val def remove_infon(self, key): - del(self.infons[key]) + del(self.infons[key]) def clear_infons(self): self.infons = dict() + class _MetaOffset: offset = '-1' + class _MetaRelations: relations = list() def relation_iterator(self): - return self.relations.iterator() # TBD + return self.relations.iterator() # TBD def clear_relations(self): self.relations = list() @@ -45,14 +49,16 @@ def clear_relations(self): def add_relation(self, relation): self.relations.append(relation) - def remove_relation(self, relation): # Can be int or obj - if type(relation) is int: + def remove_relation(self, relation): # Can be int or obj + if isinstance(relation, int): self.relations.remove(self.relations[relation]) else: - self.relations.remove(relation) # TBC + self.relations.remove(relation) # TBC + class _MetaText: text = '' + class _MetaId: id = '' diff --git a/src/bioc/meta/_iter.py b/src/bioc/meta/_iter.py index 1927634..e5a0dd6 100644 --- a/src/bioc/meta/_iter.py +++ b/src/bioc/meta/_iter.py @@ -1,5 +1,6 @@ __all__ = [] + class _MetaIter: def __iter__(self): diff --git a/src/stemmer.py b/src/stemmer.py index 6eb1d25..2ee21f6 100755 --- a/src/stemmer.py +++ b/src/stemmer.py @@ -1,61 +1,62 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- +# -*- coding: utf-8 -*- # h2m@access.uzh.ch -from os import curdir, sep +import os import sys from nltk.tokenize import wordpunct_tokenize from nltk import PorterStemmer from bioc import BioCAnnotation -from bioc import BioCReader -from bioc import BioCWriter +from bioc import BioCXMLReader +from bioc import BioCXMLWriter + +BIOC_IN = os.path.join('..', 'test_input', 'example_input.xml') +BIOC_OUT = os.path.join('..', 'test_input', 'example_input_stemmed.xml') +DTD_FILE = os.path.join('..', 'test_input', 'BioC.dtd') -BIOC_IN = '..' + sep + 'test_input' + sep + 'example_input.xml' -BIOC_OUT = 'example_input_stemmed.xml' -DTD_FILE = '..' + sep + 'BioC.dtd' def main(): # Use file defined by BIOC_IN as default if no other provided bioc_in = BIOC_IN if len(sys.argv) >= 2: bioc_in = sys.argv[1] - - # A BioCReader object is put in place to hold the example BioC XML + + # A BioCXMLReader object is put in place to hold the example BioC XML # document - bioc_reader = BioCReader(bioc_in, dtd_valid_file=DTD_FILE) - - # A BioCWRiter object is prepared to write out the annotated data - bioc_writer = BioCWriter(BIOC_OUT) - + bioc_reader = BioCXMLReader(bioc_in, dtd_valid_file=DTD_FILE) + + # A BioCXMLWRiter object is prepared to write out the annotated data + bioc_writer = BioCXMLWriter(BIOC_OUT) + # The NLTK porter stemmer is used for stemming stemmer = PorterStemmer() - + # The example input file given above (by BIOC_IN) is fed into - # a BioCReader object; validation is done by the BioC DTD + # a BioCXMLReader object; validation is done by the BioC DTD bioc_reader.read() - + # Pass over basic data bioc_writer.collection = bioc_reader.collection - + # Get documents to manipulate documents = bioc_writer.collection.documents - + # Go through each document annotation_id = 0 for document in documents: - + # Go through each passage of the document for passage in document: # Stem all the tokens found - stems = [stemmer.stem(token) for + stems = [stemmer.stem(token) for token in wordpunct_tokenize(passage.text)] # Add an anotation showing the stemmed version, in the # given order for stem in stems: annotation_id += 1 - + # For each token an annotation is created, providing # the surface form of a 'stemmed token'. # (The annotations are collectively added following @@ -63,16 +64,17 @@ def main(): bioc_annotation = BioCAnnotation() bioc_annotation.text = stem bioc_annotation.id = str(annotation_id) - bioc_annotation.put_infon('surface form', + bioc_annotation.put_infon('surface form', 'stemmed token') passage.add_annotation(bioc_annotation) - + # Print file to screen w/o trailing newline # (Can be redirected into a file, e. g output_bioc.xml) sys.stdout.write(str(bioc_writer)) - + # Write to disk bioc_writer.write() - -if __name__ == '__main__': + + +if __name__ == '__main__': main() diff --git a/src/test_read+write.py b/src/test_read+write.py index 963f5c8..9d358c6 100755 --- a/src/test_read+write.py +++ b/src/test_read+write.py @@ -1,24 +1,33 @@ #!/usr/bin/env python -from bioc import BioCReader -from bioc import BioCWriter +from bioc import BioCXMLReader +from bioc import BioCJSONReader +from bioc import BioCXMLWriter +from bioc import BioCJSONWriter -test_file = '../test_input/bcIVLearningCorpus.xml' +test_xml = '../test_input/example_input.xml' +test_json = '../test_input/example_input.json' dtd_file = '../test_input/BioC.dtd' + def main(): - bioc_reader = BioCReader(test_file, dtd_valid_file=dtd_file) - bioc_reader.read() - ''' - sentences = bioc_reader.collection.documents[0].passages[0].sentences - for sentence in sentences: - print sentence.offset - ''' - - bioc_writer = BioCWriter('output_bioc.xml') - bioc_writer.collection = bioc_reader.collection - bioc_writer.write() - print(bioc_writer) - -if __name__ == '__main__': + # Read XML, write JSON. + xml_reader = BioCXMLReader(test_xml, dtd_valid_file=dtd_file) + xml_reader.read() + + json_writer = BioCJSONWriter() + json_writer.collection = xml_reader.collection + json_writer.write('output_bioc.json', indent=2) + print(json_writer) + + # Read JSON, write XML. + json_reader = BioCJSONReader(test_json) + json_reader.read() + + xml_writer = BioCXMLWriter('output_bioc.xml', xml_reader.collection) + xml_writer.write() + print(xml_writer) + + +if __name__ == '__main__': main() diff --git a/BioC.dtd b/test_input/BioC.dtd similarity index 100% rename from BioC.dtd rename to test_input/BioC.dtd diff --git a/test_input/example_bioc.xml b/test_input/example_bioc.xml new file mode 100644 index 0000000..ed63890 --- /dev/null +++ b/test_input/example_bioc.xml @@ -0,0 +1,16 @@ + + + + Example + 1999-Jan-1 + PubTator.key + + 20085714 + + + + + + + + \ No newline at end of file diff --git a/test_input/example_input.json b/test_input/example_input.json new file mode 100644 index 0000000..eb968fd --- /dev/null +++ b/test_input/example_input.json @@ -0,0 +1,35 @@ +{ + "key": "ctdBCIVLearningDataSet.key", + "source": "PUBMED", + "date": "20130422", + "documents": [ + { + "infons": {}, + "relations": [], + "passages": [ + { + "offset": 0, + "sentences": [], + "annotations": [], + "relations": [], + "text": "Possible role of valvular serotonin 5-HT(2B) receptors in the cardiopathy associated with fenfluramine.", + "infons": { + "type": "title" + } + }, + { + "offset": 104, + "sentences": [], + "annotations": [], + "relations": [], + "text": "Dexfenfluramine was approved in the United States for long-term use as an appetite suppressant until it was reported to be associated with valvular heart disease. The valvular changes (myofibroblast proliferation) are histopathologically indistinguishable from those observed in carcinoid disease or after long-term exposure to 5-hydroxytryptamine (5-HT)(2)-preferring ergot drugs (ergotamine, methysergide). 5-HT(2) receptor stimulation is known to cause fibroblast mitogenesis, which could contribute to this lesion. To elucidate the mechanism of \"fen-phen\"-associated valvular lesions, we examined the interaction of fenfluramine and its metabolite norfenfluramine with 5-HT(2) receptor subtypes and examined the expression of these receptors in human and porcine heart valves. Fenfluramine binds weakly to 5-HT(2A), 5-HT(2B), and 5-HT(2C) receptors. In contrast, norfenfluramine exhibited high affinity for 5-HT(2B) and 5-HT(2C) receptors and more moderate affinity for 5-HT(2A) receptors. In cells expressing recombinant 5-HT(2B) receptors, norfenfluramine potently stimulated the hydrolysis of inositol phosphates, increased intracellular Ca(2+), and activated the mitogen-activated protein kinase cascade, the latter of which has been linked to mitogenic actions of the 5-HT(2B) receptor. The level of 5-HT(2B) and 5-HT(2A) receptor transcripts in heart valves was at least 300-fold higher than the levels of 5-HT(2C) receptor transcript, which were barely detectable. We propose that preferential stimulation of valvular 5-HT(2B) receptors by norfenfluramine, ergot drugs, or 5-HT released from carcinoid tumors (with or without accompanying 5-HT(2A) receptor activation) may contribute to valvular fibroplasia in humans.", + "infons": { + "type": "abstract" + } + } + ], + "id": "10617681" + } + ], + "infons": {} +}