From a391e9f21808cdf954e3aa70cc857c923d73502a Mon Sep 17 00:00:00 2001 From: Max Nanis Date: Wed, 25 Feb 2015 14:06:51 -0800 Subject: [PATCH 01/21] PyBioC to follow PyFlakes & pep8 standards - Flake8 until reasonably okay - gitignore for repo --- .gitignore | 1 + src/bioc/bioc_annotation.py | 3 +- src/bioc/bioc_collection.py | 15 +++---- src/bioc/bioc_document.py | 8 ++-- src/bioc/bioc_location.py | 7 ++-- src/bioc/bioc_node.py | 9 +++-- src/bioc/bioc_passage.py | 15 +++---- src/bioc/bioc_reader.py | 76 +++++++++++++++++------------------- src/bioc/bioc_relation.py | 6 +-- src/bioc/bioc_sentence.py | 20 +++++----- src/bioc/bioc_writer.py | 60 +++++++++++++--------------- src/bioc/compat/_py2_next.py | 6 ++- src/bioc/meta/__init__.py | 3 +- src/bioc/meta/_bioc_meta.py | 22 +++++++---- src/bioc/meta/_iter.py | 1 + src/stemmer.py | 35 +++++++++-------- src/test_read+write.py | 3 +- 17 files changed, 149 insertions(+), 141 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0d20b64 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.pyc diff --git a/src/bioc/bioc_annotation.py b/src/bioc/bioc_annotation.py index bc6af21..d0feaca 100644 --- a/src/bioc/bioc_annotation.py +++ b/src/bioc/bioc_annotation.py @@ -2,10 +2,11 @@ from meta import _MetaId, _MetaInfons, _MetaText + class BioCAnnotation(_MetaId, _MetaInfons, _MetaText): def __init__(self, annotation=None): - + self.id = '' self.infons = dict() self.locations = list() diff --git a/src/bioc/bioc_collection.py b/src/bioc/bioc_collection.py index 9abc164..28aa009 100644 --- a/src/bioc/bioc_collection.py +++ b/src/bioc/bioc_collection.py @@ -3,10 +3,11 @@ from meta import _MetaInfons, _MetaIter from compat import _Py2Next + class BioCCollection(_Py2Next, _MetaInfons, _MetaIter): def __init__(self, collection=None): - + self.infons = dict() self.source = '' self.date = '' @@ -31,18 +32,18 @@ def __str__(self): def _iterdata(self): return self.documents - + def clear_documents(self): self.documents = list() def get_document(self, doc_idx): - return self.documents[doc_idx] + return self.documents[doc_idx] def add_document(self, document): self.documents.append(document) def remove_document(self, document): - if type(document) is int: - self.dcouments.remove(self.documents[document]) - else: - self.documents.remove(document) # TBC + if type(document) is int: + self.dcouments.remove(self.documents[document]) + else: + self.documents.remove(document) # TBC diff --git a/src/bioc/bioc_document.py b/src/bioc/bioc_document.py index 8033f13..c9d89ca 100644 --- a/src/bioc/bioc_document.py +++ b/src/bioc/bioc_document.py @@ -3,8 +3,8 @@ from compat import _Py2Next from meta import _MetaId, _MetaInfons, _MetaRelations, _MetaIter -class BioCDocument(_MetaId, _MetaInfons, _MetaRelations, _MetaIter, - _Py2Next): + +class BioCDocument(_MetaId, _MetaInfons, _MetaRelations, _MetaIter, _Py2Next): def __init__(self, document=None): @@ -31,7 +31,7 @@ def _iterdata(self): return self.passages def get_size(self): - return self.passages.size() # As in Java BioC + return self.passages.size() # As in Java BioC def clear_passages(self): self.passages = list() @@ -43,4 +43,4 @@ def remove_passage(self, passage): if type(passage) is int: self.passages.remove(self.passages[passage]) else: - self.passages.remove(passage) # TBC + self.passages.remove(passage) # TBC diff --git a/src/bioc/bioc_location.py b/src/bioc/bioc_location.py index 2e574d0..20b7e17 100644 --- a/src/bioc/bioc_location.py +++ b/src/bioc/bioc_location.py @@ -2,16 +2,17 @@ from meta import _MetaOffset + class BioCLocation(_MetaOffset): def __init__(self, location=None): - + self.offset = '-1' self.length = '0' if location is not None: - self.offset = location.offset - self.length = location.length + self.offset = location.offset + self.length = location.length def __str__(self): s = str(self.offset) + ':' + str(self.length) diff --git a/src/bioc/bioc_node.py b/src/bioc/bioc_node.py index a4b2526..475a86c 100644 --- a/src/bioc/bioc_node.py +++ b/src/bioc/bioc_node.py @@ -1,9 +1,10 @@ __all__ = ['BioCNode'] + class BioCNode: def __init__(self, node=None, refid=None, role=None): - + self.refid = '' self.role = '' @@ -17,7 +18,7 @@ def __init__(self, node=None, refid=None, role=None): self.role = role def __str__(self): - s = 'refid: ' + self.refid + '\n' - s += 'role: ' + self.role + '\n' + s = 'refid: ' + self.refid + '\n' + s += 'role: ' + self.role + '\n' - return s + return s diff --git a/src/bioc/bioc_passage.py b/src/bioc/bioc_passage.py index 4347675..412c07d 100644 --- a/src/bioc/bioc_passage.py +++ b/src/bioc/bioc_passage.py @@ -1,13 +1,14 @@ __all__ = ['BioCPassage'] -from meta import _MetaAnnotations, _MetaInfons, _MetaOffset, \ - _MetaRelations, _MetaText +from meta import _MetaAnnotations, _MetaInfons, _MetaOffset, _MetaRelations, \ + _MetaText -class BioCPassage(_MetaAnnotations, _MetaOffset, _MetaText, - _MetaRelations, _MetaInfons): + +class BioCPassage(_MetaAnnotations, _MetaOffset, _MetaText, _MetaRelations, + _MetaInfons): def __init__(self, passage=None): - + self.offset = '-1' self.text = '' self.infons = dict() @@ -34,12 +35,12 @@ def add_sentence(self, sentence): self.sentences.append(sentence) def sentences_iterator(self): - return self.sentences.iterator() # TBD + return self.sentences.iterator() # TBD def clear_sentences(self): self.relations = list() - def remove_sentence(self, sentence): # int or obj + def remove_sentence(self, sentence): # int or obj if type(sentence) is int: self.sentences.remove(self.sentences[sentence]) else: diff --git a/src/bioc/bioc_reader.py b/src/bioc/bioc_reader.py index a35b9f7..d285a6d 100644 --- a/src/bioc/bioc_reader.py +++ b/src/bioc/bioc_reader.py @@ -13,9 +13,10 @@ from bioc_node import BioCNode from bioc_relation import BioCRelation + class BioCReader: """ - This class can be used to store BioC XML files in PyBioC objects, + This class can be used to store BioC XML files in PyBioC objects, for further manipulation. """ @@ -25,16 +26,16 @@ def __init__(self, source, dtd_valid_file=None): dtd_valid_file: File path to a BioC.dtd file. Using this optional argument ensures DTD validation. """ - + self.source = source self.collection = BioCCollection() self.xml_tree = etree.parse(source) - + if dtd_valid_file is not None: dtd = etree.DTD(dtd_valid_file) if dtd.validate(self.xml_tree) is False: raise(Exception(dtd.error_log.filter_from_errors()[0])) - + def read(self): """ Invoke this method in order to read in the file provided by @@ -42,25 +43,24 @@ def read(self): called the BioCReader object gets populated. """ self._read_collection() - + def _read_collection(self): collection_elem = self.xml_tree.xpath('/collection')[0] - + self.collection.source = collection_elem.xpath('source')[0].text self.collection.date = collection_elem.xpath('date')[0].text self.collection.key = collection_elem.xpath('key')[0].text - + infon_elem_list = collection_elem.xpath('infon') document_elem_list = collection_elem.xpath('document') - + self._read_infons(infon_elem_list, self.collection) self._read_documents(document_elem_list) - - + def _read_infons(self, infon_elem_list, infons_parent_elem): for infon_elem in infon_elem_list: infons_parent_elem.put_infon(self._get_infon_key(infon_elem), - infon_elem.text) + infon_elem.text) def _read_documents(self, document_elem_list): for document_elem in document_elem_list: @@ -70,8 +70,8 @@ def _read_documents(self, document_elem_list): self._read_passages(document_elem.xpath('passage'), document) self._read_relations(document_elem.xpath('relation'), - document) - + document) + self.collection.add_document(document) def _read_passages(self, passage_elem_list, document_parent_elem): @@ -79,59 +79,53 @@ def _read_passages(self, passage_elem_list, document_parent_elem): passage = BioCPassage() self._read_infons(passage_elem.xpath('infon'), passage) passage.offset = passage_elem.xpath('offset')[0].text - + # Is this BioC document with ? if len(passage_elem.xpath('sentence')) > 0: - self._read_sentences(passage_elem.xpath('sentence'), - passage) + self._read_sentences(passage_elem.xpath('sentence'), passage) else: # Is the (optional) text element available? - try: + try: passage.text = passage_elem.xpath('text')[0].text except: pass self._read_annotations(passage_elem.xpath('annotation'), - passage) - - self._read_relations(passage_elem.xpath('relation'), - passage) - + passage) + + self._read_relations(passage_elem.xpath('relation'), passage) + document_parent_elem.add_passage(passage) - + def _read_sentences(self, sentence_elem_list, passage_parent_elem): for sentence_elem in sentence_elem_list: sentence = BioCSentence() self._read_infons(sentence_elem.xpath('infon'), sentence) sentence.offset = sentence_elem.xpath('offset')[0].text sentence.text = sentence_elem.xpath('text')[0].text - self._read_annotations(sentence_elem.xpath('annotation'), - sentence) - self._read_relations(sentence_elem.xpath('relation'), - sentence) - + self._read_annotations(sentence_elem.xpath('annotation'), sentence) + self._read_relations(sentence_elem.xpath('relation'), sentence) + passage_parent_elem.add_sentence(sentence) - - def _read_annotations(self, annotation_elem_list, - annotations_parent_elem): + + def _read_annotations(self, annotation_elem_list, annotations_parent_elem): for annotation_elem in annotation_elem_list: annotation = BioCAnnotation() # Attribute id is just #IMPLIED, not #REQUIRED if 'id' in annotation_elem.attrib: annotation.id = annotation_elem.attrib['id'] - self._read_infons(annotation_elem.xpath('infon'), - annotation) - + self._read_infons(annotation_elem.xpath('infon'), annotation) + for location_elem in annotation_elem.xpath('location'): location = BioCLocation() location.offset = location_elem.attrib['offset'] location.length = location_elem.attrib['length'] - + annotation.add_location(location) - + annotation.text = annotation_elem.xpath('text')[0].text - + annotations_parent_elem.add_annotation(annotation) - + def _read_relations(self, relation_elem_list, relations_parent_elem): for relation_elem in relation_elem_list: relation = BioCRelation() @@ -144,10 +138,10 @@ def _read_relations(self, relation_elem_list, relations_parent_elem): node = BioCNode() node.refid = node_elem.attrib['refid'] node.role = node_elem.attrib['role'] - + relation.add_node(node) - + relations_parent_elem.add_relation(relation) - + def _get_infon_key(self, elem): return elem.attrib['key'] diff --git a/src/bioc/bioc_relation.py b/src/bioc/bioc_relation.py index e315c1d..9baf929 100644 --- a/src/bioc/bioc_relation.py +++ b/src/bioc/bioc_relation.py @@ -2,12 +2,12 @@ from compat import _Py2Next from meta import _MetaId, _MetaInfons, _MetaIter -from bioc_node import BioCNode + class BioCRelation(_MetaId, _MetaInfons, _Py2Next, _MetaIter): def __init__(self, relation=None): - + self.id = '' self.nodes = list() self.infons = dict() @@ -31,5 +31,5 @@ def add_node(self, node, refid=None, role=None): # Discard arg ``node'' if optional args fully provided if (refid is not None) and (role is not None): self.add_node(refid=refid, role=role) - else: # Only consider optional args if both set + else: # Only consider optional args if both set self.nodes.append(node) diff --git a/src/bioc/bioc_sentence.py b/src/bioc/bioc_sentence.py index d5d757c..5d70549 100644 --- a/src/bioc/bioc_sentence.py +++ b/src/bioc/bioc_sentence.py @@ -1,15 +1,15 @@ __all__ = ['BioCSentence'] -from meta import _MetaAnnotations, _MetaInfons, _MetaOffset, \ - _MetaRelations, _MetaText - +from meta import _MetaAnnotations, _MetaInfons, _MetaOffset, _MetaRelations, \ + _MetaText -class BioCSentence(_MetaAnnotations, _MetaInfons, _MetaOffset, + +class BioCSentence(_MetaAnnotations, _MetaInfons, _MetaOffset, _MetaRelations, _MetaText): - + def __init__(self, sentence=None): - + self.offset = '-1' self.text = '' self.infons = dict() @@ -25,9 +25,9 @@ def __init__(self, sentence=None): def __str__(self): s = 'offset: ' + str(self.offset) + '\n' - s += 'infons: ' + str(self.infons) + '\n' # TBD - s += 'text: ' + str(self.text) + '\n' # TBD - s += str(self.annotations) + '\n' # TBD - s += str(self.relations) + '\n' # TBD + s += 'infons: ' + str(self.infons) + '\n' # TBD + s += 'text: ' + str(self.text) + '\n' # TBD + s += str(self.annotations) + '\n' # TBD + s += str(self.relations) + '\n' # TBD return s diff --git a/src/bioc/bioc_writer.py b/src/bioc/bioc_writer.py index 38ebf51..d053f7d 100644 --- a/src/bioc/bioc_writer.py +++ b/src/bioc/bioc_writer.py @@ -3,79 +3,76 @@ from lxml.builder import E from lxml.etree import tostring + class BioCWriter: - + def __init__(self, filename=None, collection=None): - + self.root_tree = None - + self.collection = None self.doctype = '''''' self.doctype += '''''' self.filename = filename - + if collection is not None: self.collection = collection - + if filename is not None: self.filename = filename - + def __str__(self): """ A BioCWriter object can be printed as string. """ self._check_for_data() - + self.build() - s = tostring(self.root_tree, - pretty_print=True, - doctype=self.doctype) - + s = tostring(self.root_tree, pretty_print=True, doctype=self.doctype) + return s - + def _check_for_data(self): if self.collection is None: raise(Exception('No data available.')) - + def write(self, filename=None): """ Use this method to write the data in the PyBioC objects to disk. - + filename: Output file path (optional argument; filename provided by __init__ used otherwise.) """ if filename is not None: self.filename = filename - + if self.filename is None: raise(Exception('No output file path provided.')) - + f = open(self.filename, 'w') f.write(self.__str__()) - + def build(self): self._build_collection() - + def _build_collection(self): - self.root_tree = E('collection', - E('source'), E('date'), E('key')) + self.root_tree = E('collection', E('source'), E('date'), E('key')) self.root_tree.xpath('source')[0].text = self.collection.source self.root_tree.xpath('date')[0].text = self.collection.date - self.root_tree.xpath('key')[0].text = self.collection.key + self.root_tree.xpath('key')[0].text = self.collection.key collection_elem = self.root_tree.xpath('/collection')[0] # infon* self._build_infons(self.collection.infons, collection_elem) # document+ - self._build_documents(self.collection.documents, - collection_elem) - + self._build_documents(self.collection.documents, collection_elem) + def _build_infons(self, infons_dict, infons_parent_elem): for infon_key, infon_val in infons_dict.items(): infons_parent_elem.append(E('infon')) infon_elem = infons_parent_elem.xpath('infon')[-1] - + infon_elem.attrib['key'] = infon_key infon_elem.text = infon_val - + def _build_documents(self, documents_list, collection_parent_elem): for document in documents_list: collection_parent_elem.append(E('document', E('id'))) @@ -89,7 +86,7 @@ def _build_documents(self, documents_list, collection_parent_elem): self._build_passages(document.passages, document_elem) # relation* self._build_relations(document.relations, document_elem) - + def _build_passages(self, passages_list, document_parent_elem): for passage in passages_list: document_parent_elem.append(E('passage')) @@ -106,11 +103,11 @@ def _build_passages(self, passages_list, document_parent_elem): # text?, annotation* passage_elem.append(E('text')) passage_elem.xpath('text')[0].text = passage.text - self._build_annotations(passage.annotations, + self._build_annotations(passage.annotations, passage_elem) # relation* self._build_relations(passage.relations, passage_elem) - + def _build_relations(self, relations_list, relations_parent_elem): for relation in relations_list: relations_parent_elem.append(E('relation')) @@ -126,9 +123,8 @@ def _build_relations(self, relations_list, relations_parent_elem): # id (just #IMPLIED) if len(relation.id) > 0: relation_elem.attrib['id'] = relation.id - - def _build_annotations(self, annotations_list, - annotations_parent_elem): + + def _build_annotations(self, annotations_list, annotations_parent_elem): for annotation in annotations_list: annotations_parent_elem.append(E('annotation')) annotation_elem = \ diff --git a/src/bioc/compat/_py2_next.py b/src/bioc/compat/_py2_next.py index 745018e..7578301 100644 --- a/src/bioc/compat/_py2_next.py +++ b/src/bioc/compat/_py2_next.py @@ -1,5 +1,7 @@ __all__ = [] + class _Py2Next: - def __next__(self): - self.next() + + def __next__(self): + self.next() diff --git a/src/bioc/meta/__init__.py b/src/bioc/meta/__init__.py index d87c4f9..10d3143 100644 --- a/src/bioc/meta/__init__.py +++ b/src/bioc/meta/__init__.py @@ -3,5 +3,6 @@ __author__ = 'Hernani Marques (h2m@access.uzh.ch)' from _bioc_meta import _MetaAnnotations, _MetaInfons, _MetaOffset, \ - _MetaRelations, _MetaText, _MetaId + _MetaRelations, _MetaText, _MetaId + from _iter import _MetaIter diff --git a/src/bioc/meta/_bioc_meta.py b/src/bioc/meta/_bioc_meta.py index 268cfbe..0e26e25 100644 --- a/src/bioc/meta/_bioc_meta.py +++ b/src/bioc/meta/_bioc_meta.py @@ -1,10 +1,11 @@ __all__ = [] + class _MetaAnnotations: annotations = list() def annotation_iterator(self): - return self.annotations.iterator() # TBD + return self.annotations.iterator() # TBD def clear_annotations(self): self.annotations = list() @@ -12,32 +13,35 @@ def clear_annotations(self): def add_annotation(self, annotation): self.annotations.append(annotation) - def remove_annotation(self, annotation): # Can be int or obj + def remove_annotation(self, annotation): # Can be int or obj if type(annotation) is int: self.annotations.remove(self.annotations[annotation]) else: - self.annotations.remove(annotation) # TBC + self.annotations.remove(annotation) # TBC + class _MetaInfons: infons = dict() def put_infon(self, key, val): - self.infons[key] = val + self.infons[key] = val def remove_infon(self, key): - del(self.infons[key]) + del(self.infons[key]) def clear_infons(self): self.infons = dict() + class _MetaOffset: offset = '-1' + class _MetaRelations: relations = list() def relation_iterator(self): - return self.relations.iterator() # TBD + return self.relations.iterator() # TBD def clear_relations(self): self.relations = list() @@ -45,14 +49,16 @@ def clear_relations(self): def add_relation(self, relation): self.relations.append(relation) - def remove_relation(self, relation): # Can be int or obj + def remove_relation(self, relation): # Can be int or obj if type(relation) is int: self.relations.remove(self.relations[relation]) else: - self.relations.remove(relation) # TBC + self.relations.remove(relation) # TBC + class _MetaText: text = '' + class _MetaId: id = '' diff --git a/src/bioc/meta/_iter.py b/src/bioc/meta/_iter.py index 1927634..e5a0dd6 100644 --- a/src/bioc/meta/_iter.py +++ b/src/bioc/meta/_iter.py @@ -1,5 +1,6 @@ __all__ = [] + class _MetaIter: def __iter__(self): diff --git a/src/stemmer.py b/src/stemmer.py index 6eb1d25..e2d6e5a 100755 --- a/src/stemmer.py +++ b/src/stemmer.py @@ -1,8 +1,8 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- +# -*- coding: utf-8 -*- # h2m@access.uzh.ch -from os import curdir, sep +from os import sep import sys from nltk.tokenize import wordpunct_tokenize @@ -16,46 +16,47 @@ BIOC_OUT = 'example_input_stemmed.xml' DTD_FILE = '..' + sep + 'BioC.dtd' + def main(): # Use file defined by BIOC_IN as default if no other provided bioc_in = BIOC_IN if len(sys.argv) >= 2: bioc_in = sys.argv[1] - + # A BioCReader object is put in place to hold the example BioC XML # document bioc_reader = BioCReader(bioc_in, dtd_valid_file=DTD_FILE) - + # A BioCWRiter object is prepared to write out the annotated data bioc_writer = BioCWriter(BIOC_OUT) - + # The NLTK porter stemmer is used for stemming stemmer = PorterStemmer() - + # The example input file given above (by BIOC_IN) is fed into # a BioCReader object; validation is done by the BioC DTD bioc_reader.read() - + # Pass over basic data bioc_writer.collection = bioc_reader.collection - + # Get documents to manipulate documents = bioc_writer.collection.documents - + # Go through each document annotation_id = 0 for document in documents: - + # Go through each passage of the document for passage in document: # Stem all the tokens found - stems = [stemmer.stem(token) for + stems = [stemmer.stem(token) for token in wordpunct_tokenize(passage.text)] # Add an anotation showing the stemmed version, in the # given order for stem in stems: annotation_id += 1 - + # For each token an annotation is created, providing # the surface form of a 'stemmed token'. # (The annotations are collectively added following @@ -63,16 +64,16 @@ def main(): bioc_annotation = BioCAnnotation() bioc_annotation.text = stem bioc_annotation.id = str(annotation_id) - bioc_annotation.put_infon('surface form', + bioc_annotation.put_infon('surface form', 'stemmed token') passage.add_annotation(bioc_annotation) - + # Print file to screen w/o trailing newline # (Can be redirected into a file, e. g output_bioc.xml) sys.stdout.write(str(bioc_writer)) - + # Write to disk bioc_writer.write() - -if __name__ == '__main__': + +if __name__ == '__main__': main() diff --git a/src/test_read+write.py b/src/test_read+write.py index 963f5c8..d165ff2 100755 --- a/src/test_read+write.py +++ b/src/test_read+write.py @@ -6,6 +6,7 @@ test_file = '../test_input/bcIVLearningCorpus.xml' dtd_file = '../test_input/BioC.dtd' + def main(): bioc_reader = BioCReader(test_file, dtd_valid_file=dtd_file) bioc_reader.read() @@ -20,5 +21,5 @@ def main(): bioc_writer.write() print(bioc_writer) -if __name__ == '__main__': +if __name__ == '__main__': main() From e977849567f8c87e4598febf10efc9ba62352d19 Mon Sep 17 00:00:00 2001 From: Max Nanis Date: Fri, 27 Feb 2015 14:12:54 -0800 Subject: [PATCH 02/21] [small] update to correct paths --- src/stemmer.py | 2 +- src/test_read+write.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/stemmer.py b/src/stemmer.py index e2d6e5a..d2f33fb 100755 --- a/src/stemmer.py +++ b/src/stemmer.py @@ -13,7 +13,7 @@ from bioc import BioCWriter BIOC_IN = '..' + sep + 'test_input' + sep + 'example_input.xml' -BIOC_OUT = 'example_input_stemmed.xml' +BIOC_OUT = '..' + sep + 'test_input' + sep + 'example_input_stemmed.xml' DTD_FILE = '..' + sep + 'BioC.dtd' diff --git a/src/test_read+write.py b/src/test_read+write.py index d165ff2..c0bdc62 100755 --- a/src/test_read+write.py +++ b/src/test_read+write.py @@ -3,8 +3,8 @@ from bioc import BioCReader from bioc import BioCWriter -test_file = '../test_input/bcIVLearningCorpus.xml' -dtd_file = '../test_input/BioC.dtd' +test_file = '../test_input/everything-sentence.xml' +dtd_file = '../BioC.dtd' def main(): From a93c488b22192953003b980e6191e46935b92ddc Mon Sep 17 00:00:00 2001 From: Max Nanis Date: Fri, 27 Feb 2015 14:25:06 -0800 Subject: [PATCH 03/21] [small] readme --- README.md | 28 ++++++++++++++++++++++++++++ README.txt | 15 --------------- 2 files changed, 28 insertions(+), 15 deletions(-) create mode 100644 README.md delete mode 100644 README.txt diff --git a/README.md b/README.md new file mode 100644 index 0000000..c67cecb --- /dev/null +++ b/README.md @@ -0,0 +1,28 @@ +#PyBioC + +**[PyBioC][1] is a native python library to deal with BioCreative XML data, +i. e. to read from and to write to it.** + +More information about BioC available [online][2]. + +---------- + +##Usage: + +Two example programs, test_read+write.py and stemming.py are shipped in the `src/` folder. + +- `test_read+write.py` shows the very +basic reading and writing capability +of the library. +- `stemming.py` uses the Python Natural +Language Toolkit (NLTK) library to +manipulate a BioC XML file read in +before; it then tokenizes the +corresponding text, does stemming on +the tokens and transforms the +manipulated PyBioC objects back to +valid BioC XML format. + + +[1]: http://bioc.sourceforge.net/ +[2]: http://bioc.sourceforge.net/ \ No newline at end of file diff --git a/README.txt b/README.txt deleted file mode 100644 index d9069cb..0000000 --- a/README.txt +++ /dev/null @@ -1,15 +0,0 @@ -PyBioC is a native python library to deal with BioCreative XML data, -i. e. to read from and to write to it. - -Usage: ------- -Two example programs, test_read+write.py and stemming.py are shipped in the -src/ folder. - -test_read+write.py shows the very basic reading and writing capability of the -library. - -stemming.py uses the Python Natural Language Toolkit (NLTK) library to -manipulate a BioC XML file read in before; it then tokenizes the corresponding -text, does stemming on the tokens and transforms the manipulated PyBioC -objects back to valid BioC XML format. From 992799d79e3e841c0100e1d7358ddcd7050faab7 Mon Sep 17 00:00:00 2001 From: Max Nanis Date: Mon, 2 Mar 2015 16:06:38 -0800 Subject: [PATCH 04/21] Example for BioC obj >> export (no file) --- .gitignore | 1 + README.md | 32 ++++++++++++++++++++++++++++++-- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 0d20b64..d5770e0 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ *.pyc +src/output_bioc.xml diff --git a/README.md b/README.md index c67cecb..1fca9ef 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ Two example programs, test_read+write.py and stemming.py are shipped in the `src basic reading and writing capability of the library. - `stemming.py` uses the Python Natural -Language Toolkit (NLTK) library to +Language Toolkit (NLTK) library to manipulate a BioC XML file read in before; it then tokenizes the corresponding text, does stemming on @@ -23,6 +23,34 @@ the tokens and transforms the manipulated PyBioC objects back to valid BioC XML format. +##Example: + +### Generate BioC object for export + +```python +from bioc import BioCWriter, BioCCollection, BioCDocument, BioCPassage + +writer = BioCWriter() +writer.collection = BioCCollection() +collection = writer.collection +collection.date = '20150301' +collection.source = 'ngy1 corpus' + +document = BioCDocument() +document.id = '123456' # pubmed id + +passage = BioCPassage() +passage.put_infon('type', 'paragraph') +passage.offset = '0' +passage.text = 'This is a biomedical sentence about various rare diseases.' +document.add_passage(passage) + +collection.add_document(document) + +print writer +``` + + [1]: http://bioc.sourceforge.net/ -[2]: http://bioc.sourceforge.net/ \ No newline at end of file +[2]: http://bioc.sourceforge.net/ From f84904364109ea789ac2130116c4344ee80eb4f1 Mon Sep 17 00:00:00 2001 From: Lenz Date: Fri, 29 Jan 2016 11:24:50 +0100 Subject: [PATCH 05/21] more test files --- test_input/example_bioc.xml | 16 ++++++++++ test_input/output_bioc.xml | 60 +++++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+) create mode 100644 test_input/example_bioc.xml create mode 100644 test_input/output_bioc.xml diff --git a/test_input/example_bioc.xml b/test_input/example_bioc.xml new file mode 100644 index 0000000..ed63890 --- /dev/null +++ b/test_input/example_bioc.xml @@ -0,0 +1,16 @@ + + + + Example + 1999-Jan-1 + PubTator.key + + 20085714 + + + + + + + + \ No newline at end of file diff --git a/test_input/output_bioc.xml b/test_input/output_bioc.xml new file mode 100644 index 0000000..365ff80 --- /dev/null +++ b/test_input/output_bioc.xml @@ -0,0 +1,60 @@ + + + + + + + 000000 + + + 0 + By expressing E-cadherin subdomains, we show that the growth-suppressive effect of E-cadherin required the presence of its cytoplasmic beta-catenin interaction domain and/or correlated strictly with the ability to negatively interfere with beta-catenin transcriptional activity. + + 1499 + Protein + CTNNB1 + p(HGNC:CTNNB1) + + beta-catenin + + + 999 + Protein + CDH1 + p(HGNC:CDH1) + + E-cadherin + + + 1499 + Protein + CTNNB1 + p(HGNC:CTNNB1) + + beta-catenin + + + decreases + + negatively interfere + + + complex(a51,a52) + + + + + transcriptionalActivity + tscript(a53) + + + + directlyDecreases + r51 directlyDecreases r52 + + + + + + + From a0fe375f7934100506fab48b2a60d870503f9ffe Mon Sep 17 00:00:00 2001 From: Lenz Date: Fri, 29 Jan 2016 11:28:31 +0100 Subject: [PATCH 06/21] python 2to3 encoding stuff --- src/bioc/bioc_writer.py | 76 ++++++++++++++++++++++------------------- 1 file changed, 40 insertions(+), 36 deletions(-) diff --git a/src/bioc/bioc_writer.py b/src/bioc/bioc_writer.py index 38ebf51..c6e3e7e 100644 --- a/src/bioc/bioc_writer.py +++ b/src/bioc/bioc_writer.py @@ -4,78 +4,82 @@ from lxml.etree import tostring class BioCWriter: - + def __init__(self, filename=None, collection=None): - + self.root_tree = None - + self.collection = None self.doctype = '''''' self.doctype += '''''' self.filename = filename - + if collection is not None: self.collection = collection - + if filename is not None: self.filename = filename - + def __str__(self): """ A BioCWriter object can be printed as string. """ self._check_for_data() - + self.build() - s = tostring(self.root_tree, - pretty_print=True, - doctype=self.doctype) - + + # Use `encoding=str` to avoid returning a byte object. + # Encode the string when writing to file. + # Set the encoding declaration by using self.doctype. + s = tostring(self.root_tree, + encoding=str, + pretty_print=True, + doctype=self.doctype) + return s - + def _check_for_data(self): if self.collection is None: raise(Exception('No data available.')) - + def write(self, filename=None): """ Use this method to write the data in the PyBioC objects to disk. - + filename: Output file path (optional argument; filename provided by __init__ used otherwise.) """ - if filename is not None: - self.filename = filename - - if self.filename is None: - raise(Exception('No output file path provided.')) - - f = open(self.filename, 'w') - f.write(self.__str__()) - + if filename is None: + if self.filename is None: + raise(Exception('No output file path provided.')) + filename = self.filename + + with open(filename, 'w', encoding='utf-8') as f: + f.write(str(self)) + def build(self): self._build_collection() - + def _build_collection(self): - self.root_tree = E('collection', + self.root_tree = E('collection', E('source'), E('date'), E('key')) self.root_tree.xpath('source')[0].text = self.collection.source self.root_tree.xpath('date')[0].text = self.collection.date - self.root_tree.xpath('key')[0].text = self.collection.key + self.root_tree.xpath('key')[0].text = self.collection.key collection_elem = self.root_tree.xpath('/collection')[0] # infon* self._build_infons(self.collection.infons, collection_elem) # document+ - self._build_documents(self.collection.documents, + self._build_documents(self.collection.documents, collection_elem) - + def _build_infons(self, infons_dict, infons_parent_elem): for infon_key, infon_val in infons_dict.items(): infons_parent_elem.append(E('infon')) infon_elem = infons_parent_elem.xpath('infon')[-1] - + infon_elem.attrib['key'] = infon_key infon_elem.text = infon_val - + def _build_documents(self, documents_list, collection_parent_elem): for document in documents_list: collection_parent_elem.append(E('document', E('id'))) @@ -89,7 +93,7 @@ def _build_documents(self, documents_list, collection_parent_elem): self._build_passages(document.passages, document_elem) # relation* self._build_relations(document.relations, document_elem) - + def _build_passages(self, passages_list, document_parent_elem): for passage in passages_list: document_parent_elem.append(E('passage')) @@ -106,11 +110,11 @@ def _build_passages(self, passages_list, document_parent_elem): # text?, annotation* passage_elem.append(E('text')) passage_elem.xpath('text')[0].text = passage.text - self._build_annotations(passage.annotations, + self._build_annotations(passage.annotations, passage_elem) # relation* self._build_relations(passage.relations, passage_elem) - + def _build_relations(self, relations_list, relations_parent_elem): for relation in relations_list: relations_parent_elem.append(E('relation')) @@ -126,9 +130,9 @@ def _build_relations(self, relations_list, relations_parent_elem): # id (just #IMPLIED) if len(relation.id) > 0: relation_elem.attrib['id'] = relation.id - - def _build_annotations(self, annotations_list, - annotations_parent_elem): + + def _build_annotations(self, annotations_list, + annotations_parent_elem): for annotation in annotations_list: annotations_parent_elem.append(E('annotation')) annotation_elem = \ From 0dae8b37b0b7d7f760b8992d7550838f6dbc1667 Mon Sep 17 00:00:00 2001 From: Lenz Date: Fri, 29 Jan 2016 11:28:49 +0100 Subject: [PATCH 07/21] relative imports everywhere --- src/bioc/__init__.py | 20 ++++++++++---------- src/bioc/bioc_annotation.py | 2 +- src/bioc/bioc_collection.py | 4 ++-- src/bioc/bioc_document.py | 4 ++-- src/bioc/bioc_location.py | 2 +- src/bioc/bioc_passage.py | 4 ++-- src/bioc/bioc_reader.py | 20 ++++++++++---------- src/bioc/bioc_relation.py | 6 +++--- src/bioc/bioc_sentence.py | 4 ++-- src/bioc/compat/__init__.py | 2 +- src/bioc/meta/__init__.py | 6 +++--- src/test_read+write.py | 2 +- 12 files changed, 38 insertions(+), 38 deletions(-) diff --git a/src/bioc/__init__.py b/src/bioc/__init__.py index 5df7fef..0be6a46 100644 --- a/src/bioc/__init__.py +++ b/src/bioc/__init__.py @@ -12,13 +12,13 @@ __author__ = 'Hernani Marques (h2m@access.uzh.ch)' -from bioc_annotation import BioCAnnotation -from bioc_collection import BioCCollection -from bioc_document import BioCDocument -from bioc_location import BioCLocation -from bioc_node import BioCNode -from bioc_passage import BioCPassage -from bioc_relation import BioCRelation -from bioc_sentence import BioCSentence -from bioc_reader import BioCReader -from bioc_writer import BioCWriter +from .bioc_annotation import BioCAnnotation +from .bioc_collection import BioCCollection +from .bioc_document import BioCDocument +from .bioc_location import BioCLocation +from .bioc_node import BioCNode +from .bioc_passage import BioCPassage +from .bioc_relation import BioCRelation +from .bioc_sentence import BioCSentence +from .bioc_reader import BioCReader +from .bioc_writer import BioCWriter diff --git a/src/bioc/bioc_annotation.py b/src/bioc/bioc_annotation.py index bc6af21..e941fc3 100644 --- a/src/bioc/bioc_annotation.py +++ b/src/bioc/bioc_annotation.py @@ -1,6 +1,6 @@ __all__ = ['BioCAnnotation'] -from meta import _MetaId, _MetaInfons, _MetaText +from .meta import _MetaId, _MetaInfons, _MetaText class BioCAnnotation(_MetaId, _MetaInfons, _MetaText): diff --git a/src/bioc/bioc_collection.py b/src/bioc/bioc_collection.py index 9abc164..66e29fc 100644 --- a/src/bioc/bioc_collection.py +++ b/src/bioc/bioc_collection.py @@ -1,7 +1,7 @@ __all__ = ['BioCCollection'] -from meta import _MetaInfons, _MetaIter -from compat import _Py2Next +from .meta import _MetaInfons, _MetaIter +from .compat import _Py2Next class BioCCollection(_Py2Next, _MetaInfons, _MetaIter): diff --git a/src/bioc/bioc_document.py b/src/bioc/bioc_document.py index 8033f13..f2dffae 100644 --- a/src/bioc/bioc_document.py +++ b/src/bioc/bioc_document.py @@ -1,7 +1,7 @@ __all__ = ['BioCDocument'] -from compat import _Py2Next -from meta import _MetaId, _MetaInfons, _MetaRelations, _MetaIter +from .compat import _Py2Next +from .meta import _MetaId, _MetaInfons, _MetaRelations, _MetaIter class BioCDocument(_MetaId, _MetaInfons, _MetaRelations, _MetaIter, _Py2Next): diff --git a/src/bioc/bioc_location.py b/src/bioc/bioc_location.py index 2e574d0..a3d5169 100644 --- a/src/bioc/bioc_location.py +++ b/src/bioc/bioc_location.py @@ -1,6 +1,6 @@ __all__ = ['BioCLocation'] -from meta import _MetaOffset +from .meta import _MetaOffset class BioCLocation(_MetaOffset): diff --git a/src/bioc/bioc_passage.py b/src/bioc/bioc_passage.py index 4347675..f5e1223 100644 --- a/src/bioc/bioc_passage.py +++ b/src/bioc/bioc_passage.py @@ -1,7 +1,7 @@ __all__ = ['BioCPassage'] -from meta import _MetaAnnotations, _MetaInfons, _MetaOffset, \ - _MetaRelations, _MetaText +from .meta import _MetaAnnotations, _MetaInfons, _MetaOffset, \ + _MetaRelations, _MetaText class BioCPassage(_MetaAnnotations, _MetaOffset, _MetaText, _MetaRelations, _MetaInfons): diff --git a/src/bioc/bioc_reader.py b/src/bioc/bioc_reader.py index a35b9f7..effc9ed 100644 --- a/src/bioc/bioc_reader.py +++ b/src/bioc/bioc_reader.py @@ -1,17 +1,17 @@ __all__ = ['BioCReader'] -import StringIO +from io import StringIO from lxml import etree -from bioc_annotation import BioCAnnotation -from bioc_collection import BioCCollection -from bioc_document import BioCDocument -from bioc_location import BioCLocation -from bioc_passage import BioCPassage -from bioc_sentence import BioCSentence -from bioc_node import BioCNode -from bioc_relation import BioCRelation +from .bioc_annotation import BioCAnnotation +from .bioc_collection import BioCCollection +from .bioc_document import BioCDocument +from .bioc_location import BioCLocation +from .bioc_passage import BioCPassage +from .bioc_sentence import BioCSentence +from .bioc_node import BioCNode +from .bioc_relation import BioCRelation class BioCReader: """ @@ -86,7 +86,7 @@ def _read_passages(self, passage_elem_list, document_parent_elem): passage) else: # Is the (optional) text element available? - try: + try: passage.text = passage_elem.xpath('text')[0].text except: pass diff --git a/src/bioc/bioc_relation.py b/src/bioc/bioc_relation.py index e315c1d..f47dea7 100644 --- a/src/bioc/bioc_relation.py +++ b/src/bioc/bioc_relation.py @@ -1,8 +1,8 @@ __all__ = ['BioCRelation'] -from compat import _Py2Next -from meta import _MetaId, _MetaInfons, _MetaIter -from bioc_node import BioCNode +from .compat import _Py2Next +from .meta import _MetaId, _MetaInfons, _MetaIter +from .bioc_node import BioCNode class BioCRelation(_MetaId, _MetaInfons, _Py2Next, _MetaIter): diff --git a/src/bioc/bioc_sentence.py b/src/bioc/bioc_sentence.py index d5d757c..1adf4eb 100644 --- a/src/bioc/bioc_sentence.py +++ b/src/bioc/bioc_sentence.py @@ -1,8 +1,8 @@ __all__ = ['BioCSentence'] -from meta import _MetaAnnotations, _MetaInfons, _MetaOffset, \ - _MetaRelations, _MetaText +from .meta import _MetaAnnotations, _MetaInfons, _MetaOffset, \ + _MetaRelations, _MetaText class BioCSentence(_MetaAnnotations, _MetaInfons, _MetaOffset, diff --git a/src/bioc/compat/__init__.py b/src/bioc/compat/__init__.py index 176d401..58d14fb 100644 --- a/src/bioc/compat/__init__.py +++ b/src/bioc/compat/__init__.py @@ -2,4 +2,4 @@ __author__ = 'Hernani Marques (h2m@access.uzh.ch)' -from _py2_next import _Py2Next +from ._py2_next import _Py2Next diff --git a/src/bioc/meta/__init__.py b/src/bioc/meta/__init__.py index d87c4f9..afbce03 100644 --- a/src/bioc/meta/__init__.py +++ b/src/bioc/meta/__init__.py @@ -2,6 +2,6 @@ __author__ = 'Hernani Marques (h2m@access.uzh.ch)' -from _bioc_meta import _MetaAnnotations, _MetaInfons, _MetaOffset, \ - _MetaRelations, _MetaText, _MetaId -from _iter import _MetaIter +from ._bioc_meta import _MetaAnnotations, _MetaInfons, _MetaOffset, \ + _MetaRelations, _MetaText, _MetaId +from ._iter import _MetaIter diff --git a/src/test_read+write.py b/src/test_read+write.py index 963f5c8..8d0b832 100755 --- a/src/test_read+write.py +++ b/src/test_read+write.py @@ -3,7 +3,7 @@ from bioc import BioCReader from bioc import BioCWriter -test_file = '../test_input/bcIVLearningCorpus.xml' +test_file = '../test_input/example_input.xml' dtd_file = '../test_input/BioC.dtd' def main(): From 3f79f5cc86de251e9ddc538b12c1382195bd1ea8 Mon Sep 17 00:00:00 2001 From: Lenz Date: Fri, 29 Jan 2016 12:30:04 +0100 Subject: [PATCH 08/21] move DTD to the test files --- src/stemmer.py | 38 ++++++++++----------- BioC.dtd => test_input/BioC.dtd | 0 test_input/output_bioc.xml | 60 --------------------------------- 3 files changed, 19 insertions(+), 79 deletions(-) rename BioC.dtd => test_input/BioC.dtd (100%) delete mode 100644 test_input/output_bioc.xml diff --git a/src/stemmer.py b/src/stemmer.py index 6eb1d25..61c7798 100755 --- a/src/stemmer.py +++ b/src/stemmer.py @@ -1,8 +1,8 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- +# -*- coding: utf-8 -*- # h2m@access.uzh.ch -from os import curdir, sep +import os import sys from nltk.tokenize import wordpunct_tokenize @@ -12,50 +12,50 @@ from bioc import BioCReader from bioc import BioCWriter -BIOC_IN = '..' + sep + 'test_input' + sep + 'example_input.xml' -BIOC_OUT = 'example_input_stemmed.xml' -DTD_FILE = '..' + sep + 'BioC.dtd' +BIOC_IN = os.path.join('..', 'test_input', 'example_input.xml') +BIOC_OUT = os.path.join('example_input_stemmed.xml') +DTD_FILE = os.path.join('..', 'test_input', 'BioC.dtd') def main(): # Use file defined by BIOC_IN as default if no other provided bioc_in = BIOC_IN if len(sys.argv) >= 2: bioc_in = sys.argv[1] - + # A BioCReader object is put in place to hold the example BioC XML # document bioc_reader = BioCReader(bioc_in, dtd_valid_file=DTD_FILE) - + # A BioCWRiter object is prepared to write out the annotated data bioc_writer = BioCWriter(BIOC_OUT) - + # The NLTK porter stemmer is used for stemming stemmer = PorterStemmer() - + # The example input file given above (by BIOC_IN) is fed into # a BioCReader object; validation is done by the BioC DTD bioc_reader.read() - + # Pass over basic data bioc_writer.collection = bioc_reader.collection - + # Get documents to manipulate documents = bioc_writer.collection.documents - + # Go through each document annotation_id = 0 for document in documents: - + # Go through each passage of the document for passage in document: # Stem all the tokens found - stems = [stemmer.stem(token) for + stems = [stemmer.stem(token) for token in wordpunct_tokenize(passage.text)] # Add an anotation showing the stemmed version, in the # given order for stem in stems: annotation_id += 1 - + # For each token an annotation is created, providing # the surface form of a 'stemmed token'. # (The annotations are collectively added following @@ -63,16 +63,16 @@ def main(): bioc_annotation = BioCAnnotation() bioc_annotation.text = stem bioc_annotation.id = str(annotation_id) - bioc_annotation.put_infon('surface form', + bioc_annotation.put_infon('surface form', 'stemmed token') passage.add_annotation(bioc_annotation) - + # Print file to screen w/o trailing newline # (Can be redirected into a file, e. g output_bioc.xml) sys.stdout.write(str(bioc_writer)) - + # Write to disk bioc_writer.write() - + if __name__ == '__main__': main() diff --git a/BioC.dtd b/test_input/BioC.dtd similarity index 100% rename from BioC.dtd rename to test_input/BioC.dtd diff --git a/test_input/output_bioc.xml b/test_input/output_bioc.xml deleted file mode 100644 index 365ff80..0000000 --- a/test_input/output_bioc.xml +++ /dev/null @@ -1,60 +0,0 @@ - - - - - - - 000000 - - - 0 - By expressing E-cadherin subdomains, we show that the growth-suppressive effect of E-cadherin required the presence of its cytoplasmic beta-catenin interaction domain and/or correlated strictly with the ability to negatively interfere with beta-catenin transcriptional activity. - - 1499 - Protein - CTNNB1 - p(HGNC:CTNNB1) - - beta-catenin - - - 999 - Protein - CDH1 - p(HGNC:CDH1) - - E-cadherin - - - 1499 - Protein - CTNNB1 - p(HGNC:CTNNB1) - - beta-catenin - - - decreases - - negatively interfere - - - complex(a51,a52) - - - - - transcriptionalActivity - tscript(a53) - - - - directlyDecreases - r51 directlyDecreases r52 - - - - - - - From 848657078fd5874818165b172eb1ef9c908f09b1 Mon Sep 17 00:00:00 2001 From: Lenz Date: Fri, 29 Jan 2016 16:02:10 +0100 Subject: [PATCH 09/21] restore Py2 compatibility --- CHANGES.txt | 11 +++++++++++ src/bioc/__init__.py | 2 +- src/bioc/bioc_writer.py | 12 ++++++------ 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 05366d4..1a8cdc5 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,3 +1,14 @@ +1.02-lf.1 +--------- +Approach PEP8, especially by making implicit relative import explicit. +Changes by Tilia Ellendorff , +Max Nanis , Lenz Furrer . + +1.02 +---- +Don't assume text element in passage must be avilable. +(Thanks to Adrian van der Lek .) + 1.01 ---- Fix invalid handling of id attributes for annotation and relation tags. diff --git a/src/bioc/__init__.py b/src/bioc/__init__.py index 0be6a46..d4b3bca 100644 --- a/src/bioc/__init__.py +++ b/src/bioc/__init__.py @@ -2,7 +2,7 @@ # Package for interoperability in BioCreative work # -__version__ = '1.02' +__version__ = '1.02-lf.1' __all__ = [ 'BioCAnnotation', 'BioCCollection', 'BioCDocument', diff --git a/src/bioc/bioc_writer.py b/src/bioc/bioc_writer.py index 49db42c..c0b0749 100644 --- a/src/bioc/bioc_writer.py +++ b/src/bioc/bioc_writer.py @@ -24,15 +24,15 @@ def __init__(self, filename=None, collection=None): def __str__(self): """ A BioCWriter object can be printed as string. """ + return self._tostring(encoding='unicode') + + def _tostring(self, encoding='UTF-8'): self._check_for_data() self.build() - # Use `encoding=str` to avoid returning a byte object. - # Encode the string when writing to file. - # Set the encoding declaration by using self.doctype. s = tostring(self.root_tree, - encoding=str, + encoding=encoding, pretty_print=True, doctype=self.doctype) @@ -54,8 +54,8 @@ def write(self, filename=None): raise(Exception('No output file path provided.')) filename = self.filename - with open(filename, 'w', encoding='utf-8') as f: - f.write(str(self)) + with open(filename, 'wb') as f: + f.write(self._tostring(encoding='UTF-8')) def build(self): self._build_collection() From 9eba4e1e617702ca21808e1344b39d1bfbce9140 Mon Sep 17 00:00:00 2001 From: Lenz Date: Fri, 29 Jan 2016 16:29:23 +0100 Subject: [PATCH 10/21] add a setup script --- setup.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 setup.py diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..692b55f --- /dev/null +++ b/setup.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python +# coding: utf8 + +from setuptools import setup, find_packages + +with open('README.md') as f: + readme = f.read() + +setup(name='PyBioC', + version='1.02-lf.1', + author='Hernani Marques', + author_email='h2m@access.uzh.ch', + description='Python library to deal with BioCreative XML data', + long_description=readme, + packages=find_packages()) From a10338ab43d85660529787828c84e9b040728f13 Mon Sep 17 00:00:00 2001 From: Lenz Date: Fri, 29 Jan 2016 17:19:03 +0100 Subject: [PATCH 11/21] bugfix in setup --- setup.py | 7 ++++--- src/bioc/__init__.py | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 692b55f..1b91ca6 100644 --- a/setup.py +++ b/setup.py @@ -1,15 +1,16 @@ #!/usr/bin/env python # coding: utf8 -from setuptools import setup, find_packages +from distutils.core import setup with open('README.md') as f: readme = f.read() setup(name='PyBioC', - version='1.02-lf.1', + version='1.02.1', author='Hernani Marques', author_email='h2m@access.uzh.ch', description='Python library to deal with BioCreative XML data', long_description=readme, - packages=find_packages()) + packages=['bioc', 'bioc.meta', 'bioc.compat'], + package_dir={'bioc': 'src/bioc'}) diff --git a/src/bioc/__init__.py b/src/bioc/__init__.py index d4b3bca..bbdfd79 100644 --- a/src/bioc/__init__.py +++ b/src/bioc/__init__.py @@ -2,7 +2,7 @@ # Package for interoperability in BioCreative work # -__version__ = '1.02-lf.1' +__version__ = '1.02.1' __all__ = [ 'BioCAnnotation', 'BioCCollection', 'BioCDocument', From fb240308b336ca0e6f092c5e1f4a310d3e33fffb Mon Sep 17 00:00:00 2001 From: Lenz Date: Mon, 8 Feb 2016 15:34:08 +0100 Subject: [PATCH 12/21] fix version number --- CHANGES.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.txt b/CHANGES.txt index 1a8cdc5..3470c01 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,4 @@ -1.02-lf.1 +1.02.1 --------- Approach PEP8, especially by making implicit relative import explicit. Changes by Tilia Ellendorff , From c191a8dd94e6ffe4c784d8d7fa2e3e839aeeeb51 Mon Sep 17 00:00:00 2001 From: Lenz Date: Thu, 25 Feb 2016 09:19:56 +0100 Subject: [PATCH 13/21] various small bugfixes --- CHANGES.txt | 4 ++++ src/bioc/__init__.py | 2 +- src/bioc/bioc_collection.py | 2 +- src/bioc/bioc_document.py | 4 ++-- src/bioc/bioc_passage.py | 7 +++---- src/bioc/bioc_reader.py | 12 +++++------- src/bioc/bioc_writer.py | 11 ++++++++++- 7 files changed, 26 insertions(+), 16 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 3470c01..ca5fca6 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,3 +1,7 @@ +1.02.2 +--------- +Various minor bugfixes + 1.02.1 --------- Approach PEP8, especially by making implicit relative import explicit. diff --git a/src/bioc/__init__.py b/src/bioc/__init__.py index bbdfd79..fbdf596 100644 --- a/src/bioc/__init__.py +++ b/src/bioc/__init__.py @@ -2,7 +2,7 @@ # Package for interoperability in BioCreative work # -__version__ = '1.02.1' +__version__ = '1.02.2' __all__ = [ 'BioCAnnotation', 'BioCCollection', 'BioCDocument', diff --git a/src/bioc/bioc_collection.py b/src/bioc/bioc_collection.py index 807858e..5d519a1 100644 --- a/src/bioc/bioc_collection.py +++ b/src/bioc/bioc_collection.py @@ -44,6 +44,6 @@ def add_document(self, document): def remove_document(self, document): if isinstance(document, int): - self.dcouments.remove(self.documents[document]) + self.documents.pop(document) else: self.documents.remove(document) # TBC diff --git a/src/bioc/bioc_document.py b/src/bioc/bioc_document.py index db05505..f86f3a8 100644 --- a/src/bioc/bioc_document.py +++ b/src/bioc/bioc_document.py @@ -31,7 +31,7 @@ def _iterdata(self): return self.passages def get_size(self): - return self.passages.size() # As in Java BioC + return sum(p.size() for p in self.passages) # As in Java BioC def clear_passages(self): self.passages = list() @@ -41,6 +41,6 @@ def add_passage(self, passage): def remove_passage(self, passage): if isinstance(passage, int): - self.passages.remove(self.passages[passage]) + self.passages.pop(passage) else: self.passages.remove(passage) # TBC diff --git a/src/bioc/bioc_passage.py b/src/bioc/bioc_passage.py index 1b961df..f1cb0ba 100644 --- a/src/bioc/bioc_passage.py +++ b/src/bioc/bioc_passage.py @@ -28,8 +28,7 @@ def size(self): return len(self.sentences) def has_sentences(self): - if len(self.sentences) > 0: - return True + return bool(self.sentences) def add_sentence(self, sentence): self.sentences.append(sentence) @@ -38,10 +37,10 @@ def sentences_iterator(self): return self.sentences.iterator() # TBD def clear_sentences(self): - self.relations = list() + self.sentences = list() def remove_sentence(self, sentence): # int or obj if isinstance(sentence, int): - self.sentences.remove(self.sentences[sentence]) + self.sentences.pop(sentence) else: self.sentences.remove(sentence) diff --git a/src/bioc/bioc_reader.py b/src/bioc/bioc_reader.py index adc46c6..75bd86f 100644 --- a/src/bioc/bioc_reader.py +++ b/src/bioc/bioc_reader.py @@ -33,8 +33,8 @@ def __init__(self, source, dtd_valid_file=None): if dtd_valid_file is not None: dtd = etree.DTD(dtd_valid_file) - if dtd.validate(self.xml_tree) is False: - raise(Exception(dtd.error_log.filter_from_errors()[0])) + if not dtd.validate(self.xml_tree): + raise Exception(dtd.error_log.filter_from_errors()[0]) def read(self): """ @@ -57,9 +57,10 @@ def _read_collection(self): self._read_infons(infon_elem_list, self.collection) self._read_documents(document_elem_list) - def _read_infons(self, infon_elem_list, infons_parent_elem): + @staticmethod + def _read_infons(infon_elem_list, infons_parent_elem): for infon_elem in infon_elem_list: - infons_parent_elem.put_infon(self._get_infon_key(infon_elem), + infons_parent_elem.put_infon(infon_elem.attrib['key'], infon_elem.text) def _read_documents(self, document_elem_list): @@ -142,6 +143,3 @@ def _read_relations(self, relation_elem_list, relations_parent_elem): relation.add_node(node) relations_parent_elem.add_relation(relation) - - def _get_infon_key(self, elem): - return elem.attrib['key'] diff --git a/src/bioc/bioc_writer.py b/src/bioc/bioc_writer.py index c0b0749..9e7cb23 100644 --- a/src/bioc/bioc_writer.py +++ b/src/bioc/bioc_writer.py @@ -1,9 +1,18 @@ __all__ = ['BioCWriter'] +import sys + from lxml.builder import E from lxml.etree import tostring +# Resolve Python 2/3 difference regarding the special method __str__(). +if sys.version_info < (3,): + STR_ENCODING = 'ascii' +else: + STR_ENCODING = 'unicode' + + class BioCWriter: def __init__(self, filename=None, collection=None): @@ -24,7 +33,7 @@ def __init__(self, filename=None, collection=None): def __str__(self): """ A BioCWriter object can be printed as string. """ - return self._tostring(encoding='unicode') + return self._tostring(encoding=STR_ENCODING) def _tostring(self, encoding='UTF-8'): self._check_for_data() From db11634ec04b4f738f82328517c8792cd9644b6a Mon Sep 17 00:00:00 2001 From: Lenz Date: Sat, 27 Feb 2016 23:37:25 +0100 Subject: [PATCH 14/21] make BioCWriter.tostring() public --- src/bioc/bioc_writer.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/bioc/bioc_writer.py b/src/bioc/bioc_writer.py index 9e7cb23..51d4655 100644 --- a/src/bioc/bioc_writer.py +++ b/src/bioc/bioc_writer.py @@ -33,9 +33,16 @@ def __init__(self, filename=None, collection=None): def __str__(self): """ A BioCWriter object can be printed as string. """ - return self._tostring(encoding=STR_ENCODING) + return self.tostring(encoding=STR_ENCODING) - def _tostring(self, encoding='UTF-8'): + def tostring(self, encoding='UTF-8'): + ''' + Serialize the collection to BioC XML. + + Return an encoded string (a bytes object in Python 3), + unless encoding is "unicode", in which case a decoded + string is returned (a unicode object in Python 2). + ''' self._check_for_data() self.build() @@ -64,7 +71,7 @@ def write(self, filename=None): filename = self.filename with open(filename, 'wb') as f: - f.write(self._tostring(encoding='UTF-8')) + f.write(self.tostring(encoding='UTF-8')) def build(self): self._build_collection() From bed11b7f5aaa20cc04379e04b8346e08a966971e Mon Sep 17 00:00:00 2001 From: Lenz Date: Sat, 27 Feb 2016 23:38:27 +0100 Subject: [PATCH 15/21] avoid repeated structure creation in BioCWriter --- CHANGES.txt | 9 +++++++-- setup.py | 2 +- src/bioc/__init__.py | 2 +- src/bioc/bioc_writer.py | 7 ++++--- 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index ca5fca6..e3b0097 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,9 +1,14 @@ +1.02.3 +------ +Make `BioCWriter.tostring()` a public method. +Avoid rebuilding the whole structure for every call to `__str__()`. + 1.02.2 ---------- +------ Various minor bugfixes 1.02.1 ---------- +------ Approach PEP8, especially by making implicit relative import explicit. Changes by Tilia Ellendorff , Max Nanis , Lenz Furrer . diff --git a/setup.py b/setup.py index 1b91ca6..22445d3 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ readme = f.read() setup(name='PyBioC', - version='1.02.1', + version='1.02.3', author='Hernani Marques', author_email='h2m@access.uzh.ch', description='Python library to deal with BioCreative XML data', diff --git a/src/bioc/__init__.py b/src/bioc/__init__.py index fbdf596..e9cab99 100644 --- a/src/bioc/__init__.py +++ b/src/bioc/__init__.py @@ -2,7 +2,7 @@ # Package for interoperability in BioCreative work # -__version__ = '1.02.2' +__version__ = '1.02.3' __all__ = [ 'BioCAnnotation', 'BioCCollection', 'BioCDocument', diff --git a/src/bioc/bioc_writer.py b/src/bioc/bioc_writer.py index 51d4655..5a2cc44 100644 --- a/src/bioc/bioc_writer.py +++ b/src/bioc/bioc_writer.py @@ -56,7 +56,7 @@ def tostring(self, encoding='UTF-8'): def _check_for_data(self): if self.collection is None: - raise(Exception('No data available.')) + raise Exception('No data available.') def write(self, filename=None): """ Use this method to write the data in the PyBioC objects @@ -67,14 +67,15 @@ def write(self, filename=None): """ if filename is None: if self.filename is None: - raise(Exception('No output file path provided.')) + raise Exception('No output file path provided.') filename = self.filename with open(filename, 'wb') as f: f.write(self.tostring(encoding='UTF-8')) def build(self): - self._build_collection() + if self.root_tree is None: + self._build_collection() def _build_collection(self): self.root_tree = E('collection', E('source'), E('date'), E('key')) From a21bd181ce3b89de791db4bb7db6ca1c48167648 Mon Sep 17 00:00:00 2001 From: Lenz Date: Wed, 4 May 2016 18:13:32 +0200 Subject: [PATCH 16/21] new method BioCWriter.iterfragments() --- CHANGES.txt | 5 +++++ setup.py | 2 +- src/bioc/__init__.py | 2 +- src/bioc/bioc_writer.py | 40 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 47 insertions(+), 2 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index e3b0097..294af88 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,3 +1,8 @@ +1.02.4 +------ +New method `BioCWriter.iterfragments()`: Iterate over serialised XML fragments, +thus avoiding in-memory construction of the complete tree. + 1.02.3 ------ Make `BioCWriter.tostring()` a public method. diff --git a/setup.py b/setup.py index 22445d3..755785f 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ readme = f.read() setup(name='PyBioC', - version='1.02.3', + version='1.02.4', author='Hernani Marques', author_email='h2m@access.uzh.ch', description='Python library to deal with BioCreative XML data', diff --git a/src/bioc/__init__.py b/src/bioc/__init__.py index e9cab99..c240595 100644 --- a/src/bioc/__init__.py +++ b/src/bioc/__init__.py @@ -2,7 +2,7 @@ # Package for interoperability in BioCreative work # -__version__ = '1.02.3' +__version__ = '1.02.4' __all__ = [ 'BioCAnnotation', 'BioCCollection', 'BioCDocument', diff --git a/src/bioc/bioc_writer.py b/src/bioc/bioc_writer.py index 5a2cc44..de9fd7e 100644 --- a/src/bioc/bioc_writer.py +++ b/src/bioc/bioc_writer.py @@ -73,6 +73,46 @@ def write(self, filename=None): with open(filename, 'wb') as f: f.write(self.tostring(encoding='UTF-8')) + def iterfragments(self, encoding='UTF-8'): + ''' + Iterate over serialised XML fragments. + + Use this for large collections, as it avoids building + the whole tree in memory. + ''' + # Temporarily remove the document nodes and reset the root tree. + documents = self.collection.documents + self.collection.documents = () + self.root_tree = None + + # Construct and serialise the collection-level nodes. + # Split them into a head and tail portion. + shell = self.tostring(encoding) + tail = u'\n' + BOM = ''.encode(encoding) + if encoding != 'unicode': + tail = tail.encode(encoding).lstrip(BOM) + head = shell[:-len(tail)] + + # Yield fragment by fragment. + yield head + + step_parent = E('collection') + for doc in documents: + self._build_documents([doc], step_parent) + frag = tostring(step_parent[0], + encoding=encoding, + pretty_print=True, + xml_declaration=False) + step_parent.clear() + yield frag.lstrip(BOM) + + yield tail + + # Restore the collection object and reset the root tree again. + self.collection.documents = documents + self.root_tree = None + def build(self): if self.root_tree is None: self._build_collection() From f85334282dc95cc1bdfe22d7d40ad3702d503c42 Mon Sep 17 00:00:00 2001 From: Lenz Date: Mon, 8 Jan 2018 10:49:09 +0100 Subject: [PATCH 17/21] fix README formatting --- README.md | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 1fca9ef..0cce42f 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,11 @@ -#PyBioC +# PyBioC -**[PyBioC][1] is a native python library to deal with BioCreative XML data, -i. e. to read from and to write to it.** +**[PyBioC][1] is a native Python library for reading and writing BioC XML data.** -More information about BioC available [online][2]. +More information about BioC is available [online][2]. ----------- -##Usage: +## Usage Two example programs, test_read+write.py and stemming.py are shipped in the `src/` folder. @@ -23,7 +21,8 @@ the tokens and transforms the manipulated PyBioC objects back to valid BioC XML format. -##Example: + +## Example ### Generate BioC object for export From 5dc67d0b387cd2333825c2d28c75d3fccbefe40b Mon Sep 17 00:00:00 2001 From: Lenz Date: Mon, 8 Jan 2018 11:06:55 +0100 Subject: [PATCH 18/21] README: installation note --- README.md | 12 +++++++++++- setup.py | 2 +- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0cce42f..5dd0150 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,15 @@ More information about BioC is available [online][2]. +## Installation + +Use [`pip`][3]: + + pip install git+https://github.com/OntoGene/PyBioC.git + +For Python 3, you might have to type `pip3`. + + ## Usage Two example programs, test_read+write.py and stemming.py are shipped in the `src/` folder. @@ -51,5 +60,6 @@ print writer -[1]: http://bioc.sourceforge.net/ +[1]: https://github.com/OntoGene/PyBioC [2]: http://bioc.sourceforge.net/ +[3]: http://pip.pypa.io/ diff --git a/setup.py b/setup.py index 755785f..514b394 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ version='1.02.4', author='Hernani Marques', author_email='h2m@access.uzh.ch', - description='Python library to deal with BioCreative XML data', + description='Python library for working with BioC XML data', long_description=readme, packages=['bioc', 'bioc.meta', 'bioc.compat'], package_dir={'bioc': 'src/bioc'}) From 7492b80b6bf982b5aff8c3f373dd7764d142b6d7 Mon Sep 17 00:00:00 2001 From: Lenz Date: Wed, 10 Jan 2018 14:29:21 +0100 Subject: [PATCH 19/21] bugfix: accept encoding="unicode" in BioCWriter.iterfragments() with Python 3 --- src/bioc/bioc_reader.py | 7 ++--- src/bioc/bioc_writer.py | 67 +++++++++++++++++++++++++++-------------- 2 files changed, 46 insertions(+), 28 deletions(-) diff --git a/src/bioc/bioc_reader.py b/src/bioc/bioc_reader.py index 75bd86f..d84a277 100644 --- a/src/bioc/bioc_reader.py +++ b/src/bioc/bioc_reader.py @@ -1,7 +1,5 @@ __all__ = ['BioCReader'] -from io import StringIO - from lxml import etree from .bioc_annotation import BioCAnnotation @@ -14,7 +12,7 @@ from .bioc_relation import BioCRelation -class BioCReader: +class BioCReader(object): """ This class can be used to store BioC XML files in PyBioC objects, for further manipulation. @@ -26,7 +24,6 @@ def __init__(self, source, dtd_valid_file=None): dtd_valid_file: File path to a BioC.dtd file. Using this optional argument ensures DTD validation. """ - self.source = source self.collection = BioCCollection() self.xml_tree = etree.parse(source) @@ -88,7 +85,7 @@ def _read_passages(self, passage_elem_list, document_parent_elem): # Is the (optional) text element available? try: passage.text = passage_elem.xpath('text')[0].text - except: + except (IndexError, AttributeError): pass self._read_annotations(passage_elem.xpath('annotation'), passage) diff --git a/src/bioc/bioc_writer.py b/src/bioc/bioc_writer.py index de9fd7e..1fede58 100644 --- a/src/bioc/bioc_writer.py +++ b/src/bioc/bioc_writer.py @@ -13,25 +13,23 @@ STR_ENCODING = 'unicode' -class BioCWriter: +class BioCWriter(object): + ''' + XML serializer for BioC objects. + ''' + + doctype = "" def __init__(self, filename=None, collection=None): self.root_tree = None - self.collection = None - self.doctype = '''''' - self.doctype += '''''' + self.collection = collection self.filename = filename - if collection is not None: - self.collection = collection - - if filename is not None: - self.filename = filename - def __str__(self): - """ A BioCWriter object can be printed as string. + """ + A BioCWriter object can be printed as string. """ return self.tostring(encoding=STR_ENCODING) @@ -43,13 +41,13 @@ def tostring(self, encoding='UTF-8'): unless encoding is "unicode", in which case a decoded string is returned (a unicode object in Python 2). ''' - self._check_for_data() - self.build() + xml_declaration = self._binary_encoding(encoding) s = tostring(self.root_tree, encoding=encoding, pretty_print=True, + xml_declaration=xml_declaration, doctype=self.doctype) return s @@ -58,12 +56,26 @@ def _check_for_data(self): if self.collection is None: raise Exception('No data available.') + @staticmethod + def _binary_encoding(codec): + ''' + Is this actually a binary encoding? + + The etree.tostring method accepts an encoding + parameter value "unicode" or str/unicode, + in which case the returned serialisation is a + decoded unicode string rather than an encoded + byte string. + ''' + return not callable(codec) and codec != 'unicode' + def write(self, filename=None): - """ Use this method to write the data in the PyBioC objects - to disk. + """ + Write the data in the PyBioC objects to disk. - filename: Output file path (optional argument; filename - provided by __init__ used otherwise.) + filename: Output file path (optional argument; + filename provided through __init__ used + otherwise.) """ if filename is None: if self.filename is None: @@ -80,8 +92,11 @@ def iterfragments(self, encoding='UTF-8'): Use this for large collections, as it avoids building the whole tree in memory. ''' - # Temporarily remove the document nodes and reset the root tree. + self._check_for_data() + + # Temporarily remove the document nodes and the root tree. documents = self.collection.documents + previous_tree = self.root_tree self.collection.documents = () self.root_tree = None @@ -89,8 +104,9 @@ def iterfragments(self, encoding='UTF-8'): # Split them into a head and tail portion. shell = self.tostring(encoding) tail = u'\n' - BOM = ''.encode(encoding) - if encoding != 'unicode': + BOM = '' + if self._binary_encoding(encoding): + BOM = ''.encode(encoding) tail = tail.encode(encoding).lstrip(BOM) head = shell[:-len(tail)] @@ -109,11 +125,15 @@ def iterfragments(self, encoding='UTF-8'): yield tail - # Restore the collection object and reset the root tree again. + # Restore the collection object and the root tree. self.collection.documents = documents - self.root_tree = None + self.root_tree = previous_tree def build(self): + ''' + Create an Element tree in memory. + ''' + self._check_for_data() if self.root_tree is None: self._build_collection() @@ -128,7 +148,8 @@ def _build_collection(self): # document+ self._build_documents(self.collection.documents, collection_elem) - def _build_infons(self, infons_dict, infons_parent_elem): + @staticmethod + def _build_infons(infons_dict, infons_parent_elem): for infon_key, infon_val in infons_dict.items(): infons_parent_elem.append(E('infon')) infon_elem = infons_parent_elem.xpath('infon')[-1] From 8a9031622cb533eb739dfcf83e60f999b16f1732 Mon Sep 17 00:00:00 2001 From: Lenz Date: Wed, 10 Jan 2018 15:12:41 +0100 Subject: [PATCH 20/21] JSON serialization --- .gitignore | 1 + setup.py | 2 +- src/bioc/__init__.py | 5 +- src/bioc/bioc_writer.py | 109 ++++++++++++++++++++++++++++++++++------ src/stemmer.py | 6 +-- src/test_read+write.py | 16 ++++-- 6 files changed, 113 insertions(+), 26 deletions(-) diff --git a/.gitignore b/.gitignore index d5770e0..6666947 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ *.pyc src/output_bioc.xml +src/output_bioc.json diff --git a/setup.py b/setup.py index 514b394..1f99754 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ readme = f.read() setup(name='PyBioC', - version='1.02.4', + version='2.0', author='Hernani Marques', author_email='h2m@access.uzh.ch', description='Python library for working with BioC XML data', diff --git a/src/bioc/__init__.py b/src/bioc/__init__.py index c240595..c2506fa 100644 --- a/src/bioc/__init__.py +++ b/src/bioc/__init__.py @@ -7,7 +7,7 @@ __all__ = [ 'BioCAnnotation', 'BioCCollection', 'BioCDocument', 'BioCLocation', 'BioCNode', 'BioCPassage', 'BioCRelation', - 'BioCSentence', 'BioCReader', 'BioCWriter' + 'BioCSentence', 'BioCReader', 'BioCXMLWriter', 'BioCJSONWriter' ] __author__ = 'Hernani Marques (h2m@access.uzh.ch)' @@ -21,4 +21,5 @@ from .bioc_relation import BioCRelation from .bioc_sentence import BioCSentence from .bioc_reader import BioCReader -from .bioc_writer import BioCWriter +from .bioc_writer import BioCXMLWriter +from .bioc_writer import BioCJSONWriter diff --git a/src/bioc/bioc_writer.py b/src/bioc/bioc_writer.py index 1fede58..aa34912 100644 --- a/src/bioc/bioc_writer.py +++ b/src/bioc/bioc_writer.py @@ -1,19 +1,45 @@ -__all__ = ['BioCWriter'] +__all__ = ['BioCXMLWriter', 'BioCJSONWriter'] import sys +import json from lxml.builder import E from lxml.etree import tostring -# Resolve Python 2/3 difference regarding the special method __str__(). +# Resolve Python 2/3 differences. if sys.version_info < (3,): + # In Py2, use codecs.open rather than io.open, because the write() method + # of the latter doesn't accept native str values (only unicode). + from codecs import open + # Since BioCXMLWriter.__str__ calls lxml.etree.tostring, it must actually + # encode the serialised dump to get native str. STR_ENCODING = 'ascii' else: STR_ENCODING = 'unicode' -class BioCWriter(object): +class _BioCWriter(object): + ''' + Base for BioC serializers. + ''' + def __init__(self, filename=None, collection=None): + self.collection = collection + self.filename = filename + + def _check_for_data(self): + if self.collection is None: + raise Exception('No data available.') + + def _resolve_filename(self, filename): + if filename is None: + if self.filename is None: + raise Exception('No output file path provided.') + filename = self.filename + return filename + + +class BioCXMLWriter(_BioCWriter): ''' XML serializer for BioC objects. ''' @@ -21,12 +47,9 @@ class BioCWriter(object): doctype = "" def __init__(self, filename=None, collection=None): - + super(BioCXMLWriter, self).__init__(filename, collection) self.root_tree = None - self.collection = collection - self.filename = filename - def __str__(self): """ A BioCWriter object can be printed as string. @@ -52,10 +75,6 @@ def tostring(self, encoding='UTF-8'): return s - def _check_for_data(self): - if self.collection is None: - raise Exception('No data available.') - @staticmethod def _binary_encoding(codec): ''' @@ -77,10 +96,7 @@ def write(self, filename=None): filename provided through __init__ used otherwise.) """ - if filename is None: - if self.filename is None: - raise Exception('No output file path provided.') - filename = self.filename + filename = self._resolve_filename(filename) with open(filename, 'wb') as f: f.write(self.tostring(encoding='UTF-8')) @@ -248,3 +264,66 @@ def _build_sentences(self, sentences_list, passage_parent_elem): self._build_annotations(sentence.annotations, sentence_elem) # relation* self._build_relations(sentence.relations, sentence_elem) + + +class BioCJSONWriter(_BioCWriter): + ''' + JSON serializer for BioC objects. + ''' + def __init__(self, filename=None, collection=None): + super(BioCJSONWriter, self).__init__(filename, collection) + self.root_dict = None + + def __str__(self): + return str(self.tostring()) + + def tostring(self, **kwargs): + ''' + Dump serialized BioC JSON to a string. + ''' + self.build() + return json.dumps(self.root_dict, **kwargs) + + def write(self, filename=None, **kwargs): + ''' + Write serialised BioC JSON to disk. + ''' + self.build() + filename = self._resolve_filename(filename) + with open(filename, 'w', encoding='utf-8') as f: + json.dump(self.root_dict, f, **kwargs) + + def iterfragments(self, **kwargs): + ''' + Iterate over chunks of serialised BioC JSON. + + This method still creates an entire copy of the + structure in memory. + ''' + self.build() + for chunk in json.JSONEncoder(**kwargs).iterencode(self.root_dict): + yield chunk + + def build(self): + ''' + Construct a nested dictionary in memory. + ''' + self._check_for_data() + if self.root_dict is None: + self.root_dict = self._build_dict(self.collection) + + def _build_dict(self, obj): + # Note: + # Unlike the DTD, Don Comeau's reference implementation of a BioC JSON + # converter does not enforce mutual exclusion of either sentences + # or text + annotations inside passage elements. + dict_ = {} + for label, value in obj.__dict__.items(): + if label == 'text' and value is None: + value = '' # avoid None/null + elif label in ('offset', 'length'): + value = int(value) + elif isinstance(value, list): + value = [self._build_dict(c) for c in value] + dict_[label] = value + return dict_ diff --git a/src/stemmer.py b/src/stemmer.py index 6e14971..a692e2c 100755 --- a/src/stemmer.py +++ b/src/stemmer.py @@ -10,7 +10,7 @@ from bioc import BioCAnnotation from bioc import BioCReader -from bioc import BioCWriter +from bioc import BioCXMLWriter BIOC_IN = os.path.join('..', 'test_input', 'example_input.xml') BIOC_OUT = os.path.join('..', 'test_input', 'example_input_stemmed.xml') @@ -27,8 +27,8 @@ def main(): # document bioc_reader = BioCReader(bioc_in, dtd_valid_file=DTD_FILE) - # A BioCWRiter object is prepared to write out the annotated data - bioc_writer = BioCWriter(BIOC_OUT) + # A BioCXMLWRiter object is prepared to write out the annotated data + bioc_writer = BioCXMLWriter(BIOC_OUT) # The NLTK porter stemmer is used for stemming stemmer = PorterStemmer() diff --git a/src/test_read+write.py b/src/test_read+write.py index c37ad7c..d88bcb3 100755 --- a/src/test_read+write.py +++ b/src/test_read+write.py @@ -1,7 +1,8 @@ #!/usr/bin/env python from bioc import BioCReader -from bioc import BioCWriter +from bioc import BioCXMLWriter +from bioc import BioCJSONWriter test_file = '../test_input/example_input.xml' dtd_file = '../test_input/BioC.dtd' @@ -16,10 +17,15 @@ def main(): print sentence.offset ''' - bioc_writer = BioCWriter('output_bioc.xml') - bioc_writer.collection = bioc_reader.collection - bioc_writer.write() - print(bioc_writer) + bioc_xml_writer = BioCXMLWriter('output_bioc.xml', bioc_reader.collection) + bioc_xml_writer.write() + print(bioc_xml_writer) + + bioc_json_writer = BioCJSONWriter() + bioc_json_writer.collection = bioc_reader.collection + bioc_json_writer.write('output_bioc.json', indent=2) + print(bioc_json_writer) + if __name__ == '__main__': main() From aca46fe944ad51078096093b87b851ee93b07759 Mon Sep 17 00:00:00 2001 From: Lenz Date: Wed, 10 Jan 2018 17:46:30 +0100 Subject: [PATCH 21/21] JSON reader --- CHANGES.txt | 5 ++++ README.md | 6 ++--- src/bioc/__init__.py | 9 ++++---- src/bioc/bioc_reader.py | 43 +++++++++++++++++++++++++++++++---- src/stemmer.py | 8 +++---- src/test_read+write.py | 38 ++++++++++++++++--------------- test_input/example_input.json | 35 ++++++++++++++++++++++++++++ 7 files changed, 110 insertions(+), 34 deletions(-) create mode 100644 test_input/example_input.json diff --git a/CHANGES.txt b/CHANGES.txt index 294af88..1df55b0 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,3 +1,8 @@ +2.0 +--- +New reader and writer for BioC JSON, based on the converter by Don Comeau (https://github.com/ncbi-nlp/BioC-JSON). +Renamed the existing, XML-based BioCReader and BioCWriter to BioCXMLReader and BioCXMLWriter, respectively. + 1.02.4 ------ New method `BioCWriter.iterfragments()`: Iterate over serialised XML fragments, diff --git a/README.md b/README.md index 5dd0150..e002d88 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ **[PyBioC][1] is a native Python library for reading and writing BioC XML data.** -More information about BioC is available [online][2]. +More information about BioC is available at [sourceforge][2]. ## Installation @@ -36,9 +36,9 @@ valid BioC XML format. ### Generate BioC object for export ```python -from bioc import BioCWriter, BioCCollection, BioCDocument, BioCPassage +from bioc import BioCXMLWriter, BioCCollection, BioCDocument, BioCPassage -writer = BioCWriter() +writer = BioCXMLWriter() writer.collection = BioCCollection() collection = writer.collection collection.date = '20150301' diff --git a/src/bioc/__init__.py b/src/bioc/__init__.py index c2506fa..a3ab661 100644 --- a/src/bioc/__init__.py +++ b/src/bioc/__init__.py @@ -5,9 +5,9 @@ __version__ = '1.02.4' __all__ = [ - 'BioCAnnotation', 'BioCCollection', 'BioCDocument', - 'BioCLocation', 'BioCNode', 'BioCPassage', 'BioCRelation', - 'BioCSentence', 'BioCReader', 'BioCXMLWriter', 'BioCJSONWriter' + 'BioCAnnotation', 'BioCCollection', 'BioCDocument', 'BioCLocation', + 'BioCNode', 'BioCPassage', 'BioCRelation', 'BioCSentence', + 'BioCXMLReader', 'BioCJSONReader', 'BioCXMLWriter', 'BioCJSONWriter' ] __author__ = 'Hernani Marques (h2m@access.uzh.ch)' @@ -20,6 +20,7 @@ from .bioc_passage import BioCPassage from .bioc_relation import BioCRelation from .bioc_sentence import BioCSentence -from .bioc_reader import BioCReader +from .bioc_reader import BioCXMLReader +from .bioc_reader import BioCJSONReader from .bioc_writer import BioCXMLWriter from .bioc_writer import BioCJSONWriter diff --git a/src/bioc/bioc_reader.py b/src/bioc/bioc_reader.py index d84a277..022b9c0 100644 --- a/src/bioc/bioc_reader.py +++ b/src/bioc/bioc_reader.py @@ -1,4 +1,7 @@ -__all__ = ['BioCReader'] +__all__ = ['BioCXMLReader', 'BioCJSONReader'] + +import io +import json from lxml import etree @@ -12,12 +15,10 @@ from .bioc_relation import BioCRelation -class BioCReader(object): +class BioCXMLReader(object): """ - This class can be used to store BioC XML files in PyBioC objects, - for further manipulation. + Reader for parsing BioC XML files into BioC* objects. """ - def __init__(self, source, dtd_valid_file=None): """ source: File path to a BioC XML input document. @@ -140,3 +141,35 @@ def _read_relations(self, relation_elem_list, relations_parent_elem): relation.add_node(node) relations_parent_elem.add_relation(relation) + + + +class BioCJSONReader(object): + ''' + Reader for parsing BioC JSON files into BioC* objects. + ''' + def __init__(self, source): + self.source = source + self.collection = None + + def read(self): + ''' + Read self.source and save the result to self.collection. + ''' + with io.open(self.source, encoding='utf-8') as f: + dict_ = json.load(f) + self.collection = self._read_dict(BioCCollection, dict_) + + def _read_dict(self, class_, dict_): + obj = class_() + for label, value in dict_.items(): + if label in ('offset', 'length'): + # The converter in Don Comeau's reference implementation + # converts offset and length to strings, too. + value = str(value) + elif isinstance(value, list): + # Get the class from the label (eg. "documents" -> BioCDocument). + class_ = globals()['BioC{}'.format(label.rstrip('s').title())] + value = [self._read_dict(class_, d) for d in value] + setattr(obj, label, value) + return obj diff --git a/src/stemmer.py b/src/stemmer.py index a692e2c..2ee21f6 100755 --- a/src/stemmer.py +++ b/src/stemmer.py @@ -9,7 +9,7 @@ from nltk import PorterStemmer from bioc import BioCAnnotation -from bioc import BioCReader +from bioc import BioCXMLReader from bioc import BioCXMLWriter BIOC_IN = os.path.join('..', 'test_input', 'example_input.xml') @@ -23,9 +23,9 @@ def main(): if len(sys.argv) >= 2: bioc_in = sys.argv[1] - # A BioCReader object is put in place to hold the example BioC XML + # A BioCXMLReader object is put in place to hold the example BioC XML # document - bioc_reader = BioCReader(bioc_in, dtd_valid_file=DTD_FILE) + bioc_reader = BioCXMLReader(bioc_in, dtd_valid_file=DTD_FILE) # A BioCXMLWRiter object is prepared to write out the annotated data bioc_writer = BioCXMLWriter(BIOC_OUT) @@ -34,7 +34,7 @@ def main(): stemmer = PorterStemmer() # The example input file given above (by BIOC_IN) is fed into - # a BioCReader object; validation is done by the BioC DTD + # a BioCXMLReader object; validation is done by the BioC DTD bioc_reader.read() # Pass over basic data diff --git a/src/test_read+write.py b/src/test_read+write.py index d88bcb3..9d358c6 100755 --- a/src/test_read+write.py +++ b/src/test_read+write.py @@ -1,30 +1,32 @@ #!/usr/bin/env python -from bioc import BioCReader +from bioc import BioCXMLReader +from bioc import BioCJSONReader from bioc import BioCXMLWriter from bioc import BioCJSONWriter -test_file = '../test_input/example_input.xml' +test_xml = '../test_input/example_input.xml' +test_json = '../test_input/example_input.json' dtd_file = '../test_input/BioC.dtd' def main(): - bioc_reader = BioCReader(test_file, dtd_valid_file=dtd_file) - bioc_reader.read() - ''' - sentences = bioc_reader.collection.documents[0].passages[0].sentences - for sentence in sentences: - print sentence.offset - ''' - - bioc_xml_writer = BioCXMLWriter('output_bioc.xml', bioc_reader.collection) - bioc_xml_writer.write() - print(bioc_xml_writer) - - bioc_json_writer = BioCJSONWriter() - bioc_json_writer.collection = bioc_reader.collection - bioc_json_writer.write('output_bioc.json', indent=2) - print(bioc_json_writer) + # Read XML, write JSON. + xml_reader = BioCXMLReader(test_xml, dtd_valid_file=dtd_file) + xml_reader.read() + + json_writer = BioCJSONWriter() + json_writer.collection = xml_reader.collection + json_writer.write('output_bioc.json', indent=2) + print(json_writer) + + # Read JSON, write XML. + json_reader = BioCJSONReader(test_json) + json_reader.read() + + xml_writer = BioCXMLWriter('output_bioc.xml', xml_reader.collection) + xml_writer.write() + print(xml_writer) if __name__ == '__main__': diff --git a/test_input/example_input.json b/test_input/example_input.json new file mode 100644 index 0000000..eb968fd --- /dev/null +++ b/test_input/example_input.json @@ -0,0 +1,35 @@ +{ + "key": "ctdBCIVLearningDataSet.key", + "source": "PUBMED", + "date": "20130422", + "documents": [ + { + "infons": {}, + "relations": [], + "passages": [ + { + "offset": 0, + "sentences": [], + "annotations": [], + "relations": [], + "text": "Possible role of valvular serotonin 5-HT(2B) receptors in the cardiopathy associated with fenfluramine.", + "infons": { + "type": "title" + } + }, + { + "offset": 104, + "sentences": [], + "annotations": [], + "relations": [], + "text": "Dexfenfluramine was approved in the United States for long-term use as an appetite suppressant until it was reported to be associated with valvular heart disease. The valvular changes (myofibroblast proliferation) are histopathologically indistinguishable from those observed in carcinoid disease or after long-term exposure to 5-hydroxytryptamine (5-HT)(2)-preferring ergot drugs (ergotamine, methysergide). 5-HT(2) receptor stimulation is known to cause fibroblast mitogenesis, which could contribute to this lesion. To elucidate the mechanism of \"fen-phen\"-associated valvular lesions, we examined the interaction of fenfluramine and its metabolite norfenfluramine with 5-HT(2) receptor subtypes and examined the expression of these receptors in human and porcine heart valves. Fenfluramine binds weakly to 5-HT(2A), 5-HT(2B), and 5-HT(2C) receptors. In contrast, norfenfluramine exhibited high affinity for 5-HT(2B) and 5-HT(2C) receptors and more moderate affinity for 5-HT(2A) receptors. In cells expressing recombinant 5-HT(2B) receptors, norfenfluramine potently stimulated the hydrolysis of inositol phosphates, increased intracellular Ca(2+), and activated the mitogen-activated protein kinase cascade, the latter of which has been linked to mitogenic actions of the 5-HT(2B) receptor. The level of 5-HT(2B) and 5-HT(2A) receptor transcripts in heart valves was at least 300-fold higher than the levels of 5-HT(2C) receptor transcript, which were barely detectable. We propose that preferential stimulation of valvular 5-HT(2B) receptors by norfenfluramine, ergot drugs, or 5-HT released from carcinoid tumors (with or without accompanying 5-HT(2A) receptor activation) may contribute to valvular fibroplasia in humans.", + "infons": { + "type": "abstract" + } + } + ], + "id": "10617681" + } + ], + "infons": {} +}