From e8b9e4f9f7582e958eda48af44ce67b9cf92c6cb Mon Sep 17 00:00:00 2001 From: Jean-Philippe Goldman Date: Thu, 31 Oct 2019 11:10:26 +0100 Subject: [PATCH 1/3] -can handle discontinuated entity annotations -ignore attribute annotations (lines starting with A) --- src/bratutils/agreement.py | 51 ++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 18 deletions(-) diff --git a/src/bratutils/agreement.py b/src/bratutils/agreement.py index 97e275e..eb1af8e 100644 --- a/src/bratutils/agreement.py +++ b/src/bratutils/agreement.py @@ -266,8 +266,10 @@ def __init__(self, a): :type: str """ self.text = None - self.start_idx = None - self.end_idx = None + self.frag = None # fragment, usually one pair of indexes, but annotation can be + # composed of several fragments (aka discontinuated annotations) + self.start_idx = None # start index of first fragment + self.end_idx = None # end index of last fragment self.tag_name = None self.partial_match = None @@ -276,7 +278,7 @@ def __init__(self, a): self.border_status = False self.border_match = None - self.text, self.tag_name, self.start_idx, self.end_idx = \ + self.text, self.tag_name, self.frag, self.start_idx, self.end_idx = \ self._parse_annotation(a) @staticmethod @@ -284,10 +286,14 @@ def _parse_annotation(a): items = a.split("\t") text = items[2].strip("\n").strip(" ") subitems = items[1].split(" ") - tag_name = subitems[0] - start_idx = int(subitems[1]) - end_idx = int(subitems[2]) - return text, tag_name, start_idx, end_idx + tag_name = subitems.pop(0) + subitems = " ".join(subitems).split(";") + for idx in subitems: + start_idx, end_idx = idx.split(" ") + frag.append((int(start_idx), int(end_idx))) + start_idx = frag[0][0] + end_idx = frag[len(frag)][1] + return text, tag_name, start_idx, end_idx, frag def reset_markers(self): """Resets the comparison marker attributes to default values. The @@ -370,8 +376,9 @@ def coincides_with(self, parallel_ann): :return: True if objects coincide :rtype: bool """ - return (self.start_idx == parallel_ann.start_idx and - self.end_idx == parallel_ann.end_idx) + + return self.frag == parallel_ann.frag + def contains_ann(self, other_ann): """Checks if this object's annotation contains another object's @@ -381,8 +388,15 @@ def contains_ann(self, other_ann): :return: True if this annotaion contains the other annotation :rtype: bool """ - return (other_ann.start_idx >= self.start_idx and - other_ann.end_idx <= self.end_idx) + contained_fragments = [False] * len(other_ann) + for i in range(len(other_ann)): + for j in range(len(self)): + if other_ann.idx[i][0] >= self[j][0] and \ + other_ann.frag[i][1] <= self.frag[j][1]: + contained_fragments[i] = True + # return True if all fragments of other_ann are contained in self + return contained_fragments == [True] * len(other_ann) + def is_contained_by(self, parallel_ann): """Checks if this annotation is contained by a parallel annotation. @@ -391,8 +405,8 @@ def is_contained_by(self, parallel_ann): :return: True if contained in `parallel_ann` :rtype: bool """ - return (parallel_ann.start_idx <= self.start_idx and - parallel_ann.end_idx >= self.end_idx) + return contains_ann(parallel_ann, self) + def is_partial_to(self, parallel_ann): """Returns `True` if the annotation is a partial match to the parallel @@ -404,8 +418,9 @@ def is_partial_to(self, parallel_ann): :param parallel_ann: :return: """ - return (self.start_idx > parallel_ann.start_idx and - self.end_idx == parallel_ann.end_idx and + # TODO really dive into frag (for now, we check start of first fragment and end of last fragment) + return (self.start_idx > parallel_ann.end_idx and + self.start_idx == parallel_ann.end_idx and self.tag_name == parallel_ann.tag_name) def get_same_anns(self, parallel_anns): @@ -640,13 +655,13 @@ def __init__(self, fp=None, ann_list=None): self.basename = "" if fp: self.basename = os.path.basename(fp) - with open(fp) as doc: + with open(fp, encoding='utf-8') as doc: for line in doc: - if not line.startswith("#"): + if not line.startswith("#") and not line.startswith("A"): # ignoring Attributes self.tags.append(Annotation(line)) elif ann_list: for line in ann_list: - if not line.startswith("#"): + if not line.startswith("#") and not line.startswith("A"): self.tags.append(Annotation(line)) else: self.tags = [] From 9e77d73972dc406355e8e366608e29489bebeff2 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Goldman Date: Thu, 31 Oct 2019 14:33:52 +0100 Subject: [PATCH 2/3] cleaned some bugs --- src/bratutils/agreement.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/bratutils/agreement.py b/src/bratutils/agreement.py index eb1af8e..3acb75e 100644 --- a/src/bratutils/agreement.py +++ b/src/bratutils/agreement.py @@ -278,7 +278,7 @@ def __init__(self, a): self.border_status = False self.border_match = None - self.text, self.tag_name, self.frag, self.start_idx, self.end_idx = \ + self.text, self.tag_name, self.start_idx, self.end_idx, self.frag = \ self._parse_annotation(a) @staticmethod @@ -288,11 +288,12 @@ def _parse_annotation(a): subitems = items[1].split(" ") tag_name = subitems.pop(0) subitems = " ".join(subitems).split(";") + frag = [] for idx in subitems: start_idx, end_idx = idx.split(" ") frag.append((int(start_idx), int(end_idx))) start_idx = frag[0][0] - end_idx = frag[len(frag)][1] + end_idx = frag[len(frag)-1][1] return text, tag_name, start_idx, end_idx, frag def reset_markers(self): @@ -388,14 +389,14 @@ def contains_ann(self, other_ann): :return: True if this annotaion contains the other annotation :rtype: bool """ - contained_fragments = [False] * len(other_ann) - for i in range(len(other_ann)): - for j in range(len(self)): - if other_ann.idx[i][0] >= self[j][0] and \ + contained_fragments = [False] * len(other_ann.frag) + for i in range(len(other_ann.frag)): + for j in range(len(self.frag)): + if other_ann.frag[i][0] >= self.frag[j][0] and \ other_ann.frag[i][1] <= self.frag[j][1]: contained_fragments[i] = True # return True if all fragments of other_ann are contained in self - return contained_fragments == [True] * len(other_ann) + return contained_fragments == [True] * len(other_ann.frag) def is_contained_by(self, parallel_ann): @@ -405,7 +406,7 @@ def is_contained_by(self, parallel_ann): :return: True if contained in `parallel_ann` :rtype: bool """ - return contains_ann(parallel_ann, self) + return self.contains_ann(parallel_ann) def is_partial_to(self, parallel_ann): @@ -566,12 +567,12 @@ def __eq__(self, ann): self.tag_name == ann.tag_name) def __str__(self): - atts = [self.tag_name, str(self.start_idx), str(self.end_idx), + atts = [self.tag_name, str(self.start_idx), str(self.end_idx), str(self.frag), self.text] return " ".join(atts) def __repr__(self): - atts = [self.tag_name, str(self.start_idx), str(self.end_idx), + atts = [self.tag_name, str(self.start_idx), str(self.end_idx), str(self.frag), self.text] return " ".join(atts) From 56716685392b47f0647dca92d7d94031eb2d37e4 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Goldman Date: Thu, 31 Oct 2019 14:43:43 +0100 Subject: [PATCH 3/3] added self.frag in __eq__ for a complete equivalence --- src/bratutils/agreement.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/bratutils/agreement.py b/src/bratutils/agreement.py index 3acb75e..1b891bc 100644 --- a/src/bratutils/agreement.py +++ b/src/bratutils/agreement.py @@ -564,6 +564,7 @@ def __eq__(self, ann): return (self.text == ann.text and self.start_idx == ann.start_idx and self.end_idx == ann.end_idx and + self.frag == ann.frag and self.tag_name == ann.tag_name) def __str__(self):