From 30ce9369b7f187c540ed364539452fba0a77a75a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Fri, 1 Mar 2024 22:29:06 +0100 Subject: [PATCH 01/41] Make MultiTok..Annotator notice changes in the trie --- docdeid/process/annotator.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py index 60689df..9a5a2c5 100644 --- a/docdeid/process/annotator.py +++ b/docdeid/process/annotator.py @@ -168,19 +168,31 @@ def _init_lookup_structures( self._trie.add_item(texts) start_token = texts[0] - + # Apply the "matching pipeline" to the start token -- the same + # normalization that was applied to the tokens inside + # `add_item` already, or when building the trie in the (trie is + # not None) case, too. for string_modifier in self._matching_pipeline: start_token = string_modifier.process(start_token) self._start_words.add(start_token) + @property + def start_words(self): + # If the trie has been modified (added to) since we computed + # _start_words, + if len(self._start_words) != len(self._trie.children): + # Recompute _start_words. + self._start_words = set(self._trie.children) + return self._start_words + def annotate(self, doc: Document) -> list[Annotation]: tokens = doc.get_tokens() start_tokens = sorted( tokens.token_lookup( - self._start_words, matching_pipeline=self._matching_pipeline + self.start_words, matching_pipeline=self._matching_pipeline ), key=lambda token: token.start_char, ) From e12d0d06160892d9ea5e12c00c0edf5139bc769b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Fri, 1 Mar 2024 23:33:11 +0100 Subject: [PATCH 02/41] Provide `LowercaseTail` string modifier It's like titlecasing but it touches the word only if it was originally uppercase. There might be better names for it... --- docdeid/str/__init__.py | 1 + docdeid/str/processor.py | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/docdeid/str/__init__.py b/docdeid/str/__init__.py index 7e6db61..2eba56a 100644 --- a/docdeid/str/__init__.py +++ b/docdeid/str/__init__.py @@ -1,6 +1,7 @@ from .processor import ( FilterByLength, LowercaseString, + LowercaseTail, RemoveNonAsciiCharacters, ReplaceNonAsciiCharacters, ReplaceValue, diff --git a/docdeid/str/processor.py b/docdeid/str/processor.py index b1023a5..1fa4869 100644 --- a/docdeid/str/processor.py +++ b/docdeid/str/processor.py @@ -74,6 +74,26 @@ def process(self, item: str) -> str: return item.casefold() +_WORD_RX = re.compile('\\w+', re.U) + + +class LowercaseTail(StringModifier): + """Lowercases the tail of words.""" + + @staticmethod + def _process_word_match(m: re.Match) -> str: + word = m.group(0) + if word.isupper(): + # FIXME Is there a language-independent way to properly titlecase? + if word.startswith('IJ'): + return word[0:2] + word[2:].lower() + return word[0] + word[1:].lower() + return word + + def process(self, item: str) -> str: + return _WORD_RX.sub(LowercaseTail._process_word_match, item) + + class StripString(StringModifier): """ Strip string (whitespaces, tabs, newlines, etc. From 82aab5febe73c7c8b1fd24f6395c1ffdd56f3f82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Mon, 4 Mar 2024 13:44:26 +0100 Subject: [PATCH 03/41] Enable specifying the lang for titlecasing --- docdeid/str/processor.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/docdeid/str/processor.py b/docdeid/str/processor.py index 1fa4869..10126ce 100644 --- a/docdeid/str/processor.py +++ b/docdeid/str/processor.py @@ -80,18 +80,19 @@ def process(self, item: str) -> str: class LowercaseTail(StringModifier): """Lowercases the tail of words.""" - @staticmethod - def _process_word_match(m: re.Match) -> str: + def __init__(self, lang='nl'): + self._lang = lang + + def _process_word_match(self, m: re.Match) -> str: word = m.group(0) if word.isupper(): - # FIXME Is there a language-independent way to properly titlecase? - if word.startswith('IJ'): + if self._lang == 'nl' and word.startswith('IJ'): return word[0:2] + word[2:].lower() return word[0] + word[1:].lower() return word def process(self, item: str) -> str: - return _WORD_RX.sub(LowercaseTail._process_word_match, item) + return _WORD_RX.sub(self._process_word_match, item) class StripString(StringModifier): From 156e201897d681f2200cd39ca335ed06bb7451b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Mon, 4 Mar 2024 23:20:02 +0100 Subject: [PATCH 04/41] Minimize data fixtures for tests --- docdeid/docdeid.iml | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 docdeid/docdeid.iml diff --git a/docdeid/docdeid.iml b/docdeid/docdeid.iml new file mode 100644 index 0000000..35fdd4f --- /dev/null +++ b/docdeid/docdeid.iml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file From c7b4c896f1ed3718a118d70cae5156d7b481b1ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Wed, 6 Mar 2024 12:10:04 +0100 Subject: [PATCH 05/41] Log annotated text after every processor --- docdeid/process/doc_processor.py | 4 ++++ docdeid/utils.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/docdeid/process/doc_processor.py b/docdeid/process/doc_processor.py index 6db05f7..2085418 100644 --- a/docdeid/process/doc_processor.py +++ b/docdeid/process/doc_processor.py @@ -1,8 +1,10 @@ +import logging from abc import ABC, abstractmethod from collections import OrderedDict from typing import Iterator, Optional, Union from docdeid.document import Document +from docdeid.utils import annotate_doc class DocProcessor(ABC): # pylint: disable=R0903 @@ -143,6 +145,8 @@ def process( elif isinstance(proc, DocProcessorGroup): proc.process(doc, enabled=enabled, disabled=disabled) + logging.debug("after %s: %s", name, annotate_doc(doc)) + def __iter__(self) -> Iterator: return iter(self._processors.items()) diff --git a/docdeid/utils.py b/docdeid/utils.py index 1d3cf7c..f134689 100644 --- a/docdeid/utils.py +++ b/docdeid/utils.py @@ -1,3 +1,5 @@ +from collections import defaultdict + from frozendict import frozendict from docdeid.document import Document @@ -32,3 +34,32 @@ def annotate_intext(doc: Document) -> str: ) return text + + +def annotate_doc(doc: Document) -> str: + """\ + Adds XML-like markup for annotations into the text of a document. + + Handles also nested mentions and in a way also overlapping mentions, even + though this kind of markup cannot really represent them. + """ + annos_from_shortest = sorted( + doc.annotations, + key=lambda anno: anno.end_char - anno.start_char) + idx_to_anno_starts = defaultdict(list) + idx_to_anno_ends = defaultdict(list) + for anno in annos_from_shortest: + idx_to_anno_starts[anno.start_char].append(anno) + idx_to_anno_ends[anno.end_char].append(anno) + markup_indices = sorted(set(idx_to_anno_starts).union(idx_to_anno_ends)) + chunks = list() + last_idx = 0 + for idx in markup_indices: + chunks.append(doc.text[last_idx:idx]) + for ending_anno in idx_to_anno_ends[idx]: + chunks.append(f'') + for starting_anno in reversed(idx_to_anno_starts[idx]): + chunks.append(f'<{starting_anno.tag.upper()}>') + last_idx = idx + chunks.append(doc.text[last_idx:]) + return ''.join(chunks) From 4459c14de13691949b64a3d58fb439ae56968abd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Wed, 6 Mar 2024 15:40:05 +0100 Subject: [PATCH 06/41] Update documentation slightly --- docdeid/ds/lookup.py | 2 +- docdeid/process/annotator.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docdeid/ds/lookup.py b/docdeid/ds/lookup.py index 4df8bdc..2d9d270 100644 --- a/docdeid/ds/lookup.py +++ b/docdeid/ds/lookup.py @@ -140,7 +140,7 @@ def add_items_from_self( ) -> None: """ Add items from self (this items of this :class:`.LookupSet`). This can be used - to do a transformation or replacment of the items. + to do a transformation or replacement of the items. Args: cleaning_pipeline: A cleaning pipeline applied to the items of this set. diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py index 9a5a2c5..60c92b5 100644 --- a/docdeid/process/annotator.py +++ b/docdeid/process/annotator.py @@ -15,7 +15,7 @@ class Annotator(DocProcessor, ABC): """ Abstract class for annotators, which are responsible for generating annotations from - a given document. Instatiations should implement the annotate method. + a given document. Instantiations should implement the annotate method. Args: tag: The tag to use in the annotations. @@ -119,7 +119,7 @@ class MultiTokenLookupAnnotator(Annotator): or should process from left to right. Raises: - RunTimeError, when an incorrect combination of `lookup_values`, + RuntimeError, when an incorrect combination of `lookup_values`, `matching_pipeline` and `trie` is supplied. """ @@ -149,7 +149,7 @@ def __init__( else: raise RuntimeError( - "Please provide either looup_values and a tokenizer, or a trie." + "Please provide either lookup_values and a tokenizer, or a trie." ) self.overlapping = overlapping From 810b8b3bee0967ce5460b8bd2d0c5c4c13153def Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Wed, 6 Mar 2024 22:02:09 +0100 Subject: [PATCH 07/41] Expose `Document.token_lists` as a property --- docdeid/document.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/docdeid/document.py b/docdeid/document.py index dd515ce..9d4ccf7 100644 --- a/docdeid/document.py +++ b/docdeid/document.py @@ -1,5 +1,8 @@ +from collections.abc import Mapping from typing import Any, Optional +from frozendict import frozendict + from docdeid.annotation import AnnotationSet from docdeid.tokenizer import Tokenizer, TokenList @@ -74,7 +77,8 @@ def __init__( ) -> None: self._text = text - self._tokenizers = tokenizers + self._tokenizers = (None if tokenizers is None else + frozendict(tokenizers)) self.metadata = MetaData(metadata) """The :class:`.MetaData` of this :class:`.Document`, that can be interacted @@ -94,6 +98,14 @@ def text(self) -> str: """ return self._text + @property + def tokenizers(self) -> Optional[Mapping[str, Tokenizer]]: + return self._tokenizers + + @property + def token_lists(self) -> Mapping[str, TokenList]: + return self._token_lists + def get_tokens(self, tokenizer_name: str = "default") -> TokenList: """ Get the tokens corresponding to the input text, for a specific tokenizer. From 50026967db5e7b78e819e56d9d0c8ee3ec1bdfa1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Thu, 7 Mar 2024 14:45:39 +0100 Subject: [PATCH 08/41] (Almost) automatically format code --- docdeid/document.py | 3 +-- docdeid/str/processor.py | 6 +++--- docdeid/utils.py | 16 ++++++++-------- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/docdeid/document.py b/docdeid/document.py index 9d4ccf7..fade167 100644 --- a/docdeid/document.py +++ b/docdeid/document.py @@ -77,8 +77,7 @@ def __init__( ) -> None: self._text = text - self._tokenizers = (None if tokenizers is None else - frozendict(tokenizers)) + self._tokenizers = None if tokenizers is None else frozendict(tokenizers) self.metadata = MetaData(metadata) """The :class:`.MetaData` of this :class:`.Document`, that can be interacted diff --git a/docdeid/str/processor.py b/docdeid/str/processor.py index 10126ce..32e5a51 100644 --- a/docdeid/str/processor.py +++ b/docdeid/str/processor.py @@ -74,19 +74,19 @@ def process(self, item: str) -> str: return item.casefold() -_WORD_RX = re.compile('\\w+', re.U) +_WORD_RX = re.compile("\\w+", re.U) class LowercaseTail(StringModifier): """Lowercases the tail of words.""" - def __init__(self, lang='nl'): + def __init__(self, lang="nl"): self._lang = lang def _process_word_match(self, m: re.Match) -> str: word = m.group(0) if word.isupper(): - if self._lang == 'nl' and word.startswith('IJ'): + if self._lang == "nl" and word.startswith("IJ"): return word[0:2] + word[2:].lower() return word[0] + word[1:].lower() return word diff --git a/docdeid/utils.py b/docdeid/utils.py index f134689..ef46071 100644 --- a/docdeid/utils.py +++ b/docdeid/utils.py @@ -37,15 +37,15 @@ def annotate_intext(doc: Document) -> str: def annotate_doc(doc: Document) -> str: - """\ + """ Adds XML-like markup for annotations into the text of a document. - Handles also nested mentions and in a way also overlapping mentions, even - though this kind of markup cannot really represent them. + Handles also nested mentions and in a way also overlapping mentions, even though + this kind of markup cannot really represent them. """ annos_from_shortest = sorted( - doc.annotations, - key=lambda anno: anno.end_char - anno.start_char) + doc.annotations, key=lambda anno: anno.end_char - anno.start_char + ) idx_to_anno_starts = defaultdict(list) idx_to_anno_ends = defaultdict(list) for anno in annos_from_shortest: @@ -57,9 +57,9 @@ def annotate_doc(doc: Document) -> str: for idx in markup_indices: chunks.append(doc.text[last_idx:idx]) for ending_anno in idx_to_anno_ends[idx]: - chunks.append(f'') + chunks.append(f"") for starting_anno in reversed(idx_to_anno_starts[idx]): - chunks.append(f'<{starting_anno.tag.upper()}>') + chunks.append(f"<{starting_anno.tag.upper()}>") last_idx = idx chunks.append(doc.text[last_idx:]) - return ''.join(chunks) + return "".join(chunks) From 7d2d8668fa9cea78669a6d4848fb11a510bb3868 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Thu, 7 Mar 2024 17:08:17 +0100 Subject: [PATCH 09/41] Simplify `MultiTokenLookupAnnotator`... ...as required by pylint. --- docdeid/document.py | 3 ++ docdeid/ds/lookup.py | 7 ++-- docdeid/process/annotator.py | 57 +++++----------------------- docdeid/str/processor.py | 6 +-- docdeid/tokenizer.py | 28 ++++++++++++-- docdeid/utils.py | 2 +- tests/pipeline/test_pipeline.py | 16 +++++--- tests/unit/process/test_annotator.py | 34 ++++++++--------- 8 files changed, 72 insertions(+), 81 deletions(-) diff --git a/docdeid/document.py b/docdeid/document.py index fade167..c7b6c20 100644 --- a/docdeid/document.py +++ b/docdeid/document.py @@ -99,10 +99,13 @@ def text(self) -> str: @property def tokenizers(self) -> Optional[Mapping[str, Tokenizer]]: + """Available tokenizers indexed by their name.""" return self._tokenizers @property def token_lists(self) -> Mapping[str, TokenList]: + """Lists of tokens of the document, indexed by the name of the corresponding + tokenizer.""" return self._token_lists def get_tokens(self, tokenizer_name: str = "default") -> TokenList: diff --git a/docdeid/ds/lookup.py b/docdeid/ds/lookup.py index 2d9d270..f0daa49 100644 --- a/docdeid/ds/lookup.py +++ b/docdeid/ds/lookup.py @@ -2,6 +2,7 @@ import codecs import itertools +from collections.abc import Sequence from typing import Iterable, Iterator, Optional, Union from docdeid.ds.ds import Datastructure @@ -265,7 +266,7 @@ def __init__(self, *args, **kwargs) -> None: self.children: dict[str, LookupTrie] = {} self.is_terminal = False - def add_item(self, item: list[str]) -> None: + def add_item(self, item: Sequence[str]) -> None: """ Add an item, i.e. a list of strings, to this Trie. @@ -285,7 +286,7 @@ def add_item(self, item: list[str]) -> None: self.children[head].add_item(tail) - def __contains__(self, item: list[str]) -> bool: + def __contains__(self, item: Sequence[str]) -> bool: """ Whether the trie contains the item. Respects the matching pipeline. @@ -304,7 +305,7 @@ def __contains__(self, item: list[str]) -> bool: return (head in self.children) and tail in self.children[head] def longest_matching_prefix( - self, item: list[str], start_i: int = 0 + self, item: Sequence[str], start_i: int = 0 ) -> Union[list[str], None]: """ Finds the longest matching prefix of a list of strings. This is used to find the diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py index 60c92b5..d1e31a0 100644 --- a/docdeid/process/annotator.py +++ b/docdeid/process/annotator.py @@ -9,7 +9,7 @@ from docdeid.pattern import TokenPattern from docdeid.process.doc_processor import DocProcessor from docdeid.str.processor import StringModifier -from docdeid.tokenizer import Token, Tokenizer +from docdeid.tokenizer import Token class Annotator(DocProcessor, ABC): @@ -126,59 +126,20 @@ class MultiTokenLookupAnnotator(Annotator): def __init__( self, *args, - lookup_values: Optional[Iterable[str]] = None, - matching_pipeline: Optional[list[StringModifier]] = None, - tokenizer: Optional[Tokenizer] = None, - trie: Optional[LookupTrie] = None, + trie: LookupTrie, overlapping: bool = False, **kwargs, ) -> None: - self._start_words: set[str] = set() - - if (trie is not None) and (lookup_values is None) and (tokenizer is None): - - self._trie = trie - self._matching_pipeline = trie.matching_pipeline or [] - self._start_words = set(trie.children.keys()) - - elif (trie is None) and (lookup_values is not None) and (tokenizer is not None): - self._matching_pipeline = matching_pipeline or [] - self._trie = LookupTrie(matching_pipeline=matching_pipeline) - self._init_lookup_structures(lookup_values, tokenizer) - - else: - raise RuntimeError( - "Please provide either lookup_values and a tokenizer, or a trie." - ) - - self.overlapping = overlapping + self._trie = trie + self._overlapping = overlapping + self._start_words = set(trie.children) super().__init__(*args, **kwargs) - def _init_lookup_structures( - self, lookup_values: Iterable[str], tokenizer: Tokenizer - ) -> None: - - for val in lookup_values: - - texts = [token.text for token in tokenizer.tokenize(val)] - - if len(texts) > 0: - self._trie.add_item(texts) - - start_token = texts[0] - # Apply the "matching pipeline" to the start token -- the same - # normalization that was applied to the tokens inside - # `add_item` already, or when building the trie in the (trie is - # not None) case, too. - for string_modifier in self._matching_pipeline: - start_token = string_modifier.process(start_token) - - self._start_words.add(start_token) - @property - def start_words(self): + def start_words(self) -> set[str]: + """First words of phrases detected by this annotator.""" # If the trie has been modified (added to) since we computed # _start_words, if len(self._start_words) != len(self._trie.children): @@ -192,7 +153,7 @@ def annotate(self, doc: Document) -> list[Annotation]: start_tokens = sorted( tokens.token_lookup( - self.start_words, matching_pipeline=self._matching_pipeline + self.start_words, matching_pipeline=self._trie.matching_pipeline ), key=lambda token: token.start_char, ) @@ -230,7 +191,7 @@ def annotate(self, doc: Document) -> list[Annotation]: ) ) - if not self.overlapping: + if not self._overlapping: min_i = i + len(longest_matching_prefix) # skip ahead return annotations diff --git a/docdeid/str/processor.py b/docdeid/str/processor.py index 32e5a51..44ee468 100644 --- a/docdeid/str/processor.py +++ b/docdeid/str/processor.py @@ -80,11 +80,11 @@ def process(self, item: str) -> str: class LowercaseTail(StringModifier): """Lowercases the tail of words.""" - def __init__(self, lang="nl"): + def __init__(self, lang: str = "nl") -> None: self._lang = lang - def _process_word_match(self, m: re.Match) -> str: - word = m.group(0) + def _process_word_match(self, match: re.Match) -> str: + word = match.group(0) if word.isupper(): if self._lang == "nl" and word.startswith("IJ"): return word[0:2] + word[2:].lower() diff --git a/docdeid/tokenizer.py b/docdeid/tokenizer.py index 8813caf..c69197c 100644 --- a/docdeid/tokenizer.py +++ b/docdeid/tokenizer.py @@ -1,10 +1,12 @@ from __future__ import annotations import re +import sys from abc import ABC, abstractmethod from collections import defaultdict +from collections.abc import Sequence from dataclasses import dataclass, field -from typing import Iterator, Literal, Optional +from typing import Iterator, Literal, Optional, SupportsIndex, overload from docdeid.str import StringModifier @@ -130,7 +132,7 @@ def __len__(self) -> int: return len(self.text) -class TokenList: +class TokenList(Sequence[Token]): """ Contains a sequence of tokens, along with some lookup logic. @@ -248,9 +250,29 @@ def __len__(self) -> int: return len(self._tokens) + @overload def __getitem__(self, index: int) -> Token: + ... - return self._tokens[index] + @overload + def __getitem__(self, indexes: slice) -> Sequence[Token]: + ... + + def __getitem__(self, item): + return self._tokens[item] + + def index( + self, + __token: Token, + __start: SupportsIndex = 0, + __stop: SupportsIndex = sys.maxsize, + ) -> int: + try: + return self._token_index[__token] + except KeyError: + # Raise a plain ValueError, just like list.index. + # pylint: disable=W0707 + raise ValueError(f"'{__token}' is not in TokenList") def __eq__(self, other: object) -> bool: """ diff --git a/docdeid/utils.py b/docdeid/utils.py index ef46071..a1fcdd7 100644 --- a/docdeid/utils.py +++ b/docdeid/utils.py @@ -52,7 +52,7 @@ def annotate_doc(doc: Document) -> str: idx_to_anno_starts[anno.start_char].append(anno) idx_to_anno_ends[anno.end_char].append(anno) markup_indices = sorted(set(idx_to_anno_starts).union(idx_to_anno_ends)) - chunks = list() + chunks = [] last_idx = 0 for idx in markup_indices: chunks.append(doc.text[last_idx:idx]) diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index 62422e7..d4e0ba7 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -2,6 +2,7 @@ from docdeid.annotation import Annotation, AnnotationSet from docdeid.deidentifier import DocDeid +from docdeid.ds import LookupTrie from docdeid.process.annotator import ( MultiTokenLookupAnnotator, SingleTokenLookupAnnotator, @@ -49,11 +50,12 @@ def test_multipe_annotators(self, long_text): "name_annotator", SingleTokenLookupAnnotator(lookup_values=["Bob"], tag="name"), ) + loc_trie = LookupTrie() + loc_trie.add_item("the United States of America".split()) deidentifier.processors.add_processor( "location_annotator", MultiTokenLookupAnnotator( - lookup_values=["the United States of America"], - tokenizer=tokenizer, + trie=loc_trie, tag="location", ), ) @@ -86,11 +88,12 @@ def test_enabled(self, long_text): "name_annotator", SingleTokenLookupAnnotator(lookup_values=["Bob"], tag="name"), ) + loc_trie = LookupTrie() + loc_trie.add_item("the United States of America".split()) deidentifier.processors.add_processor( "location_annotator", MultiTokenLookupAnnotator( - lookup_values=["the United States of America"], - tokenizer=tokenizer, + trie=loc_trie, tag="location", ), ) @@ -124,11 +127,12 @@ def test_disabled(self, long_text): "name_annotator", SingleTokenLookupAnnotator(lookup_values=["Bob"], tag="name"), ) + loc_trie = LookupTrie() + loc_trie.add_item("the United States of America".split()) deidentifier.processors.add_processor( "location_annotator", MultiTokenLookupAnnotator( - lookup_values=["the United States of America"], - tokenizer=tokenizer, + trie=loc_trie, tag="location", ), ) diff --git a/tests/unit/process/test_annotator.py b/tests/unit/process/test_annotator.py index a71dc54..a94e05c 100644 --- a/tests/unit/process/test_annotator.py +++ b/tests/unit/process/test_annotator.py @@ -4,6 +4,7 @@ import docdeid.ds from docdeid.annotation import Annotation from docdeid.document import Document +from docdeid.ds import LookupTrie from docdeid.pattern import TokenPattern from docdeid.process.annotator import ( MultiTokenLookupAnnotator, @@ -12,7 +13,6 @@ TokenPatternAnnotator, ) from docdeid.str.processor import LowercaseString -from docdeid.tokenizer import WordBoundaryTokenizer class TestSingleTokenLookupAnnotator: @@ -55,11 +55,10 @@ def test_single_token_with_matching_pipeline(self, long_text, long_tokenlist): class TestMultiTokenLookupAnnotator: def test_multi_token(self, long_text, long_tokenlist): doc = Document(long_text) - annotator = MultiTokenLookupAnnotator( - lookup_values=["my name", "my wife"], - tokenizer=WordBoundaryTokenizer(), - tag="prefix", - ) + my_trie = LookupTrie() + my_trie.add_item(("my", " ", "name")) + my_trie.add_item(("my", " ", "wife")) + annotator = MultiTokenLookupAnnotator(trie=my_trie, tag="prefix") expected_annotations = [ Annotation(text="my wife", start_char=39, end_char=46, tag="prefix"), ] @@ -73,12 +72,10 @@ def test_multi_token(self, long_text, long_tokenlist): def test_multi_token_with_matching_pipeline(self, long_text, long_tokenlist): doc = Document(long_text) - annotator = MultiTokenLookupAnnotator( - lookup_values=["my name", "my wife"], - tokenizer=WordBoundaryTokenizer(), - matching_pipeline=[LowercaseString()], - tag="prefix", - ) + my_trie = LookupTrie(matching_pipeline=[LowercaseString()]) + my_trie.add_item(("my", " ", "name")) + my_trie.add_item(("my", " ", "wife")) + annotator = MultiTokenLookupAnnotator(trie=my_trie, tag="prefix") expected_annotations = [ Annotation(text="My name", start_char=0, end_char=7, tag="prefix"), Annotation(text="my wife", start_char=39, end_char=46, tag="prefix"), @@ -93,9 +90,11 @@ def test_multi_token_lookup_with_overlap(self, long_text, long_tokenlist): doc = Document(long_text) + dr_trie = LookupTrie() + dr_trie.add_item(("dr", ". ", "John")) + dr_trie.add_item(("John", " ", "Smith")) annotator = MultiTokenLookupAnnotator( - lookup_values=["dr. John", "John Smith"], - tokenizer=WordBoundaryTokenizer(), + trie=dr_trie, tag="prefix", overlapping=True, ) @@ -114,9 +113,11 @@ def test_multi_token_lookup_no_overlap(self, long_text, long_tokenlist): doc = Document(long_text) + dr_trie = LookupTrie() + dr_trie.add_item(("dr", ". ", "John")) + dr_trie.add_item(("John", " ", "Smith")) annotator = MultiTokenLookupAnnotator( - lookup_values=["dr. John", "John Smith"], - tokenizer=WordBoundaryTokenizer(), + trie=dr_trie, tag="prefix", overlapping=False, ) @@ -137,7 +138,6 @@ def test_multi_token_lookup_with_trie(self, long_text, long_tokenlist): trie = docdeid.ds.LookupTrie(matching_pipeline=[LowercaseString()]) trie.add_item(["my", " ", "name"]) trie.add_item(["my", " ", "wife"]) - annotator = MultiTokenLookupAnnotator( trie=trie, tag="prefix", From 762866aaa141120bcd408559604d93a6cc76666c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Fri, 8 Mar 2024 12:42:11 +0100 Subject: [PATCH 10/41] Update the `MultiTok...Annotator` docstring --- docdeid/process/annotator.py | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py index d1e31a0..f97b3cc 100644 --- a/docdeid/process/annotator.py +++ b/docdeid/process/annotator.py @@ -102,25 +102,13 @@ def annotate(self, doc: Document) -> list[Annotation]: class MultiTokenLookupAnnotator(Annotator): """ - Matches lookup values against tokens, where the ``lookup_values`` may themselves be - sequences. + Annotates entity mentions by looking them up in a `LookupTrie`. Args: - lookup_values: An iterable of strings, that should be matched. These are - tokenized internally. - matching_pipeline: An optional pipeline that can be used for matching - (e.g. lowercasing). This has no specific impact on matching performance, - other than overhead for applying the pipeline to each string. - tokenizer: A tokenizer that is used to create the sequence patterns from - ``lookup_values``. - trie: A trie that is used for matching, rather than a combination of - `lookup_values` and a `matching_pipeline` (cannot be used simultaneously). - overlapping: Whether the annotator should match overlapping sequences, - or should process from left to right. - - Raises: - RuntimeError, when an incorrect combination of `lookup_values`, - `matching_pipeline` and `trie` is supplied. + trie: The `LookupTrie` containing all entity mentions that should be annotated. + overlapping: Whether overlapping phrases are to be returned. + *args, **kwargs: Passed through to the `Annotator` constructor (which accepts + the arguments `tag` and `priority`). """ def __init__( From 1ae6846d921a5fffb9ca179780867cbfc8f8796a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Fri, 8 Mar 2024 13:24:17 +0100 Subject: [PATCH 11/41] Test user additions to the lookup trie --- tests/unit/process/test_annotator.py | 29 ++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/unit/process/test_annotator.py b/tests/unit/process/test_annotator.py index a94e05c..7b27196 100644 --- a/tests/unit/process/test_annotator.py +++ b/tests/unit/process/test_annotator.py @@ -13,6 +13,7 @@ TokenPatternAnnotator, ) from docdeid.str.processor import LowercaseString +from docdeid.tokenizer import SpaceSplitTokenizer class TestSingleTokenLookupAnnotator: @@ -153,6 +154,34 @@ def test_multi_token_lookup_with_trie(self, long_text, long_tokenlist): assert annotations == expected_annotations + def test_trie_modified(self, long_text): + # The user of Deduce may want to amend the resources shipped with Deduce. + # Loading those happens in the Deduce initializer, which also constructs + # annotators according to the configuration. + + # Run the interesting portions of Deduce initialization. + doc = Document(long_text,tokenizers={"default": SpaceSplitTokenizer()}) + trie = docdeid.ds.LookupTrie() + # Yeah, the comma in "Smith," seems off... but then again, WordBoundaryTokenizer + # considers whitespace to be tokens. There is no good choice. + trie.add_item(("John", "Smith,")) + annotator = MultiTokenLookupAnnotator(trie=trie, tag="name") + + # Let's add our own resources. + trie.add_item(("jane", "Keith-Lucas")) + # ...including phrases with a potential to confuse the algorithm. + trie.add_item(("jane", "joplane")) + trie.add_item(("dr.", "John", "Hopkin")) + trie.add_item(("Smith,", "please")) + + # Expect also our phrases to be detected. + want = [ + Annotation(text="John Smith,", start_char=15, end_char=26, tag="name"), + Annotation(text="jane Keith-Lucas", start_char=47, end_char=63, tag="name"), + ] + got = annotator.annotate(doc) + assert got == want + class TestRegexpAnnotator: def test_regexp_annotator(self, long_text): From ae1f93ea60ba5d4769cf09ead11cd28f68b1eb73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Fri, 8 Mar 2024 16:19:11 +0100 Subject: [PATCH 12/41] Test the `tokenizers` and `token_lists` props --- tests/unit/test_document.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/unit/test_document.py b/tests/unit/test_document.py index 05e3e3c..c37ada0 100644 --- a/tests/unit/test_document.py +++ b/tests/unit/test_document.py @@ -84,8 +84,13 @@ def test_get_tokens_multiple_tokenizers(self, short_tokens): tokenizer1, "tokenize", return_value=short_tokens ), patch.object(tokenizer2, "_split_text", return_value=[]): + assert set(doc.tokenizers.keys()) == {"tokenizer_1", "tokenizer_2"} assert doc.get_tokens(tokenizer_name="tokenizer_1") == short_tokens assert doc.get_tokens(tokenizer_name="tokenizer_2") == TokenList([]) + assert doc.token_lists == { + "tokenizer_1": short_tokens, + "tokenizer_2": TokenList([]), + } def test_metadata(self): text = "Hello I'm Bob" From d415f5177a7f8594a157a404f23258950597935e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Fri, 8 Mar 2024 16:22:11 +0100 Subject: [PATCH 13/41] Remove and ignore the IDEA project file --- .gitignore | 5 ++++- docdeid/docdeid.iml | 9 --------- 2 files changed, 4 insertions(+), 10 deletions(-) delete mode 100644 docdeid/docdeid.iml diff --git a/.gitignore b/.gitignore index 7fec20a..32cd942 100644 --- a/.gitignore +++ b/.gitignore @@ -130,4 +130,7 @@ dmypy.json # Pyre type checker .pyre/ -/.idea/* \ No newline at end of file +/.idea/* + +# IDEs +*.iml diff --git a/docdeid/docdeid.iml b/docdeid/docdeid.iml deleted file mode 100644 index 35fdd4f..0000000 --- a/docdeid/docdeid.iml +++ /dev/null @@ -1,9 +0,0 @@ - - - - - - - - - \ No newline at end of file From d8e8ed36c9efb9c945dfea94ac6414fdf973278c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Fri, 8 Mar 2024 16:28:01 +0100 Subject: [PATCH 14/41] Annotate docs for logging only if level is DEBUG --- docdeid/process/doc_processor.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docdeid/process/doc_processor.py b/docdeid/process/doc_processor.py index 2085418..3bc8556 100644 --- a/docdeid/process/doc_processor.py +++ b/docdeid/process/doc_processor.py @@ -6,6 +6,8 @@ from docdeid.document import Document from docdeid.utils import annotate_doc +_ROOT_LOGGER = logging.getLogger() + class DocProcessor(ABC): # pylint: disable=R0903 """Something that processes a document.""" @@ -145,7 +147,8 @@ def process( elif isinstance(proc, DocProcessorGroup): proc.process(doc, enabled=enabled, disabled=disabled) - logging.debug("after %s: %s", name, annotate_doc(doc)) + if _ROOT_LOGGER.isEnabledFor(logging.DEBUG): + logging.debug("after %s: %s", name, annotate_doc(doc)) def __iter__(self) -> Iterator: From 03fc99dec6df5b2b126112bdc90643966443e15c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Fri, 8 Mar 2024 18:10:47 +0100 Subject: [PATCH 15/41] Cosmetics --- docdeid/annotation.py | 12 +++++++++--- docdeid/process/annotation_processor.py | 4 ++-- tests/unit/process/test_annotator.py | 2 +- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/docdeid/annotation.py b/docdeid/annotation.py index 60fd533..e7d7d7d 100644 --- a/docdeid/annotation.py +++ b/docdeid/annotation.py @@ -100,7 +100,7 @@ def get_sort_key( val = getattr(self, attr, UNKNOWN_ATTR_DEFAULT) - if callbacks is not None and (attr in callbacks): + if callbacks is not None and attr in callbacks: val = callbacks[attr](val) sort_key.append(val) @@ -150,14 +150,20 @@ def sorted( A RunTimeError, if the callbacks are not provided as a frozen dict. """ - if callbacks is not None and not isinstance(callbacks, frozendict): + # Not liked by Mypy, even though + # https://docs.python.org/3/library/stdtypes.html#types-union + # says the "X | Y" notation is equivalent to `typing.Union[X, Y]` and the + # docstring of `typing.Optional` says it's equivalent to + # `typing.Union[None, _]`: + # if not isinstance(callbacks, Optional[frozendict]): + if not isinstance(callbacks, frozendict | None): raise RuntimeError( "Please provide the callbacks as a frozen dict, e.g. " "frozendict.frozendict(end_char=lambda x: -x)" ) return sorted( - list(self), + self, key=lambda x: x.get_sort_key( by=by, callbacks=callbacks, deterministic=deterministic ), diff --git a/docdeid/process/annotation_processor.py b/docdeid/process/annotation_processor.py index 0b3c277..41e9507 100644 --- a/docdeid/process/annotation_processor.py +++ b/docdeid/process/annotation_processor.py @@ -60,7 +60,7 @@ def __init__( @staticmethod def _zero_runs(arr: npt.NDArray) -> npt.NDArray: """ - Finds al zero runs in a numpy array. + Finds all zero runs in a numpy array. Source: https://stackoverflow.com/questions/24885092/ finding-the-consecutive-zeros-in-a-numpy-array @@ -68,7 +68,7 @@ def _zero_runs(arr: npt.NDArray) -> npt.NDArray: arr: The input array. Returns: - A (num_zero_runs, 2)-dim array, containing the start and end indeces + A (num_zero_runs, 2)-dim array, containing the start and end indices of the zero runs. Examples: diff --git a/tests/unit/process/test_annotator.py b/tests/unit/process/test_annotator.py index 7b27196..6304f6f 100644 --- a/tests/unit/process/test_annotator.py +++ b/tests/unit/process/test_annotator.py @@ -160,7 +160,7 @@ def test_trie_modified(self, long_text): # annotators according to the configuration. # Run the interesting portions of Deduce initialization. - doc = Document(long_text,tokenizers={"default": SpaceSplitTokenizer()}) + doc = Document(long_text, tokenizers={"default": SpaceSplitTokenizer()}) trie = docdeid.ds.LookupTrie() # Yeah, the comma in "Smith," seems off... but then again, WordBoundaryTokenizer # considers whitespace to be tokens. There is no good choice. From 5d188cd494ec7020aeb2a0520563a483edb229d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Mon, 11 Mar 2024 12:48:21 +0100 Subject: [PATCH 16/41] Support whitespace trimming in `WordBoundaryTokenizer` --- docdeid/tokenizer.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/docdeid/tokenizer.py b/docdeid/tokenizer.py index c69197c..7826c39 100644 --- a/docdeid/tokenizer.py +++ b/docdeid/tokenizer.py @@ -355,11 +355,17 @@ def _split_text(self, text: str) -> list[Token]: class WordBoundaryTokenizer(Tokenizer): # pylint: disable=R0903 """ - Tokenizes based on word boundary. + Tokenizes based on word boundary. Sequences of non-alphanumeric characters are + also represented as tokens. - Whitespaces and similar characters are included as tokens. + Args: + keep_blanks: Keep whitespace in tokens, and whitespace-only tokens? """ + def __init__(self, keep_blanks=True): + super().__init__() + self._trim = not keep_blanks + def _split_text(self, text: str) -> list[Token]: tokens = [] matches = [*re.finditer(r"\b", text)] @@ -369,9 +375,21 @@ def _split_text(self, text: str) -> list[Token]: start_char = start_match.span(0)[0] end_char = end_match.span(0)[0] + if self._trim: + word = text[start_char:end_char] + orig_length = len(word) + word = word.rstrip() + end_char -= orig_length - len(word) + word = word.lstrip() + start_char = end_char - len(word) + if not word: + continue + else: + word = text[start_char:end_char] + tokens.append( Token( - text=text[start_char:end_char], + text=word, start_char=start_char, end_char=end_char, ) From 6ea9b744db40b3c97d9dd543ca9536b189f3062f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Mon, 11 Mar 2024 12:47:56 +0100 Subject: [PATCH 17/41] Move `SequenceTokenizer` to Docdeid This is needed so as to reduce the number of arguments for the `_match_sequence` method and creates a cleaner inheritance hierarchy between annotators, too. --- docdeid/annotation.py | 2 +- docdeid/process/__init__.py | 3 +- docdeid/process/annotator.py | 339 ++++++++++++++++++++++++++- tests/unit/process/test_annotator.py | 113 ++++++++- 4 files changed, 451 insertions(+), 6 deletions(-) diff --git a/docdeid/annotation.py b/docdeid/annotation.py index e7d7d7d..142a223 100644 --- a/docdeid/annotation.py +++ b/docdeid/annotation.py @@ -46,7 +46,7 @@ class Annotation: # pylint: disable=R0902 Should only be used when the annotation ends on a token boundary. """ - length: int = field(init=False) + length: int = field(init=False, compare=False) """The number of characters of the annotation text.""" _key_cache: dict = field(default_factory=dict, repr=False, compare=False) diff --git a/docdeid/process/__init__.py b/docdeid/process/__init__.py index 79387f1..6113db6 100644 --- a/docdeid/process/__init__.py +++ b/docdeid/process/__init__.py @@ -4,11 +4,12 @@ OverlapResolver, ) from .annotator import ( + _DIRECTION_MAP, # FIXME Stop using this. Annotator, MultiTokenLookupAnnotator, RegexpAnnotator, + SequenceAnnotator, SingleTokenLookupAnnotator, - TokenPatternAnnotator, ) from .doc_processor import DocProcessor, DocProcessorGroup from .redactor import RedactAllText, Redactor, SimpleRedactor diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py index f97b3cc..b6c51b7 100644 --- a/docdeid/process/annotator.py +++ b/docdeid/process/annotator.py @@ -1,15 +1,61 @@ import re +import warnings from abc import ABC, abstractmethod -from typing import Iterable, Optional, Union +from collections import defaultdict +from dataclasses import dataclass +from typing import Iterable, Optional, Union, Literal, Mapping import docdeid.str from docdeid.annotation import Annotation from docdeid.document import Document +from docdeid.tokenizer import Token, TokenList +from docdeid.ds import DsCollection from docdeid.ds.lookup import LookupSet, LookupTrie from docdeid.pattern import TokenPattern from docdeid.process.doc_processor import DocProcessor from docdeid.str.processor import StringModifier -from docdeid.tokenizer import Token + + +_DIRECTION_MAP = { + "left": { + "attr": "previous", + "order": reversed, + "start_token": lambda annotation: annotation.start_token, + }, + "right": { + "attr": "next", + "order": lambda pattern: pattern, + "start_token": lambda annotation: annotation.end_token, + }, +} + + +@dataclass +class SimpleTokenPattern: + """A pattern for a token (and possibly its annotation, too).""" + func: Literal["equal", "re_match", "is_initial", "is_initials", "like_name", + "lookup", "neg_lookup", "tag"] + pattern: str + + +@dataclass +class NestedTokenPattern: + """Coordination of token patterns.""" + func: Literal["and", "or"] + pattern: list[TokenPattern] + + +TokenPattern = Union[SimpleTokenPattern, NestedTokenPattern] + + +@dataclass +class SequencePattern: + """ + Pattern for matching a sequence of tokens. + """ + direction: Literal["left", "right"] + skip: set[str] + pattern: list[TokenPattern] class Annotator(DocProcessor, ABC): @@ -46,6 +92,76 @@ def annotate(self, doc: Document) -> list[Annotation]: A list of annotations. """ + # FIXME This doesn't really belong here. Maybe to TokenList, rather. + @staticmethod + def _get_chained_token(token: Token, attr: str, skip: set[str]) -> Optional[Token]: + while True: + token = getattr(token, attr)() + + if token is None or token.text not in skip: + break + + return token + + def _match_sequence( + self, + doc: Document, + seq_pattern: SequencePattern, + start_token: Token, + annos_by_token: defaultdict[Token, Iterable[Annotation]], + ds: Optional[DsCollection] + ) -> Optional[Annotation]: + """ + Matches a token sequence pattern at `start_token`. + + Args: + doc: The document. + seq_pattern: The pattern to match. + start_token: The start token to match. + annos_by_token: Map from tokens to annotations covering it. + ds: Lookup dictionaries available. + + Returns: + An Annotation if matching is possible, None otherwise. + """ + + direction = seq_pattern.direction + # FIXME Avoid the dependency loop. + attr = _DIRECTION_MAP[direction]["attr"] + pattern = _DIRECTION_MAP[direction]["order"](seq_pattern.pattern) + + current_token = start_token + end_token = start_token + + for pattern_position in pattern: + if current_token is None or not _PatternPositionMatcher.match( + token_pattern=pattern_position, + token=current_token, + annos=annos_by_token[current_token], + ds=ds, + metadata=doc.metadata, + ): + return None + + end_token = current_token + current_token = SequenceAnnotator._get_chained_token( + current_token, attr, seq_pattern.skip + ) + + start_token, end_token = _DIRECTION_MAP[direction]["order"]( + (start_token, end_token) + ) + + return Annotation( + text=doc.text[start_token.start_char : end_token.end_char], + start_char=start_token.start_char, + end_char=end_token.end_char, + tag=self.tag, + priority=self.priority, + start_token=start_token, + end_token=end_token, + ) + class SingleTokenLookupAnnotator(Annotator): """ @@ -309,3 +425,222 @@ def annotate(self, doc: Document) -> list[Annotation]: ) return annotations + + +class _PatternPositionMatcher: # pylint: disable=R0903 + """Checks if a token matches against a single pattern.""" + + @classmethod + def match(cls, token_pattern: dict | TokenPattern, **kwargs) -> bool: # pylint: + # disable=R0911 + """ + Matches a pattern position (a dict with one key). Other information should be + presented as kwargs. + + Args: + token_pattern: A dictionary with a single key, e.g. {'is_initial': True} + kwargs: Any other information, like the token or ds + + Returns: + True if the pattern position matches, false otherwise. + """ + + if isinstance(token_pattern, dict): + return cls.match(as_token_pattern(token_pattern), **kwargs) + + func = token_pattern.func + value = token_pattern.pattern + + if func == "equal": + return kwargs.get("token").text == value + if func == "re_match": + return re.match(value, kwargs.get("token").text) is not None + if func == "is_initial": + + warnings.warn( + "is_initial matcher pattern is deprecated and will be removed " + "in a future version", + DeprecationWarning, + ) + + return ( + ( + len(kwargs.get("token").text) == 1 + and kwargs.get("token").text[0].isupper() + ) + or kwargs.get("token").text in {"Ch", "Chr", "Ph", "Th"} + ) == value + if func == "is_initials": + return ( + len(kwargs.get("token").text) <= 4 + and kwargs.get("token").text.isupper() + ) == value + if func == "like_name": + return ( + len(kwargs.get("token").text) >= 3 + and kwargs.get("token").text.istitle() + and not any(ch.isdigit() for ch in kwargs.get("token").text) + ) == value + if func == "lookup": + return cls._lookup(value, **kwargs) + if func == "neg_lookup": + return not cls._lookup(value, **kwargs) + if func == "tag": + annos = kwargs.get("annos", ()) + return any(anno.tag == value for anno in annos) + if func == "and": + return all(cls.match(x, **kwargs) for x in value) + if func == "or": + return any(cls.match(x, **kwargs) for x in value) + + raise NotImplementedError(f"No known logic for pattern {func}") + + @classmethod + def _lookup(cls, ent_type: str, **kwargs) -> bool: + token = kwargs.get("token").text + if "." in ent_type: + meta_key, meta_attr = ent_type.split(".", 1) + try: + meta_val = getattr(kwargs["metadata"][meta_key], meta_attr) + except (TypeError, KeyError, AttributeError): + return False + else: + return ( + token == meta_val + if isinstance(meta_val, str) + else token in meta_val + ) + else: # pylint: disable=R1705 + return token in kwargs.get("ds")[ent_type] + + +def as_token_pattern(pat_dict: dict) -> TokenPattern: + if len(pat_dict) != 1: + raise ValueError( + f"Cannot parse a token pattern which doesn't have exactly 1 key: " + f"{pat_dict}." + ) + func, value = next(iter(pat_dict.items())) + if func in ("and", "or"): + return NestedTokenPattern(func, list(map(as_token_pattern, value))) + return SimpleTokenPattern(func, value) + + +class SequenceAnnotator(Annotator): + """ + Annotates based on token patterns, which should be provided as a list of dicts. Each + position in the list denotes a token position, e.g.: [{'is_initial': True}, + {'like_name': True}] matches sequences of two tokens, where the first one is an + initial, and the second one is like a name. + + Arguments: + pattern: The pattern + ds: Lookup dictionaries. Those referenced by the pattern should be + LookupSets. (Don't ask why.) + skip: Any string values that should be skipped in matching (e.g. periods) + """ + + def __init__( + self, + pattern: list[dict], + *args, + ds: Optional[DsCollection] = None, + skip: Optional[list[str]] = None, + **kwargs, + ) -> None: + self.pattern = pattern + self.dicts = ds + self.skip = set(skip or []) + + self._start_words = None + self._matching_pipeline = None + + if len(self.pattern) > 0 and "lookup" in self.pattern[0]: + + if self.ds is None: + raise RuntimeError( + "Created pattern with lookup in TokenPatternAnnotator, but " + "no lookup structures provided." + ) + + lookup_list = self.ds[self.pattern[0]["lookup"]] + + # FIXME This doesn't work correctly for multiple ([{"lookup":"prefix"}, + # {"lookup":"interfix"}]) and nested patterns ("or", "and"). + if not isinstance(lookup_list, LookupSet): + raise ValueError( + f"Expected a LookupSet, but got a " f"{type(lookup_list)}." + ) + + # FIXME This doesn't work correctly for multiple ([{"lookup":"prefix"}, + # {"lookup":"interfix"}]) and nested patterns ("or", "and"). + self._start_words = lookup_list.items() + # FIXME This doesn't work correctly for multiple ([{"lookup":"prefix"}, + # {"lookup":"interfix"}]) and nested patterns ("or", "and"). + self._matching_pipeline = lookup_list.matching_pipeline + + self._seq_pattern = SequencePattern("right", + set(skip or ()), + list(map(as_token_pattern, pattern))) + + super().__init__(*args, **kwargs) + + def annotate(self, doc: Document) -> list[Annotation]: + """ + Annotate the document, by matching the pattern against all tokens. + + Args: + doc: The document being processed. + + Returns: + A list of Annotation. + """ + + annotations = [] + + tokens = doc.get_tokens() + + if self._start_words is not None: + tokens = tokens.token_lookup( + lookup_values=self._start_words, + matching_pipeline=self._matching_pipeline, + ) + + annos_by_token = SequenceAnnotator._index_by_token( + doc.annotations, doc.token_lists + ) + + for token in tokens: + + annotation = self._match_sequence(doc, + self._seq_pattern, + token, + annos_by_token, + self.ds) + + if annotation is not None: + annotations.append(annotation) + + return annotations + + # TODO Test. + @classmethod + def _index_by_token( + cls, + annotations: Iterable[Annotation], + token_lists: Mapping[str, TokenList], + ) -> defaultdict[Token, set[Annotation]]: + """Assigns existing annotations to tokens.""" + annos_by_token = defaultdict(set) + for token_list in token_lists.values(): + # TODO Improve efficiency, simplify. + for anno in annotations: + found_first = False + for token in token_list: + if anno.start_char < token.end_char: + found_first = True + if token.start_char >= anno.end_char: + break + if found_first: + annos_by_token[token].add(anno) + return annos_by_token diff --git a/tests/unit/process/test_annotator.py b/tests/unit/process/test_annotator.py index 6304f6f..aca7230 100644 --- a/tests/unit/process/test_annotator.py +++ b/tests/unit/process/test_annotator.py @@ -1,19 +1,25 @@ import re +from collections import defaultdict from unittest.mock import patch +import pytest + import docdeid.ds from docdeid.annotation import Annotation from docdeid.document import Document -from docdeid.ds import LookupTrie +from docdeid.ds import LookupTrie, DsCollection, LookupSet from docdeid.pattern import TokenPattern from docdeid.process.annotator import ( + as_token_pattern, MultiTokenLookupAnnotator, RegexpAnnotator, + SequenceAnnotator, + SequencePattern, SingleTokenLookupAnnotator, TokenPatternAnnotator, ) from docdeid.str.processor import LowercaseString -from docdeid.tokenizer import SpaceSplitTokenizer +from docdeid.tokenizer import SpaceSplitTokenizer, WordBoundaryTokenizer class TestSingleTokenLookupAnnotator: @@ -292,3 +298,106 @@ def test_multi_pattern(self, long_text, long_tokens_linked, multi_pattern): annotations = annotator.annotate(doc) assert annotations == expected_annotations + + +class TestSequenceAnnotator: + @pytest.fixture + def ds(self): + ds = DsCollection() + + first_names = ["Andries", "pieter", "Aziz", "Bernard"] + surnames = ["Meijer", "Smit", "Bakker", "Heerma"] + + ds["first_names"] = LookupSet() + ds["first_names"].add_items_from_iterable(items=first_names) + + ds["surnames"] = LookupSet() + ds["surnames"].add_items_from_iterable(items=surnames) + + return ds + + @pytest.fixture + def pattern_doc(self): + return Document( + text="De man heet Andries Meijer-Heerma, voornaam Andries.", + tokenizers={"default": WordBoundaryTokenizer(False)} + ) + + def test_match_sequence(self, pattern_doc, ds): + pattern = [{"lookup": "first_names"}, {"like_name": True}] + + tpa = SequenceAnnotator(pattern=[], ds=ds, tag="_") + + assert tpa._match_sequence( + pattern_doc, + SequencePattern("right", set(), list(map(as_token_pattern, pattern))), + start_token=pattern_doc.get_tokens()[3], + annos_by_token=defaultdict(list), + ds=ds, + ) == Annotation(text="Andries Meijer", start_char=12, end_char=26, tag="_") + assert ( + tpa._match_sequence( + pattern_doc, + SequencePattern("right", set(), list(map(as_token_pattern, pattern))), + start_token=pattern_doc.get_tokens()[7], + annos_by_token=defaultdict(list), + ds=ds, + ) + is None + ) + + def test_match_sequence_left(self, pattern_doc, ds): + pattern = [{"lookup": "first_names"}, {"like_name": True}] + + tpa = SequenceAnnotator(pattern=[], ds=ds, tag="_") + + assert tpa._match_sequence( + pattern_doc, + SequencePattern("left", set(), list(map(as_token_pattern, pattern))), + start_token=pattern_doc.get_tokens()[4], + annos_by_token=defaultdict(list), + ds=ds, + ) == Annotation(text="Andries Meijer", start_char=12, end_char=26, tag="_") + + assert ( + tpa._match_sequence( + pattern_doc, + SequencePattern("left", set(), list(map(as_token_pattern, pattern))), + start_token=pattern_doc.get_tokens()[8], + annos_by_token=defaultdict(list), + ds=ds, + ) + is None + ) + + def test_match_sequence_skip(self, pattern_doc, ds): + pattern = [{"lookup": "surnames"}, {"like_name": True}] + + tpa = SequenceAnnotator(pattern=[], ds=ds, tag="_") + + assert tpa._match_sequence( + pattern_doc, + SequencePattern("right", {"-"}, list(map(as_token_pattern, pattern))), + start_token=pattern_doc.get_tokens()[4], + annos_by_token=defaultdict(list), + ds=ds, + ) == Annotation(text="Meijer-Heerma", start_char=20, end_char=33, tag="_") + assert ( + tpa._match_sequence( + pattern_doc, + SequencePattern("right", set(), list(map(as_token_pattern, pattern))), + start_token=pattern_doc.get_tokens()[4], + annos_by_token=defaultdict(list), + ds=ds, + ) + is None + ) + + def test_annotate(self, pattern_doc, ds): + pattern = [{"lookup": "first_names"}, {"like_name": True}] + + tpa = SequenceAnnotator(pattern=pattern, ds=ds, tag="_") + + assert tpa.annotate(pattern_doc) == [ + Annotation(text="Andries Meijer", start_char=12, end_char=26, tag="_") + ] From 4110a53b15ee5cfa72d2601be426b15542734dc7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Mon, 11 Mar 2024 17:55:11 +0100 Subject: [PATCH 18/41] Format code --- docdeid/process/__init__.py | 2 +- docdeid/process/annotator.py | 55 +++++++++++++++++----------- docdeid/tokenizer.py | 6 +-- tests/unit/process/test_annotator.py | 54 +++++++++++++-------------- 4 files changed, 65 insertions(+), 52 deletions(-) diff --git a/docdeid/process/__init__.py b/docdeid/process/__init__.py index 6113db6..1333cee 100644 --- a/docdeid/process/__init__.py +++ b/docdeid/process/__init__.py @@ -3,8 +3,8 @@ MergeAdjacentAnnotations, OverlapResolver, ) +from .annotator import _DIRECTION_MAP # FIXME Stop using this. from .annotator import ( - _DIRECTION_MAP, # FIXME Stop using this. Annotator, MultiTokenLookupAnnotator, RegexpAnnotator, diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py index b6c51b7..911e4fb 100644 --- a/docdeid/process/annotator.py +++ b/docdeid/process/annotator.py @@ -3,18 +3,17 @@ from abc import ABC, abstractmethod from collections import defaultdict from dataclasses import dataclass -from typing import Iterable, Optional, Union, Literal, Mapping +from typing import Iterable, Literal, Mapping, Optional, Union import docdeid.str from docdeid.annotation import Annotation from docdeid.document import Document -from docdeid.tokenizer import Token, TokenList from docdeid.ds import DsCollection from docdeid.ds.lookup import LookupSet, LookupTrie from docdeid.pattern import TokenPattern from docdeid.process.doc_processor import DocProcessor from docdeid.str.processor import StringModifier - +from docdeid.tokenizer import Token, TokenList _DIRECTION_MAP = { "left": { @@ -33,14 +32,24 @@ @dataclass class SimpleTokenPattern: """A pattern for a token (and possibly its annotation, too).""" - func: Literal["equal", "re_match", "is_initial", "is_initials", "like_name", - "lookup", "neg_lookup", "tag"] + + func: Literal[ + "equal", + "re_match", + "is_initial", + "is_initials", + "like_name", + "lookup", + "neg_lookup", + "tag", + ] pattern: str @dataclass class NestedTokenPattern: """Coordination of token patterns.""" + func: Literal["and", "or"] pattern: list[TokenPattern] @@ -50,9 +59,8 @@ class NestedTokenPattern: @dataclass class SequencePattern: - """ - Pattern for matching a sequence of tokens. - """ + """Pattern for matching a sequence of tokens.""" + direction: Literal["left", "right"] skip: set[str] pattern: list[TokenPattern] @@ -109,7 +117,7 @@ def _match_sequence( seq_pattern: SequencePattern, start_token: Token, annos_by_token: defaultdict[Token, Iterable[Annotation]], - ds: Optional[DsCollection] + ds: Optional[DsCollection], ) -> Optional[Annotation]: """ Matches a token sequence pattern at `start_token`. @@ -515,6 +523,13 @@ def _lookup(cls, ent_type: str, **kwargs) -> bool: def as_token_pattern(pat_dict: dict) -> TokenPattern: + """ + Converts the JSON dictionary representation of token patterns into a + `TokenPattern` instance. + + Args: + pat_dict: the JSON representation of the pattern + """ if len(pat_dict) != 1: raise ValueError( f"Cannot parse a token pattern which doesn't have exactly 1 key: " @@ -535,8 +550,8 @@ class SequenceAnnotator(Annotator): Arguments: pattern: The pattern - ds: Lookup dictionaries. Those referenced by the pattern should be - LookupSets. (Don't ask why.) + ds: Lookup dictionaries. Those referenced by the pattern should be LookupSets. + (Don't ask why.) skip: Any string values that should be skipped in matching (e.g. periods) """ @@ -557,13 +572,13 @@ def __init__( if len(self.pattern) > 0 and "lookup" in self.pattern[0]: - if self.ds is None: + if self.dicts is None: raise RuntimeError( "Created pattern with lookup in TokenPatternAnnotator, but " "no lookup structures provided." ) - lookup_list = self.ds[self.pattern[0]["lookup"]] + lookup_list = self.dicts[self.pattern[0]["lookup"]] # FIXME This doesn't work correctly for multiple ([{"lookup":"prefix"}, # {"lookup":"interfix"}]) and nested patterns ("or", "and"). @@ -579,9 +594,9 @@ def __init__( # {"lookup":"interfix"}]) and nested patterns ("or", "and"). self._matching_pipeline = lookup_list.matching_pipeline - self._seq_pattern = SequencePattern("right", - set(skip or ()), - list(map(as_token_pattern, pattern))) + self._seq_pattern = SequencePattern( + "right", set(skip or ()), list(map(as_token_pattern, pattern)) + ) super().__init__(*args, **kwargs) @@ -612,11 +627,9 @@ def annotate(self, doc: Document) -> list[Annotation]: for token in tokens: - annotation = self._match_sequence(doc, - self._seq_pattern, - token, - annos_by_token, - self.ds) + annotation = self._match_sequence( + doc, self._seq_pattern, token, annos_by_token, self.dicts + ) if annotation is not None: annotations.append(annotation) diff --git a/docdeid/tokenizer.py b/docdeid/tokenizer.py index 7826c39..e2eae5d 100644 --- a/docdeid/tokenizer.py +++ b/docdeid/tokenizer.py @@ -355,14 +355,14 @@ def _split_text(self, text: str) -> list[Token]: class WordBoundaryTokenizer(Tokenizer): # pylint: disable=R0903 """ - Tokenizes based on word boundary. Sequences of non-alphanumeric characters are - also represented as tokens. + Tokenizes based on word boundary. Sequences of non-alphanumeric characters are also + represented as tokens. Args: keep_blanks: Keep whitespace in tokens, and whitespace-only tokens? """ - def __init__(self, keep_blanks=True): + def __init__(self, keep_blanks: bool = True) -> None: super().__init__() self._trim = not keep_blanks diff --git a/tests/unit/process/test_annotator.py b/tests/unit/process/test_annotator.py index aca7230..ab150ef 100644 --- a/tests/unit/process/test_annotator.py +++ b/tests/unit/process/test_annotator.py @@ -7,16 +7,16 @@ import docdeid.ds from docdeid.annotation import Annotation from docdeid.document import Document -from docdeid.ds import LookupTrie, DsCollection, LookupSet +from docdeid.ds import DsCollection, LookupSet, LookupTrie from docdeid.pattern import TokenPattern from docdeid.process.annotator import ( - as_token_pattern, MultiTokenLookupAnnotator, RegexpAnnotator, SequenceAnnotator, SequencePattern, SingleTokenLookupAnnotator, TokenPatternAnnotator, + as_token_pattern, ) from docdeid.str.processor import LowercaseString from docdeid.tokenizer import SpaceSplitTokenizer, WordBoundaryTokenizer @@ -320,7 +320,7 @@ def ds(self): def pattern_doc(self): return Document( text="De man heet Andries Meijer-Heerma, voornaam Andries.", - tokenizers={"default": WordBoundaryTokenizer(False)} + tokenizers={"default": WordBoundaryTokenizer(False)}, ) def test_match_sequence(self, pattern_doc, ds): @@ -336,14 +336,14 @@ def test_match_sequence(self, pattern_doc, ds): ds=ds, ) == Annotation(text="Andries Meijer", start_char=12, end_char=26, tag="_") assert ( - tpa._match_sequence( - pattern_doc, - SequencePattern("right", set(), list(map(as_token_pattern, pattern))), - start_token=pattern_doc.get_tokens()[7], - annos_by_token=defaultdict(list), - ds=ds, - ) - is None + tpa._match_sequence( + pattern_doc, + SequencePattern("right", set(), list(map(as_token_pattern, pattern))), + start_token=pattern_doc.get_tokens()[7], + annos_by_token=defaultdict(list), + ds=ds, + ) + is None ) def test_match_sequence_left(self, pattern_doc, ds): @@ -360,14 +360,14 @@ def test_match_sequence_left(self, pattern_doc, ds): ) == Annotation(text="Andries Meijer", start_char=12, end_char=26, tag="_") assert ( - tpa._match_sequence( - pattern_doc, - SequencePattern("left", set(), list(map(as_token_pattern, pattern))), - start_token=pattern_doc.get_tokens()[8], - annos_by_token=defaultdict(list), - ds=ds, - ) - is None + tpa._match_sequence( + pattern_doc, + SequencePattern("left", set(), list(map(as_token_pattern, pattern))), + start_token=pattern_doc.get_tokens()[8], + annos_by_token=defaultdict(list), + ds=ds, + ) + is None ) def test_match_sequence_skip(self, pattern_doc, ds): @@ -383,14 +383,14 @@ def test_match_sequence_skip(self, pattern_doc, ds): ds=ds, ) == Annotation(text="Meijer-Heerma", start_char=20, end_char=33, tag="_") assert ( - tpa._match_sequence( - pattern_doc, - SequencePattern("right", set(), list(map(as_token_pattern, pattern))), - start_token=pattern_doc.get_tokens()[4], - annos_by_token=defaultdict(list), - ds=ds, - ) - is None + tpa._match_sequence( + pattern_doc, + SequencePattern("right", set(), list(map(as_token_pattern, pattern))), + start_token=pattern_doc.get_tokens()[4], + annos_by_token=defaultdict(list), + ds=ds, + ) + is None ) def test_annotate(self, pattern_doc, ds): From df73e54730d2ed3aa8b3df94d0b8803e17c6e309 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Mon, 11 Mar 2024 19:57:59 +0100 Subject: [PATCH 19/41] Replace `_DIRECTION_MAP` with an enum --- docdeid/direction.py | 41 ++++++++++++ docdeid/process/__init__.py | 1 - docdeid/process/annotator.py | 94 +++++++++++----------------- docdeid/tokenizer.py | 32 +++++++++- tests/unit/process/test_annotator.py | 25 ++++++-- 5 files changed, 128 insertions(+), 65 deletions(-) create mode 100644 docdeid/direction.py diff --git a/docdeid/direction.py b/docdeid/direction.py new file mode 100644 index 0000000..6c4b104 --- /dev/null +++ b/docdeid/direction.py @@ -0,0 +1,41 @@ +from __future__ import annotations + +from enum import IntEnum +from typing import Iterable, Sequence, TypeVar + +T = TypeVar('T') + + +class Direction(IntEnum): + """Direction in text -- either left or right.""" + LEFT = -1 + RIGHT = 1 + + @property + def opposite(self) -> Direction: + """The opposite direction to this.""" + return Direction(-self) + + @staticmethod + def from_string(val: str) -> Direction: + """ + Parses a Direction from a string, which must be either 'left' or 'right' after + lowercasing. + """ + norm = val.lower() + if norm == "left": + return Direction.LEFT + if norm == "right": + return Direction.RIGHT + raise ValueError("Invalid direction: '%s'".format(val)) + + def iter(self, seq: Sequence[T]) -> Iterable[T]: + """ + Returns an iterator over the given sequence that traverses it in this direction. + + Args: + seq: sequence to iterate over + """ + if self is Direction.RIGHT: + return seq + return reversed(seq) diff --git a/docdeid/process/__init__.py b/docdeid/process/__init__.py index 1333cee..d0f040f 100644 --- a/docdeid/process/__init__.py +++ b/docdeid/process/__init__.py @@ -3,7 +3,6 @@ MergeAdjacentAnnotations, OverlapResolver, ) -from .annotator import _DIRECTION_MAP # FIXME Stop using this. from .annotator import ( Annotator, MultiTokenLookupAnnotator, diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py index 911e4fb..690c44e 100644 --- a/docdeid/process/annotator.py +++ b/docdeid/process/annotator.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import re import warnings from abc import ABC, abstractmethod @@ -7,6 +9,7 @@ import docdeid.str from docdeid.annotation import Annotation +from docdeid.direction import Direction from docdeid.document import Document from docdeid.ds import DsCollection from docdeid.ds.lookup import LookupSet, LookupTrie @@ -15,19 +18,6 @@ from docdeid.str.processor import StringModifier from docdeid.tokenizer import Token, TokenList -_DIRECTION_MAP = { - "left": { - "attr": "previous", - "order": reversed, - "start_token": lambda annotation: annotation.start_token, - }, - "right": { - "attr": "next", - "order": lambda pattern: pattern, - "start_token": lambda annotation: annotation.end_token, - }, -} - @dataclass class SimpleTokenPattern: @@ -61,7 +51,7 @@ class NestedTokenPattern: class SequencePattern: """Pattern for matching a sequence of tokens.""" - direction: Literal["left", "right"] + direction: Direction skip: set[str] pattern: list[TokenPattern] @@ -100,17 +90,6 @@ def annotate(self, doc: Document) -> list[Annotation]: A list of annotations. """ - # FIXME This doesn't really belong here. Maybe to TokenList, rather. - @staticmethod - def _get_chained_token(token: Token, attr: str, skip: set[str]) -> Optional[Token]: - while True: - token = getattr(token, attr)() - - if token is None or token.text not in skip: - break - - return token - def _match_sequence( self, doc: Document, @@ -133,42 +112,41 @@ def _match_sequence( An Annotation if matching is possible, None otherwise. """ - direction = seq_pattern.direction - # FIXME Avoid the dependency loop. - attr = _DIRECTION_MAP[direction]["attr"] - pattern = _DIRECTION_MAP[direction]["order"](seq_pattern.pattern) + dir_ = seq_pattern.direction - current_token = start_token - end_token = start_token + tokens = (token for token in start_token.iter_to(dir_) + if token.text not in seq_pattern.skip) + # Iterate the token patterns in the direction corresponding to the surface + # order it's supposed to match (i.e. "left" means "iterate patterns from the + # end"). + tok_patterns = dir_.iter(seq_pattern.pattern) - for pattern_position in pattern: - if current_token is None or not _PatternPositionMatcher.match( - token_pattern=pattern_position, - token=current_token, - annos=annos_by_token[current_token], - ds=ds, - metadata=doc.metadata, + num_matched = 0 + end_token = start_token + for tok_pattern, end_token in zip(tok_patterns, tokens): + if _PatternPositionMatcher.match( + token_pattern=tok_pattern, + token=end_token, + annos=annos_by_token[end_token], + ds=ds, + metadata=doc.metadata, ): - return None - - end_token = current_token - current_token = SequenceAnnotator._get_chained_token( - current_token, attr, seq_pattern.skip - ) + num_matched += 1 + else: + break - start_token, end_token = _DIRECTION_MAP[direction]["order"]( - (start_token, end_token) - ) + if num_matched == len(seq_pattern.pattern): + left_token, right_token = dir_.iter((start_token, end_token)) - return Annotation( - text=doc.text[start_token.start_char : end_token.end_char], - start_char=start_token.start_char, - end_char=end_token.end_char, - tag=self.tag, - priority=self.priority, - start_token=start_token, - end_token=end_token, - ) + return Annotation( + text=doc.text[left_token.start_char : right_token.end_char], + start_char=left_token.start_char, + end_char=right_token.end_char, + tag=self.tag, + priority=self.priority, + start_token=left_token, + end_token=right_token, + ) class SingleTokenLookupAnnotator(Annotator): @@ -595,7 +573,9 @@ def __init__( self._matching_pipeline = lookup_list.matching_pipeline self._seq_pattern = SequencePattern( - "right", set(skip or ()), list(map(as_token_pattern, pattern)) + Direction.RIGHT, + set(skip or ()), + list(map(as_token_pattern, pattern)) ) super().__init__(*args, **kwargs) diff --git a/docdeid/tokenizer.py b/docdeid/tokenizer.py index e2eae5d..249a0ab 100644 --- a/docdeid/tokenizer.py +++ b/docdeid/tokenizer.py @@ -6,8 +6,9 @@ from collections import defaultdict from collections.abc import Sequence from dataclasses import dataclass, field -from typing import Iterator, Literal, Optional, SupportsIndex, overload +from typing import Iterator, Literal, Optional, SupportsIndex, overload, Generator +from docdeid.direction import Direction from docdeid.str import StringModifier @@ -122,6 +123,35 @@ def next(self, num: int = 1) -> Optional[Token]: """ return self._get_linked_token(num=num, attr="_next_token") + def get_nth(self, + num: int = 1, + dir_: Direction = Direction.RIGHT, + ) -> Optional[Token]: + """ + Finds the _n_-th token to the left or right. + + Args: + num: number of tokens to move + dir_: direction to go + """ + if num < 0: + return self.get_nth(-num, dir_.opposite) + return self.next(num) if dir_ is Direction.RIGHT else self.previous(num) + + def iter_to(self, + dir_: Direction = Direction.RIGHT, + ) -> Generator[Token, None, None]: + """ + Iterates linked tokens in the specified direction. + + Args: + dir_: direction to go + """ + token = self + while token is not None: + yield token + token = token.next() if dir_ is Direction.RIGHT else token.previous() + def __len__(self) -> int: """ The length of the text. diff --git a/tests/unit/process/test_annotator.py b/tests/unit/process/test_annotator.py index ab150ef..6bcb3ec 100644 --- a/tests/unit/process/test_annotator.py +++ b/tests/unit/process/test_annotator.py @@ -6,6 +6,7 @@ import docdeid.ds from docdeid.annotation import Annotation +from docdeid.direction import Direction from docdeid.document import Document from docdeid.ds import DsCollection, LookupSet, LookupTrie from docdeid.pattern import TokenPattern @@ -330,7 +331,9 @@ def test_match_sequence(self, pattern_doc, ds): assert tpa._match_sequence( pattern_doc, - SequencePattern("right", set(), list(map(as_token_pattern, pattern))), + SequencePattern(Direction.RIGHT, + set(), + list(map(as_token_pattern, pattern))), start_token=pattern_doc.get_tokens()[3], annos_by_token=defaultdict(list), ds=ds, @@ -338,7 +341,9 @@ def test_match_sequence(self, pattern_doc, ds): assert ( tpa._match_sequence( pattern_doc, - SequencePattern("right", set(), list(map(as_token_pattern, pattern))), + SequencePattern(Direction.RIGHT, + set(), + list(map(as_token_pattern, pattern))), start_token=pattern_doc.get_tokens()[7], annos_by_token=defaultdict(list), ds=ds, @@ -353,7 +358,9 @@ def test_match_sequence_left(self, pattern_doc, ds): assert tpa._match_sequence( pattern_doc, - SequencePattern("left", set(), list(map(as_token_pattern, pattern))), + SequencePattern(Direction.LEFT, + set(), + list(map(as_token_pattern, pattern))), start_token=pattern_doc.get_tokens()[4], annos_by_token=defaultdict(list), ds=ds, @@ -362,7 +369,9 @@ def test_match_sequence_left(self, pattern_doc, ds): assert ( tpa._match_sequence( pattern_doc, - SequencePattern("left", set(), list(map(as_token_pattern, pattern))), + SequencePattern(Direction.LEFT, + set(), + list(map(as_token_pattern, pattern))), start_token=pattern_doc.get_tokens()[8], annos_by_token=defaultdict(list), ds=ds, @@ -377,7 +386,9 @@ def test_match_sequence_skip(self, pattern_doc, ds): assert tpa._match_sequence( pattern_doc, - SequencePattern("right", {"-"}, list(map(as_token_pattern, pattern))), + SequencePattern(Direction.RIGHT, + {"-"}, + list(map(as_token_pattern, pattern))), start_token=pattern_doc.get_tokens()[4], annos_by_token=defaultdict(list), ds=ds, @@ -385,7 +396,9 @@ def test_match_sequence_skip(self, pattern_doc, ds): assert ( tpa._match_sequence( pattern_doc, - SequencePattern("right", set(), list(map(as_token_pattern, pattern))), + SequencePattern(Direction.RIGHT, + set(), + list(map(as_token_pattern, pattern))), start_token=pattern_doc.get_tokens()[4], annos_by_token=defaultdict(list), ds=ds, From 99163d6700b43f34392d79222ea08417b8745c97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Mon, 11 Mar 2024 22:16:11 +0100 Subject: [PATCH 20/41] Improve and test `annos_by_token()` --- docdeid/annotation.py | 45 ++++++++++++++++++++++++ docdeid/document.py | 5 +++ docdeid/process/annotator.py | 26 +------------- tests/unit/test_annotation.py | 65 ++++++++++++++++++++++++++++++++++- 4 files changed, 115 insertions(+), 26 deletions(-) diff --git a/docdeid/annotation.py b/docdeid/annotation.py index 142a223..7f4f538 100644 --- a/docdeid/annotation.py +++ b/docdeid/annotation.py @@ -1,3 +1,4 @@ +from collections import defaultdict from dataclasses import dataclass, field from typing import Any, Callable, Optional @@ -126,6 +127,10 @@ class AnnotationSet(set[Annotation]): It extends the builtin ``set``. """ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._annos_by_tokenizers_by_token = {} + def sorted( self, by: tuple, # pylint: disable=C0103 @@ -185,3 +190,43 @@ def has_overlap(self) -> bool: return True return False + + def annos_by_token(self, doc: "Document") -> defaultdict[Token, set[Annotation]]: + """ + Returns a mapping from document tokens to annotations. + + Args: + doc: document whose tokens are to be linked + """ + # We key the token->annotations cache only by the set of tokenizers where it + # actually (obviously) depends also on the document. However, it's assumed + # that an AnnotationSet is always bound only to one document. + tokenizers = frozenset(doc.token_lists) + if tokenizers not in self._annos_by_tokenizers_by_token: + annos_by_token = defaultdict(set) + for token_list in doc.token_lists.values(): + if not token_list: + continue + cur_tok_idx = 0 + tok = token_list[cur_tok_idx] + for anno in self.sorted(by=("start_char", )): + try: + # Iterate over tokens till we reach the annotation. + while tok.end_char < anno.start_char: + cur_tok_idx += 1 + tok = token_list[cur_tok_idx] + except IndexError: + break + else: + # Iterate over tokens in the annotation till we reach the end + # of it or the end of the tokens. + anno_tok_idx = cur_tok_idx + anno_tok = tok + while anno_tok.start_char < anno.end_char: + annos_by_token[anno_tok].add(anno) + if anno_tok_idx == len(token_list) - 1: + break + anno_tok_idx += 1 + anno_tok = token_list[anno_tok_idx] + self._annos_by_tokenizers_by_token[tokenizers] = annos_by_token + return self._annos_by_tokenizers_by_token[tokenizers] diff --git a/docdeid/document.py b/docdeid/document.py index c7b6c20..aa3f1a2 100644 --- a/docdeid/document.py +++ b/docdeid/document.py @@ -100,12 +100,17 @@ def text(self) -> str: @property def tokenizers(self) -> Optional[Mapping[str, Tokenizer]]: """Available tokenizers indexed by their name.""" + if self._tokenizers is None: + raise RuntimeError("No tokenizers initialized.") return self._tokenizers @property def token_lists(self) -> Mapping[str, TokenList]: """Lists of tokens of the document, indexed by the name of the corresponding tokenizer.""" + for tokker_name in set(self.tokenizers) - set(self._token_lists): + tokker = self._tokenizers[tokker_name] + self._token_lists[tokker_name] = tokker.tokenize(self._text) return self._token_lists def get_tokens(self, tokenizer_name: str = "default") -> TokenList: diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py index 690c44e..38a826b 100644 --- a/docdeid/process/annotator.py +++ b/docdeid/process/annotator.py @@ -601,9 +601,7 @@ def annotate(self, doc: Document) -> list[Annotation]: matching_pipeline=self._matching_pipeline, ) - annos_by_token = SequenceAnnotator._index_by_token( - doc.annotations, doc.token_lists - ) + annos_by_token = doc.annotations.annos_by_token(doc) for token in tokens: @@ -615,25 +613,3 @@ def annotate(self, doc: Document) -> list[Annotation]: annotations.append(annotation) return annotations - - # TODO Test. - @classmethod - def _index_by_token( - cls, - annotations: Iterable[Annotation], - token_lists: Mapping[str, TokenList], - ) -> defaultdict[Token, set[Annotation]]: - """Assigns existing annotations to tokens.""" - annos_by_token = defaultdict(set) - for token_list in token_lists.values(): - # TODO Improve efficiency, simplify. - for anno in annotations: - found_first = False - for token in token_list: - if anno.start_char < token.end_char: - found_first = True - if token.start_char >= anno.end_char: - break - if found_first: - annos_by_token[token].add(anno) - return annos_by_token diff --git a/tests/unit/test_annotation.py b/tests/unit/test_annotation.py index fe4f785..7e32ca4 100644 --- a/tests/unit/test_annotation.py +++ b/tests/unit/test_annotation.py @@ -1,8 +1,11 @@ +import re + import pytest from frozendict import frozendict +from docdeid import Document from docdeid.annotation import Annotation, AnnotationSet -from docdeid.tokenizer import Token +from docdeid.tokenizer import Token, WordBoundaryTokenizer, Tokenizer class TestAnnotation: @@ -157,3 +160,63 @@ def test_get_annotations_sorted_no_frozendict(self, annotations): _ = annotation_set.sorted( by=("priority", "length"), callbacks=dict(length=lambda x: -x) ) + + def test_annos_by_token(self, annotations): + doc = Document("1 2 3 1 2 3 hum Hello hum I'm Bob - said Cindy", + tokenizers={"default": WordBoundaryTokenizer(False)}) + aset = AnnotationSet([ + a1 := Annotation("Hello", 16, 21, "word"), + a2 := Annotation("I", 26, 27, "ltr"), + a3 := Annotation("I'm", 26, 29, "words"), + a4 := Annotation("Bob", 30, 33, "name"), + a5 := Annotation("I'm Bob", 26, 33, "stmt"), + ]) + + # import pydevd_pycharm + # pydevd_pycharm.settrace() + + got = aset.annos_by_token(doc) + + want = { + Token("Hello", 16, 21): {a1}, + Token("I", 26, 27): {a2, a3, a5}, + Token("'", 27, 28): {a3, a5}, + Token("m", 28, 29): {a3, a5}, + Token("Bob", 30, 33): {a4, a5}, + } + + assert got == want + + def test_annos_by_token_2(self, annotations): + class HumTokenizer(Tokenizer): + """Extracts each "hum" word and the following word as a token.""" + def _split_text(self, text: str) -> list[Token]: + return [ + Token(match.group(0), match.start(), match.end()) + for match in re.finditer("\\bhum\\s+\\w+", text) + ] + + doc = Document("1 2 3 1 2 3 hum Hello hum I'm Bob - said Cindy", + tokenizers={"default": WordBoundaryTokenizer(False), + "for_fun": HumTokenizer()}) + aset = AnnotationSet([ + a1 := Annotation("Hello", 16, 21, "word"), + a2 := Annotation("I", 26, 27, "ltr"), + a3 := Annotation("I'm", 26, 29, "words"), + a4 := Annotation("Bob", 30, 33, "name"), + a5 := Annotation("I'm Bob", 26, 33, "stmt"), + ]) + + got = aset.annos_by_token(doc) + + want = { + Token("Hello", 16, 21): {a1}, + Token("I", 26, 27): {a2, a3, a5}, + Token("'", 27, 28): {a3, a5}, + Token("m", 28, 29): {a3, a5}, + Token("Bob", 30, 33): {a4, a5}, + Token("hum Hello", 12, 21): {a1}, + Token("hum I", 22, 27): {a2, a3, a5}, + } + + assert got == want From c7ba5bc8890ab7b1dbeb11ad83e3da75f88042fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Tue, 12 Mar 2024 12:34:24 +0100 Subject: [PATCH 21/41] Drop `Token.get_nth`, simplify `Token.iter_to` --- docdeid/tokenizer.py | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/docdeid/tokenizer.py b/docdeid/tokenizer.py index 249a0ab..f055503 100644 --- a/docdeid/tokenizer.py +++ b/docdeid/tokenizer.py @@ -123,24 +123,8 @@ def next(self, num: int = 1) -> Optional[Token]: """ return self._get_linked_token(num=num, attr="_next_token") - def get_nth(self, - num: int = 1, - dir_: Direction = Direction.RIGHT, - ) -> Optional[Token]: - """ - Finds the _n_-th token to the left or right. - - Args: - num: number of tokens to move - dir_: direction to go - """ - if num < 0: - return self.get_nth(-num, dir_.opposite) - return self.next(num) if dir_ is Direction.RIGHT else self.previous(num) - - def iter_to(self, - dir_: Direction = Direction.RIGHT, - ) -> Generator[Token, None, None]: + def iter_to(self, dir_: Direction = Direction.RIGHT, + ) -> Generator[Token, None, None]: """ Iterates linked tokens in the specified direction. @@ -150,7 +134,8 @@ def iter_to(self, token = self while token is not None: yield token - token = token.next() if dir_ is Direction.RIGHT else token.previous() + token = (token._next_token if dir_ is Direction.RIGHT else + token._previous_token) def __len__(self) -> int: """ From c80e2adda4239fc6d033884d701ec51f975d211d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Tue, 12 Mar 2024 22:03:28 +0100 Subject: [PATCH 22/41] Format code --- docdeid/annotation.py | 2 +- docdeid/direction.py | 9 +++-- docdeid/process/annotator.py | 25 +++++++------- docdeid/tokenizer.py | 13 ++++--- tests/unit/process/test_annotator.py | 36 ++++++++++---------- tests/unit/test_annotation.py | 51 +++++++++++++++++----------- 6 files changed, 75 insertions(+), 61 deletions(-) diff --git a/docdeid/annotation.py b/docdeid/annotation.py index 7f4f538..6689365 100644 --- a/docdeid/annotation.py +++ b/docdeid/annotation.py @@ -209,7 +209,7 @@ def annos_by_token(self, doc: "Document") -> defaultdict[Token, set[Annotation]] continue cur_tok_idx = 0 tok = token_list[cur_tok_idx] - for anno in self.sorted(by=("start_char", )): + for anno in self.sorted(by=("start_char",)): try: # Iterate over tokens till we reach the annotation. while tok.end_char < anno.start_char: diff --git a/docdeid/direction.py b/docdeid/direction.py index 6c4b104..a6eaeb5 100644 --- a/docdeid/direction.py +++ b/docdeid/direction.py @@ -3,11 +3,12 @@ from enum import IntEnum from typing import Iterable, Sequence, TypeVar -T = TypeVar('T') +T = TypeVar("T") class Direction(IntEnum): """Direction in text -- either left or right.""" + LEFT = -1 RIGHT = 1 @@ -18,10 +19,8 @@ def opposite(self) -> Direction: @staticmethod def from_string(val: str) -> Direction: - """ - Parses a Direction from a string, which must be either 'left' or 'right' after - lowercasing. - """ + """Parses a Direction from a string, which must be either 'left' or 'right' + after lowercasing.""" norm = val.lower() if norm == "left": return Direction.LEFT diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py index 38a826b..67fc169 100644 --- a/docdeid/process/annotator.py +++ b/docdeid/process/annotator.py @@ -114,8 +114,11 @@ def _match_sequence( dir_ = seq_pattern.direction - tokens = (token for token in start_token.iter_to(dir_) - if token.text not in seq_pattern.skip) + tokens = ( + token + for token in start_token.iter_to(dir_) + if token.text not in seq_pattern.skip + ) # Iterate the token patterns in the direction corresponding to the surface # order it's supposed to match (i.e. "left" means "iterate patterns from the # end"). @@ -125,11 +128,11 @@ def _match_sequence( end_token = start_token for tok_pattern, end_token in zip(tok_patterns, tokens): if _PatternPositionMatcher.match( - token_pattern=tok_pattern, - token=end_token, - annos=annos_by_token[end_token], - ds=ds, - metadata=doc.metadata, + token_pattern=tok_pattern, + token=end_token, + annos=annos_by_token[end_token], + ds=ds, + metadata=doc.metadata, ): num_matched += 1 else: @@ -502,8 +505,8 @@ def _lookup(cls, ent_type: str, **kwargs) -> bool: def as_token_pattern(pat_dict: dict) -> TokenPattern: """ - Converts the JSON dictionary representation of token patterns into a - `TokenPattern` instance. + Converts the JSON dictionary representation of token patterns into a `TokenPattern` + instance. Args: pat_dict: the JSON representation of the pattern @@ -573,9 +576,7 @@ def __init__( self._matching_pipeline = lookup_list.matching_pipeline self._seq_pattern = SequencePattern( - Direction.RIGHT, - set(skip or ()), - list(map(as_token_pattern, pattern)) + Direction.RIGHT, set(skip or ()), list(map(as_token_pattern, pattern)) ) super().__init__(*args, **kwargs) diff --git a/docdeid/tokenizer.py b/docdeid/tokenizer.py index f055503..d5c4efa 100644 --- a/docdeid/tokenizer.py +++ b/docdeid/tokenizer.py @@ -6,7 +6,7 @@ from collections import defaultdict from collections.abc import Sequence from dataclasses import dataclass, field -from typing import Iterator, Literal, Optional, SupportsIndex, overload, Generator +from typing import Generator, Iterator, Literal, Optional, SupportsIndex, overload from docdeid.direction import Direction from docdeid.str import StringModifier @@ -123,8 +123,10 @@ def next(self, num: int = 1) -> Optional[Token]: """ return self._get_linked_token(num=num, attr="_next_token") - def iter_to(self, dir_: Direction = Direction.RIGHT, - ) -> Generator[Token, None, None]: + def iter_to( + self, + dir_: Direction = Direction.RIGHT, + ) -> Generator[Token, None, None]: """ Iterates linked tokens in the specified direction. @@ -134,8 +136,9 @@ def iter_to(self, dir_: Direction = Direction.RIGHT, token = self while token is not None: yield token - token = (token._next_token if dir_ is Direction.RIGHT else - token._previous_token) + token = ( + token._next_token if dir_ is Direction.RIGHT else token._previous_token + ) def __len__(self) -> int: """ diff --git a/tests/unit/process/test_annotator.py b/tests/unit/process/test_annotator.py index 6bcb3ec..efe7a6a 100644 --- a/tests/unit/process/test_annotator.py +++ b/tests/unit/process/test_annotator.py @@ -331,9 +331,9 @@ def test_match_sequence(self, pattern_doc, ds): assert tpa._match_sequence( pattern_doc, - SequencePattern(Direction.RIGHT, - set(), - list(map(as_token_pattern, pattern))), + SequencePattern( + Direction.RIGHT, set(), list(map(as_token_pattern, pattern)) + ), start_token=pattern_doc.get_tokens()[3], annos_by_token=defaultdict(list), ds=ds, @@ -341,9 +341,9 @@ def test_match_sequence(self, pattern_doc, ds): assert ( tpa._match_sequence( pattern_doc, - SequencePattern(Direction.RIGHT, - set(), - list(map(as_token_pattern, pattern))), + SequencePattern( + Direction.RIGHT, set(), list(map(as_token_pattern, pattern)) + ), start_token=pattern_doc.get_tokens()[7], annos_by_token=defaultdict(list), ds=ds, @@ -358,9 +358,9 @@ def test_match_sequence_left(self, pattern_doc, ds): assert tpa._match_sequence( pattern_doc, - SequencePattern(Direction.LEFT, - set(), - list(map(as_token_pattern, pattern))), + SequencePattern( + Direction.LEFT, set(), list(map(as_token_pattern, pattern)) + ), start_token=pattern_doc.get_tokens()[4], annos_by_token=defaultdict(list), ds=ds, @@ -369,9 +369,9 @@ def test_match_sequence_left(self, pattern_doc, ds): assert ( tpa._match_sequence( pattern_doc, - SequencePattern(Direction.LEFT, - set(), - list(map(as_token_pattern, pattern))), + SequencePattern( + Direction.LEFT, set(), list(map(as_token_pattern, pattern)) + ), start_token=pattern_doc.get_tokens()[8], annos_by_token=defaultdict(list), ds=ds, @@ -386,9 +386,9 @@ def test_match_sequence_skip(self, pattern_doc, ds): assert tpa._match_sequence( pattern_doc, - SequencePattern(Direction.RIGHT, - {"-"}, - list(map(as_token_pattern, pattern))), + SequencePattern( + Direction.RIGHT, {"-"}, list(map(as_token_pattern, pattern)) + ), start_token=pattern_doc.get_tokens()[4], annos_by_token=defaultdict(list), ds=ds, @@ -396,9 +396,9 @@ def test_match_sequence_skip(self, pattern_doc, ds): assert ( tpa._match_sequence( pattern_doc, - SequencePattern(Direction.RIGHT, - set(), - list(map(as_token_pattern, pattern))), + SequencePattern( + Direction.RIGHT, set(), list(map(as_token_pattern, pattern)) + ), start_token=pattern_doc.get_tokens()[4], annos_by_token=defaultdict(list), ds=ds, diff --git a/tests/unit/test_annotation.py b/tests/unit/test_annotation.py index 7e32ca4..2ed882f 100644 --- a/tests/unit/test_annotation.py +++ b/tests/unit/test_annotation.py @@ -5,7 +5,7 @@ from docdeid import Document from docdeid.annotation import Annotation, AnnotationSet -from docdeid.tokenizer import Token, WordBoundaryTokenizer, Tokenizer +from docdeid.tokenizer import Token, Tokenizer, WordBoundaryTokenizer class TestAnnotation: @@ -162,15 +162,19 @@ def test_get_annotations_sorted_no_frozendict(self, annotations): ) def test_annos_by_token(self, annotations): - doc = Document("1 2 3 1 2 3 hum Hello hum I'm Bob - said Cindy", - tokenizers={"default": WordBoundaryTokenizer(False)}) - aset = AnnotationSet([ - a1 := Annotation("Hello", 16, 21, "word"), - a2 := Annotation("I", 26, 27, "ltr"), - a3 := Annotation("I'm", 26, 29, "words"), - a4 := Annotation("Bob", 30, 33, "name"), - a5 := Annotation("I'm Bob", 26, 33, "stmt"), - ]) + doc = Document( + "1 2 3 1 2 3 hum Hello hum I'm Bob - said Cindy", + tokenizers={"default": WordBoundaryTokenizer(False)}, + ) + aset = AnnotationSet( + [ + a1 := Annotation("Hello", 16, 21, "word"), + a2 := Annotation("I", 26, 27, "ltr"), + a3 := Annotation("I'm", 26, 29, "words"), + a4 := Annotation("Bob", 30, 33, "name"), + a5 := Annotation("I'm Bob", 26, 33, "stmt"), + ] + ) # import pydevd_pycharm # pydevd_pycharm.settrace() @@ -190,22 +194,29 @@ def test_annos_by_token(self, annotations): def test_annos_by_token_2(self, annotations): class HumTokenizer(Tokenizer): """Extracts each "hum" word and the following word as a token.""" + def _split_text(self, text: str) -> list[Token]: return [ Token(match.group(0), match.start(), match.end()) for match in re.finditer("\\bhum\\s+\\w+", text) ] - doc = Document("1 2 3 1 2 3 hum Hello hum I'm Bob - said Cindy", - tokenizers={"default": WordBoundaryTokenizer(False), - "for_fun": HumTokenizer()}) - aset = AnnotationSet([ - a1 := Annotation("Hello", 16, 21, "word"), - a2 := Annotation("I", 26, 27, "ltr"), - a3 := Annotation("I'm", 26, 29, "words"), - a4 := Annotation("Bob", 30, 33, "name"), - a5 := Annotation("I'm Bob", 26, 33, "stmt"), - ]) + doc = Document( + "1 2 3 1 2 3 hum Hello hum I'm Bob - said Cindy", + tokenizers={ + "default": WordBoundaryTokenizer(False), + "for_fun": HumTokenizer(), + }, + ) + aset = AnnotationSet( + [ + a1 := Annotation("Hello", 16, 21, "word"), + a2 := Annotation("I", 26, 27, "ltr"), + a3 := Annotation("I'm", 26, 29, "words"), + a4 := Annotation("Bob", 30, 33, "name"), + a5 := Annotation("I'm Bob", 26, 33, "stmt"), + ] + ) got = aset.annos_by_token(doc) From 40fcd62d46c2df3791ce4111f4b872a6bcf3f8eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Tue, 12 Mar 2024 22:16:15 +0100 Subject: [PATCH 23/41] Test and fix `Direction` --- docdeid/direction.py | 2 +- tests/unit/test_direction.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) create mode 100644 tests/unit/test_direction.py diff --git a/docdeid/direction.py b/docdeid/direction.py index a6eaeb5..15cdbe8 100644 --- a/docdeid/direction.py +++ b/docdeid/direction.py @@ -26,7 +26,7 @@ def from_string(val: str) -> Direction: return Direction.LEFT if norm == "right": return Direction.RIGHT - raise ValueError("Invalid direction: '%s'".format(val)) + raise ValueError("Invalid direction: '{}'".format(val)) def iter(self, seq: Sequence[T]) -> Iterable[T]: """ diff --git a/tests/unit/test_direction.py b/tests/unit/test_direction.py new file mode 100644 index 0000000..07537b1 --- /dev/null +++ b/tests/unit/test_direction.py @@ -0,0 +1,30 @@ +import pytest + +from docdeid.direction import Direction + + +class TestDirection: + def test_basics(self): + assert Direction.LEFT != Direction.RIGHT + assert Direction.LEFT.opposite == Direction.RIGHT + assert Direction.RIGHT.opposite == Direction.LEFT + + def test_parsing(self): + assert Direction.from_string("left") == Direction.LEFT + assert Direction.from_string("Left") == Direction.LEFT + assert Direction.from_string("LEFT") == Direction.LEFT + assert Direction.from_string("right") == Direction.RIGHT + assert Direction.from_string("Right") == Direction.RIGHT + assert Direction.from_string("RIGHT") == Direction.RIGHT + + def test_parsing_failure(self): + with pytest.raises(ValueError, match="Invalid direction: 'down'"): + Direction.from_string("down") + with pytest.raises(ValueError, match="Invalid direction: ' left'"): + Direction.from_string(" left") + + def test_iteration(self): + assert list(Direction.RIGHT.iter([])) == [] + assert list(Direction.LEFT.iter([])) == [] + assert list(Direction.RIGHT.iter([1, 2, "three"])) == [1, 2, "three"] + assert list(Direction.LEFT.iter([1, 2, "three"])) == ["three", 2, 1] From 15b864890a062e74052fbee7bc55594bb9b5a5e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Tue, 12 Mar 2024 22:26:38 +0100 Subject: [PATCH 24/41] Fix Flake8-reported errors --- docdeid/annotation.py | 8 ++++++-- docdeid/process/annotator.py | 20 ++++++++++---------- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/docdeid/annotation.py b/docdeid/annotation.py index 6689365..5afcbc8 100644 --- a/docdeid/annotation.py +++ b/docdeid/annotation.py @@ -4,6 +4,7 @@ from frozendict import frozendict +import docdeid from docdeid.tokenizer import Token UNKNOWN_ATTR_DEFAULT: Any = 0 @@ -127,7 +128,7 @@ class AnnotationSet(set[Annotation]): It extends the builtin ``set``. """ - def __init__(self, *args, **kwargs): + def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) self._annos_by_tokenizers_by_token = {} @@ -191,7 +192,10 @@ def has_overlap(self) -> bool: return False - def annos_by_token(self, doc: "Document") -> defaultdict[Token, set[Annotation]]: + def annos_by_token( + self, + doc: "docdeid.document.Document", + ) -> defaultdict[Token, set[Annotation]]: """ Returns a mapping from document tokens to annotations. diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py index 67fc169..258b032 100644 --- a/docdeid/process/annotator.py +++ b/docdeid/process/annotator.py @@ -5,7 +5,7 @@ from abc import ABC, abstractmethod from collections import defaultdict from dataclasses import dataclass -from typing import Iterable, Literal, Mapping, Optional, Union +from typing import Iterable, Literal, Optional, Union import docdeid.str from docdeid.annotation import Annotation @@ -16,7 +16,7 @@ from docdeid.pattern import TokenPattern from docdeid.process.doc_processor import DocProcessor from docdeid.str.processor import StringModifier -from docdeid.tokenizer import Token, TokenList +from docdeid.tokenizer import Token @dataclass @@ -41,10 +41,10 @@ class NestedTokenPattern: """Coordination of token patterns.""" func: Literal["and", "or"] - pattern: list[TokenPattern] + pattern: list[TokenPatternFromCfg] -TokenPattern = Union[SimpleTokenPattern, NestedTokenPattern] +TokenPatternFromCfg = Union[SimpleTokenPattern, NestedTokenPattern] @dataclass @@ -53,7 +53,7 @@ class SequencePattern: direction: Direction skip: set[str] - pattern: list[TokenPattern] + pattern: list[TokenPatternFromCfg] class Annotator(DocProcessor, ABC): @@ -420,8 +420,8 @@ class _PatternPositionMatcher: # pylint: disable=R0903 """Checks if a token matches against a single pattern.""" @classmethod - def match(cls, token_pattern: dict | TokenPattern, **kwargs) -> bool: # pylint: - # disable=R0911 + def match(cls, token_pattern: dict | TokenPatternFromCfg, **kwargs) -> bool: + # pylint: disable=R0911 """ Matches a pattern position (a dict with one key). Other information should be presented as kwargs. @@ -503,10 +503,10 @@ def _lookup(cls, ent_type: str, **kwargs) -> bool: return token in kwargs.get("ds")[ent_type] -def as_token_pattern(pat_dict: dict) -> TokenPattern: +def as_token_pattern(pat_dict: dict) -> TokenPatternFromCfg: """ - Converts the JSON dictionary representation of token patterns into a `TokenPattern` - instance. + Converts the JSON dictionary representation of token patterns into a + `TokenPatternFromCfg` instance. Args: pat_dict: the JSON representation of the pattern From ebdefa4e4dae4f0e16acff67411bff9f6df12df6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Tue, 12 Mar 2024 22:48:14 +0100 Subject: [PATCH 25/41] Address most non-Mypy lint issues --- Makefile | 8 +++--- docdeid/annotation.py | 24 +++++++++--------- docdeid/direction.py | 2 +- docdeid/process/annotator.py | 37 +++++++++++++--------------- docdeid/tokenizer.py | 4 +-- tests/unit/process/test_annotator.py | 12 ++++----- 6 files changed, 42 insertions(+), 45 deletions(-) diff --git a/Makefile b/Makefile index 82129f5..7312987 100644 --- a/Makefile +++ b/Makefile @@ -4,9 +4,11 @@ format: python -m docformatter . lint: - python -m flake8 . - python -m pylint docdeid/ - python -m mypy docdeid/ + { python -m flake8 .; fret=$$?; }; \ + { python -m pylint docdeid/; pret=$$?; }; \ + { python -m mypy docdeid/; mret=$$?; }; \ + echo "flake8: $$fret, pylint: $$pret, mypy: $$mret"; \ + [ $$fret,$$pret,$$mret = "0,0,0" ] build-docs: sphinx-apidoc --module-first --force --templatedir=docs/templates -o docs/source/api docdeid diff --git a/docdeid/annotation.py b/docdeid/annotation.py index 5afcbc8..562b917 100644 --- a/docdeid/annotation.py +++ b/docdeid/annotation.py @@ -4,7 +4,6 @@ from frozendict import frozendict -import docdeid from docdeid.tokenizer import Token UNKNOWN_ATTR_DEFAULT: Any = 0 @@ -192,6 +191,8 @@ def has_overlap(self) -> bool: return False + import docdeid # needed to type-annotate the `doc` argument below + def annos_by_token( self, doc: "docdeid.document.Document", @@ -221,16 +222,15 @@ def annos_by_token( tok = token_list[cur_tok_idx] except IndexError: break - else: - # Iterate over tokens in the annotation till we reach the end - # of it or the end of the tokens. - anno_tok_idx = cur_tok_idx - anno_tok = tok - while anno_tok.start_char < anno.end_char: - annos_by_token[anno_tok].add(anno) - if anno_tok_idx == len(token_list) - 1: - break - anno_tok_idx += 1 - anno_tok = token_list[anno_tok_idx] + # Iterate over tokens in the annotation till we reach the end + # of it or the end of the tokens. + anno_tok_idx = cur_tok_idx + anno_tok = tok + while anno_tok.start_char < anno.end_char: + annos_by_token[anno_tok].add(anno) + if anno_tok_idx == len(token_list) - 1: + break + anno_tok_idx += 1 + anno_tok = token_list[anno_tok_idx] self._annos_by_tokenizers_by_token[tokenizers] = annos_by_token return self._annos_by_tokenizers_by_token[tokenizers] diff --git a/docdeid/direction.py b/docdeid/direction.py index 15cdbe8..7083459 100644 --- a/docdeid/direction.py +++ b/docdeid/direction.py @@ -26,7 +26,7 @@ def from_string(val: str) -> Direction: return Direction.LEFT if norm == "right": return Direction.RIGHT - raise ValueError("Invalid direction: '{}'".format(val)) + raise ValueError(f"Invalid direction: '{val}'") def iter(self, seq: Sequence[T]) -> Iterable[T]: """ diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py index 258b032..3bb4ca9 100644 --- a/docdeid/process/annotator.py +++ b/docdeid/process/annotator.py @@ -96,7 +96,7 @@ def _match_sequence( seq_pattern: SequencePattern, start_token: Token, annos_by_token: defaultdict[Token, Iterable[Annotation]], - ds: Optional[DsCollection], + dicts: Optional[DsCollection], ) -> Optional[Annotation]: """ Matches a token sequence pattern at `start_token`. @@ -106,7 +106,7 @@ def _match_sequence( seq_pattern: The pattern to match. start_token: The start token to match. annos_by_token: Map from tokens to annotations covering it. - ds: Lookup dictionaries available. + dicts: Lookup dictionaries available. Returns: An Annotation if matching is possible, None otherwise. @@ -131,25 +131,27 @@ def _match_sequence( token_pattern=tok_pattern, token=end_token, annos=annos_by_token[end_token], - ds=ds, + ds=dicts, metadata=doc.metadata, ): num_matched += 1 else: break - if num_matched == len(seq_pattern.pattern): - left_token, right_token = dir_.iter((start_token, end_token)) + if num_matched != len(seq_pattern.pattern): + return None - return Annotation( - text=doc.text[left_token.start_char : right_token.end_char], - start_char=left_token.start_char, - end_char=right_token.end_char, - tag=self.tag, - priority=self.priority, - start_token=left_token, - end_token=right_token, - ) + left_token, right_token = dir_.iter((start_token, end_token)) + + return Annotation( + text=doc.text[left_token.start_char : right_token.end_char], + start_char=left_token.start_char, + end_char=right_token.end_char, + tag=self.tag, + priority=self.priority, + start_token=left_token, + end_token=right_token, + ) class SingleTokenLookupAnnotator(Annotator): @@ -493,12 +495,7 @@ def _lookup(cls, ent_type: str, **kwargs) -> bool: meta_val = getattr(kwargs["metadata"][meta_key], meta_attr) except (TypeError, KeyError, AttributeError): return False - else: - return ( - token == meta_val - if isinstance(meta_val, str) - else token in meta_val - ) + return token == meta_val if isinstance(meta_val, str) else token in meta_val else: # pylint: disable=R1705 return token in kwargs.get("ds")[ent_type] diff --git a/docdeid/tokenizer.py b/docdeid/tokenizer.py index d5c4efa..d1cf97f 100644 --- a/docdeid/tokenizer.py +++ b/docdeid/tokenizer.py @@ -136,9 +136,7 @@ def iter_to( token = self while token is not None: yield token - token = ( - token._next_token if dir_ is Direction.RIGHT else token._previous_token - ) + token = token.next() if dir_ is Direction.RIGHT else token.previous() def __len__(self) -> int: """ diff --git a/tests/unit/process/test_annotator.py b/tests/unit/process/test_annotator.py index efe7a6a..1ee7952 100644 --- a/tests/unit/process/test_annotator.py +++ b/tests/unit/process/test_annotator.py @@ -336,7 +336,7 @@ def test_match_sequence(self, pattern_doc, ds): ), start_token=pattern_doc.get_tokens()[3], annos_by_token=defaultdict(list), - ds=ds, + dicts=ds, ) == Annotation(text="Andries Meijer", start_char=12, end_char=26, tag="_") assert ( tpa._match_sequence( @@ -346,7 +346,7 @@ def test_match_sequence(self, pattern_doc, ds): ), start_token=pattern_doc.get_tokens()[7], annos_by_token=defaultdict(list), - ds=ds, + dicts=ds, ) is None ) @@ -363,7 +363,7 @@ def test_match_sequence_left(self, pattern_doc, ds): ), start_token=pattern_doc.get_tokens()[4], annos_by_token=defaultdict(list), - ds=ds, + dicts=ds, ) == Annotation(text="Andries Meijer", start_char=12, end_char=26, tag="_") assert ( @@ -374,7 +374,7 @@ def test_match_sequence_left(self, pattern_doc, ds): ), start_token=pattern_doc.get_tokens()[8], annos_by_token=defaultdict(list), - ds=ds, + dicts=ds, ) is None ) @@ -391,7 +391,7 @@ def test_match_sequence_skip(self, pattern_doc, ds): ), start_token=pattern_doc.get_tokens()[4], annos_by_token=defaultdict(list), - ds=ds, + dicts=ds, ) == Annotation(text="Meijer-Heerma", start_char=20, end_char=33, tag="_") assert ( tpa._match_sequence( @@ -401,7 +401,7 @@ def test_match_sequence_skip(self, pattern_doc, ds): ), start_token=pattern_doc.get_tokens()[4], annos_by_token=defaultdict(list), - ds=ds, + dicts=ds, ) is None ) From 4a082b8e4080ef63203623ae50a2f1ecfc345f24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Tue, 12 Mar 2024 23:09:16 +0100 Subject: [PATCH 26/41] Address easy and valid Mypy issues --- docdeid/annotation.py | 5 ++++- docdeid/document.py | 4 ++-- docdeid/process/annotator.py | 30 ++++++++++++++---------------- docdeid/tokenizer.py | 2 +- 4 files changed, 21 insertions(+), 20 deletions(-) diff --git a/docdeid/annotation.py b/docdeid/annotation.py index 562b917..95bd38d 100644 --- a/docdeid/annotation.py +++ b/docdeid/annotation.py @@ -129,7 +129,10 @@ class AnnotationSet(set[Annotation]): def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) - self._annos_by_tokenizers_by_token = {} + # Ugh, this feels like Java 9. (For sake of Mypy:) + self._annos_by_tokenizers_by_token: dict[ + frozenset[str], defaultdict[Token, set[Annotation]] + ] = {} def sorted( self, diff --git a/docdeid/document.py b/docdeid/document.py index aa3f1a2..274b66e 100644 --- a/docdeid/document.py +++ b/docdeid/document.py @@ -98,7 +98,7 @@ def text(self) -> str: return self._text @property - def tokenizers(self) -> Optional[Mapping[str, Tokenizer]]: + def tokenizers(self) -> Mapping[str, Tokenizer]: """Available tokenizers indexed by their name.""" if self._tokenizers is None: raise RuntimeError("No tokenizers initialized.") @@ -109,7 +109,7 @@ def token_lists(self) -> Mapping[str, TokenList]: """Lists of tokens of the document, indexed by the name of the corresponding tokenizer.""" for tokker_name in set(self.tokenizers) - set(self._token_lists): - tokker = self._tokenizers[tokker_name] + tokker = self.tokenizers[tokker_name] self._token_lists[tokker_name] = tokker.tokenize(self._text) return self._token_lists diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py index 3bb4ca9..9df1e57 100644 --- a/docdeid/process/annotator.py +++ b/docdeid/process/annotator.py @@ -443,9 +443,9 @@ def match(cls, token_pattern: dict | TokenPatternFromCfg, **kwargs) -> bool: value = token_pattern.pattern if func == "equal": - return kwargs.get("token").text == value + return kwargs["token"].text == value if func == "re_match": - return re.match(value, kwargs.get("token").text) is not None + return re.match(value, kwargs["token"].text) is not None if func == "is_initial": warnings.warn( @@ -455,22 +455,18 @@ def match(cls, token_pattern: dict | TokenPatternFromCfg, **kwargs) -> bool: ) return ( - ( - len(kwargs.get("token").text) == 1 - and kwargs.get("token").text[0].isupper() - ) - or kwargs.get("token").text in {"Ch", "Chr", "Ph", "Th"} + (len(kwargs["token"].text) == 1 and kwargs["token"].text[0].isupper()) + or kwargs["token"].text in {"Ch", "Chr", "Ph", "Th"} ) == value if func == "is_initials": return ( - len(kwargs.get("token").text) <= 4 - and kwargs.get("token").text.isupper() + len(kwargs["token"].text) <= 4 and kwargs["token"].text.isupper() ) == value if func == "like_name": return ( - len(kwargs.get("token").text) >= 3 - and kwargs.get("token").text.istitle() - and not any(ch.isdigit() for ch in kwargs.get("token").text) + len(kwargs["token"].text) >= 3 + and kwargs["token"].text.istitle() + and not any(ch.isdigit() for ch in kwargs["token"].text) ) == value if func == "lookup": return cls._lookup(value, **kwargs) @@ -488,7 +484,7 @@ def match(cls, token_pattern: dict | TokenPatternFromCfg, **kwargs) -> bool: @classmethod def _lookup(cls, ent_type: str, **kwargs) -> bool: - token = kwargs.get("token").text + token = kwargs["token"].text if "." in ent_type: meta_key, meta_attr = ent_type.split(".", 1) try: @@ -497,7 +493,7 @@ def _lookup(cls, ent_type: str, **kwargs) -> bool: return False return token == meta_val if isinstance(meta_val, str) else token in meta_val else: # pylint: disable=R1705 - return token in kwargs.get("ds")[ent_type] + return token in kwargs["ds"][ent_type] def as_token_pattern(pat_dict: dict) -> TokenPatternFromCfg: @@ -591,13 +587,15 @@ def annotate(self, doc: Document) -> list[Annotation]: annotations = [] - tokens = doc.get_tokens() + token_list = doc.get_tokens() if self._start_words is not None: - tokens = tokens.token_lookup( + tokens: Iterable[Token] = token_list.token_lookup( lookup_values=self._start_words, matching_pipeline=self._matching_pipeline, ) + else: + tokens = token_list # ...to make Mypy happy. annos_by_token = doc.annotations.annos_by_token(doc) diff --git a/docdeid/tokenizer.py b/docdeid/tokenizer.py index d1cf97f..9f8d66b 100644 --- a/docdeid/tokenizer.py +++ b/docdeid/tokenizer.py @@ -133,7 +133,7 @@ def iter_to( Args: dir_: direction to go """ - token = self + token: Optional[Token] = self while token is not None: yield token token = token.next() if dir_ is Direction.RIGHT else token.previous() From 3319df118c4768907ffaea9cd1778b0c475d4d59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Fri, 12 Jul 2024 18:33:42 +0200 Subject: [PATCH 27/41] Add a test for keep_blanks=False in WBTokenizer --- docdeid/tokenizer.py | 3 +-- tests/unit/test_tokenizer.py | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/docdeid/tokenizer.py b/docdeid/tokenizer.py index 9f8d66b..39b2bb4 100644 --- a/docdeid/tokenizer.py +++ b/docdeid/tokenizer.py @@ -393,9 +393,8 @@ def _split_text(self, text: str) -> list[Token]: if self._trim: word = text[start_char:end_char] - orig_length = len(word) word = word.rstrip() - end_char -= orig_length - len(word) + end_char = start_char + len(word) word = word.lstrip() start_char = end_char - len(word) if not word: diff --git a/tests/unit/test_tokenizer.py b/tests/unit/test_tokenizer.py index ce118ce..463b1cd 100644 --- a/tests/unit/test_tokenizer.py +++ b/tests/unit/test_tokenizer.py @@ -215,3 +215,17 @@ def test_word_boundary_tokenizer(self): tokens = tokenizer._split_text(text) assert tokens == expected_tokens + + def test_trimming(self): + text = "Jane Keith-Lucas" + tokenizer = WordBoundaryTokenizer(keep_blanks=False) + expected_tokens = [ + Token(text="Jane", start_char=0, end_char=4), + Token(text="Keith", start_char=5, end_char=10), + Token(text="-", start_char=10, end_char=11), + Token(text="Lucas", start_char=11, end_char=16), + ] + + tokens = tokenizer._split_text(text) + + assert tokens == expected_tokens From 1afb16ff23dc3857ff81459d076f990135fa046f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Fri, 12 Jul 2024 18:40:12 +0200 Subject: [PATCH 28/41] Document how to run tests better + cosmetics --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 32cd942..d1dc9dd 100644 --- a/.gitignore +++ b/.gitignore @@ -134,3 +134,6 @@ dmypy.json # IDEs *.iml + +# misc +*~ From 53db956db05dc9f8006ad0c7fb0900356fe180bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Tue, 7 Jan 2025 17:28:46 +0100 Subject: [PATCH 29/41] Drop the `Document.token_lists` property --- docdeid/annotation.py | 5 +++-- docdeid/document.py | 9 --------- tests/unit/test_document.py | 4 ---- 3 files changed, 3 insertions(+), 15 deletions(-) diff --git a/docdeid/annotation.py b/docdeid/annotation.py index 95bd38d..a658f3b 100644 --- a/docdeid/annotation.py +++ b/docdeid/annotation.py @@ -209,10 +209,11 @@ def annos_by_token( # We key the token->annotations cache only by the set of tokenizers where it # actually (obviously) depends also on the document. However, it's assumed # that an AnnotationSet is always bound only to one document. - tokenizers = frozenset(doc.token_lists) + tokenizers = frozenset(doc.tokenizers) if tokenizers not in self._annos_by_tokenizers_by_token: annos_by_token = defaultdict(set) - for token_list in doc.token_lists.values(): + for tokenizer in tokenizers: + token_list = doc.get_tokens(tokenizer) if not token_list: continue cur_tok_idx = 0 diff --git a/docdeid/document.py b/docdeid/document.py index 274b66e..843bce3 100644 --- a/docdeid/document.py +++ b/docdeid/document.py @@ -104,15 +104,6 @@ def tokenizers(self) -> Mapping[str, Tokenizer]: raise RuntimeError("No tokenizers initialized.") return self._tokenizers - @property - def token_lists(self) -> Mapping[str, TokenList]: - """Lists of tokens of the document, indexed by the name of the corresponding - tokenizer.""" - for tokker_name in set(self.tokenizers) - set(self._token_lists): - tokker = self.tokenizers[tokker_name] - self._token_lists[tokker_name] = tokker.tokenize(self._text) - return self._token_lists - def get_tokens(self, tokenizer_name: str = "default") -> TokenList: """ Get the tokens corresponding to the input text, for a specific tokenizer. diff --git a/tests/unit/test_document.py b/tests/unit/test_document.py index c37ada0..e63bd94 100644 --- a/tests/unit/test_document.py +++ b/tests/unit/test_document.py @@ -87,10 +87,6 @@ def test_get_tokens_multiple_tokenizers(self, short_tokens): assert set(doc.tokenizers.keys()) == {"tokenizer_1", "tokenizer_2"} assert doc.get_tokens(tokenizer_name="tokenizer_1") == short_tokens assert doc.get_tokens(tokenizer_name="tokenizer_2") == TokenList([]) - assert doc.token_lists == { - "tokenizer_1": short_tokens, - "tokenizer_2": TokenList([]), - } def test_metadata(self): text = "Hello I'm Bob" From 230c507dde2a959adfea178841027ff642c84bc1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Wed, 8 Jan 2025 10:15:33 +0100 Subject: [PATCH 30/41] Avoid "|" for union types This syntax is not supported in Python 3.9. --- docdeid/annotation.py | 8 +------- docdeid/process/annotator.py | 2 +- docdeid/process/doc_processor.py | 2 +- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/docdeid/annotation.py b/docdeid/annotation.py index a658f3b..4e6b29e 100644 --- a/docdeid/annotation.py +++ b/docdeid/annotation.py @@ -158,13 +158,7 @@ def sorted( A RunTimeError, if the callbacks are not provided as a frozen dict. """ - # Not liked by Mypy, even though - # https://docs.python.org/3/library/stdtypes.html#types-union - # says the "X | Y" notation is equivalent to `typing.Union[X, Y]` and the - # docstring of `typing.Optional` says it's equivalent to - # `typing.Union[None, _]`: - # if not isinstance(callbacks, Optional[frozendict]): - if not isinstance(callbacks, frozendict | None): + if not isinstance(callbacks, (type(None), frozendict)): raise RuntimeError( "Please provide the callbacks as a frozen dict, e.g. " "frozendict.frozendict(end_char=lambda x: -x)" diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py index 9df1e57..227267b 100644 --- a/docdeid/process/annotator.py +++ b/docdeid/process/annotator.py @@ -422,7 +422,7 @@ class _PatternPositionMatcher: # pylint: disable=R0903 """Checks if a token matches against a single pattern.""" @classmethod - def match(cls, token_pattern: dict | TokenPatternFromCfg, **kwargs) -> bool: + def match(cls, token_pattern: Union[dict, TokenPatternFromCfg], **kwargs) -> bool: # pylint: disable=R0911 """ Matches a pattern position (a dict with one key). Other information should be diff --git a/docdeid/process/doc_processor.py b/docdeid/process/doc_processor.py index 3bc8556..1e12115 100644 --- a/docdeid/process/doc_processor.py +++ b/docdeid/process/doc_processor.py @@ -32,7 +32,7 @@ class DocProcessorGroup: def __init__(self) -> None: self._processors: OrderedDict[ - str, Union[DocProcessor | DocProcessorGroup] + str, Union[DocProcessor, DocProcessorGroup] ] = OrderedDict() def get_names(self, recursive: bool = True) -> list[str]: From 25cbcfd2491bdfe253dea55bee86a2fe233c5eff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Wed, 8 Jan 2025 14:18:58 +0100 Subject: [PATCH 31/41] Move `annos_by_token` to `Document` --- docdeid/annotation.py | 50 ------------------------- docdeid/document.py | 70 ++++++++++++++++++++++++++++++++++- docdeid/process/annotator.py | 2 +- tests/unit/test_annotation.py | 4 +- 4 files changed, 71 insertions(+), 55 deletions(-) diff --git a/docdeid/annotation.py b/docdeid/annotation.py index 4e6b29e..a52fa0c 100644 --- a/docdeid/annotation.py +++ b/docdeid/annotation.py @@ -1,4 +1,3 @@ -from collections import defaultdict from dataclasses import dataclass, field from typing import Any, Callable, Optional @@ -129,10 +128,6 @@ class AnnotationSet(set[Annotation]): def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) - # Ugh, this feels like Java 9. (For sake of Mypy:) - self._annos_by_tokenizers_by_token: dict[ - frozenset[str], defaultdict[Token, set[Annotation]] - ] = {} def sorted( self, @@ -187,48 +182,3 @@ def has_overlap(self) -> bool: return True return False - - import docdeid # needed to type-annotate the `doc` argument below - - def annos_by_token( - self, - doc: "docdeid.document.Document", - ) -> defaultdict[Token, set[Annotation]]: - """ - Returns a mapping from document tokens to annotations. - - Args: - doc: document whose tokens are to be linked - """ - # We key the token->annotations cache only by the set of tokenizers where it - # actually (obviously) depends also on the document. However, it's assumed - # that an AnnotationSet is always bound only to one document. - tokenizers = frozenset(doc.tokenizers) - if tokenizers not in self._annos_by_tokenizers_by_token: - annos_by_token = defaultdict(set) - for tokenizer in tokenizers: - token_list = doc.get_tokens(tokenizer) - if not token_list: - continue - cur_tok_idx = 0 - tok = token_list[cur_tok_idx] - for anno in self.sorted(by=("start_char",)): - try: - # Iterate over tokens till we reach the annotation. - while tok.end_char < anno.start_char: - cur_tok_idx += 1 - tok = token_list[cur_tok_idx] - except IndexError: - break - # Iterate over tokens in the annotation till we reach the end - # of it or the end of the tokens. - anno_tok_idx = cur_tok_idx - anno_tok = tok - while anno_tok.start_char < anno.end_char: - annos_by_token[anno_tok].add(anno) - if anno_tok_idx == len(token_list) - 1: - break - anno_tok_idx += 1 - anno_tok = token_list[anno_tok_idx] - self._annos_by_tokenizers_by_token[tokenizers] = annos_by_token - return self._annos_by_tokenizers_by_token[tokenizers] diff --git a/docdeid/document.py b/docdeid/document.py index 843bce3..b7a8444 100644 --- a/docdeid/document.py +++ b/docdeid/document.py @@ -1,10 +1,12 @@ +from collections import defaultdict from collections.abc import Mapping +from dataclasses import dataclass from typing import Any, Optional from frozendict import frozendict -from docdeid.annotation import AnnotationSet -from docdeid.tokenizer import Tokenizer, TokenList +from docdeid.annotation import Annotation, AnnotationSet +from docdeid.tokenizer import Token, Tokenizer, TokenList class MetaData: @@ -69,6 +71,12 @@ class Document: Will be stored in a :class:`.MetaData` object. """ + @dataclass + class AnnosByToken: + """A cache entry associating an `AnnotationSet` with a token->annos map.""" + anno_set: AnnotationSet + value: defaultdict[Token, set[Annotation]] + def __init__( self, text: str, @@ -78,6 +86,8 @@ def __init__( self._text = text self._tokenizers = None if tokenizers is None else frozendict(tokenizers) + self._default_annos_by_token = Document.AnnosByToken(None, None) + self._tmp_annos_by_token = Document.AnnosByToken(None, None) self.metadata = MetaData(metadata) """The :class:`.MetaData` of this :class:`.Document`, that can be interacted @@ -156,6 +166,62 @@ def annotations(self, annotations: AnnotationSet) -> None: """ self._annotations = annotations + def annos_by_token( + self, + annos: AnnotationSet = None, + ) -> defaultdict[Token, set[Annotation]]: + """ + Returns a mapping from document tokens to annotations. + + Args: + annos: annotations for this document to index by token (default: current + annotations of this `Document`) + """ + + # Fill the default arg value. + if annos is None: + eff_annos = self._annotations + cache = self._default_annos_by_token + else: + eff_annos = annos + cache = self._tmp_annos_by_token + + # Try to use a cached response. + if eff_annos == cache.anno_set: + return cache.value + + # Compute the return value. + annos_by_token = defaultdict(set) + for tokenizer in self.tokenizers: + token_list = self.get_tokens(tokenizer) + if not token_list: + continue + cur_tok_idx = 0 + tok = token_list[cur_tok_idx] + for anno in eff_annos.sorted(by=("start_char",)): + try: + # Iterate over tokens till we reach the annotation. + while tok.end_char < anno.start_char: + cur_tok_idx += 1 + tok = token_list[cur_tok_idx] + except IndexError: + break + # Iterate over tokens in the annotation till we reach the end + # of it or the end of the tokens. + anno_tok_idx = cur_tok_idx + anno_tok = tok + while anno_tok.start_char < anno.end_char: + annos_by_token[anno_tok].add(anno) + if anno_tok_idx == len(token_list) - 1: + break + anno_tok_idx += 1 + anno_tok = token_list[anno_tok_idx] + + # Cache the value before returning. + cache.anno_set = eff_annos + cache.value = annos_by_token + return annos_by_token + @property def deidentified_text(self) -> Optional[str]: """ diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py index 227267b..f74ab92 100644 --- a/docdeid/process/annotator.py +++ b/docdeid/process/annotator.py @@ -597,7 +597,7 @@ def annotate(self, doc: Document) -> list[Annotation]: else: tokens = token_list # ...to make Mypy happy. - annos_by_token = doc.annotations.annos_by_token(doc) + annos_by_token = doc.annos_by_token() for token in tokens: diff --git a/tests/unit/test_annotation.py b/tests/unit/test_annotation.py index 2ed882f..072979f 100644 --- a/tests/unit/test_annotation.py +++ b/tests/unit/test_annotation.py @@ -179,7 +179,7 @@ def test_annos_by_token(self, annotations): # import pydevd_pycharm # pydevd_pycharm.settrace() - got = aset.annos_by_token(doc) + got = doc.annos_by_token(aset) want = { Token("Hello", 16, 21): {a1}, @@ -218,7 +218,7 @@ def _split_text(self, text: str) -> list[Token]: ] ) - got = aset.annos_by_token(doc) + got = doc.annos_by_token(aset) want = { Token("Hello", 16, 21): {a1}, From 36eb1e3beaa3ed05a3d3e9bf895bcf700ed8a641 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Wed, 8 Jan 2025 14:27:49 +0100 Subject: [PATCH 32/41] Simplify `Direction.from_string` --- docdeid/direction.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/docdeid/direction.py b/docdeid/direction.py index 7083459..7c53507 100644 --- a/docdeid/direction.py +++ b/docdeid/direction.py @@ -19,14 +19,11 @@ def opposite(self) -> Direction: @staticmethod def from_string(val: str) -> Direction: - """Parses a Direction from a string, which must be either 'left' or 'right' - after lowercasing.""" - norm = val.lower() - if norm == "left": - return Direction.LEFT - if norm == "right": - return Direction.RIGHT - raise ValueError(f"Invalid direction: '{val}'") + """Parses a Direction from a string (case insensitive).""" + try: + return Direction[val.upper()] + except KeyError as key_error: + raise ValueError(f"Invalid direction: '{val}'") from key_error def iter(self, seq: Sequence[T]) -> Iterable[T]: """ From 573deffa3ac5705e0d4bebab286b01f3c0566fe5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Wed, 8 Jan 2025 15:48:43 +0100 Subject: [PATCH 33/41] Rename `SequenceAnnotator.dicts` to `ds` --- docdeid/process/annotator.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py index f74ab92..926344f 100644 --- a/docdeid/process/annotator.py +++ b/docdeid/process/annotator.py @@ -538,7 +538,7 @@ def __init__( **kwargs, ) -> None: self.pattern = pattern - self.dicts = ds + self.ds = ds self.skip = set(skip or []) self._start_words = None @@ -546,13 +546,13 @@ def __init__( if len(self.pattern) > 0 and "lookup" in self.pattern[0]: - if self.dicts is None: + if self.ds is None: raise RuntimeError( "Created pattern with lookup in TokenPatternAnnotator, but " "no lookup structures provided." ) - lookup_list = self.dicts[self.pattern[0]["lookup"]] + lookup_list = self.ds[self.pattern[0]["lookup"]] # FIXME This doesn't work correctly for multiple ([{"lookup":"prefix"}, # {"lookup":"interfix"}]) and nested patterns ("or", "and"). @@ -602,7 +602,7 @@ def annotate(self, doc: Document) -> list[Annotation]: for token in tokens: annotation = self._match_sequence( - doc, self._seq_pattern, token, annos_by_token, self.dicts + doc, self._seq_pattern, token, annos_by_token, self.ds ) if annotation is not None: From a2704c5d8a0b70175090146f9dc7f1e7509baa45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Wed, 8 Jan 2025 16:33:17 +0100 Subject: [PATCH 34/41] Replace `list(map(f, xs))` with list comprehension --- docdeid/process/annotator.py | 7 +++---- tests/unit/process/test_annotator.py | 12 ++++++------ 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py index 926344f..bc699c5 100644 --- a/docdeid/process/annotator.py +++ b/docdeid/process/annotator.py @@ -511,7 +511,7 @@ def as_token_pattern(pat_dict: dict) -> TokenPatternFromCfg: ) func, value = next(iter(pat_dict.items())) if func in ("and", "or"): - return NestedTokenPattern(func, list(map(as_token_pattern, value))) + return NestedTokenPattern(func, [as_token_pattern(it) for it in value]) return SimpleTokenPattern(func, value) @@ -539,7 +539,6 @@ def __init__( ) -> None: self.pattern = pattern self.ds = ds - self.skip = set(skip or []) self._start_words = None self._matching_pipeline = None @@ -558,7 +557,7 @@ def __init__( # {"lookup":"interfix"}]) and nested patterns ("or", "and"). if not isinstance(lookup_list, LookupSet): raise ValueError( - f"Expected a LookupSet, but got a " f"{type(lookup_list)}." + f"Expected a LookupSet, but got a {type(lookup_list)}." ) # FIXME This doesn't work correctly for multiple ([{"lookup":"prefix"}, @@ -569,7 +568,7 @@ def __init__( self._matching_pipeline = lookup_list.matching_pipeline self._seq_pattern = SequencePattern( - Direction.RIGHT, set(skip or ()), list(map(as_token_pattern, pattern)) + Direction.RIGHT, set(skip or ()), [as_token_pattern(it) for it in pattern] ) super().__init__(*args, **kwargs) diff --git a/tests/unit/process/test_annotator.py b/tests/unit/process/test_annotator.py index 1ee7952..fb4ec23 100644 --- a/tests/unit/process/test_annotator.py +++ b/tests/unit/process/test_annotator.py @@ -332,7 +332,7 @@ def test_match_sequence(self, pattern_doc, ds): assert tpa._match_sequence( pattern_doc, SequencePattern( - Direction.RIGHT, set(), list(map(as_token_pattern, pattern)) + Direction.RIGHT, set(), [as_token_pattern(it) for it in pattern] ), start_token=pattern_doc.get_tokens()[3], annos_by_token=defaultdict(list), @@ -342,7 +342,7 @@ def test_match_sequence(self, pattern_doc, ds): tpa._match_sequence( pattern_doc, SequencePattern( - Direction.RIGHT, set(), list(map(as_token_pattern, pattern)) + Direction.RIGHT, set(), [as_token_pattern(it) for it in pattern] ), start_token=pattern_doc.get_tokens()[7], annos_by_token=defaultdict(list), @@ -359,7 +359,7 @@ def test_match_sequence_left(self, pattern_doc, ds): assert tpa._match_sequence( pattern_doc, SequencePattern( - Direction.LEFT, set(), list(map(as_token_pattern, pattern)) + Direction.LEFT, set(), [as_token_pattern(it) for it in pattern] ), start_token=pattern_doc.get_tokens()[4], annos_by_token=defaultdict(list), @@ -370,7 +370,7 @@ def test_match_sequence_left(self, pattern_doc, ds): tpa._match_sequence( pattern_doc, SequencePattern( - Direction.LEFT, set(), list(map(as_token_pattern, pattern)) + Direction.LEFT, set(), [as_token_pattern(it) for it in pattern] ), start_token=pattern_doc.get_tokens()[8], annos_by_token=defaultdict(list), @@ -387,7 +387,7 @@ def test_match_sequence_skip(self, pattern_doc, ds): assert tpa._match_sequence( pattern_doc, SequencePattern( - Direction.RIGHT, {"-"}, list(map(as_token_pattern, pattern)) + Direction.RIGHT, {"-"}, [as_token_pattern(it) for it in pattern] ), start_token=pattern_doc.get_tokens()[4], annos_by_token=defaultdict(list), @@ -397,7 +397,7 @@ def test_match_sequence_skip(self, pattern_doc, ds): tpa._match_sequence( pattern_doc, SequencePattern( - Direction.RIGHT, set(), list(map(as_token_pattern, pattern)) + Direction.RIGHT, set(), [as_token_pattern(it) for it in pattern] ), start_token=pattern_doc.get_tokens()[4], annos_by_token=defaultdict(list), From 3ca37aa6b1a8aef34d48c1220fd5c2b195377abd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Wed, 8 Jan 2025 18:02:48 +0100 Subject: [PATCH 35/41] Re-add `MultiTokenLookupAnnotator` accepting a `LookupSet` --- docdeid/process/__init__.py | 1 + docdeid/process/annotator.py | 71 +++++++++++++++++++++++++++++++----- 2 files changed, 63 insertions(+), 9 deletions(-) diff --git a/docdeid/process/__init__.py b/docdeid/process/__init__.py index d0f040f..6a40be3 100644 --- a/docdeid/process/__init__.py +++ b/docdeid/process/__init__.py @@ -6,6 +6,7 @@ from .annotator import ( Annotator, MultiTokenLookupAnnotator, + MultiTokenTrieAnnotator, RegexpAnnotator, SequenceAnnotator, SingleTokenLookupAnnotator, diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py index bc699c5..88c1b67 100644 --- a/docdeid/process/annotator.py +++ b/docdeid/process/annotator.py @@ -16,7 +16,7 @@ from docdeid.pattern import TokenPattern from docdeid.process.doc_processor import DocProcessor from docdeid.str.processor import StringModifier -from docdeid.tokenizer import Token +from docdeid.tokenizer import Token, Tokenizer @dataclass @@ -207,7 +207,7 @@ def annotate(self, doc: Document) -> list[Annotation]: return self._tokens_to_annotations(annotate_tokens) -class MultiTokenLookupAnnotator(Annotator): +class MultiTokenTrieAnnotator(Annotator): """ Annotates entity mentions by looking them up in a `LookupTrie`. @@ -219,11 +219,11 @@ class MultiTokenLookupAnnotator(Annotator): """ def __init__( - self, - *args, - trie: LookupTrie, - overlapping: bool = False, - **kwargs, + self, + *args, + trie: LookupTrie, + overlapping: bool = False, + **kwargs, ) -> None: self._trie = trie @@ -235,8 +235,7 @@ def __init__( @property def start_words(self) -> set[str]: """First words of phrases detected by this annotator.""" - # If the trie has been modified (added to) since we computed - # _start_words, + # If the trie has been modified (added to) since we computed _start_words, if len(self._start_words) != len(self._trie.children): # Recompute _start_words. self._start_words = set(self._trie.children) @@ -292,6 +291,60 @@ def annotate(self, doc: Document) -> list[Annotation]: return annotations +class MultiTokenLookupAnnotator(MultiTokenTrieAnnotator): + """ + Annotates entity mentions by looking them up in a `LookupTrie` or + a collection of phrases. This is a thin wrapper for + class:`MultiTokenTrieAnnotator` that additionally handles non-trie lookup + structures by building tries out of them and delegating to the parent class. + + Args: + lookup_values: An iterable of phrases that should be matched. These are + tokenized using ``tokenizer``. + matching_pipeline: An optional pipeline that can be used for matching + (e.g. lowercasing). This has no specific impact on matching performance, + other than overhead for applying the pipeline to each string. + tokenizer: A tokenizer that is used to create the sequence patterns from + ``lookup_values``. + trie: A `LookupTrie` containing all entity mentions that should be + annotated. Specifying this is mutually exclusive with specifying + ``lookup_values`` and ``tokenizer``. + overlapping: Whether overlapping phrases are to be returned. + *args, **kwargs: Passed through to the `Annotator` constructor (which accepts + the arguments `tag` and `priority`). + + Raises: + RunTimeError, when an incorrect combination of `lookup_values`, + `matching_pipeline` and `trie` is supplied. + """ + + def __init__( + self, + *args, + lookup_values: Optional[Iterable[str]] = None, + matching_pipeline: Optional[list[StringModifier]] = None, + tokenizer: Optional[Tokenizer] = None, + trie: Optional[LookupTrie] = None, + overlapping: bool = False, + **kwargs, + ) -> None: + + if (trie is not None) and (lookup_values is None) and (tokenizer is None): + eff_trie = trie + + elif (trie is None) and (lookup_values is not None) and (tokenizer is not None): + eff_trie = LookupTrie(matching_pipeline=matching_pipeline) + for phrase in filter(None, map(tokenizer.tokenize, lookup_values)): + eff_trie.add_item([token.text for token in phrase]) + + else: + raise RuntimeError( + "Please provide either looup_values and a tokenizer, or a trie." + ) + + super().__init__(*args, trie=eff_trie, overlapping=overlapping, **kwargs) + + class RegexpAnnotator(Annotator): """ Create annotations based on regular expression patterns. Note that these patterns do From 68f4afba4f72f3d00de7441c5ce60e8d2e226355 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Thu, 9 Jan 2025 13:38:54 +0100 Subject: [PATCH 36/41] Add a test for matching multi-word phrases --- tests/unit/process/test_annotator.py | 42 +++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/tests/unit/process/test_annotator.py b/tests/unit/process/test_annotator.py index fb4ec23..466064a 100644 --- a/tests/unit/process/test_annotator.py +++ b/tests/unit/process/test_annotator.py @@ -306,8 +306,10 @@ class TestSequenceAnnotator: def ds(self): ds = DsCollection() - first_names = ["Andries", "pieter", "Aziz", "Bernard"] + first_names = ["Andries", "pieter", "Aziz", "Bernard", "Won Jung"] surnames = ["Meijer", "Smit", "Bakker", "Heerma"] + interfixes = ["v/d"] + interfixed_surnames = ["Heck"] ds["first_names"] = LookupSet() ds["first_names"].add_items_from_iterable(items=first_names) @@ -315,6 +317,12 @@ def ds(self): ds["surnames"] = LookupSet() ds["surnames"].add_items_from_iterable(items=surnames) + ds["interfixes"] = LookupSet() + ds["interfixes"].add_items_from_iterable(items=interfixes) + + ds["interfixed_surnames"] = LookupSet() + ds["interfixed_surnames"].add_items_from_iterable(items=interfixed_surnames) + return ds @pytest.fixture @@ -324,6 +332,20 @@ def pattern_doc(self): tokenizers={"default": WordBoundaryTokenizer(False)}, ) + @pytest.fixture + def interfixed_doc(self): + return Document( + text="De man heet v/d Heck.", + tokenizers={"default": WordBoundaryTokenizer(False)}, + ) + + @pytest.fixture + def korean_doc(self): + return Document( + text="De mevrouw heet Won Jung Meijer-Heerma.", + tokenizers={"default": WordBoundaryTokenizer(False)}, + ) + def test_match_sequence(self, pattern_doc, ds): pattern = [{"lookup": "first_names"}, {"like_name": True}] @@ -414,3 +436,21 @@ def test_annotate(self, pattern_doc, ds): assert tpa.annotate(pattern_doc) == [ Annotation(text="Andries Meijer", start_char=12, end_char=26, tag="_") ] + + def test_annotate_multiword(self, interfixed_doc, korean_doc, ds): + # XXX This tests functionality (matching multiple tokens with one member of + # the "pattern" list) which is not supported as per the SequenceAnnotator + # docstring, nonetheless is exercised by the packaged base_config.json (most + # notably in the case of the interfix_with_name annotator). + + inter_pattern = [{"lookup": "interfixes"}, {"lookup": "interfixed_surnames"}] + ipa = SequenceAnnotator(pattern=inter_pattern, ds=ds, tag="_") + assert ipa.annotate(interfixed_doc) == [ + Annotation(text="v/d Heck", start_char=12, end_char=20, tag="_") + ] + + pattern = [{"lookup": "first_names"}, {"like_name": True}] + kpa = SequenceAnnotator(pattern=pattern, ds=ds, tag="_") + assert kpa.annotate(korean_doc) == [ + Annotation(text="Won Jung Meijer", start_char=16, end_char=31, tag="_") + ] From fb3cbd84d34b5cb292d4e3c1d3fa788e6ed4a8fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Thu, 9 Jan 2025 14:19:47 +0100 Subject: [PATCH 37/41] Try to support multi-word matching in SequenceAnnotator --- docdeid/process/annotator.py | 34 ++++++++++++++++------------ docdeid/tokenizer.py | 9 ++++++++ tests/unit/process/test_annotator.py | 6 +++-- 3 files changed, 32 insertions(+), 17 deletions(-) diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py index 88c1b67..193a57a 100644 --- a/docdeid/process/annotator.py +++ b/docdeid/process/annotator.py @@ -16,7 +16,7 @@ from docdeid.pattern import TokenPattern from docdeid.process.doc_processor import DocProcessor from docdeid.str.processor import StringModifier -from docdeid.tokenizer import Token, Tokenizer +from docdeid.tokenizer import Token, Tokenizer, DummyTokenizer @dataclass @@ -571,15 +571,18 @@ def as_token_pattern(pat_dict: dict) -> TokenPatternFromCfg: class SequenceAnnotator(Annotator): """ Annotates based on token patterns, which should be provided as a list of dicts. Each - position in the list denotes a token position, e.g.: [{'is_initial': True}, - {'like_name': True}] matches sequences of two tokens, where the first one is an - initial, and the second one is like a name. + position in the list corresponds to a token or a token sequence. For example: + ``[{'is_initial': True}, {'like_name': True}]`` matches sequences of two tokens + where the first one is an initial and the second one looks like a name. Arguments: pattern: The pattern ds: Lookup dictionaries. Those referenced by the pattern should be LookupSets. (Don't ask why.) skip: Any string values that should be skipped in matching (e.g. periods) + tokenizer: A tokenizer called to determine the first word to look for each + phrase in ``lookup_values``. If none is provided, phrases in + ``lookup_values`` are all assumed to be a single word. """ def __init__( @@ -588,15 +591,17 @@ def __init__( *args, ds: Optional[DsCollection] = None, skip: Optional[list[str]] = None, + tokenizer: Optional[Tokenizer] = None, **kwargs, ) -> None: self.pattern = pattern self.ds = ds self._start_words = None - self._matching_pipeline = None + self._start_matching_pipeline = None - if len(self.pattern) > 0 and "lookup" in self.pattern[0]: + # If the first token pattern is lookup, determine the possible starting words. + if self.pattern and "lookup" in self.pattern[0]: if self.ds is None: raise RuntimeError( @@ -606,19 +611,18 @@ def __init__( lookup_list = self.ds[self.pattern[0]["lookup"]] - # FIXME This doesn't work correctly for multiple ([{"lookup":"prefix"}, - # {"lookup":"interfix"}]) and nested patterns ("or", "and"). if not isinstance(lookup_list, LookupSet): raise ValueError( f"Expected a LookupSet, but got a {type(lookup_list)}." ) - # FIXME This doesn't work correctly for multiple ([{"lookup":"prefix"}, - # {"lookup":"interfix"}]) and nested patterns ("or", "and"). - self._start_words = lookup_list.items() - # FIXME This doesn't work correctly for multiple ([{"lookup":"prefix"}, - # {"lookup":"interfix"}]) and nested patterns ("or", "and"). - self._matching_pipeline = lookup_list.matching_pipeline + eff_tokenizer = tokenizer or DummyTokenizer() + self._start_words = { + phrase[0].text + for phrase in filter(None, map(eff_tokenizer.tokenize, + lookup_list.items())) + } + self._start_matching_pipeline = lookup_list.matching_pipeline self._seq_pattern = SequencePattern( Direction.RIGHT, set(skip or ()), [as_token_pattern(it) for it in pattern] @@ -644,7 +648,7 @@ def annotate(self, doc: Document) -> list[Annotation]: if self._start_words is not None: tokens: Iterable[Token] = token_list.token_lookup( lookup_values=self._start_words, - matching_pipeline=self._matching_pipeline, + matching_pipeline=self._start_matching_pipeline, ) else: tokens = token_list # ...to make Mypy happy. diff --git a/docdeid/tokenizer.py b/docdeid/tokenizer.py index 39b2bb4..a889ac7 100644 --- a/docdeid/tokenizer.py +++ b/docdeid/tokenizer.py @@ -355,6 +355,15 @@ def tokenize(self, text: str) -> TokenList: return TokenList(tokens, link_tokens=self.link_tokens) +class DummyTokenizer(Tokenizer): # pylint: disable=R0903 + """ + Treats any given string as a single token. + """ + + def _split_text(self, text: str) -> list[Token]: + return [Token(text=text, start_char=0, end_char=len(text))] + + class SpaceSplitTokenizer(Tokenizer): # pylint: disable=R0903 """ Tokenizes based on splitting on whitespaces. diff --git a/tests/unit/process/test_annotator.py b/tests/unit/process/test_annotator.py index 466064a..3fcb773 100644 --- a/tests/unit/process/test_annotator.py +++ b/tests/unit/process/test_annotator.py @@ -444,13 +444,15 @@ def test_annotate_multiword(self, interfixed_doc, korean_doc, ds): # notably in the case of the interfix_with_name annotator). inter_pattern = [{"lookup": "interfixes"}, {"lookup": "interfixed_surnames"}] - ipa = SequenceAnnotator(pattern=inter_pattern, ds=ds, tag="_") + ipa = SequenceAnnotator(pattern=inter_pattern, ds=ds, + tokenizer=WordBoundaryTokenizer(False), tag="_") assert ipa.annotate(interfixed_doc) == [ Annotation(text="v/d Heck", start_char=12, end_char=20, tag="_") ] pattern = [{"lookup": "first_names"}, {"like_name": True}] - kpa = SequenceAnnotator(pattern=pattern, ds=ds, tag="_") + kpa = SequenceAnnotator(pattern=pattern, ds=ds, + tokenizer=WordBoundaryTokenizer(False), tag="_") assert kpa.annotate(korean_doc) == [ Annotation(text="Won Jung Meijer", start_char=16, end_char=31, tag="_") ] From 0c04a784178cd20cac7dc3ed22a2ba7dad35cb80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Thu, 9 Jan 2025 14:32:27 +0100 Subject: [PATCH 38/41] Give up multi-word matching in SequenceAnnotator --- docdeid/process/annotator.py | 19 +++++++------------ tests/unit/process/test_annotator.py | 17 ++++++++++------- 2 files changed, 17 insertions(+), 19 deletions(-) diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py index 193a57a..6e014e8 100644 --- a/docdeid/process/annotator.py +++ b/docdeid/process/annotator.py @@ -16,7 +16,7 @@ from docdeid.pattern import TokenPattern from docdeid.process.doc_processor import DocProcessor from docdeid.str.processor import StringModifier -from docdeid.tokenizer import Token, Tokenizer, DummyTokenizer +from docdeid.tokenizer import Token, Tokenizer @dataclass @@ -571,7 +571,7 @@ def as_token_pattern(pat_dict: dict) -> TokenPatternFromCfg: class SequenceAnnotator(Annotator): """ Annotates based on token patterns, which should be provided as a list of dicts. Each - position in the list corresponds to a token or a token sequence. For example: + position in the list corresponds to a token. For example: ``[{'is_initial': True}, {'like_name': True}]`` matches sequences of two tokens where the first one is an initial and the second one looks like a name. @@ -580,9 +580,6 @@ class SequenceAnnotator(Annotator): ds: Lookup dictionaries. Those referenced by the pattern should be LookupSets. (Don't ask why.) skip: Any string values that should be skipped in matching (e.g. periods) - tokenizer: A tokenizer called to determine the first word to look for each - phrase in ``lookup_values``. If none is provided, phrases in - ``lookup_values`` are all assumed to be a single word. """ def __init__( @@ -591,7 +588,6 @@ def __init__( *args, ds: Optional[DsCollection] = None, skip: Optional[list[str]] = None, - tokenizer: Optional[Tokenizer] = None, **kwargs, ) -> None: self.pattern = pattern @@ -616,12 +612,11 @@ def __init__( f"Expected a LookupSet, but got a {type(lookup_list)}." ) - eff_tokenizer = tokenizer or DummyTokenizer() - self._start_words = { - phrase[0].text - for phrase in filter(None, map(eff_tokenizer.tokenize, - lookup_list.items())) - } + # XXX We assume the items of the lookup list are all single words. This + # is not always the case but just splitting the phrases wouldn't help + # because the "lookup" token matcher assumes matching against a single + # token. + self._start_words = lookup_list.items() self._start_matching_pipeline = lookup_list.matching_pipeline self._seq_pattern = SequencePattern( diff --git a/tests/unit/process/test_annotator.py b/tests/unit/process/test_annotator.py index 3fcb773..3b6e0d0 100644 --- a/tests/unit/process/test_annotator.py +++ b/tests/unit/process/test_annotator.py @@ -437,22 +437,25 @@ def test_annotate(self, pattern_doc, ds): Annotation(text="Andries Meijer", start_char=12, end_char=26, tag="_") ] + @pytest.mark.xfail(reason="The lookup token pattern only ever matches a single " + "token and the SequenceAnnotator docstring accordingly " + "rules the case of multiple tokens per pattern out of " + "scope. Yet, the packaged base_config.json seems to " + "rely on such multi-word matches, most notably in the " + "case of the interfix_with_name annotator.") def test_annotate_multiword(self, interfixed_doc, korean_doc, ds): - # XXX This tests functionality (matching multiple tokens with one member of - # the "pattern" list) which is not supported as per the SequenceAnnotator - # docstring, nonetheless is exercised by the packaged base_config.json (most - # notably in the case of the interfix_with_name annotator). - inter_pattern = [{"lookup": "interfixes"}, {"lookup": "interfixed_surnames"}] ipa = SequenceAnnotator(pattern=inter_pattern, ds=ds, - tokenizer=WordBoundaryTokenizer(False), tag="_") + # tokenizer=WordBoundaryTokenizer(False), + tag="_") assert ipa.annotate(interfixed_doc) == [ Annotation(text="v/d Heck", start_char=12, end_char=20, tag="_") ] pattern = [{"lookup": "first_names"}, {"like_name": True}] kpa = SequenceAnnotator(pattern=pattern, ds=ds, - tokenizer=WordBoundaryTokenizer(False), tag="_") + # tokenizer=WordBoundaryTokenizer(False), + tag="_") assert kpa.annotate(korean_doc) == [ Annotation(text="Won Jung Meijer", start_char=16, end_char=31, tag="_") ] From 82c52fc6686d6cf34672cf6da96324641b6cd8c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Thu, 9 Jan 2025 21:34:34 +0100 Subject: [PATCH 39/41] Move seq pattern validation to a new method --- docdeid/process/annotator.py | 48 +++++++++++------ docdeid/utils.py | 28 ++++++++++ tests/unit/process/test_annotator.py | 78 ++++++++++++++++++++++------ 3 files changed, 122 insertions(+), 32 deletions(-) diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py index 6e014e8..ee82334 100644 --- a/docdeid/process/annotator.py +++ b/docdeid/process/annotator.py @@ -2,6 +2,7 @@ import re import warnings + from abc import ABC, abstractmethod from collections import defaultdict from dataclasses import dataclass @@ -17,6 +18,7 @@ from docdeid.process.doc_processor import DocProcessor from docdeid.str.processor import StringModifier from docdeid.tokenizer import Token, Tokenizer +from docdeid.utils import leaf_items @dataclass @@ -56,6 +58,7 @@ class SequencePattern: pattern: list[TokenPatternFromCfg] + class Annotator(DocProcessor, ABC): """ Abstract class for annotators, which are responsible for generating annotations from @@ -596,28 +599,16 @@ def __init__( self._start_words = None self._start_matching_pipeline = None - # If the first token pattern is lookup, determine the possible starting words. - if self.pattern and "lookup" in self.pattern[0]: - - if self.ds is None: - raise RuntimeError( - "Created pattern with lookup in TokenPatternAnnotator, but " - "no lookup structures provided." - ) - - lookup_list = self.ds[self.pattern[0]["lookup"]] - - if not isinstance(lookup_list, LookupSet): - raise ValueError( - f"Expected a LookupSet, but got a {type(lookup_list)}." - ) + SequenceAnnotator.validate_pattern(pattern, ds) + # If the first token pattern is lookup, determine the possible starting words. + if start_ent_type := pattern[0].get("lookup"): # XXX We assume the items of the lookup list are all single words. This # is not always the case but just splitting the phrases wouldn't help # because the "lookup" token matcher assumes matching against a single # token. - self._start_words = lookup_list.items() - self._start_matching_pipeline = lookup_list.matching_pipeline + self._start_words = ds[start_ent_type].items() + self._start_matching_pipeline = ds[start_ent_type].matching_pipeline self._seq_pattern = SequencePattern( Direction.RIGHT, set(skip or ()), [as_token_pattern(it) for it in pattern] @@ -625,6 +616,29 @@ def __init__( super().__init__(*args, **kwargs) + @classmethod + def validate_pattern(cls, pattern, ds): + if not pattern: + raise ValueError(f"Sequence pattern is missing or empty: {pattern}.") + + referenced_ents = {match_val + for tok_pattern in pattern + for func, match_val in leaf_items(tok_pattern) + if func.endswith("lookup")} + if referenced_ents and ds is None: + raise ValueError("Pattern relies on entity lookups but no lookup " + "structures were provided.") + + if missing := referenced_ents - set(ds or ()): + raise ValueError("Unknown lookup entity types: {}." + .format(", ".join(sorted(missing)))) + + if start_ent_type := pattern[0].get("lookup"): + if not isinstance(ds[start_ent_type], LookupSet): + raise ValueError('If the first token pattern is lookup, it must be ' + f'backed by a LookupSet, but "{start_ent_type}" is ' + f'backed by a {type(ds[start_ent_type]).__name__}.') + def annotate(self, doc: Document) -> list[Annotation]: """ Annotate the document, by matching the pattern against all tokens. diff --git a/docdeid/utils.py b/docdeid/utils.py index a1fcdd7..3c57a20 100644 --- a/docdeid/utils.py +++ b/docdeid/utils.py @@ -1,10 +1,38 @@ from collections import defaultdict +from collections.abc import Generator, Iterable, Iterator, Mapping +from typing import Any, Optional from frozendict import frozendict from docdeid.document import Document +def leaf_items(json_struct: Mapping) -> Iterator[tuple]: + """ + Generates all `(key, value)` items that appear as leaves of the potentially deeply + nested JSON-like structure `json_struct`, where being a leaf item means that + `key` is associated with a `value` in a dict and `value` is of an atomic type + (such as a `str` but not list-like or map-like). + + :param json_struct: nested structure to iterate + :return: generator of leaf `(key, value)` items + """ + return __leaf_items(json_struct, None) + + +def __leaf_items(obj: Any, par_key: Optional[str]) -> Generator[tuple, None, None]: + if isinstance(obj, Mapping): + for key, val in obj.items(): + for item in __leaf_items(val, key): + yield item + elif isinstance(obj, Iterable) and not isinstance(obj, (bytes, str)): + for member in obj: + for item in __leaf_items(member, None): + yield item + elif par_key is not None: + yield par_key, obj + + def annotate_intext(doc: Document) -> str: """ Annotate intext, which can be useful to compare the annotations of two different diff --git a/tests/unit/process/test_annotator.py b/tests/unit/process/test_annotator.py index 3b6e0d0..d71296b 100644 --- a/tests/unit/process/test_annotator.py +++ b/tests/unit/process/test_annotator.py @@ -320,8 +320,10 @@ def ds(self): ds["interfixes"] = LookupSet() ds["interfixes"].add_items_from_iterable(items=interfixes) - ds["interfixed_surnames"] = LookupSet() - ds["interfixed_surnames"].add_items_from_iterable(items=interfixed_surnames) + trie = LookupTrie() + for phrase in interfixed_surnames: + trie.add_item(phrase.split()) + ds["interfixed_surnames"] = trie return ds @@ -346,16 +348,60 @@ def korean_doc(self): tokenizers={"default": WordBoundaryTokenizer(False)}, ) + def test_validation(self, ds): + with pytest.raises(ValueError) as exc_info: + tpa = SequenceAnnotator(pattern=[], ds=ds, tag="_") + assert "missing or empty" in str(exc_info) + + # Lookup structures are not required if there are no lookup token patterns. + tpa = SequenceAnnotator(pattern=[{"like_name": True}], tag="_") + assert True + + with pytest.raises(ValueError) as exc_info: + tpa = SequenceAnnotator(pattern=[{"lookup": "undefined_entity"}], tag="_") + assert "no lookup structures were provided" in str(exc_info) + + with pytest.raises(ValueError) as exc_info: + tpa = SequenceAnnotator(pattern=[{"lookup": "undefined_entity"}], + ds=ds, + tag="_") + assert "Unknown lookup entity types: undefined_entity." in str(exc_info) + + with pytest.raises(ValueError) as exc_info: + tpa = SequenceAnnotator( + pattern=[{"or": [{"lookup": "undefined_entity"}, + {"lookup": "another_entity"}]}], + ds=ds, + tag="_") + assert ("Unknown lookup entity types: another_entity, undefined_entity." + in str(exc_info)) + + with pytest.raises(ValueError) as exc_info: + tpa = SequenceAnnotator( + pattern=[{"or": [{"lookup": "interfixes"}, + {"and": [{"lookup": "first_names"}, + {"lookup": "alien_entity"}]}]}, + {"lookup": "another_entity"}], + ds=ds, + tag="_") + assert ("Unknown lookup entity types: alien_entity, another_entity." + in str(exc_info)) + + with pytest.raises(ValueError) as exc_info: + tpa = SequenceAnnotator( + pattern=[{"lookup": "interfixed_surnames"}], + ds=ds, + tag="_") + assert ("is backed by a LookupTrie" in str(exc_info)) + def test_match_sequence(self, pattern_doc, ds): pattern = [{"lookup": "first_names"}, {"like_name": True}] - tpa = SequenceAnnotator(pattern=[], ds=ds, tag="_") + tpa = SequenceAnnotator(pattern=pattern, ds=ds, tag="_") assert tpa._match_sequence( pattern_doc, - SequencePattern( - Direction.RIGHT, set(), [as_token_pattern(it) for it in pattern] - ), + tpa._seq_pattern, start_token=pattern_doc.get_tokens()[3], annos_by_token=defaultdict(list), dicts=ds, @@ -363,9 +409,7 @@ def test_match_sequence(self, pattern_doc, ds): assert ( tpa._match_sequence( pattern_doc, - SequencePattern( - Direction.RIGHT, set(), [as_token_pattern(it) for it in pattern] - ), + tpa._seq_pattern, start_token=pattern_doc.get_tokens()[7], annos_by_token=defaultdict(list), dicts=ds, @@ -374,9 +418,14 @@ def test_match_sequence(self, pattern_doc, ds): ) def test_match_sequence_left(self, pattern_doc, ds): + """ + Matching is always performed in the direction left-to-right by + SequenceAnnotator proper but the same method is also called by + ContextAnnotator in Deduce, where matching may proceed also right-to-left. + """ pattern = [{"lookup": "first_names"}, {"like_name": True}] - tpa = SequenceAnnotator(pattern=[], ds=ds, tag="_") + tpa = SequenceAnnotator(pattern=pattern, ds=ds, tag="_") assert tpa._match_sequence( pattern_doc, @@ -404,13 +453,12 @@ def test_match_sequence_left(self, pattern_doc, ds): def test_match_sequence_skip(self, pattern_doc, ds): pattern = [{"lookup": "surnames"}, {"like_name": True}] - tpa = SequenceAnnotator(pattern=[], ds=ds, tag="_") + tpa = SequenceAnnotator(pattern=pattern, ds=ds, tag="_") + tpa_skipping = SequenceAnnotator(pattern=pattern, ds=ds, skip=["-"], tag="_") - assert tpa._match_sequence( + assert tpa_skipping._match_sequence( pattern_doc, - SequencePattern( - Direction.RIGHT, {"-"}, [as_token_pattern(it) for it in pattern] - ), + tpa_skipping._seq_pattern, start_token=pattern_doc.get_tokens()[4], annos_by_token=defaultdict(list), dicts=ds, From 9dcc4f08cd7f8c115816e252d54a2f399d224e74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Thu, 9 Jan 2025 21:35:12 +0100 Subject: [PATCH 40/41] Polish the code a little --- docdeid/tokenizer.py | 4 ++-- docdeid/utils.py | 4 +--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/docdeid/tokenizer.py b/docdeid/tokenizer.py index a889ac7..9e69208 100644 --- a/docdeid/tokenizer.py +++ b/docdeid/tokenizer.py @@ -4,9 +4,9 @@ import sys from abc import ABC, abstractmethod from collections import defaultdict -from collections.abc import Sequence +from collections.abc import Generator, Iterator, Sequence from dataclasses import dataclass, field -from typing import Generator, Iterator, Literal, Optional, SupportsIndex, overload +from typing import Literal, Optional, SupportsIndex, overload from docdeid.direction import Direction from docdeid.str import StringModifier diff --git a/docdeid/utils.py b/docdeid/utils.py index 3c57a20..d5bfe5a 100644 --- a/docdeid/utils.py +++ b/docdeid/utils.py @@ -71,9 +71,7 @@ def annotate_doc(doc: Document) -> str: Handles also nested mentions and in a way also overlapping mentions, even though this kind of markup cannot really represent them. """ - annos_from_shortest = sorted( - doc.annotations, key=lambda anno: anno.end_char - anno.start_char - ) + annos_from_shortest = doc.annotations.sorted(by=("length", )) idx_to_anno_starts = defaultdict(list) idx_to_anno_ends = defaultdict(list) for anno in annos_from_shortest: From 659a694b75e32719fa8527e53fb7d596f74c46e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Fri, 10 Jan 2025 08:59:57 +0100 Subject: [PATCH 41/41] Don't fail validation on refs to metadata --- docdeid/process/annotator.py | 2 +- tests/unit/process/test_annotator.py | 28 +++++++++++++++++++--------- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py index ee82334..8b7f5c2 100644 --- a/docdeid/process/annotator.py +++ b/docdeid/process/annotator.py @@ -624,7 +624,7 @@ def validate_pattern(cls, pattern, ds): referenced_ents = {match_val for tok_pattern in pattern for func, match_val in leaf_items(tok_pattern) - if func.endswith("lookup")} + if func.endswith("lookup") and "." not in match_val} if referenced_ents and ds is None: raise ValueError("Pattern relies on entity lookups but no lookup " "structures were provided.") diff --git a/tests/unit/process/test_annotator.py b/tests/unit/process/test_annotator.py index d71296b..380e76e 100644 --- a/tests/unit/process/test_annotator.py +++ b/tests/unit/process/test_annotator.py @@ -350,25 +350,25 @@ def korean_doc(self): def test_validation(self, ds): with pytest.raises(ValueError) as exc_info: - tpa = SequenceAnnotator(pattern=[], ds=ds, tag="_") + SequenceAnnotator(pattern=[], ds=ds, tag="_") assert "missing or empty" in str(exc_info) # Lookup structures are not required if there are no lookup token patterns. - tpa = SequenceAnnotator(pattern=[{"like_name": True}], tag="_") + SequenceAnnotator(pattern=[{"like_name": True}], tag="_") assert True with pytest.raises(ValueError) as exc_info: - tpa = SequenceAnnotator(pattern=[{"lookup": "undefined_entity"}], tag="_") + SequenceAnnotator(pattern=[{"lookup": "undefined_entity"}], tag="_") assert "no lookup structures were provided" in str(exc_info) with pytest.raises(ValueError) as exc_info: - tpa = SequenceAnnotator(pattern=[{"lookup": "undefined_entity"}], - ds=ds, - tag="_") + SequenceAnnotator(pattern=[{"lookup": "undefined_entity"}], + ds=ds, + tag="_") assert "Unknown lookup entity types: undefined_entity." in str(exc_info) with pytest.raises(ValueError) as exc_info: - tpa = SequenceAnnotator( + SequenceAnnotator( pattern=[{"or": [{"lookup": "undefined_entity"}, {"lookup": "another_entity"}]}], ds=ds, @@ -376,8 +376,18 @@ def test_validation(self, ds): assert ("Unknown lookup entity types: another_entity, undefined_entity." in str(exc_info)) + # References to entities from metadata must not cause validation errors. + SequenceAnnotator(pattern=[{"or": [{"lookup": "patient.name"}, + {"lookup": "doctor.surname"}]}], + tag="_") + SequenceAnnotator(pattern=[{"or": [{"lookup": "interfixes"}, + {"lookup": "doctor.surname"}]}], + ds=ds, + tag="_") + assert True + with pytest.raises(ValueError) as exc_info: - tpa = SequenceAnnotator( + SequenceAnnotator( pattern=[{"or": [{"lookup": "interfixes"}, {"and": [{"lookup": "first_names"}, {"lookup": "alien_entity"}]}]}, @@ -388,7 +398,7 @@ def test_validation(self, ds): in str(exc_info)) with pytest.raises(ValueError) as exc_info: - tpa = SequenceAnnotator( + SequenceAnnotator( pattern=[{"lookup": "interfixed_surnames"}], ds=ds, tag="_")