From 30ce9369b7f187c540ed364539452fba0a77a75a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Fri, 1 Mar 2024 22:29:06 +0100
Subject: [PATCH 01/41] Make MultiTok..Annotator notice changes in the trie

---
 docdeid/process/annotator.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py
index 60689df..9a5a2c5 100644
--- a/docdeid/process/annotator.py
+++ b/docdeid/process/annotator.py
@@ -168,19 +168,31 @@ def _init_lookup_structures(
                 self._trie.add_item(texts)
 
                 start_token = texts[0]
-
+                # Apply the "matching pipeline" to the start token -- the same
+                # normalization that was applied to the tokens inside
+                # `add_item` already, or when building the trie in the (trie is
+                # not None) case, too.
                 for string_modifier in self._matching_pipeline:
                     start_token = string_modifier.process(start_token)
 
                 self._start_words.add(start_token)
 
+    @property
+    def start_words(self):
+        # If the trie has been modified (added to) since we computed
+        # _start_words,
+        if len(self._start_words) != len(self._trie.children):
+            # Recompute _start_words.
+            self._start_words = set(self._trie.children)
+        return self._start_words
+
     def annotate(self, doc: Document) -> list[Annotation]:
 
         tokens = doc.get_tokens()
 
         start_tokens = sorted(
             tokens.token_lookup(
-                self._start_words, matching_pipeline=self._matching_pipeline
+                self.start_words, matching_pipeline=self._matching_pipeline
             ),
             key=lambda token: token.start_char,
         )

From e12d0d06160892d9ea5e12c00c0edf5139bc769b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Fri, 1 Mar 2024 23:33:11 +0100
Subject: [PATCH 02/41] Provide `LowercaseTail` string modifier

It's like titlecasing but it touches the word only if it was
originally uppercase. There might be better names for it...
---
 docdeid/str/__init__.py  |  1 +
 docdeid/str/processor.py | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/docdeid/str/__init__.py b/docdeid/str/__init__.py
index 7e6db61..2eba56a 100644
--- a/docdeid/str/__init__.py
+++ b/docdeid/str/__init__.py
@@ -1,6 +1,7 @@
 from .processor import (
     FilterByLength,
     LowercaseString,
+    LowercaseTail,
     RemoveNonAsciiCharacters,
     ReplaceNonAsciiCharacters,
     ReplaceValue,
diff --git a/docdeid/str/processor.py b/docdeid/str/processor.py
index b1023a5..1fa4869 100644
--- a/docdeid/str/processor.py
+++ b/docdeid/str/processor.py
@@ -74,6 +74,26 @@ def process(self, item: str) -> str:
         return item.casefold()
 
 
+_WORD_RX = re.compile('\\w+', re.U)
+
+
+class LowercaseTail(StringModifier):
+    """Lowercases the tail of words."""
+
+    @staticmethod
+    def _process_word_match(m: re.Match) -> str:
+        word = m.group(0)
+        if word.isupper():
+            # FIXME Is there a language-independent way to properly titlecase?
+            if word.startswith('IJ'):
+                return word[0:2] + word[2:].lower()
+            return word[0] + word[1:].lower()
+        return word
+
+    def process(self, item: str) -> str:
+        return _WORD_RX.sub(LowercaseTail._process_word_match, item)
+
+
 class StripString(StringModifier):
     """
     Strip string (whitespaces, tabs, newlines, etc.

From 82aab5febe73c7c8b1fd24f6395c1ffdd56f3f82 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Mon, 4 Mar 2024 13:44:26 +0100
Subject: [PATCH 03/41] Enable specifying the lang for titlecasing

---
 docdeid/str/processor.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/docdeid/str/processor.py b/docdeid/str/processor.py
index 1fa4869..10126ce 100644
--- a/docdeid/str/processor.py
+++ b/docdeid/str/processor.py
@@ -80,18 +80,19 @@ def process(self, item: str) -> str:
 class LowercaseTail(StringModifier):
     """Lowercases the tail of words."""
 
-    @staticmethod
-    def _process_word_match(m: re.Match) -> str:
+    def __init__(self, lang='nl'):
+        self._lang = lang
+
+    def _process_word_match(self, m: re.Match) -> str:
         word = m.group(0)
         if word.isupper():
-            # FIXME Is there a language-independent way to properly titlecase?
-            if word.startswith('IJ'):
+            if self._lang == 'nl' and word.startswith('IJ'):
                 return word[0:2] + word[2:].lower()
             return word[0] + word[1:].lower()
         return word
 
     def process(self, item: str) -> str:
-        return _WORD_RX.sub(LowercaseTail._process_word_match, item)
+        return _WORD_RX.sub(self._process_word_match, item)
 
 
 class StripString(StringModifier):

From 156e201897d681f2200cd39ca335ed06bb7451b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Mon, 4 Mar 2024 23:20:02 +0100
Subject: [PATCH 04/41] Minimize data fixtures for tests

---
 docdeid/docdeid.iml | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 docdeid/docdeid.iml

diff --git a/docdeid/docdeid.iml b/docdeid/docdeid.iml
new file mode 100644
index 0000000..35fdd4f
--- /dev/null
+++ b/docdeid/docdeid.iml
@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager" inherit-compiler-output="true">
+    <exclude-output />
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="jdk" jdkName="Poetry (deduce)" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
\ No newline at end of file

From c7b4c896f1ed3718a118d70cae5156d7b481b1ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Wed, 6 Mar 2024 12:10:04 +0100
Subject: [PATCH 05/41] Log annotated text after every processor

---
 docdeid/process/doc_processor.py |  4 ++++
 docdeid/utils.py                 | 31 +++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/docdeid/process/doc_processor.py b/docdeid/process/doc_processor.py
index 6db05f7..2085418 100644
--- a/docdeid/process/doc_processor.py
+++ b/docdeid/process/doc_processor.py
@@ -1,8 +1,10 @@
+import logging
 from abc import ABC, abstractmethod
 from collections import OrderedDict
 from typing import Iterator, Optional, Union
 
 from docdeid.document import Document
+from docdeid.utils import annotate_doc
 
 
 class DocProcessor(ABC):  # pylint: disable=R0903
@@ -143,6 +145,8 @@ def process(
             elif isinstance(proc, DocProcessorGroup):
                 proc.process(doc, enabled=enabled, disabled=disabled)
 
+            logging.debug("after %s: %s", name, annotate_doc(doc))
+
     def __iter__(self) -> Iterator:
 
         return iter(self._processors.items())
diff --git a/docdeid/utils.py b/docdeid/utils.py
index 1d3cf7c..f134689 100644
--- a/docdeid/utils.py
+++ b/docdeid/utils.py
@@ -1,3 +1,5 @@
+from collections import defaultdict
+
 from frozendict import frozendict
 
 from docdeid.document import Document
@@ -32,3 +34,32 @@ def annotate_intext(doc: Document) -> str:
         )
 
     return text
+
+
+def annotate_doc(doc: Document) -> str:
+    """\
+    Adds XML-like markup for annotations into the text of a document.
+
+    Handles also nested mentions and in a way also overlapping mentions, even
+    though this kind of markup cannot really represent them.
+    """
+    annos_from_shortest = sorted(
+        doc.annotations,
+        key=lambda anno: anno.end_char - anno.start_char)
+    idx_to_anno_starts = defaultdict(list)
+    idx_to_anno_ends = defaultdict(list)
+    for anno in annos_from_shortest:
+        idx_to_anno_starts[anno.start_char].append(anno)
+        idx_to_anno_ends[anno.end_char].append(anno)
+    markup_indices = sorted(set(idx_to_anno_starts).union(idx_to_anno_ends))
+    chunks = list()
+    last_idx = 0
+    for idx in markup_indices:
+        chunks.append(doc.text[last_idx:idx])
+        for ending_anno in idx_to_anno_ends[idx]:
+            chunks.append(f'</{ending_anno.tag.upper()}>')
+        for starting_anno in reversed(idx_to_anno_starts[idx]):
+            chunks.append(f'<{starting_anno.tag.upper()}>')
+        last_idx = idx
+    chunks.append(doc.text[last_idx:])
+    return ''.join(chunks)

From 4459c14de13691949b64a3d58fb439ae56968abd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Wed, 6 Mar 2024 15:40:05 +0100
Subject: [PATCH 06/41] Update documentation slightly

---
 docdeid/ds/lookup.py         | 2 +-
 docdeid/process/annotator.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docdeid/ds/lookup.py b/docdeid/ds/lookup.py
index 4df8bdc..2d9d270 100644
--- a/docdeid/ds/lookup.py
+++ b/docdeid/ds/lookup.py
@@ -140,7 +140,7 @@ def add_items_from_self(
     ) -> None:
         """
         Add items from self (this items of this :class:`.LookupSet`). This can be used
-        to do a transformation or replacment of the items.
+        to do a transformation or replacement of the items.
 
         Args:
             cleaning_pipeline: A cleaning pipeline applied to the items of this set.
diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py
index 9a5a2c5..60c92b5 100644
--- a/docdeid/process/annotator.py
+++ b/docdeid/process/annotator.py
@@ -15,7 +15,7 @@
 class Annotator(DocProcessor, ABC):
     """
     Abstract class for annotators, which are responsible for generating annotations from
-    a given document. Instatiations should implement the annotate method.
+    a given document. Instantiations should implement the annotate method.
 
     Args:
         tag: The tag to use in the annotations.
@@ -119,7 +119,7 @@ class MultiTokenLookupAnnotator(Annotator):
             or should process from left to right.
 
     Raises:
-        RunTimeError, when an incorrect combination of `lookup_values`,
+        RuntimeError, when an incorrect combination of `lookup_values`,
         `matching_pipeline` and `trie` is supplied.
     """
 
@@ -149,7 +149,7 @@ def __init__(
 
         else:
             raise RuntimeError(
-                "Please provide either looup_values and a tokenizer, or a trie."
+                "Please provide either lookup_values and a tokenizer, or a trie."
             )
 
         self.overlapping = overlapping

From 810b8b3bee0967ce5460b8bd2d0c5c4c13153def Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Wed, 6 Mar 2024 22:02:09 +0100
Subject: [PATCH 07/41] Expose `Document.token_lists` as a property

---
 docdeid/document.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/docdeid/document.py b/docdeid/document.py
index dd515ce..9d4ccf7 100644
--- a/docdeid/document.py
+++ b/docdeid/document.py
@@ -1,5 +1,8 @@
+from collections.abc import Mapping
 from typing import Any, Optional
 
+from frozendict import frozendict
+
 from docdeid.annotation import AnnotationSet
 from docdeid.tokenizer import Tokenizer, TokenList
 
@@ -74,7 +77,8 @@ def __init__(
     ) -> None:
 
         self._text = text
-        self._tokenizers = tokenizers
+        self._tokenizers = (None if tokenizers is None else
+                            frozendict(tokenizers))
 
         self.metadata = MetaData(metadata)
         """The :class:`.MetaData` of this :class:`.Document`, that can be interacted
@@ -94,6 +98,14 @@ def text(self) -> str:
         """
         return self._text
 
+    @property
+    def tokenizers(self) -> Optional[Mapping[str, Tokenizer]]:
+        return self._tokenizers
+
+    @property
+    def token_lists(self) -> Mapping[str, TokenList]:
+        return self._token_lists
+
     def get_tokens(self, tokenizer_name: str = "default") -> TokenList:
         """
         Get the tokens corresponding to the input text, for a specific tokenizer.

From 50026967db5e7b78e819e56d9d0c8ee3ec1bdfa1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Thu, 7 Mar 2024 14:45:39 +0100
Subject: [PATCH 08/41] (Almost) automatically format code

---
 docdeid/document.py      |  3 +--
 docdeid/str/processor.py |  6 +++---
 docdeid/utils.py         | 16 ++++++++--------
 3 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/docdeid/document.py b/docdeid/document.py
index 9d4ccf7..fade167 100644
--- a/docdeid/document.py
+++ b/docdeid/document.py
@@ -77,8 +77,7 @@ def __init__(
     ) -> None:
 
         self._text = text
-        self._tokenizers = (None if tokenizers is None else
-                            frozendict(tokenizers))
+        self._tokenizers = None if tokenizers is None else frozendict(tokenizers)
 
         self.metadata = MetaData(metadata)
         """The :class:`.MetaData` of this :class:`.Document`, that can be interacted
diff --git a/docdeid/str/processor.py b/docdeid/str/processor.py
index 10126ce..32e5a51 100644
--- a/docdeid/str/processor.py
+++ b/docdeid/str/processor.py
@@ -74,19 +74,19 @@ def process(self, item: str) -> str:
         return item.casefold()
 
 
-_WORD_RX = re.compile('\\w+', re.U)
+_WORD_RX = re.compile("\\w+", re.U)
 
 
 class LowercaseTail(StringModifier):
     """Lowercases the tail of words."""
 
-    def __init__(self, lang='nl'):
+    def __init__(self, lang="nl"):
         self._lang = lang
 
     def _process_word_match(self, m: re.Match) -> str:
         word = m.group(0)
         if word.isupper():
-            if self._lang == 'nl' and word.startswith('IJ'):
+            if self._lang == "nl" and word.startswith("IJ"):
                 return word[0:2] + word[2:].lower()
             return word[0] + word[1:].lower()
         return word
diff --git a/docdeid/utils.py b/docdeid/utils.py
index f134689..ef46071 100644
--- a/docdeid/utils.py
+++ b/docdeid/utils.py
@@ -37,15 +37,15 @@ def annotate_intext(doc: Document) -> str:
 
 
 def annotate_doc(doc: Document) -> str:
-    """\
+    """
     Adds XML-like markup for annotations into the text of a document.
 
-    Handles also nested mentions and in a way also overlapping mentions, even
-    though this kind of markup cannot really represent them.
+    Handles also nested mentions and in a way also overlapping mentions, even though
+    this kind of markup cannot really represent them.
     """
     annos_from_shortest = sorted(
-        doc.annotations,
-        key=lambda anno: anno.end_char - anno.start_char)
+        doc.annotations, key=lambda anno: anno.end_char - anno.start_char
+    )
     idx_to_anno_starts = defaultdict(list)
     idx_to_anno_ends = defaultdict(list)
     for anno in annos_from_shortest:
@@ -57,9 +57,9 @@ def annotate_doc(doc: Document) -> str:
     for idx in markup_indices:
         chunks.append(doc.text[last_idx:idx])
         for ending_anno in idx_to_anno_ends[idx]:
-            chunks.append(f'</{ending_anno.tag.upper()}>')
+            chunks.append(f"</{ending_anno.tag.upper()}>")
         for starting_anno in reversed(idx_to_anno_starts[idx]):
-            chunks.append(f'<{starting_anno.tag.upper()}>')
+            chunks.append(f"<{starting_anno.tag.upper()}>")
         last_idx = idx
     chunks.append(doc.text[last_idx:])
-    return ''.join(chunks)
+    return "".join(chunks)

From 7d2d8668fa9cea78669a6d4848fb11a510bb3868 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Thu, 7 Mar 2024 17:08:17 +0100
Subject: [PATCH 09/41] Simplify `MultiTokenLookupAnnotator`...

...as required by pylint.
---
 docdeid/document.py                  |  3 ++
 docdeid/ds/lookup.py                 |  7 ++--
 docdeid/process/annotator.py         | 57 +++++-----------------------
 docdeid/str/processor.py             |  6 +--
 docdeid/tokenizer.py                 | 28 ++++++++++++--
 docdeid/utils.py                     |  2 +-
 tests/pipeline/test_pipeline.py      | 16 +++++---
 tests/unit/process/test_annotator.py | 34 ++++++++---------
 8 files changed, 72 insertions(+), 81 deletions(-)

diff --git a/docdeid/document.py b/docdeid/document.py
index fade167..c7b6c20 100644
--- a/docdeid/document.py
+++ b/docdeid/document.py
@@ -99,10 +99,13 @@ def text(self) -> str:
 
     @property
     def tokenizers(self) -> Optional[Mapping[str, Tokenizer]]:
+        """Available tokenizers indexed by their name."""
         return self._tokenizers
 
     @property
     def token_lists(self) -> Mapping[str, TokenList]:
+        """Lists of tokens of the document, indexed by the name of the corresponding
+        tokenizer."""
         return self._token_lists
 
     def get_tokens(self, tokenizer_name: str = "default") -> TokenList:
diff --git a/docdeid/ds/lookup.py b/docdeid/ds/lookup.py
index 2d9d270..f0daa49 100644
--- a/docdeid/ds/lookup.py
+++ b/docdeid/ds/lookup.py
@@ -2,6 +2,7 @@
 
 import codecs
 import itertools
+from collections.abc import Sequence
 from typing import Iterable, Iterator, Optional, Union
 
 from docdeid.ds.ds import Datastructure
@@ -265,7 +266,7 @@ def __init__(self, *args, **kwargs) -> None:
         self.children: dict[str, LookupTrie] = {}
         self.is_terminal = False
 
-    def add_item(self, item: list[str]) -> None:
+    def add_item(self, item: Sequence[str]) -> None:
         """
         Add an item, i.e. a list of strings, to this Trie.
 
@@ -285,7 +286,7 @@ def add_item(self, item: list[str]) -> None:
 
             self.children[head].add_item(tail)
 
-    def __contains__(self, item: list[str]) -> bool:
+    def __contains__(self, item: Sequence[str]) -> bool:
         """
         Whether the trie contains the item. Respects the matching pipeline.
 
@@ -304,7 +305,7 @@ def __contains__(self, item: list[str]) -> bool:
         return (head in self.children) and tail in self.children[head]
 
     def longest_matching_prefix(
-        self, item: list[str], start_i: int = 0
+        self, item: Sequence[str], start_i: int = 0
     ) -> Union[list[str], None]:
         """
         Finds the longest matching prefix of a list of strings. This is used to find the
diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py
index 60c92b5..d1e31a0 100644
--- a/docdeid/process/annotator.py
+++ b/docdeid/process/annotator.py
@@ -9,7 +9,7 @@
 from docdeid.pattern import TokenPattern
 from docdeid.process.doc_processor import DocProcessor
 from docdeid.str.processor import StringModifier
-from docdeid.tokenizer import Token, Tokenizer
+from docdeid.tokenizer import Token
 
 
 class Annotator(DocProcessor, ABC):
@@ -126,59 +126,20 @@ class MultiTokenLookupAnnotator(Annotator):
     def __init__(
         self,
         *args,
-        lookup_values: Optional[Iterable[str]] = None,
-        matching_pipeline: Optional[list[StringModifier]] = None,
-        tokenizer: Optional[Tokenizer] = None,
-        trie: Optional[LookupTrie] = None,
+        trie: LookupTrie,
         overlapping: bool = False,
         **kwargs,
     ) -> None:
 
-        self._start_words: set[str] = set()
-
-        if (trie is not None) and (lookup_values is None) and (tokenizer is None):
-
-            self._trie = trie
-            self._matching_pipeline = trie.matching_pipeline or []
-            self._start_words = set(trie.children.keys())
-
-        elif (trie is None) and (lookup_values is not None) and (tokenizer is not None):
-            self._matching_pipeline = matching_pipeline or []
-            self._trie = LookupTrie(matching_pipeline=matching_pipeline)
-            self._init_lookup_structures(lookup_values, tokenizer)
-
-        else:
-            raise RuntimeError(
-                "Please provide either lookup_values and a tokenizer, or a trie."
-            )
-
-        self.overlapping = overlapping
+        self._trie = trie
+        self._overlapping = overlapping
+        self._start_words = set(trie.children)
 
         super().__init__(*args, **kwargs)
 
-    def _init_lookup_structures(
-        self, lookup_values: Iterable[str], tokenizer: Tokenizer
-    ) -> None:
-
-        for val in lookup_values:
-
-            texts = [token.text for token in tokenizer.tokenize(val)]
-
-            if len(texts) > 0:
-                self._trie.add_item(texts)
-
-                start_token = texts[0]
-                # Apply the "matching pipeline" to the start token -- the same
-                # normalization that was applied to the tokens inside
-                # `add_item` already, or when building the trie in the (trie is
-                # not None) case, too.
-                for string_modifier in self._matching_pipeline:
-                    start_token = string_modifier.process(start_token)
-
-                self._start_words.add(start_token)
-
     @property
-    def start_words(self):
+    def start_words(self) -> set[str]:
+        """First words of phrases detected by this annotator."""
         # If the trie has been modified (added to) since we computed
         # _start_words,
         if len(self._start_words) != len(self._trie.children):
@@ -192,7 +153,7 @@ def annotate(self, doc: Document) -> list[Annotation]:
 
         start_tokens = sorted(
             tokens.token_lookup(
-                self.start_words, matching_pipeline=self._matching_pipeline
+                self.start_words, matching_pipeline=self._trie.matching_pipeline
             ),
             key=lambda token: token.start_char,
         )
@@ -230,7 +191,7 @@ def annotate(self, doc: Document) -> list[Annotation]:
                 )
             )
 
-            if not self.overlapping:
+            if not self._overlapping:
                 min_i = i + len(longest_matching_prefix)  # skip ahead
 
         return annotations
diff --git a/docdeid/str/processor.py b/docdeid/str/processor.py
index 32e5a51..44ee468 100644
--- a/docdeid/str/processor.py
+++ b/docdeid/str/processor.py
@@ -80,11 +80,11 @@ def process(self, item: str) -> str:
 class LowercaseTail(StringModifier):
     """Lowercases the tail of words."""
 
-    def __init__(self, lang="nl"):
+    def __init__(self, lang: str = "nl") -> None:
         self._lang = lang
 
-    def _process_word_match(self, m: re.Match) -> str:
-        word = m.group(0)
+    def _process_word_match(self, match: re.Match) -> str:
+        word = match.group(0)
         if word.isupper():
             if self._lang == "nl" and word.startswith("IJ"):
                 return word[0:2] + word[2:].lower()
diff --git a/docdeid/tokenizer.py b/docdeid/tokenizer.py
index 8813caf..c69197c 100644
--- a/docdeid/tokenizer.py
+++ b/docdeid/tokenizer.py
@@ -1,10 +1,12 @@
 from __future__ import annotations
 
 import re
+import sys
 from abc import ABC, abstractmethod
 from collections import defaultdict
+from collections.abc import Sequence
 from dataclasses import dataclass, field
-from typing import Iterator, Literal, Optional
+from typing import Iterator, Literal, Optional, SupportsIndex, overload
 
 from docdeid.str import StringModifier
 
@@ -130,7 +132,7 @@ def __len__(self) -> int:
         return len(self.text)
 
 
-class TokenList:
+class TokenList(Sequence[Token]):
     """
     Contains a sequence of tokens, along with some lookup logic.
 
@@ -248,9 +250,29 @@ def __len__(self) -> int:
 
         return len(self._tokens)
 
+    @overload
     def __getitem__(self, index: int) -> Token:
+        ...
 
-        return self._tokens[index]
+    @overload
+    def __getitem__(self, indexes: slice) -> Sequence[Token]:
+        ...
+
+    def __getitem__(self, item):
+        return self._tokens[item]
+
+    def index(
+        self,
+        __token: Token,
+        __start: SupportsIndex = 0,
+        __stop: SupportsIndex = sys.maxsize,
+    ) -> int:
+        try:
+            return self._token_index[__token]
+        except KeyError:
+            # Raise a plain ValueError, just like list.index.
+            # pylint: disable=W0707
+            raise ValueError(f"'{__token}' is not in TokenList")
 
     def __eq__(self, other: object) -> bool:
         """
diff --git a/docdeid/utils.py b/docdeid/utils.py
index ef46071..a1fcdd7 100644
--- a/docdeid/utils.py
+++ b/docdeid/utils.py
@@ -52,7 +52,7 @@ def annotate_doc(doc: Document) -> str:
         idx_to_anno_starts[anno.start_char].append(anno)
         idx_to_anno_ends[anno.end_char].append(anno)
     markup_indices = sorted(set(idx_to_anno_starts).union(idx_to_anno_ends))
-    chunks = list()
+    chunks = []
     last_idx = 0
     for idx in markup_indices:
         chunks.append(doc.text[last_idx:idx])
diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py
index 62422e7..d4e0ba7 100644
--- a/tests/pipeline/test_pipeline.py
+++ b/tests/pipeline/test_pipeline.py
@@ -2,6 +2,7 @@
 
 from docdeid.annotation import Annotation, AnnotationSet
 from docdeid.deidentifier import DocDeid
+from docdeid.ds import LookupTrie
 from docdeid.process.annotator import (
     MultiTokenLookupAnnotator,
     SingleTokenLookupAnnotator,
@@ -49,11 +50,12 @@ def test_multipe_annotators(self, long_text):
             "name_annotator",
             SingleTokenLookupAnnotator(lookup_values=["Bob"], tag="name"),
         )
+        loc_trie = LookupTrie()
+        loc_trie.add_item("the United States of America".split())
         deidentifier.processors.add_processor(
             "location_annotator",
             MultiTokenLookupAnnotator(
-                lookup_values=["the United States of America"],
-                tokenizer=tokenizer,
+                trie=loc_trie,
                 tag="location",
             ),
         )
@@ -86,11 +88,12 @@ def test_enabled(self, long_text):
             "name_annotator",
             SingleTokenLookupAnnotator(lookup_values=["Bob"], tag="name"),
         )
+        loc_trie = LookupTrie()
+        loc_trie.add_item("the United States of America".split())
         deidentifier.processors.add_processor(
             "location_annotator",
             MultiTokenLookupAnnotator(
-                lookup_values=["the United States of America"],
-                tokenizer=tokenizer,
+                trie=loc_trie,
                 tag="location",
             ),
         )
@@ -124,11 +127,12 @@ def test_disabled(self, long_text):
             "name_annotator",
             SingleTokenLookupAnnotator(lookup_values=["Bob"], tag="name"),
         )
+        loc_trie = LookupTrie()
+        loc_trie.add_item("the United States of America".split())
         deidentifier.processors.add_processor(
             "location_annotator",
             MultiTokenLookupAnnotator(
-                lookup_values=["the United States of America"],
-                tokenizer=tokenizer,
+                trie=loc_trie,
                 tag="location",
             ),
         )
diff --git a/tests/unit/process/test_annotator.py b/tests/unit/process/test_annotator.py
index a71dc54..a94e05c 100644
--- a/tests/unit/process/test_annotator.py
+++ b/tests/unit/process/test_annotator.py
@@ -4,6 +4,7 @@
 import docdeid.ds
 from docdeid.annotation import Annotation
 from docdeid.document import Document
+from docdeid.ds import LookupTrie
 from docdeid.pattern import TokenPattern
 from docdeid.process.annotator import (
     MultiTokenLookupAnnotator,
@@ -12,7 +13,6 @@
     TokenPatternAnnotator,
 )
 from docdeid.str.processor import LowercaseString
-from docdeid.tokenizer import WordBoundaryTokenizer
 
 
 class TestSingleTokenLookupAnnotator:
@@ -55,11 +55,10 @@ def test_single_token_with_matching_pipeline(self, long_text, long_tokenlist):
 class TestMultiTokenLookupAnnotator:
     def test_multi_token(self, long_text, long_tokenlist):
         doc = Document(long_text)
-        annotator = MultiTokenLookupAnnotator(
-            lookup_values=["my name", "my wife"],
-            tokenizer=WordBoundaryTokenizer(),
-            tag="prefix",
-        )
+        my_trie = LookupTrie()
+        my_trie.add_item(("my", " ", "name"))
+        my_trie.add_item(("my", " ", "wife"))
+        annotator = MultiTokenLookupAnnotator(trie=my_trie, tag="prefix")
         expected_annotations = [
             Annotation(text="my wife", start_char=39, end_char=46, tag="prefix"),
         ]
@@ -73,12 +72,10 @@ def test_multi_token(self, long_text, long_tokenlist):
     def test_multi_token_with_matching_pipeline(self, long_text, long_tokenlist):
         doc = Document(long_text)
 
-        annotator = MultiTokenLookupAnnotator(
-            lookup_values=["my name", "my wife"],
-            tokenizer=WordBoundaryTokenizer(),
-            matching_pipeline=[LowercaseString()],
-            tag="prefix",
-        )
+        my_trie = LookupTrie(matching_pipeline=[LowercaseString()])
+        my_trie.add_item(("my", " ", "name"))
+        my_trie.add_item(("my", " ", "wife"))
+        annotator = MultiTokenLookupAnnotator(trie=my_trie, tag="prefix")
         expected_annotations = [
             Annotation(text="My name", start_char=0, end_char=7, tag="prefix"),
             Annotation(text="my wife", start_char=39, end_char=46, tag="prefix"),
@@ -93,9 +90,11 @@ def test_multi_token_lookup_with_overlap(self, long_text, long_tokenlist):
 
         doc = Document(long_text)
 
+        dr_trie = LookupTrie()
+        dr_trie.add_item(("dr", ". ", "John"))
+        dr_trie.add_item(("John", " ", "Smith"))
         annotator = MultiTokenLookupAnnotator(
-            lookup_values=["dr. John", "John Smith"],
-            tokenizer=WordBoundaryTokenizer(),
+            trie=dr_trie,
             tag="prefix",
             overlapping=True,
         )
@@ -114,9 +113,11 @@ def test_multi_token_lookup_no_overlap(self, long_text, long_tokenlist):
 
         doc = Document(long_text)
 
+        dr_trie = LookupTrie()
+        dr_trie.add_item(("dr", ". ", "John"))
+        dr_trie.add_item(("John", " ", "Smith"))
         annotator = MultiTokenLookupAnnotator(
-            lookup_values=["dr. John", "John Smith"],
-            tokenizer=WordBoundaryTokenizer(),
+            trie=dr_trie,
             tag="prefix",
             overlapping=False,
         )
@@ -137,7 +138,6 @@ def test_multi_token_lookup_with_trie(self, long_text, long_tokenlist):
         trie = docdeid.ds.LookupTrie(matching_pipeline=[LowercaseString()])
         trie.add_item(["my", " ", "name"])
         trie.add_item(["my", " ", "wife"])
-
         annotator = MultiTokenLookupAnnotator(
             trie=trie,
             tag="prefix",

From 762866aaa141120bcd408559604d93a6cc76666c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Fri, 8 Mar 2024 12:42:11 +0100
Subject: [PATCH 10/41] Update the `MultiTok...Annotator` docstring

---
 docdeid/process/annotator.py | 22 +++++-----------------
 1 file changed, 5 insertions(+), 17 deletions(-)

diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py
index d1e31a0..f97b3cc 100644
--- a/docdeid/process/annotator.py
+++ b/docdeid/process/annotator.py
@@ -102,25 +102,13 @@ def annotate(self, doc: Document) -> list[Annotation]:
 
 class MultiTokenLookupAnnotator(Annotator):
     """
-    Matches lookup values against tokens, where the ``lookup_values`` may themselves be
-    sequences.
+    Annotates entity mentions by looking them up in a `LookupTrie`.
 
     Args:
-        lookup_values: An iterable of strings, that should be matched. These are
-            tokenized internally.
-        matching_pipeline: An optional pipeline that can be used for matching
-            (e.g. lowercasing). This has no specific impact on matching performance,
-            other than overhead for applying the pipeline to each string.
-        tokenizer: A tokenizer that is used to create the sequence patterns from
-            ``lookup_values``.
-        trie: A trie that is used for matching, rather than a combination of
-            `lookup_values` and a `matching_pipeline` (cannot be used simultaneously).
-        overlapping: Whether the annotator should match overlapping sequences,
-            or should process from left to right.
-
-    Raises:
-        RuntimeError, when an incorrect combination of `lookup_values`,
-        `matching_pipeline` and `trie` is supplied.
+        trie: The `LookupTrie` containing all entity mentions that should be annotated.
+        overlapping: Whether overlapping phrases are to be returned.
+        *args, **kwargs: Passed through to the `Annotator` constructor (which accepts
+            the arguments `tag` and `priority`).
     """
 
     def __init__(

From 1ae6846d921a5fffb9ca179780867cbfc8f8796a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Fri, 8 Mar 2024 13:24:17 +0100
Subject: [PATCH 11/41] Test user additions to the lookup trie

---
 tests/unit/process/test_annotator.py | 29 ++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/tests/unit/process/test_annotator.py b/tests/unit/process/test_annotator.py
index a94e05c..7b27196 100644
--- a/tests/unit/process/test_annotator.py
+++ b/tests/unit/process/test_annotator.py
@@ -13,6 +13,7 @@
     TokenPatternAnnotator,
 )
 from docdeid.str.processor import LowercaseString
+from docdeid.tokenizer import SpaceSplitTokenizer
 
 
 class TestSingleTokenLookupAnnotator:
@@ -153,6 +154,34 @@ def test_multi_token_lookup_with_trie(self, long_text, long_tokenlist):
 
         assert annotations == expected_annotations
 
+    def test_trie_modified(self, long_text):
+        # The user of Deduce may want to amend the resources shipped with Deduce.
+        # Loading those happens in the Deduce initializer, which also constructs
+        # annotators according to the configuration.
+
+        # Run the interesting portions of Deduce initialization.
+        doc = Document(long_text,tokenizers={"default": SpaceSplitTokenizer()})
+        trie = docdeid.ds.LookupTrie()
+        # Yeah, the comma in "Smith," seems off... but then again, WordBoundaryTokenizer
+        # considers whitespace to be tokens. There is no good choice.
+        trie.add_item(("John", "Smith,"))
+        annotator = MultiTokenLookupAnnotator(trie=trie, tag="name")
+
+        # Let's add our own resources.
+        trie.add_item(("jane", "Keith-Lucas"))
+        # ...including phrases with a potential to confuse the algorithm.
+        trie.add_item(("jane", "joplane"))
+        trie.add_item(("dr.", "John", "Hopkin"))
+        trie.add_item(("Smith,", "please"))
+
+        # Expect also our phrases to be detected.
+        want = [
+            Annotation(text="John Smith,", start_char=15, end_char=26, tag="name"),
+            Annotation(text="jane Keith-Lucas", start_char=47, end_char=63, tag="name"),
+        ]
+        got = annotator.annotate(doc)
+        assert got == want
+
 
 class TestRegexpAnnotator:
     def test_regexp_annotator(self, long_text):

From ae1f93ea60ba5d4769cf09ead11cd28f68b1eb73 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Fri, 8 Mar 2024 16:19:11 +0100
Subject: [PATCH 12/41] Test the `tokenizers` and `token_lists` props

---
 tests/unit/test_document.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/unit/test_document.py b/tests/unit/test_document.py
index 05e3e3c..c37ada0 100644
--- a/tests/unit/test_document.py
+++ b/tests/unit/test_document.py
@@ -84,8 +84,13 @@ def test_get_tokens_multiple_tokenizers(self, short_tokens):
             tokenizer1, "tokenize", return_value=short_tokens
         ), patch.object(tokenizer2, "_split_text", return_value=[]):
 
+            assert set(doc.tokenizers.keys()) == {"tokenizer_1", "tokenizer_2"}
             assert doc.get_tokens(tokenizer_name="tokenizer_1") == short_tokens
             assert doc.get_tokens(tokenizer_name="tokenizer_2") == TokenList([])
+            assert doc.token_lists == {
+                "tokenizer_1": short_tokens,
+                "tokenizer_2": TokenList([]),
+            }
 
     def test_metadata(self):
         text = "Hello I'm Bob"

From d415f5177a7f8594a157a404f23258950597935e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Fri, 8 Mar 2024 16:22:11 +0100
Subject: [PATCH 13/41] Remove and ignore the IDEA project file

---
 .gitignore          | 5 ++++-
 docdeid/docdeid.iml | 9 ---------
 2 files changed, 4 insertions(+), 10 deletions(-)
 delete mode 100644 docdeid/docdeid.iml

diff --git a/.gitignore b/.gitignore
index 7fec20a..32cd942 100644
--- a/.gitignore
+++ b/.gitignore
@@ -130,4 +130,7 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
-/.idea/*
\ No newline at end of file
+/.idea/*
+
+# IDEs
+*.iml
diff --git a/docdeid/docdeid.iml b/docdeid/docdeid.iml
deleted file mode 100644
index 35fdd4f..0000000
--- a/docdeid/docdeid.iml
+++ /dev/null
@@ -1,9 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<module type="PYTHON_MODULE" version="4">
-  <component name="NewModuleRootManager" inherit-compiler-output="true">
-    <exclude-output />
-    <content url="file://$MODULE_DIR$" />
-    <orderEntry type="jdk" jdkName="Poetry (deduce)" jdkType="Python SDK" />
-    <orderEntry type="sourceFolder" forTests="false" />
-  </component>
-</module>
\ No newline at end of file

From d8e8ed36c9efb9c945dfea94ac6414fdf973278c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Fri, 8 Mar 2024 16:28:01 +0100
Subject: [PATCH 14/41] Annotate docs for logging only if level is DEBUG

---
 docdeid/process/doc_processor.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/docdeid/process/doc_processor.py b/docdeid/process/doc_processor.py
index 2085418..3bc8556 100644
--- a/docdeid/process/doc_processor.py
+++ b/docdeid/process/doc_processor.py
@@ -6,6 +6,8 @@
 from docdeid.document import Document
 from docdeid.utils import annotate_doc
 
+_ROOT_LOGGER = logging.getLogger()
+
 
 class DocProcessor(ABC):  # pylint: disable=R0903
     """Something that processes a document."""
@@ -145,7 +147,8 @@ def process(
             elif isinstance(proc, DocProcessorGroup):
                 proc.process(doc, enabled=enabled, disabled=disabled)
 
-            logging.debug("after %s: %s", name, annotate_doc(doc))
+            if _ROOT_LOGGER.isEnabledFor(logging.DEBUG):
+                logging.debug("after %s: %s", name, annotate_doc(doc))
 
     def __iter__(self) -> Iterator:
 

From 03fc99dec6df5b2b126112bdc90643966443e15c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Fri, 8 Mar 2024 18:10:47 +0100
Subject: [PATCH 15/41] Cosmetics

---
 docdeid/annotation.py                   | 12 +++++++++---
 docdeid/process/annotation_processor.py |  4 ++--
 tests/unit/process/test_annotator.py    |  2 +-
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/docdeid/annotation.py b/docdeid/annotation.py
index 60fd533..e7d7d7d 100644
--- a/docdeid/annotation.py
+++ b/docdeid/annotation.py
@@ -100,7 +100,7 @@ def get_sort_key(
 
             val = getattr(self, attr, UNKNOWN_ATTR_DEFAULT)
 
-            if callbacks is not None and (attr in callbacks):
+            if callbacks is not None and attr in callbacks:
                 val = callbacks[attr](val)
 
             sort_key.append(val)
@@ -150,14 +150,20 @@ def sorted(
             A RunTimeError, if the callbacks are not provided as a frozen dict.
         """
 
-        if callbacks is not None and not isinstance(callbacks, frozendict):
+        # Not liked by Mypy, even though
+        # https://docs.python.org/3/library/stdtypes.html#types-union
+        # says the "X | Y" notation is equivalent to `typing.Union[X, Y]` and the
+        # docstring of `typing.Optional` says it's equivalent to
+        # `typing.Union[None, _]`:
+        #     if not isinstance(callbacks, Optional[frozendict]):
+        if not isinstance(callbacks, frozendict | None):
             raise RuntimeError(
                 "Please provide the callbacks as a frozen dict, e.g. "
                 "frozendict.frozendict(end_char=lambda x: -x)"
             )
 
         return sorted(
-            list(self),
+            self,
             key=lambda x: x.get_sort_key(
                 by=by, callbacks=callbacks, deterministic=deterministic
             ),
diff --git a/docdeid/process/annotation_processor.py b/docdeid/process/annotation_processor.py
index 0b3c277..41e9507 100644
--- a/docdeid/process/annotation_processor.py
+++ b/docdeid/process/annotation_processor.py
@@ -60,7 +60,7 @@ def __init__(
     @staticmethod
     def _zero_runs(arr: npt.NDArray) -> npt.NDArray:
         """
-        Finds al zero runs in a numpy array.
+        Finds all zero runs in a numpy array.
         Source: https://stackoverflow.com/questions/24885092/
         finding-the-consecutive-zeros-in-a-numpy-array
 
@@ -68,7 +68,7 @@ def _zero_runs(arr: npt.NDArray) -> npt.NDArray:
             arr: The input array.
 
         Returns:
-            A (num_zero_runs, 2)-dim array, containing the start and end indeces
+            A (num_zero_runs, 2)-dim array, containing the start and end indices
             of the zero runs.
 
         Examples:
diff --git a/tests/unit/process/test_annotator.py b/tests/unit/process/test_annotator.py
index 7b27196..6304f6f 100644
--- a/tests/unit/process/test_annotator.py
+++ b/tests/unit/process/test_annotator.py
@@ -160,7 +160,7 @@ def test_trie_modified(self, long_text):
         # annotators according to the configuration.
 
         # Run the interesting portions of Deduce initialization.
-        doc = Document(long_text,tokenizers={"default": SpaceSplitTokenizer()})
+        doc = Document(long_text, tokenizers={"default": SpaceSplitTokenizer()})
         trie = docdeid.ds.LookupTrie()
         # Yeah, the comma in "Smith," seems off... but then again, WordBoundaryTokenizer
         # considers whitespace to be tokens. There is no good choice.

From 5d188cd494ec7020aeb2a0520563a483edb229d8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Mon, 11 Mar 2024 12:48:21 +0100
Subject: [PATCH 16/41] Support whitespace trimming in `WordBoundaryTokenizer`

---
 docdeid/tokenizer.py | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/docdeid/tokenizer.py b/docdeid/tokenizer.py
index c69197c..7826c39 100644
--- a/docdeid/tokenizer.py
+++ b/docdeid/tokenizer.py
@@ -355,11 +355,17 @@ def _split_text(self, text: str) -> list[Token]:
 
 class WordBoundaryTokenizer(Tokenizer):  # pylint: disable=R0903
     """
-    Tokenizes based on word boundary.
+    Tokenizes based on word boundary. Sequences of non-alphanumeric characters are
+    also represented as tokens.
 
-    Whitespaces and similar characters are included as tokens.
+    Args:
+        keep_blanks: Keep whitespace in tokens, and whitespace-only tokens?
     """
 
+    def __init__(self, keep_blanks=True):
+        super().__init__()
+        self._trim = not keep_blanks
+
     def _split_text(self, text: str) -> list[Token]:
         tokens = []
         matches = [*re.finditer(r"\b", text)]
@@ -369,9 +375,21 @@ def _split_text(self, text: str) -> list[Token]:
             start_char = start_match.span(0)[0]
             end_char = end_match.span(0)[0]
 
+            if self._trim:
+                word = text[start_char:end_char]
+                orig_length = len(word)
+                word = word.rstrip()
+                end_char -= orig_length - len(word)
+                word = word.lstrip()
+                start_char = end_char - len(word)
+                if not word:
+                    continue
+            else:
+                word = text[start_char:end_char]
+
             tokens.append(
                 Token(
-                    text=text[start_char:end_char],
+                    text=word,
                     start_char=start_char,
                     end_char=end_char,
                 )

From 6ea9b744db40b3c97d9dd543ca9536b189f3062f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Mon, 11 Mar 2024 12:47:56 +0100
Subject: [PATCH 17/41] Move `SequenceTokenizer` to Docdeid

This is needed so as to reduce the number of arguments
for the `_match_sequence` method and creates a cleaner
inheritance hierarchy between annotators, too.
---
 docdeid/annotation.py                |   2 +-
 docdeid/process/__init__.py          |   3 +-
 docdeid/process/annotator.py         | 339 ++++++++++++++++++++++++++-
 tests/unit/process/test_annotator.py | 113 ++++++++-
 4 files changed, 451 insertions(+), 6 deletions(-)

diff --git a/docdeid/annotation.py b/docdeid/annotation.py
index e7d7d7d..142a223 100644
--- a/docdeid/annotation.py
+++ b/docdeid/annotation.py
@@ -46,7 +46,7 @@ class Annotation:  # pylint: disable=R0902
     Should only be used when the annotation ends on a token boundary.
     """
 
-    length: int = field(init=False)
+    length: int = field(init=False, compare=False)
     """The number of characters of the annotation text."""
 
     _key_cache: dict = field(default_factory=dict, repr=False, compare=False)
diff --git a/docdeid/process/__init__.py b/docdeid/process/__init__.py
index 79387f1..6113db6 100644
--- a/docdeid/process/__init__.py
+++ b/docdeid/process/__init__.py
@@ -4,11 +4,12 @@
     OverlapResolver,
 )
 from .annotator import (
+    _DIRECTION_MAP, # FIXME Stop using this.
     Annotator,
     MultiTokenLookupAnnotator,
     RegexpAnnotator,
+    SequenceAnnotator,
     SingleTokenLookupAnnotator,
-    TokenPatternAnnotator,
 )
 from .doc_processor import DocProcessor, DocProcessorGroup
 from .redactor import RedactAllText, Redactor, SimpleRedactor
diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py
index f97b3cc..b6c51b7 100644
--- a/docdeid/process/annotator.py
+++ b/docdeid/process/annotator.py
@@ -1,15 +1,61 @@
 import re
+import warnings
 from abc import ABC, abstractmethod
-from typing import Iterable, Optional, Union
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Iterable, Optional, Union, Literal, Mapping
 
 import docdeid.str
 from docdeid.annotation import Annotation
 from docdeid.document import Document
+from docdeid.tokenizer import Token, TokenList
+from docdeid.ds import DsCollection
 from docdeid.ds.lookup import LookupSet, LookupTrie
 from docdeid.pattern import TokenPattern
 from docdeid.process.doc_processor import DocProcessor
 from docdeid.str.processor import StringModifier
-from docdeid.tokenizer import Token
+
+
+_DIRECTION_MAP = {
+    "left": {
+        "attr": "previous",
+        "order": reversed,
+        "start_token": lambda annotation: annotation.start_token,
+    },
+    "right": {
+        "attr": "next",
+        "order": lambda pattern: pattern,
+        "start_token": lambda annotation: annotation.end_token,
+    },
+}
+
+
+@dataclass
+class SimpleTokenPattern:
+    """A pattern for a token (and possibly its annotation, too)."""
+    func: Literal["equal", "re_match", "is_initial", "is_initials", "like_name",
+    "lookup", "neg_lookup", "tag"]
+    pattern: str
+
+
+@dataclass
+class NestedTokenPattern:
+    """Coordination of token patterns."""
+    func: Literal["and", "or"]
+    pattern: list[TokenPattern]
+
+
+TokenPattern = Union[SimpleTokenPattern, NestedTokenPattern]
+
+
+@dataclass
+class SequencePattern:
+    """
+    Pattern for matching a sequence of tokens.
+    """
+    direction: Literal["left", "right"]
+    skip: set[str]
+    pattern: list[TokenPattern]
 
 
 class Annotator(DocProcessor, ABC):
@@ -46,6 +92,76 @@ def annotate(self, doc: Document) -> list[Annotation]:
             A list of annotations.
         """
 
+    # FIXME This doesn't really belong here. Maybe to TokenList, rather.
+    @staticmethod
+    def _get_chained_token(token: Token, attr: str, skip: set[str]) -> Optional[Token]:
+        while True:
+            token = getattr(token, attr)()
+
+            if token is None or token.text not in skip:
+                break
+
+        return token
+
+    def _match_sequence(
+        self,
+        doc: Document,
+        seq_pattern: SequencePattern,
+        start_token: Token,
+        annos_by_token: defaultdict[Token, Iterable[Annotation]],
+        ds: Optional[DsCollection]
+    ) -> Optional[Annotation]:
+        """
+        Matches a token sequence pattern at `start_token`.
+
+        Args:
+            doc: The document.
+            seq_pattern: The pattern to match.
+            start_token: The start token to match.
+            annos_by_token: Map from tokens to annotations covering it.
+            ds: Lookup dictionaries available.
+
+        Returns:
+              An Annotation if matching is possible, None otherwise.
+        """
+
+        direction = seq_pattern.direction
+        # FIXME Avoid the dependency loop.
+        attr = _DIRECTION_MAP[direction]["attr"]
+        pattern = _DIRECTION_MAP[direction]["order"](seq_pattern.pattern)
+
+        current_token = start_token
+        end_token = start_token
+
+        for pattern_position in pattern:
+            if current_token is None or not _PatternPositionMatcher.match(
+                token_pattern=pattern_position,
+                token=current_token,
+                annos=annos_by_token[current_token],
+                ds=ds,
+                metadata=doc.metadata,
+            ):
+                return None
+
+            end_token = current_token
+            current_token = SequenceAnnotator._get_chained_token(
+                current_token, attr, seq_pattern.skip
+            )
+
+        start_token, end_token = _DIRECTION_MAP[direction]["order"](
+            (start_token, end_token)
+        )
+
+        return Annotation(
+            text=doc.text[start_token.start_char : end_token.end_char],
+            start_char=start_token.start_char,
+            end_char=end_token.end_char,
+            tag=self.tag,
+            priority=self.priority,
+            start_token=start_token,
+            end_token=end_token,
+        )
+
 
 class SingleTokenLookupAnnotator(Annotator):
     """
@@ -309,3 +425,222 @@ def annotate(self, doc: Document) -> list[Annotation]:
             )
 
         return annotations
+
+
+class _PatternPositionMatcher:  # pylint: disable=R0903
+    """Checks if a token matches against a single pattern."""
+
+    @classmethod
+    def match(cls, token_pattern: dict | TokenPattern, **kwargs) -> bool:  # pylint:
+        # disable=R0911
+        """
+        Matches a pattern position (a dict with one key). Other information should be
+        presented as kwargs.
+
+        Args:
+            token_pattern: A dictionary with a single key, e.g. {'is_initial': True}
+            kwargs: Any other information, like the token or ds
+
+        Returns:
+            True if the pattern position matches, false otherwise.
+        """
+
+        if isinstance(token_pattern, dict):
+            return cls.match(as_token_pattern(token_pattern), **kwargs)
+
+        func = token_pattern.func
+        value = token_pattern.pattern
+
+        if func == "equal":
+            return kwargs.get("token").text == value
+        if func == "re_match":
+            return re.match(value, kwargs.get("token").text) is not None
+        if func == "is_initial":
+
+            warnings.warn(
+                "is_initial matcher pattern is deprecated and will be removed "
+                "in a future version",
+                DeprecationWarning,
+            )
+
+            return (
+                (
+                    len(kwargs.get("token").text) == 1
+                    and kwargs.get("token").text[0].isupper()
+                )
+                or kwargs.get("token").text in {"Ch", "Chr", "Ph", "Th"}
+            ) == value
+        if func == "is_initials":
+            return (
+                len(kwargs.get("token").text) <= 4
+                and kwargs.get("token").text.isupper()
+            ) == value
+        if func == "like_name":
+            return (
+                len(kwargs.get("token").text) >= 3
+                and kwargs.get("token").text.istitle()
+                and not any(ch.isdigit() for ch in kwargs.get("token").text)
+            ) == value
+        if func == "lookup":
+            return cls._lookup(value, **kwargs)
+        if func == "neg_lookup":
+            return not cls._lookup(value, **kwargs)
+        if func == "tag":
+            annos = kwargs.get("annos", ())
+            return any(anno.tag == value for anno in annos)
+        if func == "and":
+            return all(cls.match(x, **kwargs) for x in value)
+        if func == "or":
+            return any(cls.match(x, **kwargs) for x in value)
+
+        raise NotImplementedError(f"No known logic for pattern {func}")
+
+    @classmethod
+    def _lookup(cls, ent_type: str, **kwargs) -> bool:
+        token = kwargs.get("token").text
+        if "." in ent_type:
+            meta_key, meta_attr = ent_type.split(".", 1)
+            try:
+                meta_val = getattr(kwargs["metadata"][meta_key], meta_attr)
+            except (TypeError, KeyError, AttributeError):
+                return False
+            else:
+                return (
+                    token == meta_val
+                    if isinstance(meta_val, str)
+                    else token in meta_val
+                )
+        else:  # pylint: disable=R1705
+            return token in kwargs.get("ds")[ent_type]
+
+
+def as_token_pattern(pat_dict: dict) -> TokenPattern:
+    if len(pat_dict) != 1:
+        raise ValueError(
+            f"Cannot parse a token pattern which doesn't have exactly 1 key: "
+            f"{pat_dict}."
+        )
+    func, value = next(iter(pat_dict.items()))
+    if func in ("and", "or"):
+        return NestedTokenPattern(func, list(map(as_token_pattern, value)))
+    return SimpleTokenPattern(func, value)
+
+
+class SequenceAnnotator(Annotator):
+    """
+    Annotates based on token patterns, which should be provided as a list of dicts. Each
+    position in the list denotes a token position, e.g.: [{'is_initial': True},
+    {'like_name': True}] matches sequences of two tokens, where the first one is an
+    initial, and the second one is like a name.
+
+    Arguments:
+        pattern: The pattern
+        ds: Lookup dictionaries. Those referenced by the pattern should be
+            LookupSets. (Don't ask why.)
+        skip: Any string values that should be skipped in matching (e.g. periods)
+    """
+
+    def __init__(
+        self,
+        pattern: list[dict],
+        *args,
+        ds: Optional[DsCollection] = None,
+        skip: Optional[list[str]] = None,
+        **kwargs,
+    ) -> None:
+        self.pattern = pattern
+        self.dicts = ds
+        self.skip = set(skip or [])
+
+        self._start_words = None
+        self._matching_pipeline = None
+
+        if len(self.pattern) > 0 and "lookup" in self.pattern[0]:
+
+            if self.ds is None:
+                raise RuntimeError(
+                    "Created pattern with lookup in TokenPatternAnnotator, but "
+                    "no lookup structures provided."
+                )
+
+            lookup_list = self.ds[self.pattern[0]["lookup"]]
+
+            # FIXME This doesn't work correctly for multiple ([{"lookup":"prefix"},
+            #  {"lookup":"interfix"}]) and nested patterns ("or", "and").
+            if not isinstance(lookup_list, LookupSet):
+                raise ValueError(
+                    f"Expected a LookupSet, but got a " f"{type(lookup_list)}."
+                )
+
+            # FIXME This doesn't work correctly for multiple ([{"lookup":"prefix"},
+            #  {"lookup":"interfix"}]) and nested patterns ("or", "and").
+            self._start_words = lookup_list.items()
+            # FIXME This doesn't work correctly for multiple ([{"lookup":"prefix"},
+            #  {"lookup":"interfix"}]) and nested patterns ("or", "and").
+            self._matching_pipeline = lookup_list.matching_pipeline
+
+        self._seq_pattern = SequencePattern("right",
+                                            set(skip or ()),
+                                            list(map(as_token_pattern, pattern)))
+
+        super().__init__(*args, **kwargs)
+
+    def annotate(self, doc: Document) -> list[Annotation]:
+        """
+        Annotate the document, by matching the pattern against all tokens.
+
+        Args:
+            doc: The document being processed.
+
+        Returns:
+            A list of Annotation.
+        """
+
+        annotations = []
+
+        tokens = doc.get_tokens()
+
+        if self._start_words is not None:
+            tokens = tokens.token_lookup(
+                lookup_values=self._start_words,
+                matching_pipeline=self._matching_pipeline,
+            )
+
+        annos_by_token = SequenceAnnotator._index_by_token(
+            doc.annotations, doc.token_lists
+        )
+
+        for token in tokens:
+
+            annotation = self._match_sequence(doc,
+                                              self._seq_pattern,
+                                              token,
+                                              annos_by_token,
+                                              self.ds)
+
+            if annotation is not None:
+                annotations.append(annotation)
+
+        return annotations
+
+    # TODO Test.
+    @classmethod
+    def _index_by_token(
+        cls,
+        annotations: Iterable[Annotation],
+        token_lists: Mapping[str, TokenList],
+    ) -> defaultdict[Token, set[Annotation]]:
+        """Assigns existing annotations to tokens."""
+        annos_by_token = defaultdict(set)
+        for token_list in token_lists.values():
+            # TODO Improve efficiency, simplify.
+            for anno in annotations:
+                found_first = False
+                for token in token_list:
+                    if anno.start_char < token.end_char:
+                        found_first = True
+                    if token.start_char >= anno.end_char:
+                        break
+                    if found_first:
+                        annos_by_token[token].add(anno)
+        return annos_by_token
diff --git a/tests/unit/process/test_annotator.py b/tests/unit/process/test_annotator.py
index 6304f6f..aca7230 100644
--- a/tests/unit/process/test_annotator.py
+++ b/tests/unit/process/test_annotator.py
@@ -1,19 +1,25 @@
 import re
+from collections import defaultdict
 from unittest.mock import patch
 
+import pytest
+
 import docdeid.ds
 from docdeid.annotation import Annotation
 from docdeid.document import Document
-from docdeid.ds import LookupTrie
+from docdeid.ds import LookupTrie, DsCollection, LookupSet
 from docdeid.pattern import TokenPattern
 from docdeid.process.annotator import (
+    as_token_pattern,
     MultiTokenLookupAnnotator,
     RegexpAnnotator,
+    SequenceAnnotator,
+    SequencePattern,
     SingleTokenLookupAnnotator,
     TokenPatternAnnotator,
 )
 from docdeid.str.processor import LowercaseString
-from docdeid.tokenizer import SpaceSplitTokenizer
+from docdeid.tokenizer import SpaceSplitTokenizer, WordBoundaryTokenizer
 
 
 class TestSingleTokenLookupAnnotator:
@@ -292,3 +298,106 @@ def test_multi_pattern(self, long_text, long_tokens_linked, multi_pattern):
             annotations = annotator.annotate(doc)
 
         assert annotations == expected_annotations
+
+
+class TestSequenceAnnotator:
+    @pytest.fixture
+    def ds(self):
+        ds = DsCollection()
+
+        first_names = ["Andries", "pieter", "Aziz", "Bernard"]
+        surnames = ["Meijer", "Smit", "Bakker", "Heerma"]
+
+        ds["first_names"] = LookupSet()
+        ds["first_names"].add_items_from_iterable(items=first_names)
+
+        ds["surnames"] = LookupSet()
+        ds["surnames"].add_items_from_iterable(items=surnames)
+
+        return ds
+
+    @pytest.fixture
+    def pattern_doc(self):
+        return Document(
+            text="De man heet Andries Meijer-Heerma, voornaam Andries.",
+            tokenizers={"default": WordBoundaryTokenizer(False)}
+        )
+
+    def test_match_sequence(self, pattern_doc, ds):
+        pattern = [{"lookup": "first_names"}, {"like_name": True}]
+
+        tpa = SequenceAnnotator(pattern=[], ds=ds, tag="_")
+
+        assert tpa._match_sequence(
+            pattern_doc,
+            SequencePattern("right", set(), list(map(as_token_pattern, pattern))),
+            start_token=pattern_doc.get_tokens()[3],
+            annos_by_token=defaultdict(list),
+            ds=ds,
+        ) == Annotation(text="Andries Meijer", start_char=12, end_char=26, tag="_")
+        assert (
+                tpa._match_sequence(
+                    pattern_doc,
+                    SequencePattern("right", set(), list(map(as_token_pattern, pattern))),
+                    start_token=pattern_doc.get_tokens()[7],
+                    annos_by_token=defaultdict(list),
+                    ds=ds,
+                )
+                is None
+        )
+
+    def test_match_sequence_left(self, pattern_doc, ds):
+        pattern = [{"lookup": "first_names"}, {"like_name": True}]
+
+        tpa = SequenceAnnotator(pattern=[], ds=ds, tag="_")
+
+        assert tpa._match_sequence(
+            pattern_doc,
+            SequencePattern("left", set(), list(map(as_token_pattern, pattern))),
+            start_token=pattern_doc.get_tokens()[4],
+            annos_by_token=defaultdict(list),
+            ds=ds,
+        ) == Annotation(text="Andries Meijer", start_char=12, end_char=26, tag="_")
+
+        assert (
+                tpa._match_sequence(
+                    pattern_doc,
+                    SequencePattern("left", set(), list(map(as_token_pattern, pattern))),
+                    start_token=pattern_doc.get_tokens()[8],
+                    annos_by_token=defaultdict(list),
+                    ds=ds,
+                )
+                is None
+        )
+
+    def test_match_sequence_skip(self, pattern_doc, ds):
+        pattern = [{"lookup": "surnames"}, {"like_name": True}]
+
+        tpa = SequenceAnnotator(pattern=[], ds=ds, tag="_")
+
+        assert tpa._match_sequence(
+            pattern_doc,
+            SequencePattern("right", {"-"}, list(map(as_token_pattern, pattern))),
+            start_token=pattern_doc.get_tokens()[4],
+            annos_by_token=defaultdict(list),
+            ds=ds,
+        ) == Annotation(text="Meijer-Heerma", start_char=20, end_char=33, tag="_")
+        assert (
+                tpa._match_sequence(
+                    pattern_doc,
+                    SequencePattern("right", set(), list(map(as_token_pattern, pattern))),
+                    start_token=pattern_doc.get_tokens()[4],
+                    annos_by_token=defaultdict(list),
+                    ds=ds,
+                )
+                is None
+        )
+
+    def test_annotate(self, pattern_doc, ds):
+        pattern = [{"lookup": "first_names"}, {"like_name": True}]
+
+        tpa = SequenceAnnotator(pattern=pattern, ds=ds, tag="_")
+
+        assert tpa.annotate(pattern_doc) == [
+            Annotation(text="Andries Meijer", start_char=12, end_char=26, tag="_")
+        ]

From 4110a53b15ee5cfa72d2601be426b15542734dc7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Mon, 11 Mar 2024 17:55:11 +0100
Subject: [PATCH 18/41] Format code

---
 docdeid/process/__init__.py          |  2 +-
 docdeid/process/annotator.py         | 55 +++++++++++++++++-----------
 docdeid/tokenizer.py                 |  6 +--
 tests/unit/process/test_annotator.py | 54 +++++++++++++--------------
 4 files changed, 65 insertions(+), 52 deletions(-)

diff --git a/docdeid/process/__init__.py b/docdeid/process/__init__.py
index 6113db6..1333cee 100644
--- a/docdeid/process/__init__.py
+++ b/docdeid/process/__init__.py
@@ -3,8 +3,8 @@
     MergeAdjacentAnnotations,
     OverlapResolver,
 )
+from .annotator import _DIRECTION_MAP  # FIXME Stop using this.
 from .annotator import (
-    _DIRECTION_MAP, # FIXME Stop using this.
     Annotator,
     MultiTokenLookupAnnotator,
     RegexpAnnotator,
diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py
index b6c51b7..911e4fb 100644
--- a/docdeid/process/annotator.py
+++ b/docdeid/process/annotator.py
@@ -3,18 +3,17 @@
 from abc import ABC, abstractmethod
 from collections import defaultdict
 from dataclasses import dataclass
-from typing import Iterable, Optional, Union, Literal, Mapping
+from typing import Iterable, Literal, Mapping, Optional, Union
 
 import docdeid.str
 from docdeid.annotation import Annotation
 from docdeid.document import Document
-from docdeid.tokenizer import Token, TokenList
 from docdeid.ds import DsCollection
 from docdeid.ds.lookup import LookupSet, LookupTrie
 from docdeid.pattern import TokenPattern
 from docdeid.process.doc_processor import DocProcessor
 from docdeid.str.processor import StringModifier
-
+from docdeid.tokenizer import Token, TokenList
 
 _DIRECTION_MAP = {
     "left": {
@@ -33,14 +32,24 @@
 @dataclass
 class SimpleTokenPattern:
     """A pattern for a token (and possibly its annotation, too)."""
-    func: Literal["equal", "re_match", "is_initial", "is_initials", "like_name",
-    "lookup", "neg_lookup", "tag"]
+
+    func: Literal[
+        "equal",
+        "re_match",
+        "is_initial",
+        "is_initials",
+        "like_name",
+        "lookup",
+        "neg_lookup",
+        "tag",
+    ]
     pattern: str
 
 
 @dataclass
 class NestedTokenPattern:
     """Coordination of token patterns."""
+
     func: Literal["and", "or"]
     pattern: list[TokenPattern]
 
@@ -50,9 +59,8 @@ class NestedTokenPattern:
 
 @dataclass
 class SequencePattern:
-    """
-    Pattern for matching a sequence of tokens.
-    """
+    """Pattern for matching a sequence of tokens."""
+
     direction: Literal["left", "right"]
     skip: set[str]
     pattern: list[TokenPattern]
@@ -109,7 +117,7 @@ def _match_sequence(
         seq_pattern: SequencePattern,
         start_token: Token,
         annos_by_token: defaultdict[Token, Iterable[Annotation]],
-        ds: Optional[DsCollection]
+        ds: Optional[DsCollection],
     ) -> Optional[Annotation]:
         """
         Matches a token sequence pattern at `start_token`.
@@ -515,6 +523,13 @@ def _lookup(cls, ent_type: str, **kwargs) -> bool:
 
 
 def as_token_pattern(pat_dict: dict) -> TokenPattern:
+    """
+    Converts the JSON dictionary representation of token patterns into a
+    `TokenPattern` instance.
+
+    Args:
+        pat_dict: the JSON representation of the pattern
+    """
     if len(pat_dict) != 1:
         raise ValueError(
             f"Cannot parse a token pattern which doesn't have exactly 1 key: "
@@ -535,8 +550,8 @@ class SequenceAnnotator(Annotator):
 
     Arguments:
         pattern: The pattern
-        ds: Lookup dictionaries. Those referenced by the pattern should be
-            LookupSets. (Don't ask why.)
+        ds: Lookup dictionaries. Those referenced by the pattern should be LookupSets.
+            (Don't ask why.)
         skip: Any string values that should be skipped in matching (e.g. periods)
     """
 
@@ -557,13 +572,13 @@ def __init__(
 
         if len(self.pattern) > 0 and "lookup" in self.pattern[0]:
 
-            if self.ds is None:
+            if self.dicts is None:
                 raise RuntimeError(
                     "Created pattern with lookup in TokenPatternAnnotator, but "
                     "no lookup structures provided."
                 )
 
-            lookup_list = self.ds[self.pattern[0]["lookup"]]
+            lookup_list = self.dicts[self.pattern[0]["lookup"]]
 
             # FIXME This doesn't work correctly for multiple ([{"lookup":"prefix"},
             #  {"lookup":"interfix"}]) and nested patterns ("or", "and").
@@ -579,9 +594,9 @@ def __init__(
             #  {"lookup":"interfix"}]) and nested patterns ("or", "and").
             self._matching_pipeline = lookup_list.matching_pipeline
 
-        self._seq_pattern = SequencePattern("right",
-                                            set(skip or ()),
-                                            list(map(as_token_pattern, pattern)))
+        self._seq_pattern = SequencePattern(
+            "right", set(skip or ()), list(map(as_token_pattern, pattern))
+        )
 
         super().__init__(*args, **kwargs)
 
@@ -612,11 +627,9 @@ def annotate(self, doc: Document) -> list[Annotation]:
 
         for token in tokens:
 
-            annotation = self._match_sequence(doc,
-                                              self._seq_pattern,
-                                              token,
-                                              annos_by_token,
-                                              self.ds)
+            annotation = self._match_sequence(
+                doc, self._seq_pattern, token, annos_by_token, self.dicts
+            )
 
             if annotation is not None:
                 annotations.append(annotation)
diff --git a/docdeid/tokenizer.py b/docdeid/tokenizer.py
index 7826c39..e2eae5d 100644
--- a/docdeid/tokenizer.py
+++ b/docdeid/tokenizer.py
@@ -355,14 +355,14 @@ def _split_text(self, text: str) -> list[Token]:
 
 class WordBoundaryTokenizer(Tokenizer):  # pylint: disable=R0903
     """
-    Tokenizes based on word boundary. Sequences of non-alphanumeric characters are
-    also represented as tokens.
+    Tokenizes based on word boundary. Sequences of non-alphanumeric characters are also
+    represented as tokens.
 
     Args:
         keep_blanks: Keep whitespace in tokens, and whitespace-only tokens?
     """
 
-    def __init__(self, keep_blanks=True):
+    def __init__(self, keep_blanks: bool = True) -> None:
         super().__init__()
         self._trim = not keep_blanks
 
diff --git a/tests/unit/process/test_annotator.py b/tests/unit/process/test_annotator.py
index aca7230..ab150ef 100644
--- a/tests/unit/process/test_annotator.py
+++ b/tests/unit/process/test_annotator.py
@@ -7,16 +7,16 @@
 import docdeid.ds
 from docdeid.annotation import Annotation
 from docdeid.document import Document
-from docdeid.ds import LookupTrie, DsCollection, LookupSet
+from docdeid.ds import DsCollection, LookupSet, LookupTrie
 from docdeid.pattern import TokenPattern
 from docdeid.process.annotator import (
-    as_token_pattern,
     MultiTokenLookupAnnotator,
     RegexpAnnotator,
     SequenceAnnotator,
     SequencePattern,
     SingleTokenLookupAnnotator,
     TokenPatternAnnotator,
+    as_token_pattern,
 )
 from docdeid.str.processor import LowercaseString
 from docdeid.tokenizer import SpaceSplitTokenizer, WordBoundaryTokenizer
@@ -320,7 +320,7 @@ def ds(self):
     def pattern_doc(self):
         return Document(
             text="De man heet Andries Meijer-Heerma, voornaam Andries.",
-            tokenizers={"default": WordBoundaryTokenizer(False)}
+            tokenizers={"default": WordBoundaryTokenizer(False)},
         )
 
     def test_match_sequence(self, pattern_doc, ds):
@@ -336,14 +336,14 @@ def test_match_sequence(self, pattern_doc, ds):
             ds=ds,
         ) == Annotation(text="Andries Meijer", start_char=12, end_char=26, tag="_")
         assert (
-                tpa._match_sequence(
-                    pattern_doc,
-                    SequencePattern("right", set(), list(map(as_token_pattern, pattern))),
-                    start_token=pattern_doc.get_tokens()[7],
-                    annos_by_token=defaultdict(list),
-                    ds=ds,
-                )
-                is None
+            tpa._match_sequence(
+                pattern_doc,
+                SequencePattern("right", set(), list(map(as_token_pattern, pattern))),
+                start_token=pattern_doc.get_tokens()[7],
+                annos_by_token=defaultdict(list),
+                ds=ds,
+            )
+            is None
         )
 
     def test_match_sequence_left(self, pattern_doc, ds):
@@ -360,14 +360,14 @@ def test_match_sequence_left(self, pattern_doc, ds):
         ) == Annotation(text="Andries Meijer", start_char=12, end_char=26, tag="_")
 
         assert (
-                tpa._match_sequence(
-                    pattern_doc,
-                    SequencePattern("left", set(), list(map(as_token_pattern, pattern))),
-                    start_token=pattern_doc.get_tokens()[8],
-                    annos_by_token=defaultdict(list),
-                    ds=ds,
-                )
-                is None
+            tpa._match_sequence(
+                pattern_doc,
+                SequencePattern("left", set(), list(map(as_token_pattern, pattern))),
+                start_token=pattern_doc.get_tokens()[8],
+                annos_by_token=defaultdict(list),
+                ds=ds,
+            )
+            is None
         )
 
     def test_match_sequence_skip(self, pattern_doc, ds):
@@ -383,14 +383,14 @@ def test_match_sequence_skip(self, pattern_doc, ds):
             ds=ds,
         ) == Annotation(text="Meijer-Heerma", start_char=20, end_char=33, tag="_")
         assert (
-                tpa._match_sequence(
-                    pattern_doc,
-                    SequencePattern("right", set(), list(map(as_token_pattern, pattern))),
-                    start_token=pattern_doc.get_tokens()[4],
-                    annos_by_token=defaultdict(list),
-                    ds=ds,
-                )
-                is None
+            tpa._match_sequence(
+                pattern_doc,
+                SequencePattern("right", set(), list(map(as_token_pattern, pattern))),
+                start_token=pattern_doc.get_tokens()[4],
+                annos_by_token=defaultdict(list),
+                ds=ds,
+            )
+            is None
         )
 
     def test_annotate(self, pattern_doc, ds):

From df73e54730d2ed3aa8b3df94d0b8803e17c6e309 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Mon, 11 Mar 2024 19:57:59 +0100
Subject: [PATCH 19/41] Replace `_DIRECTION_MAP` with an enum

---
 docdeid/direction.py                 | 41 ++++++++++++
 docdeid/process/__init__.py          |  1 -
 docdeid/process/annotator.py         | 94 +++++++++++-----------------
 docdeid/tokenizer.py                 | 32 +++++++++-
 tests/unit/process/test_annotator.py | 25 ++++++--
 5 files changed, 128 insertions(+), 65 deletions(-)
 create mode 100644 docdeid/direction.py

diff --git a/docdeid/direction.py b/docdeid/direction.py
new file mode 100644
index 0000000..6c4b104
--- /dev/null
+++ b/docdeid/direction.py
@@ -0,0 +1,41 @@
+from __future__ import annotations
+
+from enum import IntEnum
+from typing import Iterable, Sequence, TypeVar
+
+T = TypeVar('T')
+
+
+class Direction(IntEnum):
+    """Direction in text -- either left or right."""
+    LEFT = -1
+    RIGHT = 1
+
+    @property
+    def opposite(self) -> Direction:
+        """The opposite direction to this."""
+        return Direction(-self)
+
+    @staticmethod
+    def from_string(val: str) -> Direction:
+        """
+        Parses a Direction from a string, which must be either 'left' or 'right' after
+        lowercasing.
+        """
+        norm = val.lower()
+        if norm == "left":
+            return Direction.LEFT
+        if norm == "right":
+            return Direction.RIGHT
+        raise ValueError("Invalid direction: '%s'".format(val))
+
+    def iter(self, seq: Sequence[T]) -> Iterable[T]:
+        """
+        Returns an iterator over the given sequence that traverses it in this direction.
+
+        Args:
+            seq: sequence to iterate over
+        """
+        if self is Direction.RIGHT:
+            return seq
+        return reversed(seq)
diff --git a/docdeid/process/__init__.py b/docdeid/process/__init__.py
index 1333cee..d0f040f 100644
--- a/docdeid/process/__init__.py
+++ b/docdeid/process/__init__.py
@@ -3,7 +3,6 @@
     MergeAdjacentAnnotations,
     OverlapResolver,
 )
-from .annotator import _DIRECTION_MAP  # FIXME Stop using this.
 from .annotator import (
     Annotator,
     MultiTokenLookupAnnotator,
diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py
index 911e4fb..690c44e 100644
--- a/docdeid/process/annotator.py
+++ b/docdeid/process/annotator.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import re
 import warnings
 from abc import ABC, abstractmethod
@@ -7,6 +9,7 @@
 
 import docdeid.str
 from docdeid.annotation import Annotation
+from docdeid.direction import Direction
 from docdeid.document import Document
 from docdeid.ds import DsCollection
 from docdeid.ds.lookup import LookupSet, LookupTrie
@@ -15,19 +18,6 @@
 from docdeid.str.processor import StringModifier
 from docdeid.tokenizer import Token, TokenList
 
-_DIRECTION_MAP = {
-    "left": {
-        "attr": "previous",
-        "order": reversed,
-        "start_token": lambda annotation: annotation.start_token,
-    },
-    "right": {
-        "attr": "next",
-        "order": lambda pattern: pattern,
-        "start_token": lambda annotation: annotation.end_token,
-    },
-}
-
 
 @dataclass
 class SimpleTokenPattern:
@@ -61,7 +51,7 @@ class NestedTokenPattern:
 class SequencePattern:
     """Pattern for matching a sequence of tokens."""
 
-    direction: Literal["left", "right"]
+    direction: Direction
     skip: set[str]
     pattern: list[TokenPattern]
 
@@ -100,17 +90,6 @@ def annotate(self, doc: Document) -> list[Annotation]:
             A list of annotations.
         """
 
-    # FIXME This doesn't really belong here. Maybe to TokenList, rather.
-    @staticmethod
-    def _get_chained_token(token: Token, attr: str, skip: set[str]) -> Optional[Token]:
-        while True:
-            token = getattr(token, attr)()
-
-            if token is None or token.text not in skip:
-                break
-
-        return token
-
     def _match_sequence(
         self,
         doc: Document,
@@ -133,42 +112,41 @@ def _match_sequence(
               An Annotation if matching is possible, None otherwise.
         """
 
-        direction = seq_pattern.direction
-        # FIXME Avoid the dependency loop.
-        attr = _DIRECTION_MAP[direction]["attr"]
-        pattern = _DIRECTION_MAP[direction]["order"](seq_pattern.pattern)
+        dir_ = seq_pattern.direction
 
-        current_token = start_token
-        end_token = start_token
+        tokens = (token for token in start_token.iter_to(dir_)
+                  if token.text not in seq_pattern.skip)
+        # Iterate the token patterns in the direction corresponding to the surface
+        # order it's supposed to match (i.e. "left" means "iterate patterns from the
+        # end").
+        tok_patterns = dir_.iter(seq_pattern.pattern)
 
-        for pattern_position in pattern:
-            if current_token is None or not _PatternPositionMatcher.match(
-                token_pattern=pattern_position,
-                token=current_token,
-                annos=annos_by_token[current_token],
-                ds=ds,
-                metadata=doc.metadata,
+        num_matched = 0
+        end_token = start_token
+        for tok_pattern, end_token in zip(tok_patterns, tokens):
+            if _PatternPositionMatcher.match(
+                    token_pattern=tok_pattern,
+                    token=end_token,
+                    annos=annos_by_token[end_token],
+                    ds=ds,
+                    metadata=doc.metadata,
             ):
-                return None
-
-            end_token = current_token
-            current_token = SequenceAnnotator._get_chained_token(
-                current_token, attr, seq_pattern.skip
-            )
+                num_matched += 1
+            else:
+                break
 
-        start_token, end_token = _DIRECTION_MAP[direction]["order"](
-            (start_token, end_token)
-        )
+        if num_matched == len(seq_pattern.pattern):
+            left_token, right_token = dir_.iter((start_token, end_token))
 
-        return Annotation(
-            text=doc.text[start_token.start_char : end_token.end_char],
-            start_char=start_token.start_char,
-            end_char=end_token.end_char,
-            tag=self.tag,
-            priority=self.priority,
-            start_token=start_token,
-            end_token=end_token,
-        )
+            return Annotation(
+                text=doc.text[left_token.start_char : right_token.end_char],
+                start_char=left_token.start_char,
+                end_char=right_token.end_char,
+                tag=self.tag,
+                priority=self.priority,
+                start_token=left_token,
+                end_token=right_token,
+            )
 
 
 class SingleTokenLookupAnnotator(Annotator):
@@ -595,7 +573,9 @@ def __init__(
             self._matching_pipeline = lookup_list.matching_pipeline
 
         self._seq_pattern = SequencePattern(
-            "right", set(skip or ()), list(map(as_token_pattern, pattern))
+            Direction.RIGHT,
+            set(skip or ()),
+            list(map(as_token_pattern, pattern))
         )
 
         super().__init__(*args, **kwargs)
diff --git a/docdeid/tokenizer.py b/docdeid/tokenizer.py
index e2eae5d..249a0ab 100644
--- a/docdeid/tokenizer.py
+++ b/docdeid/tokenizer.py
@@ -6,8 +6,9 @@
 from collections import defaultdict
 from collections.abc import Sequence
 from dataclasses import dataclass, field
-from typing import Iterator, Literal, Optional, SupportsIndex, overload
+from typing import Iterator, Literal, Optional, SupportsIndex, overload, Generator
 
+from docdeid.direction import Direction
 from docdeid.str import StringModifier
 
 
@@ -122,6 +123,35 @@ def next(self, num: int = 1) -> Optional[Token]:
         """
         return self._get_linked_token(num=num, attr="_next_token")
 
+    def get_nth(self,
+                num: int = 1,
+                dir_: Direction = Direction.RIGHT,
+                ) -> Optional[Token]:
+        """
+        Finds the _n_-th token to the left or right.
+
+        Args:
+            num: number of tokens to move
+            dir_: direction to go
+        """
+        if num < 0:
+            return self.get_nth(-num, dir_.opposite)
+        return self.next(num) if dir_ is Direction.RIGHT else self.previous(num)
+
+    def iter_to(self,
+                dir_: Direction = Direction.RIGHT,
+            ) -> Generator[Token, None, None]:
+        """
+        Iterates linked tokens in the specified direction.
+
+        Args:
+            dir_: direction to go
+        """
+        token = self
+        while token is not None:
+            yield token
+            token = token.next() if dir_ is Direction.RIGHT else token.previous()
+
     def __len__(self) -> int:
         """
         The length of the text.
diff --git a/tests/unit/process/test_annotator.py b/tests/unit/process/test_annotator.py
index ab150ef..6bcb3ec 100644
--- a/tests/unit/process/test_annotator.py
+++ b/tests/unit/process/test_annotator.py
@@ -6,6 +6,7 @@
 
 import docdeid.ds
 from docdeid.annotation import Annotation
+from docdeid.direction import Direction
 from docdeid.document import Document
 from docdeid.ds import DsCollection, LookupSet, LookupTrie
 from docdeid.pattern import TokenPattern
@@ -330,7 +331,9 @@ def test_match_sequence(self, pattern_doc, ds):
 
         assert tpa._match_sequence(
             pattern_doc,
-            SequencePattern("right", set(), list(map(as_token_pattern, pattern))),
+            SequencePattern(Direction.RIGHT,
+                            set(),
+                            list(map(as_token_pattern, pattern))),
             start_token=pattern_doc.get_tokens()[3],
             annos_by_token=defaultdict(list),
             ds=ds,
@@ -338,7 +341,9 @@ def test_match_sequence(self, pattern_doc, ds):
         assert (
             tpa._match_sequence(
                 pattern_doc,
-                SequencePattern("right", set(), list(map(as_token_pattern, pattern))),
+                SequencePattern(Direction.RIGHT,
+                                set(),
+                                list(map(as_token_pattern, pattern))),
                 start_token=pattern_doc.get_tokens()[7],
                 annos_by_token=defaultdict(list),
                 ds=ds,
@@ -353,7 +358,9 @@ def test_match_sequence_left(self, pattern_doc, ds):
 
         assert tpa._match_sequence(
             pattern_doc,
-            SequencePattern("left", set(), list(map(as_token_pattern, pattern))),
+            SequencePattern(Direction.LEFT,
+                            set(),
+                            list(map(as_token_pattern, pattern))),
             start_token=pattern_doc.get_tokens()[4],
             annos_by_token=defaultdict(list),
             ds=ds,
@@ -362,7 +369,9 @@ def test_match_sequence_left(self, pattern_doc, ds):
         assert (
             tpa._match_sequence(
                 pattern_doc,
-                SequencePattern("left", set(), list(map(as_token_pattern, pattern))),
+                SequencePattern(Direction.LEFT,
+                                set(),
+                                list(map(as_token_pattern, pattern))),
                 start_token=pattern_doc.get_tokens()[8],
                 annos_by_token=defaultdict(list),
                 ds=ds,
@@ -377,7 +386,9 @@ def test_match_sequence_skip(self, pattern_doc, ds):
 
         assert tpa._match_sequence(
             pattern_doc,
-            SequencePattern("right", {"-"}, list(map(as_token_pattern, pattern))),
+            SequencePattern(Direction.RIGHT,
+                            {"-"},
+                            list(map(as_token_pattern, pattern))),
             start_token=pattern_doc.get_tokens()[4],
             annos_by_token=defaultdict(list),
             ds=ds,
@@ -385,7 +396,9 @@ def test_match_sequence_skip(self, pattern_doc, ds):
         assert (
             tpa._match_sequence(
                 pattern_doc,
-                SequencePattern("right", set(), list(map(as_token_pattern, pattern))),
+                SequencePattern(Direction.RIGHT,
+                                set(),
+                                list(map(as_token_pattern, pattern))),
                 start_token=pattern_doc.get_tokens()[4],
                 annos_by_token=defaultdict(list),
                 ds=ds,

From 99163d6700b43f34392d79222ea08417b8745c97 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Mon, 11 Mar 2024 22:16:11 +0100
Subject: [PATCH 20/41] Improve and test `annos_by_token()`

---
 docdeid/annotation.py         | 45 ++++++++++++++++++++++++
 docdeid/document.py           |  5 +++
 docdeid/process/annotator.py  | 26 +-------------
 tests/unit/test_annotation.py | 65 ++++++++++++++++++++++++++++++++++-
 4 files changed, 115 insertions(+), 26 deletions(-)

diff --git a/docdeid/annotation.py b/docdeid/annotation.py
index 142a223..7f4f538 100644
--- a/docdeid/annotation.py
+++ b/docdeid/annotation.py
@@ -1,3 +1,4 @@
+from collections import defaultdict
 from dataclasses import dataclass, field
 from typing import Any, Callable, Optional
 
@@ -126,6 +127,10 @@ class AnnotationSet(set[Annotation]):
     It extends the builtin ``set``.
     """
 
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._annos_by_tokenizers_by_token = {}
+
     def sorted(
         self,
         by: tuple,  # pylint: disable=C0103
@@ -185,3 +190,43 @@ def has_overlap(self) -> bool:
                 return True
 
         return False
+
+    def annos_by_token(self, doc: "Document") -> defaultdict[Token, set[Annotation]]:
+        """
+        Returns a mapping from document tokens to annotations.
+
+        Args:
+            doc: document whose tokens are to be linked
+        """
+        # We key the token->annotations cache only by the set of tokenizers where it
+        # actually (obviously) depends also on the document. However, it's assumed
+        # that an AnnotationSet is always bound only to one document.
+        tokenizers = frozenset(doc.token_lists)
+        if tokenizers not in self._annos_by_tokenizers_by_token:
+            annos_by_token = defaultdict(set)
+            for token_list in doc.token_lists.values():
+                if not token_list:
+                    continue
+                cur_tok_idx = 0
+                tok = token_list[cur_tok_idx]
+                for anno in self.sorted(by=("start_char", )):
+                    try:
+                        # Iterate over tokens till we reach the annotation.
+                        while tok.end_char < anno.start_char:
+                            cur_tok_idx += 1
+                            tok = token_list[cur_tok_idx]
+                    except IndexError:
+                        break
+                    else:
+                        # Iterate over tokens in the annotation till we reach the end
+                        # of it or the end of the tokens.
+                        anno_tok_idx = cur_tok_idx
+                        anno_tok = tok
+                        while anno_tok.start_char < anno.end_char:
+                            annos_by_token[anno_tok].add(anno)
+                            if anno_tok_idx == len(token_list) - 1:
+                                break
+                            anno_tok_idx += 1
+                            anno_tok = token_list[anno_tok_idx]
+            self._annos_by_tokenizers_by_token[tokenizers] = annos_by_token
+        return self._annos_by_tokenizers_by_token[tokenizers]
diff --git a/docdeid/document.py b/docdeid/document.py
index c7b6c20..aa3f1a2 100644
--- a/docdeid/document.py
+++ b/docdeid/document.py
@@ -100,12 +100,17 @@ def text(self) -> str:
     @property
     def tokenizers(self) -> Optional[Mapping[str, Tokenizer]]:
         """Available tokenizers indexed by their name."""
+        if self._tokenizers is None:
+            raise RuntimeError("No tokenizers initialized.")
         return self._tokenizers
 
     @property
     def token_lists(self) -> Mapping[str, TokenList]:
         """Lists of tokens of the document, indexed by the name of the corresponding
         tokenizer."""
+        for tokker_name in set(self.tokenizers) - set(self._token_lists):
+            tokker = self._tokenizers[tokker_name]
+            self._token_lists[tokker_name] = tokker.tokenize(self._text)
         return self._token_lists
 
     def get_tokens(self, tokenizer_name: str = "default") -> TokenList:
diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py
index 690c44e..38a826b 100644
--- a/docdeid/process/annotator.py
+++ b/docdeid/process/annotator.py
@@ -601,9 +601,7 @@ def annotate(self, doc: Document) -> list[Annotation]:
                 matching_pipeline=self._matching_pipeline,
             )
 
-        annos_by_token = SequenceAnnotator._index_by_token(
-            doc.annotations, doc.token_lists
-        )
+        annos_by_token = doc.annotations.annos_by_token(doc)
 
         for token in tokens:
 
@@ -615,25 +613,3 @@ def annotate(self, doc: Document) -> list[Annotation]:
                 annotations.append(annotation)
 
         return annotations
-
-    # TODO Test.
-    @classmethod
-    def _index_by_token(
-        cls,
-        annotations: Iterable[Annotation],
-        token_lists: Mapping[str, TokenList],
-    ) -> defaultdict[Token, set[Annotation]]:
-        """Assigns existing annotations to tokens."""
-        annos_by_token = defaultdict(set)
-        for token_list in token_lists.values():
-            # TODO Improve efficiency, simplify.
-            for anno in annotations:
-                found_first = False
-                for token in token_list:
-                    if anno.start_char < token.end_char:
-                        found_first = True
-                    if token.start_char >= anno.end_char:
-                        break
-                    if found_first:
-                        annos_by_token[token].add(anno)
-        return annos_by_token
diff --git a/tests/unit/test_annotation.py b/tests/unit/test_annotation.py
index fe4f785..7e32ca4 100644
--- a/tests/unit/test_annotation.py
+++ b/tests/unit/test_annotation.py
@@ -1,8 +1,11 @@
+import re
+
 import pytest
 from frozendict import frozendict
 
+from docdeid import Document
 from docdeid.annotation import Annotation, AnnotationSet
-from docdeid.tokenizer import Token
+from docdeid.tokenizer import Token, WordBoundaryTokenizer, Tokenizer
 
 
 class TestAnnotation:
@@ -157,3 +160,63 @@ def test_get_annotations_sorted_no_frozendict(self, annotations):
             _ = annotation_set.sorted(
                 by=("priority", "length"), callbacks=dict(length=lambda x: -x)
             )
+
+    def test_annos_by_token(self, annotations):
+        doc = Document("1 2 3 1 2 3 hum Hello hum I'm Bob - said Cindy",
+                       tokenizers={"default": WordBoundaryTokenizer(False)})
+        aset = AnnotationSet([
+            a1 := Annotation("Hello", 16, 21, "word"),
+            a2 := Annotation("I", 26, 27, "ltr"),
+            a3 := Annotation("I'm", 26, 29, "words"),
+            a4 := Annotation("Bob", 30, 33, "name"),
+            a5 := Annotation("I'm Bob", 26, 33, "stmt"),
+        ])
+
+        # import pydevd_pycharm
+        # pydevd_pycharm.settrace()
+
+        got = aset.annos_by_token(doc)
+
+        want = {
+            Token("Hello", 16, 21): {a1},
+            Token("I", 26, 27): {a2, a3, a5},
+            Token("'", 27, 28): {a3, a5},
+            Token("m", 28, 29): {a3, a5},
+            Token("Bob", 30, 33): {a4, a5},
+        }
+
+        assert got == want
+
+    def test_annos_by_token_2(self, annotations):
+        class HumTokenizer(Tokenizer):
+            """Extracts each "hum" word and the following word as a token."""
+            def _split_text(self, text: str) -> list[Token]:
+                return [
+                    Token(match.group(0), match.start(), match.end())
+                    for match in re.finditer("\\bhum\\s+\\w+", text)
+                ]
+
+        doc = Document("1 2 3 1 2 3 hum Hello hum I'm Bob - said Cindy",
+                       tokenizers={"default": WordBoundaryTokenizer(False),
+                                   "for_fun": HumTokenizer()})
+        aset = AnnotationSet([
+            a1 := Annotation("Hello", 16, 21, "word"),
+            a2 := Annotation("I", 26, 27, "ltr"),
+            a3 := Annotation("I'm", 26, 29, "words"),
+            a4 := Annotation("Bob", 30, 33, "name"),
+            a5 := Annotation("I'm Bob", 26, 33, "stmt"),
+        ])
+
+        got = aset.annos_by_token(doc)
+
+        want = {
+            Token("Hello", 16, 21): {a1},
+            Token("I", 26, 27): {a2, a3, a5},
+            Token("'", 27, 28): {a3, a5},
+            Token("m", 28, 29): {a3, a5},
+            Token("Bob", 30, 33): {a4, a5},
+            Token("hum Hello", 12, 21): {a1},
+            Token("hum I", 22, 27): {a2, a3, a5},
+        }
+
+        assert got == want

From c7ba5bc8890ab7b1dbeb11ad83e3da75f88042fc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Tue, 12 Mar 2024 12:34:24 +0100
Subject: [PATCH 21/41] Drop `Token.get_nth`, simplify `Token.iter_to`

---
 docdeid/tokenizer.py | 23 ++++-------------------
 1 file changed, 4 insertions(+), 19 deletions(-)

diff --git a/docdeid/tokenizer.py b/docdeid/tokenizer.py
index 249a0ab..f055503 100644
--- a/docdeid/tokenizer.py
+++ b/docdeid/tokenizer.py
@@ -123,24 +123,8 @@ def next(self, num: int = 1) -> Optional[Token]:
         """
         return self._get_linked_token(num=num, attr="_next_token")
 
-    def get_nth(self,
-                num: int = 1,
-                dir_: Direction = Direction.RIGHT,
-                ) -> Optional[Token]:
-        """
-        Finds the _n_-th token to the left or right.
-
-        Args:
-            num: number of tokens to move
-            dir_: direction to go
-        """
-        if num < 0:
-            return self.get_nth(-num, dir_.opposite)
-        return self.next(num) if dir_ is Direction.RIGHT else self.previous(num)
-
-    def iter_to(self,
-                dir_: Direction = Direction.RIGHT,
-            ) -> Generator[Token, None, None]:
+    def iter_to(self, dir_: Direction = Direction.RIGHT,
+                ) -> Generator[Token, None, None]:
         """
         Iterates linked tokens in the specified direction.
 
@@ -150,7 +134,8 @@ def iter_to(self,
         token = self
         while token is not None:
             yield token
-            token = token.next() if dir_ is Direction.RIGHT else token.previous()
+            token = (token._next_token if dir_ is Direction.RIGHT else
+                     token._previous_token)
 
     def __len__(self) -> int:
         """

From c80e2adda4239fc6d033884d701ec51f975d211d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Tue, 12 Mar 2024 22:03:28 +0100
Subject: [PATCH 22/41] Format code

---
 docdeid/annotation.py                |  2 +-
 docdeid/direction.py                 |  9 +++--
 docdeid/process/annotator.py         | 25 +++++++-------
 docdeid/tokenizer.py                 | 13 ++++---
 tests/unit/process/test_annotator.py | 36 ++++++++++----------
 tests/unit/test_annotation.py        | 51 +++++++++++++++++-----------
 6 files changed, 75 insertions(+), 61 deletions(-)

diff --git a/docdeid/annotation.py b/docdeid/annotation.py
index 7f4f538..6689365 100644
--- a/docdeid/annotation.py
+++ b/docdeid/annotation.py
@@ -209,7 +209,7 @@ def annos_by_token(self, doc: "Document") -> defaultdict[Token, set[Annotation]]
                     continue
                 cur_tok_idx = 0
                 tok = token_list[cur_tok_idx]
-                for anno in self.sorted(by=("start_char", )):
+                for anno in self.sorted(by=("start_char",)):
                     try:
                         # Iterate over tokens till we reach the annotation.
                         while tok.end_char < anno.start_char:
diff --git a/docdeid/direction.py b/docdeid/direction.py
index 6c4b104..a6eaeb5 100644
--- a/docdeid/direction.py
+++ b/docdeid/direction.py
@@ -3,11 +3,12 @@
 from enum import IntEnum
 from typing import Iterable, Sequence, TypeVar
 
-T = TypeVar('T')
+T = TypeVar("T")
 
 
 class Direction(IntEnum):
     """Direction in text -- either left or right."""
+
     LEFT = -1
     RIGHT = 1
 
@@ -18,10 +19,8 @@ def opposite(self) -> Direction:
 
     @staticmethod
     def from_string(val: str) -> Direction:
-        """
-        Parses a Direction from a string, which must be either 'left' or 'right' after
-        lowercasing.
-        """
+        """Parses a Direction from a string, which must be either 'left' or 'right'
+        after lowercasing."""
         norm = val.lower()
         if norm == "left":
             return Direction.LEFT
diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py
index 38a826b..67fc169 100644
--- a/docdeid/process/annotator.py
+++ b/docdeid/process/annotator.py
@@ -114,8 +114,11 @@ def _match_sequence(
 
         dir_ = seq_pattern.direction
 
-        tokens = (token for token in start_token.iter_to(dir_)
-                  if token.text not in seq_pattern.skip)
+        tokens = (
+            token
+            for token in start_token.iter_to(dir_)
+            if token.text not in seq_pattern.skip
+        )
         # Iterate the token patterns in the direction corresponding to the surface
         # order it's supposed to match (i.e. "left" means "iterate patterns from the
         # end").
@@ -125,11 +128,11 @@ def _match_sequence(
         end_token = start_token
         for tok_pattern, end_token in zip(tok_patterns, tokens):
             if _PatternPositionMatcher.match(
-                    token_pattern=tok_pattern,
-                    token=end_token,
-                    annos=annos_by_token[end_token],
-                    ds=ds,
-                    metadata=doc.metadata,
+                token_pattern=tok_pattern,
+                token=end_token,
+                annos=annos_by_token[end_token],
+                ds=ds,
+                metadata=doc.metadata,
             ):
                 num_matched += 1
             else:
@@ -502,8 +505,8 @@ def _lookup(cls, ent_type: str, **kwargs) -> bool:
 
 def as_token_pattern(pat_dict: dict) -> TokenPattern:
     """
-    Converts the JSON dictionary representation of token patterns into a
-    `TokenPattern` instance.
+    Converts the JSON dictionary representation of token patterns into a `TokenPattern`
+    instance.
 
     Args:
         pat_dict: the JSON representation of the pattern
@@ -573,9 +576,7 @@ def __init__(
             self._matching_pipeline = lookup_list.matching_pipeline
 
         self._seq_pattern = SequencePattern(
-            Direction.RIGHT,
-            set(skip or ()),
-            list(map(as_token_pattern, pattern))
+            Direction.RIGHT, set(skip or ()), list(map(as_token_pattern, pattern))
         )
 
         super().__init__(*args, **kwargs)
diff --git a/docdeid/tokenizer.py b/docdeid/tokenizer.py
index f055503..d5c4efa 100644
--- a/docdeid/tokenizer.py
+++ b/docdeid/tokenizer.py
@@ -6,7 +6,7 @@
 from collections import defaultdict
 from collections.abc import Sequence
 from dataclasses import dataclass, field
-from typing import Iterator, Literal, Optional, SupportsIndex, overload, Generator
+from typing import Generator, Iterator, Literal, Optional, SupportsIndex, overload
 
 from docdeid.direction import Direction
 from docdeid.str import StringModifier
@@ -123,8 +123,10 @@ def next(self, num: int = 1) -> Optional[Token]:
         """
         return self._get_linked_token(num=num, attr="_next_token")
 
-    def iter_to(self, dir_: Direction = Direction.RIGHT,
-                ) -> Generator[Token, None, None]:
+    def iter_to(
+        self,
+        dir_: Direction = Direction.RIGHT,
+    ) -> Generator[Token, None, None]:
         """
         Iterates linked tokens in the specified direction.
 
@@ -134,8 +136,9 @@ def iter_to(self, dir_: Direction = Direction.RIGHT,
         token = self
         while token is not None:
             yield token
-            token = (token._next_token if dir_ is Direction.RIGHT else
-                     token._previous_token)
+            token = (
+                token._next_token if dir_ is Direction.RIGHT else token._previous_token
+            )
 
     def __len__(self) -> int:
         """
diff --git a/tests/unit/process/test_annotator.py b/tests/unit/process/test_annotator.py
index 6bcb3ec..efe7a6a 100644
--- a/tests/unit/process/test_annotator.py
+++ b/tests/unit/process/test_annotator.py
@@ -331,9 +331,9 @@ def test_match_sequence(self, pattern_doc, ds):
 
         assert tpa._match_sequence(
             pattern_doc,
-            SequencePattern(Direction.RIGHT,
-                            set(),
-                            list(map(as_token_pattern, pattern))),
+            SequencePattern(
+                Direction.RIGHT, set(), list(map(as_token_pattern, pattern))
+            ),
             start_token=pattern_doc.get_tokens()[3],
             annos_by_token=defaultdict(list),
             ds=ds,
@@ -341,9 +341,9 @@ def test_match_sequence(self, pattern_doc, ds):
         assert (
             tpa._match_sequence(
                 pattern_doc,
-                SequencePattern(Direction.RIGHT,
-                                set(),
-                                list(map(as_token_pattern, pattern))),
+                SequencePattern(
+                    Direction.RIGHT, set(), list(map(as_token_pattern, pattern))
+                ),
                 start_token=pattern_doc.get_tokens()[7],
                 annos_by_token=defaultdict(list),
                 ds=ds,
@@ -358,9 +358,9 @@ def test_match_sequence_left(self, pattern_doc, ds):
 
         assert tpa._match_sequence(
             pattern_doc,
-            SequencePattern(Direction.LEFT,
-                            set(),
-                            list(map(as_token_pattern, pattern))),
+            SequencePattern(
+                Direction.LEFT, set(), list(map(as_token_pattern, pattern))
+            ),
             start_token=pattern_doc.get_tokens()[4],
             annos_by_token=defaultdict(list),
             ds=ds,
@@ -369,9 +369,9 @@ def test_match_sequence_left(self, pattern_doc, ds):
         assert (
             tpa._match_sequence(
                 pattern_doc,
-                SequencePattern(Direction.LEFT,
-                                set(),
-                                list(map(as_token_pattern, pattern))),
+                SequencePattern(
+                    Direction.LEFT, set(), list(map(as_token_pattern, pattern))
+                ),
                 start_token=pattern_doc.get_tokens()[8],
                 annos_by_token=defaultdict(list),
                 ds=ds,
@@ -386,9 +386,9 @@ def test_match_sequence_skip(self, pattern_doc, ds):
 
         assert tpa._match_sequence(
             pattern_doc,
-            SequencePattern(Direction.RIGHT,
-                            {"-"},
-                            list(map(as_token_pattern, pattern))),
+            SequencePattern(
+                Direction.RIGHT, {"-"}, list(map(as_token_pattern, pattern))
+            ),
             start_token=pattern_doc.get_tokens()[4],
             annos_by_token=defaultdict(list),
             ds=ds,
@@ -396,9 +396,9 @@ def test_match_sequence_skip(self, pattern_doc, ds):
         assert (
             tpa._match_sequence(
                 pattern_doc,
-                SequencePattern(Direction.RIGHT,
-                                set(),
-                                list(map(as_token_pattern, pattern))),
+                SequencePattern(
+                    Direction.RIGHT, set(), list(map(as_token_pattern, pattern))
+                ),
                 start_token=pattern_doc.get_tokens()[4],
                 annos_by_token=defaultdict(list),
                 ds=ds,
diff --git a/tests/unit/test_annotation.py b/tests/unit/test_annotation.py
index 7e32ca4..2ed882f 100644
--- a/tests/unit/test_annotation.py
+++ b/tests/unit/test_annotation.py
@@ -5,7 +5,7 @@
 
 from docdeid import Document
 from docdeid.annotation import Annotation, AnnotationSet
-from docdeid.tokenizer import Token, WordBoundaryTokenizer, Tokenizer
+from docdeid.tokenizer import Token, Tokenizer, WordBoundaryTokenizer
 
 
 class TestAnnotation:
@@ -162,15 +162,19 @@ def test_get_annotations_sorted_no_frozendict(self, annotations):
             )
 
     def test_annos_by_token(self, annotations):
-        doc = Document("1 2 3 1 2 3 hum Hello hum I'm Bob - said Cindy",
-                       tokenizers={"default": WordBoundaryTokenizer(False)})
-        aset = AnnotationSet([
-            a1 := Annotation("Hello", 16, 21, "word"),
-            a2 := Annotation("I", 26, 27, "ltr"),
-            a3 := Annotation("I'm", 26, 29, "words"),
-            a4 := Annotation("Bob", 30, 33, "name"),
-            a5 := Annotation("I'm Bob", 26, 33, "stmt"),
-        ])
+        doc = Document(
+            "1 2 3 1 2 3 hum Hello hum I'm Bob - said Cindy",
+            tokenizers={"default": WordBoundaryTokenizer(False)},
+        )
+        aset = AnnotationSet(
+            [
+                a1 := Annotation("Hello", 16, 21, "word"),
+                a2 := Annotation("I", 26, 27, "ltr"),
+                a3 := Annotation("I'm", 26, 29, "words"),
+                a4 := Annotation("Bob", 30, 33, "name"),
+                a5 := Annotation("I'm Bob", 26, 33, "stmt"),
+            ]
+        )
 
         # import pydevd_pycharm
         # pydevd_pycharm.settrace()
@@ -190,22 +194,29 @@ def test_annos_by_token(self, annotations):
     def test_annos_by_token_2(self, annotations):
         class HumTokenizer(Tokenizer):
             """Extracts each "hum" word and the following word as a token."""
+
             def _split_text(self, text: str) -> list[Token]:
                 return [
                     Token(match.group(0), match.start(), match.end())
                     for match in re.finditer("\\bhum\\s+\\w+", text)
                 ]
 
-        doc = Document("1 2 3 1 2 3 hum Hello hum I'm Bob - said Cindy",
-                       tokenizers={"default": WordBoundaryTokenizer(False),
-                                   "for_fun": HumTokenizer()})
-        aset = AnnotationSet([
-            a1 := Annotation("Hello", 16, 21, "word"),
-            a2 := Annotation("I", 26, 27, "ltr"),
-            a3 := Annotation("I'm", 26, 29, "words"),
-            a4 := Annotation("Bob", 30, 33, "name"),
-            a5 := Annotation("I'm Bob", 26, 33, "stmt"),
-        ])
+        doc = Document(
+            "1 2 3 1 2 3 hum Hello hum I'm Bob - said Cindy",
+            tokenizers={
+                "default": WordBoundaryTokenizer(False),
+                "for_fun": HumTokenizer(),
+            },
+        )
+        aset = AnnotationSet(
+            [
+                a1 := Annotation("Hello", 16, 21, "word"),
+                a2 := Annotation("I", 26, 27, "ltr"),
+                a3 := Annotation("I'm", 26, 29, "words"),
+                a4 := Annotation("Bob", 30, 33, "name"),
+                a5 := Annotation("I'm Bob", 26, 33, "stmt"),
+            ]
+        )
 
         got = aset.annos_by_token(doc)
 

From 40fcd62d46c2df3791ce4111f4b872a6bcf3f8eb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Tue, 12 Mar 2024 22:16:15 +0100
Subject: [PATCH 23/41] Test and fix `Direction`

---
 docdeid/direction.py         |  2 +-
 tests/unit/test_direction.py | 30 ++++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)
 create mode 100644 tests/unit/test_direction.py

diff --git a/docdeid/direction.py b/docdeid/direction.py
index a6eaeb5..15cdbe8 100644
--- a/docdeid/direction.py
+++ b/docdeid/direction.py
@@ -26,7 +26,7 @@ def from_string(val: str) -> Direction:
             return Direction.LEFT
         if norm == "right":
             return Direction.RIGHT
-        raise ValueError("Invalid direction: '%s'".format(val))
+        raise ValueError("Invalid direction: '{}'".format(val))
 
     def iter(self, seq: Sequence[T]) -> Iterable[T]:
         """
diff --git a/tests/unit/test_direction.py b/tests/unit/test_direction.py
new file mode 100644
index 0000000..07537b1
--- /dev/null
+++ b/tests/unit/test_direction.py
@@ -0,0 +1,30 @@
+import pytest
+
+from docdeid.direction import Direction
+
+
+class TestDirection:
+    def test_basics(self):
+        assert Direction.LEFT != Direction.RIGHT
+        assert Direction.LEFT.opposite == Direction.RIGHT
+        assert Direction.RIGHT.opposite == Direction.LEFT
+
+    def test_parsing(self):
+        assert Direction.from_string("left") == Direction.LEFT
+        assert Direction.from_string("Left") == Direction.LEFT
+        assert Direction.from_string("LEFT") == Direction.LEFT
+        assert Direction.from_string("right") == Direction.RIGHT
+        assert Direction.from_string("Right") == Direction.RIGHT
+        assert Direction.from_string("RIGHT") == Direction.RIGHT
+
+    def test_parsing_failure(self):
+        with pytest.raises(ValueError, match="Invalid direction: 'down'"):
+            Direction.from_string("down")
+        with pytest.raises(ValueError, match="Invalid direction: ' left'"):
+            Direction.from_string(" left")
+
+    def test_iteration(self):
+        assert list(Direction.RIGHT.iter([])) == []
+        assert list(Direction.LEFT.iter([])) == []
+        assert list(Direction.RIGHT.iter([1, 2, "three"])) == [1, 2, "three"]
+        assert list(Direction.LEFT.iter([1, 2, "three"])) == ["three", 2, 1]

From 15b864890a062e74052fbee7bc55594bb9b5a5e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Tue, 12 Mar 2024 22:26:38 +0100
Subject: [PATCH 24/41] Fix Flake8-reported errors

---
 docdeid/annotation.py        |  8 ++++++--
 docdeid/process/annotator.py | 20 ++++++++++----------
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/docdeid/annotation.py b/docdeid/annotation.py
index 6689365..5afcbc8 100644
--- a/docdeid/annotation.py
+++ b/docdeid/annotation.py
@@ -4,6 +4,7 @@
 
 from frozendict import frozendict
 
+import docdeid
 from docdeid.tokenizer import Token
 
 UNKNOWN_ATTR_DEFAULT: Any = 0
@@ -127,7 +128,7 @@ class AnnotationSet(set[Annotation]):
     It extends the builtin ``set``.
     """
 
-    def __init__(self, *args, **kwargs):
+    def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
         self._annos_by_tokenizers_by_token = {}
 
@@ -191,7 +192,10 @@ def has_overlap(self) -> bool:
 
         return False
 
-    def annos_by_token(self, doc: "Document") -> defaultdict[Token, set[Annotation]]:
+    def annos_by_token(
+        self,
+        doc: "docdeid.document.Document",
+    ) -> defaultdict[Token, set[Annotation]]:
         """
         Returns a mapping from document tokens to annotations.
 
diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py
index 67fc169..258b032 100644
--- a/docdeid/process/annotator.py
+++ b/docdeid/process/annotator.py
@@ -5,7 +5,7 @@
 from abc import ABC, abstractmethod
 from collections import defaultdict
 from dataclasses import dataclass
-from typing import Iterable, Literal, Mapping, Optional, Union
+from typing import Iterable, Literal, Optional, Union
 
 import docdeid.str
 from docdeid.annotation import Annotation
@@ -16,7 +16,7 @@
 from docdeid.pattern import TokenPattern
 from docdeid.process.doc_processor import DocProcessor
 from docdeid.str.processor import StringModifier
-from docdeid.tokenizer import Token, TokenList
+from docdeid.tokenizer import Token
 
 
 @dataclass
@@ -41,10 +41,10 @@ class NestedTokenPattern:
     """Coordination of token patterns."""
 
     func: Literal["and", "or"]
-    pattern: list[TokenPattern]
+    pattern: list[TokenPatternFromCfg]
 
 
-TokenPattern = Union[SimpleTokenPattern, NestedTokenPattern]
+TokenPatternFromCfg = Union[SimpleTokenPattern, NestedTokenPattern]
 
 
 @dataclass
@@ -53,7 +53,7 @@ class SequencePattern:
 
     direction: Direction
     skip: set[str]
-    pattern: list[TokenPattern]
+    pattern: list[TokenPatternFromCfg]
 
 
 class Annotator(DocProcessor, ABC):
@@ -420,8 +420,8 @@ class _PatternPositionMatcher:  # pylint: disable=R0903
     """Checks if a token matches against a single pattern."""
 
     @classmethod
-    def match(cls, token_pattern: dict | TokenPattern, **kwargs) -> bool:  # pylint:
-        # disable=R0911
+    def match(cls, token_pattern: dict | TokenPatternFromCfg, **kwargs) -> bool:
+        # pylint: disable=R0911
         """
         Matches a pattern position (a dict with one key). Other information should be
         presented as kwargs.
@@ -503,10 +503,10 @@ def _lookup(cls, ent_type: str, **kwargs) -> bool:
             return token in kwargs.get("ds")[ent_type]
 
 
-def as_token_pattern(pat_dict: dict) -> TokenPattern:
+def as_token_pattern(pat_dict: dict) -> TokenPatternFromCfg:
     """
-    Converts the JSON dictionary representation of token patterns into a `TokenPattern`
-    instance.
+    Converts the JSON dictionary representation of token patterns into a
+    `TokenPatternFromCfg` instance.
 
     Args:
         pat_dict: the JSON representation of the pattern

From ebdefa4e4dae4f0e16acff67411bff9f6df12df6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Tue, 12 Mar 2024 22:48:14 +0100
Subject: [PATCH 25/41] Address most non-Mypy lint issues

---
 Makefile                             |  8 +++---
 docdeid/annotation.py                | 24 +++++++++---------
 docdeid/direction.py                 |  2 +-
 docdeid/process/annotator.py         | 37 +++++++++++++---------------
 docdeid/tokenizer.py                 |  4 +--
 tests/unit/process/test_annotator.py | 12 ++++-----
 6 files changed, 42 insertions(+), 45 deletions(-)

diff --git a/Makefile b/Makefile
index 82129f5..7312987 100644
--- a/Makefile
+++ b/Makefile
@@ -4,9 +4,11 @@ format:
 	python -m docformatter .
 
 lint:
-	python -m flake8 .
-	python -m pylint docdeid/
-	python -m mypy docdeid/
+	{ python -m flake8 .; fret=$$?; }; \
+		{ python -m pylint docdeid/; pret=$$?; }; \
+		{ python -m mypy docdeid/; mret=$$?; }; \
+		echo "flake8: $$fret, pylint: $$pret, mypy: $$mret"; \
+	  [ $$fret,$$pret,$$mret = "0,0,0" ]
 
 build-docs:
 	sphinx-apidoc --module-first --force --templatedir=docs/templates -o docs/source/api docdeid
diff --git a/docdeid/annotation.py b/docdeid/annotation.py
index 5afcbc8..562b917 100644
--- a/docdeid/annotation.py
+++ b/docdeid/annotation.py
@@ -4,7 +4,6 @@
 
 from frozendict import frozendict
 
-import docdeid
 from docdeid.tokenizer import Token
 
 UNKNOWN_ATTR_DEFAULT: Any = 0
@@ -192,6 +191,8 @@ def has_overlap(self) -> bool:
 
         return False
 
+    import docdeid  # needed to type-annotate the `doc` argument below
+
     def annos_by_token(
         self,
         doc: "docdeid.document.Document",
@@ -221,16 +222,15 @@ def annos_by_token(
                             tok = token_list[cur_tok_idx]
                     except IndexError:
                         break
-                    else:
-                        # Iterate over tokens in the annotation till we reach the end
-                        # of it or the end of the tokens.
-                        anno_tok_idx = cur_tok_idx
-                        anno_tok = tok
-                        while anno_tok.start_char < anno.end_char:
-                            annos_by_token[anno_tok].add(anno)
-                            if anno_tok_idx == len(token_list) - 1:
-                                break
-                            anno_tok_idx += 1
-                            anno_tok = token_list[anno_tok_idx]
+                    # Iterate over tokens in the annotation till we reach the end
+                    # of it or the end of the tokens.
+                    anno_tok_idx = cur_tok_idx
+                    anno_tok = tok
+                    while anno_tok.start_char < anno.end_char:
+                        annos_by_token[anno_tok].add(anno)
+                        if anno_tok_idx == len(token_list) - 1:
+                            break
+                        anno_tok_idx += 1
+                        anno_tok = token_list[anno_tok_idx]
             self._annos_by_tokenizers_by_token[tokenizers] = annos_by_token
         return self._annos_by_tokenizers_by_token[tokenizers]
diff --git a/docdeid/direction.py b/docdeid/direction.py
index 15cdbe8..7083459 100644
--- a/docdeid/direction.py
+++ b/docdeid/direction.py
@@ -26,7 +26,7 @@ def from_string(val: str) -> Direction:
             return Direction.LEFT
         if norm == "right":
             return Direction.RIGHT
-        raise ValueError("Invalid direction: '{}'".format(val))
+        raise ValueError(f"Invalid direction: '{val}'")
 
     def iter(self, seq: Sequence[T]) -> Iterable[T]:
         """
diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py
index 258b032..3bb4ca9 100644
--- a/docdeid/process/annotator.py
+++ b/docdeid/process/annotator.py
@@ -96,7 +96,7 @@ def _match_sequence(
         seq_pattern: SequencePattern,
         start_token: Token,
         annos_by_token: defaultdict[Token, Iterable[Annotation]],
-        ds: Optional[DsCollection],
+        dicts: Optional[DsCollection],
     ) -> Optional[Annotation]:
         """
         Matches a token sequence pattern at `start_token`.
@@ -106,7 +106,7 @@ def _match_sequence(
             seq_pattern: The pattern to match.
             start_token: The start token to match.
             annos_by_token: Map from tokens to annotations covering it.
-            ds: Lookup dictionaries available.
+            dicts: Lookup dictionaries available.
 
         Returns:
               An Annotation if matching is possible, None otherwise.
@@ -131,25 +131,27 @@ def _match_sequence(
                 token_pattern=tok_pattern,
                 token=end_token,
                 annos=annos_by_token[end_token],
-                ds=ds,
+                ds=dicts,
                 metadata=doc.metadata,
             ):
                 num_matched += 1
             else:
                 break
 
-        if num_matched == len(seq_pattern.pattern):
-            left_token, right_token = dir_.iter((start_token, end_token))
+        if num_matched != len(seq_pattern.pattern):
+            return None
 
-            return Annotation(
-                text=doc.text[left_token.start_char : right_token.end_char],
-                start_char=left_token.start_char,
-                end_char=right_token.end_char,
-                tag=self.tag,
-                priority=self.priority,
-                start_token=left_token,
-                end_token=right_token,
-            )
+        left_token, right_token = dir_.iter((start_token, end_token))
+
+        return Annotation(
+            text=doc.text[left_token.start_char : right_token.end_char],
+            start_char=left_token.start_char,
+            end_char=right_token.end_char,
+            tag=self.tag,
+            priority=self.priority,
+            start_token=left_token,
+            end_token=right_token,
+        )
 
 
 class SingleTokenLookupAnnotator(Annotator):
@@ -493,12 +495,7 @@ def _lookup(cls, ent_type: str, **kwargs) -> bool:
                 meta_val = getattr(kwargs["metadata"][meta_key], meta_attr)
             except (TypeError, KeyError, AttributeError):
                 return False
-            else:
-                return (
-                    token == meta_val
-                    if isinstance(meta_val, str)
-                    else token in meta_val
-                )
+            return token == meta_val if isinstance(meta_val, str) else token in meta_val
         else:  # pylint: disable=R1705
             return token in kwargs.get("ds")[ent_type]
 
diff --git a/docdeid/tokenizer.py b/docdeid/tokenizer.py
index d5c4efa..d1cf97f 100644
--- a/docdeid/tokenizer.py
+++ b/docdeid/tokenizer.py
@@ -136,9 +136,7 @@ def iter_to(
         token = self
         while token is not None:
             yield token
-            token = (
-                token._next_token if dir_ is Direction.RIGHT else token._previous_token
-            )
+            token = token.next() if dir_ is Direction.RIGHT else token.previous()
 
     def __len__(self) -> int:
         """
diff --git a/tests/unit/process/test_annotator.py b/tests/unit/process/test_annotator.py
index efe7a6a..1ee7952 100644
--- a/tests/unit/process/test_annotator.py
+++ b/tests/unit/process/test_annotator.py
@@ -336,7 +336,7 @@ def test_match_sequence(self, pattern_doc, ds):
             ),
             start_token=pattern_doc.get_tokens()[3],
             annos_by_token=defaultdict(list),
-            ds=ds,
+            dicts=ds,
         ) == Annotation(text="Andries Meijer", start_char=12, end_char=26, tag="_")
         assert (
             tpa._match_sequence(
@@ -346,7 +346,7 @@ def test_match_sequence(self, pattern_doc, ds):
                 ),
                 start_token=pattern_doc.get_tokens()[7],
                 annos_by_token=defaultdict(list),
-                ds=ds,
+                dicts=ds,
             )
             is None
         )
@@ -363,7 +363,7 @@ def test_match_sequence_left(self, pattern_doc, ds):
             ),
             start_token=pattern_doc.get_tokens()[4],
             annos_by_token=defaultdict(list),
-            ds=ds,
+            dicts=ds,
         ) == Annotation(text="Andries Meijer", start_char=12, end_char=26, tag="_")
 
         assert (
@@ -374,7 +374,7 @@ def test_match_sequence_left(self, pattern_doc, ds):
                 ),
                 start_token=pattern_doc.get_tokens()[8],
                 annos_by_token=defaultdict(list),
-                ds=ds,
+                dicts=ds,
             )
             is None
         )
@@ -391,7 +391,7 @@ def test_match_sequence_skip(self, pattern_doc, ds):
             ),
             start_token=pattern_doc.get_tokens()[4],
             annos_by_token=defaultdict(list),
-            ds=ds,
+            dicts=ds,
         ) == Annotation(text="Meijer-Heerma", start_char=20, end_char=33, tag="_")
         assert (
             tpa._match_sequence(
@@ -401,7 +401,7 @@ def test_match_sequence_skip(self, pattern_doc, ds):
                 ),
                 start_token=pattern_doc.get_tokens()[4],
                 annos_by_token=defaultdict(list),
-                ds=ds,
+                dicts=ds,
             )
             is None
         )

From 4a082b8e4080ef63203623ae50a2f1ecfc345f24 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Tue, 12 Mar 2024 23:09:16 +0100
Subject: [PATCH 26/41] Address easy and valid Mypy issues

---
 docdeid/annotation.py        |  5 ++++-
 docdeid/document.py          |  4 ++--
 docdeid/process/annotator.py | 30 ++++++++++++++----------------
 docdeid/tokenizer.py         |  2 +-
 4 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/docdeid/annotation.py b/docdeid/annotation.py
index 562b917..95bd38d 100644
--- a/docdeid/annotation.py
+++ b/docdeid/annotation.py
@@ -129,7 +129,10 @@ class AnnotationSet(set[Annotation]):
 
     def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
-        self._annos_by_tokenizers_by_token = {}
+        # Ugh, this feels like Java 9. (For sake of Mypy:)
+        self._annos_by_tokenizers_by_token: dict[
+            frozenset[str], defaultdict[Token, set[Annotation]]
+        ] = {}
 
     def sorted(
         self,
diff --git a/docdeid/document.py b/docdeid/document.py
index aa3f1a2..274b66e 100644
--- a/docdeid/document.py
+++ b/docdeid/document.py
@@ -98,7 +98,7 @@ def text(self) -> str:
         return self._text
 
     @property
-    def tokenizers(self) -> Optional[Mapping[str, Tokenizer]]:
+    def tokenizers(self) -> Mapping[str, Tokenizer]:
         """Available tokenizers indexed by their name."""
         if self._tokenizers is None:
             raise RuntimeError("No tokenizers initialized.")
@@ -109,7 +109,7 @@ def token_lists(self) -> Mapping[str, TokenList]:
         """Lists of tokens of the document, indexed by the name of the corresponding
         tokenizer."""
         for tokker_name in set(self.tokenizers) - set(self._token_lists):
-            tokker = self._tokenizers[tokker_name]
+            tokker = self.tokenizers[tokker_name]
             self._token_lists[tokker_name] = tokker.tokenize(self._text)
         return self._token_lists
 
diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py
index 3bb4ca9..9df1e57 100644
--- a/docdeid/process/annotator.py
+++ b/docdeid/process/annotator.py
@@ -443,9 +443,9 @@ def match(cls, token_pattern: dict | TokenPatternFromCfg, **kwargs) -> bool:
         value = token_pattern.pattern
 
         if func == "equal":
-            return kwargs.get("token").text == value
+            return kwargs["token"].text == value
         if func == "re_match":
-            return re.match(value, kwargs.get("token").text) is not None
+            return re.match(value, kwargs["token"].text) is not None
         if func == "is_initial":
 
             warnings.warn(
@@ -455,22 +455,18 @@ def match(cls, token_pattern: dict | TokenPatternFromCfg, **kwargs) -> bool:
             )
 
             return (
-                (
-                    len(kwargs.get("token").text) == 1
-                    and kwargs.get("token").text[0].isupper()
-                )
-                or kwargs.get("token").text in {"Ch", "Chr", "Ph", "Th"}
+                (len(kwargs["token"].text) == 1 and kwargs["token"].text[0].isupper())
+                or kwargs["token"].text in {"Ch", "Chr", "Ph", "Th"}
             ) == value
         if func == "is_initials":
             return (
-                len(kwargs.get("token").text) <= 4
-                and kwargs.get("token").text.isupper()
+                len(kwargs["token"].text) <= 4 and kwargs["token"].text.isupper()
             ) == value
         if func == "like_name":
             return (
-                len(kwargs.get("token").text) >= 3
-                and kwargs.get("token").text.istitle()
-                and not any(ch.isdigit() for ch in kwargs.get("token").text)
+                len(kwargs["token"].text) >= 3
+                and kwargs["token"].text.istitle()
+                and not any(ch.isdigit() for ch in kwargs["token"].text)
             ) == value
         if func == "lookup":
             return cls._lookup(value, **kwargs)
@@ -488,7 +484,7 @@ def match(cls, token_pattern: dict | TokenPatternFromCfg, **kwargs) -> bool:
 
     @classmethod
     def _lookup(cls, ent_type: str, **kwargs) -> bool:
-        token = kwargs.get("token").text
+        token = kwargs["token"].text
         if "." in ent_type:
             meta_key, meta_attr = ent_type.split(".", 1)
             try:
@@ -497,7 +493,7 @@ def _lookup(cls, ent_type: str, **kwargs) -> bool:
                 return False
             return token == meta_val if isinstance(meta_val, str) else token in meta_val
         else:  # pylint: disable=R1705
-            return token in kwargs.get("ds")[ent_type]
+            return token in kwargs["ds"][ent_type]
 
 
 def as_token_pattern(pat_dict: dict) -> TokenPatternFromCfg:
@@ -591,13 +587,15 @@ def annotate(self, doc: Document) -> list[Annotation]:
 
         annotations = []
 
-        tokens = doc.get_tokens()
+        token_list = doc.get_tokens()
 
         if self._start_words is not None:
-            tokens = tokens.token_lookup(
+            tokens: Iterable[Token] = token_list.token_lookup(
                 lookup_values=self._start_words,
                 matching_pipeline=self._matching_pipeline,
             )
+        else:
+            tokens = token_list  # ...to make Mypy happy.
 
         annos_by_token = doc.annotations.annos_by_token(doc)
 
diff --git a/docdeid/tokenizer.py b/docdeid/tokenizer.py
index d1cf97f..9f8d66b 100644
--- a/docdeid/tokenizer.py
+++ b/docdeid/tokenizer.py
@@ -133,7 +133,7 @@ def iter_to(
         Args:
             dir_: direction to go
         """
-        token = self
+        token: Optional[Token] = self
         while token is not None:
             yield token
             token = token.next() if dir_ is Direction.RIGHT else token.previous()

From 3319df118c4768907ffaea9cd1778b0c475d4d59 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Fri, 12 Jul 2024 18:33:42 +0200
Subject: [PATCH 27/41] Add a test for keep_blanks=False in WBTokenizer

---
 docdeid/tokenizer.py         |  3 +--
 tests/unit/test_tokenizer.py | 14 ++++++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/docdeid/tokenizer.py b/docdeid/tokenizer.py
index 9f8d66b..39b2bb4 100644
--- a/docdeid/tokenizer.py
+++ b/docdeid/tokenizer.py
@@ -393,9 +393,8 @@ def _split_text(self, text: str) -> list[Token]:
 
             if self._trim:
                 word = text[start_char:end_char]
-                orig_length = len(word)
                 word = word.rstrip()
-                end_char -= orig_length - len(word)
+                end_char = start_char + len(word)
                 word = word.lstrip()
                 start_char = end_char - len(word)
                 if not word:
diff --git a/tests/unit/test_tokenizer.py b/tests/unit/test_tokenizer.py
index ce118ce..463b1cd 100644
--- a/tests/unit/test_tokenizer.py
+++ b/tests/unit/test_tokenizer.py
@@ -215,3 +215,17 @@ def test_word_boundary_tokenizer(self):
         tokens = tokenizer._split_text(text)
 
         assert tokens == expected_tokens
+
+    def test_trimming(self):
+        text = "Jane Keith-Lucas"
+        tokenizer = WordBoundaryTokenizer(keep_blanks=False)
+        expected_tokens = [
+            Token(text="Jane", start_char=0, end_char=4),
+            Token(text="Keith", start_char=5, end_char=10),
+            Token(text="-", start_char=10, end_char=11),
+            Token(text="Lucas", start_char=11, end_char=16),
+        ]
+
+        tokens = tokenizer._split_text(text)
+
+        assert tokens == expected_tokens

From 1afb16ff23dc3857ff81459d076f990135fa046f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Fri, 12 Jul 2024 18:40:12 +0200
Subject: [PATCH 28/41] Document how to run tests better + cosmetics

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index 32cd942..d1dc9dd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -134,3 +134,6 @@ dmypy.json
 
 # IDEs
 *.iml
+
+# misc
+*~

From 53db956db05dc9f8006ad0c7fb0900356fe180bf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Tue, 7 Jan 2025 17:28:46 +0100
Subject: [PATCH 29/41] Drop the `Document.token_lists` property

---
 docdeid/annotation.py       | 5 +++--
 docdeid/document.py         | 9 ---------
 tests/unit/test_document.py | 4 ----
 3 files changed, 3 insertions(+), 15 deletions(-)

diff --git a/docdeid/annotation.py b/docdeid/annotation.py
index 95bd38d..a658f3b 100644
--- a/docdeid/annotation.py
+++ b/docdeid/annotation.py
@@ -209,10 +209,11 @@ def annos_by_token(
         # We key the token->annotations cache only by the set of tokenizers where it
         # actually (obviously) depends also on the document. However, it's assumed
         # that an AnnotationSet is always bound only to one document.
-        tokenizers = frozenset(doc.token_lists)
+        tokenizers = frozenset(doc.tokenizers)
         if tokenizers not in self._annos_by_tokenizers_by_token:
             annos_by_token = defaultdict(set)
-            for token_list in doc.token_lists.values():
+            for tokenizer in tokenizers:
+                token_list = doc.get_tokens(tokenizer)
                 if not token_list:
                     continue
                 cur_tok_idx = 0
diff --git a/docdeid/document.py b/docdeid/document.py
index 274b66e..843bce3 100644
--- a/docdeid/document.py
+++ b/docdeid/document.py
@@ -104,15 +104,6 @@ def tokenizers(self) -> Mapping[str, Tokenizer]:
             raise RuntimeError("No tokenizers initialized.")
         return self._tokenizers
 
-    @property
-    def token_lists(self) -> Mapping[str, TokenList]:
-        """Lists of tokens of the document, indexed by the name of the corresponding
-        tokenizer."""
-        for tokker_name in set(self.tokenizers) - set(self._token_lists):
-            tokker = self.tokenizers[tokker_name]
-            self._token_lists[tokker_name] = tokker.tokenize(self._text)
-        return self._token_lists
-
     def get_tokens(self, tokenizer_name: str = "default") -> TokenList:
         """
         Get the tokens corresponding to the input text, for a specific tokenizer.
diff --git a/tests/unit/test_document.py b/tests/unit/test_document.py
index c37ada0..e63bd94 100644
--- a/tests/unit/test_document.py
+++ b/tests/unit/test_document.py
@@ -87,10 +87,6 @@ def test_get_tokens_multiple_tokenizers(self, short_tokens):
             assert set(doc.tokenizers.keys()) == {"tokenizer_1", "tokenizer_2"}
             assert doc.get_tokens(tokenizer_name="tokenizer_1") == short_tokens
             assert doc.get_tokens(tokenizer_name="tokenizer_2") == TokenList([])
-            assert doc.token_lists == {
-                "tokenizer_1": short_tokens,
-                "tokenizer_2": TokenList([]),
-            }
 
     def test_metadata(self):
         text = "Hello I'm Bob"

From 230c507dde2a959adfea178841027ff642c84bc1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Wed, 8 Jan 2025 10:15:33 +0100
Subject: [PATCH 30/41] Avoid "|" for union types

This syntax is not supported in Python 3.9.
---
 docdeid/annotation.py            | 8 +-------
 docdeid/process/annotator.py     | 2 +-
 docdeid/process/doc_processor.py | 2 +-
 3 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/docdeid/annotation.py b/docdeid/annotation.py
index a658f3b..4e6b29e 100644
--- a/docdeid/annotation.py
+++ b/docdeid/annotation.py
@@ -158,13 +158,7 @@ def sorted(
             A RunTimeError, if the callbacks are not provided as a frozen dict.
         """
 
-        # Not liked by Mypy, even though
-        # https://docs.python.org/3/library/stdtypes.html#types-union
-        # says the "X | Y" notation is equivalent to `typing.Union[X, Y]` and the
-        # docstring of `typing.Optional` says it's equivalent to
-        # `typing.Union[None, _]`:
-        #     if not isinstance(callbacks, Optional[frozendict]):
-        if not isinstance(callbacks, frozendict | None):
+        if not isinstance(callbacks, (type(None), frozendict)):
             raise RuntimeError(
                 "Please provide the callbacks as a frozen dict, e.g. "
                 "frozendict.frozendict(end_char=lambda x: -x)"
diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py
index 9df1e57..227267b 100644
--- a/docdeid/process/annotator.py
+++ b/docdeid/process/annotator.py
@@ -422,7 +422,7 @@ class _PatternPositionMatcher:  # pylint: disable=R0903
     """Checks if a token matches against a single pattern."""
 
     @classmethod
-    def match(cls, token_pattern: dict | TokenPatternFromCfg, **kwargs) -> bool:
+    def match(cls, token_pattern: Union[dict, TokenPatternFromCfg], **kwargs) -> bool:
         # pylint: disable=R0911
         """
         Matches a pattern position (a dict with one key). Other information should be
diff --git a/docdeid/process/doc_processor.py b/docdeid/process/doc_processor.py
index 3bc8556..1e12115 100644
--- a/docdeid/process/doc_processor.py
+++ b/docdeid/process/doc_processor.py
@@ -32,7 +32,7 @@ class DocProcessorGroup:
 
     def __init__(self) -> None:
         self._processors: OrderedDict[
-            str, Union[DocProcessor | DocProcessorGroup]
+            str, Union[DocProcessor, DocProcessorGroup]
         ] = OrderedDict()
 
     def get_names(self, recursive: bool = True) -> list[str]:

From 25cbcfd2491bdfe253dea55bee86a2fe233c5eff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Wed, 8 Jan 2025 14:18:58 +0100
Subject: [PATCH 31/41] Move `annos_by_token` to `Document`

---
 docdeid/annotation.py         | 50 -------------------------
 docdeid/document.py           | 70 ++++++++++++++++++++++++++++++++++-
 docdeid/process/annotator.py  |  2 +-
 tests/unit/test_annotation.py |  4 +-
 4 files changed, 71 insertions(+), 55 deletions(-)

diff --git a/docdeid/annotation.py b/docdeid/annotation.py
index 4e6b29e..a52fa0c 100644
--- a/docdeid/annotation.py
+++ b/docdeid/annotation.py
@@ -1,4 +1,3 @@
-from collections import defaultdict
 from dataclasses import dataclass, field
 from typing import Any, Callable, Optional
 
@@ -129,10 +128,6 @@ class AnnotationSet(set[Annotation]):
 
     def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
-        # Ugh, this feels like Java 9. (For sake of Mypy:)
-        self._annos_by_tokenizers_by_token: dict[
-            frozenset[str], defaultdict[Token, set[Annotation]]
-        ] = {}
 
     def sorted(
         self,
@@ -187,48 +182,3 @@ def has_overlap(self) -> bool:
                 return True
 
         return False
-
-    import docdeid  # needed to type-annotate the `doc` argument below
-
-    def annos_by_token(
-        self,
-        doc: "docdeid.document.Document",
-    ) -> defaultdict[Token, set[Annotation]]:
-        """
-        Returns a mapping from document tokens to annotations.
-
-        Args:
-            doc: document whose tokens are to be linked
-        """
-        # We key the token->annotations cache only by the set of tokenizers where it
-        # actually (obviously) depends also on the document. However, it's assumed
-        # that an AnnotationSet is always bound only to one document.
-        tokenizers = frozenset(doc.tokenizers)
-        if tokenizers not in self._annos_by_tokenizers_by_token:
-            annos_by_token = defaultdict(set)
-            for tokenizer in tokenizers:
-                token_list = doc.get_tokens(tokenizer)
-                if not token_list:
-                    continue
-                cur_tok_idx = 0
-                tok = token_list[cur_tok_idx]
-                for anno in self.sorted(by=("start_char",)):
-                    try:
-                        # Iterate over tokens till we reach the annotation.
-                        while tok.end_char < anno.start_char:
-                            cur_tok_idx += 1
-                            tok = token_list[cur_tok_idx]
-                    except IndexError:
-                        break
-                    # Iterate over tokens in the annotation till we reach the end
-                    # of it or the end of the tokens.
-                    anno_tok_idx = cur_tok_idx
-                    anno_tok = tok
-                    while anno_tok.start_char < anno.end_char:
-                        annos_by_token[anno_tok].add(anno)
-                        if anno_tok_idx == len(token_list) - 1:
-                            break
-                        anno_tok_idx += 1
-                        anno_tok = token_list[anno_tok_idx]
-            self._annos_by_tokenizers_by_token[tokenizers] = annos_by_token
-        return self._annos_by_tokenizers_by_token[tokenizers]
diff --git a/docdeid/document.py b/docdeid/document.py
index 843bce3..b7a8444 100644
--- a/docdeid/document.py
+++ b/docdeid/document.py
@@ -1,10 +1,12 @@
+from collections import defaultdict
 from collections.abc import Mapping
+from dataclasses import dataclass
 from typing import Any, Optional
 
 from frozendict import frozendict
 
-from docdeid.annotation import AnnotationSet
-from docdeid.tokenizer import Tokenizer, TokenList
+from docdeid.annotation import Annotation, AnnotationSet
+from docdeid.tokenizer import Token, Tokenizer, TokenList
 
 
 class MetaData:
@@ -69,6 +71,12 @@ class Document:
             Will be stored in a :class:`.MetaData` object.
     """
 
+    @dataclass
+    class AnnosByToken:
+        """A cache entry associating an `AnnotationSet` with a token->annos map."""
+        anno_set: AnnotationSet
+        value: defaultdict[Token, set[Annotation]]
+
     def __init__(
         self,
         text: str,
@@ -78,6 +86,8 @@ def __init__(
 
         self._text = text
         self._tokenizers = None if tokenizers is None else frozendict(tokenizers)
+        self._default_annos_by_token = Document.AnnosByToken(None, None)
+        self._tmp_annos_by_token = Document.AnnosByToken(None, None)
 
         self.metadata = MetaData(metadata)
         """The :class:`.MetaData` of this :class:`.Document`, that can be interacted
@@ -156,6 +166,62 @@ def annotations(self, annotations: AnnotationSet) -> None:
         """
         self._annotations = annotations
 
+    def annos_by_token(
+            self,
+            annos: AnnotationSet = None,
+    ) -> defaultdict[Token, set[Annotation]]:
+        """
+        Returns a mapping from document tokens to annotations.
+
+        Args:
+            annos: annotations for this document to index by token (default: current
+                   annotations of this `Document`)
+        """
+
+        # Fill the default arg value.
+        if annos is None:
+            eff_annos = self._annotations
+            cache = self._default_annos_by_token
+        else:
+            eff_annos = annos
+            cache = self._tmp_annos_by_token
+
+        # Try to use a cached response.
+        if eff_annos == cache.anno_set:
+            return cache.value
+
+        # Compute the return value.
+        annos_by_token = defaultdict(set)
+        for tokenizer in self.tokenizers:
+            token_list = self.get_tokens(tokenizer)
+            if not token_list:
+                continue
+            cur_tok_idx = 0
+            tok = token_list[cur_tok_idx]
+            for anno in eff_annos.sorted(by=("start_char",)):
+                try:
+                    # Iterate over tokens till we reach the annotation.
+                    while tok.end_char < anno.start_char:
+                        cur_tok_idx += 1
+                        tok = token_list[cur_tok_idx]
+                except IndexError:
+                    break
+                # Iterate over tokens in the annotation till we reach the end
+                # of it or the end of the tokens.
+                anno_tok_idx = cur_tok_idx
+                anno_tok = tok
+                while anno_tok.start_char < anno.end_char:
+                    annos_by_token[anno_tok].add(anno)
+                    if anno_tok_idx == len(token_list) - 1:
+                        break
+                    anno_tok_idx += 1
+                    anno_tok = token_list[anno_tok_idx]
+
+        # Cache the value before returning.
+        cache.anno_set = eff_annos
+        cache.value = annos_by_token
+        return annos_by_token
+
     @property
     def deidentified_text(self) -> Optional[str]:
         """
diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py
index 227267b..f74ab92 100644
--- a/docdeid/process/annotator.py
+++ b/docdeid/process/annotator.py
@@ -597,7 +597,7 @@ def annotate(self, doc: Document) -> list[Annotation]:
         else:
             tokens = token_list  # ...to make Mypy happy.
 
-        annos_by_token = doc.annotations.annos_by_token(doc)
+        annos_by_token = doc.annos_by_token()
 
         for token in tokens:
 
diff --git a/tests/unit/test_annotation.py b/tests/unit/test_annotation.py
index 2ed882f..072979f 100644
--- a/tests/unit/test_annotation.py
+++ b/tests/unit/test_annotation.py
@@ -179,7 +179,7 @@ def test_annos_by_token(self, annotations):
         # import pydevd_pycharm
         # pydevd_pycharm.settrace()
 
-        got = aset.annos_by_token(doc)
+        got = doc.annos_by_token(aset)
 
         want = {
             Token("Hello", 16, 21): {a1},
@@ -218,7 +218,7 @@ def _split_text(self, text: str) -> list[Token]:
             ]
         )
 
-        got = aset.annos_by_token(doc)
+        got = doc.annos_by_token(aset)
 
         want = {
             Token("Hello", 16, 21): {a1},

From 36eb1e3beaa3ed05a3d3e9bf895bcf700ed8a641 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Wed, 8 Jan 2025 14:27:49 +0100
Subject: [PATCH 32/41] Simplify `Direction.from_string`

---
 docdeid/direction.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/docdeid/direction.py b/docdeid/direction.py
index 7083459..7c53507 100644
--- a/docdeid/direction.py
+++ b/docdeid/direction.py
@@ -19,14 +19,11 @@ def opposite(self) -> Direction:
 
     @staticmethod
     def from_string(val: str) -> Direction:
-        """Parses a Direction from a string, which must be either 'left' or 'right'
-        after lowercasing."""
-        norm = val.lower()
-        if norm == "left":
-            return Direction.LEFT
-        if norm == "right":
-            return Direction.RIGHT
-        raise ValueError(f"Invalid direction: '{val}'")
+        """Parses a Direction from a string (case insensitive)."""
+        try:
+            return Direction[val.upper()]
+        except KeyError as key_error:
+            raise ValueError(f"Invalid direction: '{val}'") from key_error
 
     def iter(self, seq: Sequence[T]) -> Iterable[T]:
         """

From 573deffa3ac5705e0d4bebab286b01f3c0566fe5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Wed, 8 Jan 2025 15:48:43 +0100
Subject: [PATCH 33/41] Rename `SequenceAnnotator.dicts` to `ds`

---
 docdeid/process/annotator.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py
index f74ab92..926344f 100644
--- a/docdeid/process/annotator.py
+++ b/docdeid/process/annotator.py
@@ -538,7 +538,7 @@ def __init__(
         **kwargs,
     ) -> None:
         self.pattern = pattern
-        self.dicts = ds
+        self.ds = ds
         self.skip = set(skip or [])
 
         self._start_words = None
@@ -546,13 +546,13 @@ def __init__(
 
         if len(self.pattern) > 0 and "lookup" in self.pattern[0]:
 
-            if self.dicts is None:
+            if self.ds is None:
                 raise RuntimeError(
                     "Created pattern with lookup in TokenPatternAnnotator, but "
                     "no lookup structures provided."
                 )
 
-            lookup_list = self.dicts[self.pattern[0]["lookup"]]
+            lookup_list = self.ds[self.pattern[0]["lookup"]]
 
             # FIXME This doesn't work correctly for multiple ([{"lookup":"prefix"},
             #  {"lookup":"interfix"}]) and nested patterns ("or", "and").
@@ -602,7 +602,7 @@ def annotate(self, doc: Document) -> list[Annotation]:
         for token in tokens:
 
             annotation = self._match_sequence(
-                doc, self._seq_pattern, token, annos_by_token, self.dicts
+                doc, self._seq_pattern, token, annos_by_token, self.ds
             )
 
             if annotation is not None:

From a2704c5d8a0b70175090146f9dc7f1e7509baa45 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Wed, 8 Jan 2025 16:33:17 +0100
Subject: [PATCH 34/41] Replace `list(map(f, xs))` with list comprehension

---
 docdeid/process/annotator.py         |  7 +++----
 tests/unit/process/test_annotator.py | 12 ++++++------
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py
index 926344f..bc699c5 100644
--- a/docdeid/process/annotator.py
+++ b/docdeid/process/annotator.py
@@ -511,7 +511,7 @@ def as_token_pattern(pat_dict: dict) -> TokenPatternFromCfg:
         )
     func, value = next(iter(pat_dict.items()))
     if func in ("and", "or"):
-        return NestedTokenPattern(func, list(map(as_token_pattern, value)))
+        return NestedTokenPattern(func, [as_token_pattern(it) for it in value])
     return SimpleTokenPattern(func, value)
 
 
@@ -539,7 +539,6 @@ def __init__(
     ) -> None:
         self.pattern = pattern
         self.ds = ds
-        self.skip = set(skip or [])
 
         self._start_words = None
         self._matching_pipeline = None
@@ -558,7 +557,7 @@ def __init__(
             #  {"lookup":"interfix"}]) and nested patterns ("or", "and").
             if not isinstance(lookup_list, LookupSet):
                 raise ValueError(
-                    f"Expected a LookupSet, but got a " f"{type(lookup_list)}."
+                    f"Expected a LookupSet, but got a {type(lookup_list)}."
                 )
 
             # FIXME This doesn't work correctly for multiple ([{"lookup":"prefix"},
@@ -569,7 +568,7 @@ def __init__(
             self._matching_pipeline = lookup_list.matching_pipeline
 
         self._seq_pattern = SequencePattern(
-            Direction.RIGHT, set(skip or ()), list(map(as_token_pattern, pattern))
+            Direction.RIGHT, set(skip or ()), [as_token_pattern(it) for it in pattern]
         )
 
         super().__init__(*args, **kwargs)
diff --git a/tests/unit/process/test_annotator.py b/tests/unit/process/test_annotator.py
index 1ee7952..fb4ec23 100644
--- a/tests/unit/process/test_annotator.py
+++ b/tests/unit/process/test_annotator.py
@@ -332,7 +332,7 @@ def test_match_sequence(self, pattern_doc, ds):
         assert tpa._match_sequence(
             pattern_doc,
             SequencePattern(
-                Direction.RIGHT, set(), list(map(as_token_pattern, pattern))
+                Direction.RIGHT, set(), [as_token_pattern(it) for it in pattern]
             ),
             start_token=pattern_doc.get_tokens()[3],
             annos_by_token=defaultdict(list),
@@ -342,7 +342,7 @@ def test_match_sequence(self, pattern_doc, ds):
             tpa._match_sequence(
                 pattern_doc,
                 SequencePattern(
-                    Direction.RIGHT, set(), list(map(as_token_pattern, pattern))
+                    Direction.RIGHT, set(), [as_token_pattern(it) for it in pattern]
                 ),
                 start_token=pattern_doc.get_tokens()[7],
                 annos_by_token=defaultdict(list),
@@ -359,7 +359,7 @@ def test_match_sequence_left(self, pattern_doc, ds):
         assert tpa._match_sequence(
             pattern_doc,
             SequencePattern(
-                Direction.LEFT, set(), list(map(as_token_pattern, pattern))
+                Direction.LEFT, set(), [as_token_pattern(it) for it in pattern]
             ),
             start_token=pattern_doc.get_tokens()[4],
             annos_by_token=defaultdict(list),
@@ -370,7 +370,7 @@ def test_match_sequence_left(self, pattern_doc, ds):
             tpa._match_sequence(
                 pattern_doc,
                 SequencePattern(
-                    Direction.LEFT, set(), list(map(as_token_pattern, pattern))
+                    Direction.LEFT, set(), [as_token_pattern(it) for it in pattern]
                 ),
                 start_token=pattern_doc.get_tokens()[8],
                 annos_by_token=defaultdict(list),
@@ -387,7 +387,7 @@ def test_match_sequence_skip(self, pattern_doc, ds):
         assert tpa._match_sequence(
             pattern_doc,
             SequencePattern(
-                Direction.RIGHT, {"-"}, list(map(as_token_pattern, pattern))
+                Direction.RIGHT, {"-"}, [as_token_pattern(it) for it in pattern]
             ),
             start_token=pattern_doc.get_tokens()[4],
             annos_by_token=defaultdict(list),
@@ -397,7 +397,7 @@ def test_match_sequence_skip(self, pattern_doc, ds):
             tpa._match_sequence(
                 pattern_doc,
                 SequencePattern(
-                    Direction.RIGHT, set(), list(map(as_token_pattern, pattern))
+                    Direction.RIGHT, set(), [as_token_pattern(it) for it in pattern]
                 ),
                 start_token=pattern_doc.get_tokens()[4],
                 annos_by_token=defaultdict(list),

From 3ca37aa6b1a8aef34d48c1220fd5c2b195377abd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Wed, 8 Jan 2025 18:02:48 +0100
Subject: [PATCH 35/41] Re-add `MultiTokenLookupAnnotator` accepting a
 `LookupSet`

---
 docdeid/process/__init__.py  |  1 +
 docdeid/process/annotator.py | 71 +++++++++++++++++++++++++++++++-----
 2 files changed, 63 insertions(+), 9 deletions(-)

diff --git a/docdeid/process/__init__.py b/docdeid/process/__init__.py
index d0f040f..6a40be3 100644
--- a/docdeid/process/__init__.py
+++ b/docdeid/process/__init__.py
@@ -6,6 +6,7 @@
 from .annotator import (
     Annotator,
     MultiTokenLookupAnnotator,
+    MultiTokenTrieAnnotator,
     RegexpAnnotator,
     SequenceAnnotator,
     SingleTokenLookupAnnotator,
diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py
index bc699c5..88c1b67 100644
--- a/docdeid/process/annotator.py
+++ b/docdeid/process/annotator.py
@@ -16,7 +16,7 @@
 from docdeid.pattern import TokenPattern
 from docdeid.process.doc_processor import DocProcessor
 from docdeid.str.processor import StringModifier
-from docdeid.tokenizer import Token
+from docdeid.tokenizer import Token, Tokenizer
 
 
 @dataclass
@@ -207,7 +207,7 @@ def annotate(self, doc: Document) -> list[Annotation]:
         return self._tokens_to_annotations(annotate_tokens)
 
 
-class MultiTokenLookupAnnotator(Annotator):
+class MultiTokenTrieAnnotator(Annotator):
     """
     Annotates entity mentions by looking them up in a `LookupTrie`.
 
@@ -219,11 +219,11 @@ class MultiTokenLookupAnnotator(Annotator):
     """
 
     def __init__(
-        self,
-        *args,
-        trie: LookupTrie,
-        overlapping: bool = False,
-        **kwargs,
+            self,
+            *args,
+            trie: LookupTrie,
+            overlapping: bool = False,
+            **kwargs,
     ) -> None:
 
         self._trie = trie
@@ -235,8 +235,7 @@ def __init__(
     @property
     def start_words(self) -> set[str]:
         """First words of phrases detected by this annotator."""
-        # If the trie has been modified (added to) since we computed
-        # _start_words,
+        # If the trie has been modified (added to) since we computed _start_words,
         if len(self._start_words) != len(self._trie.children):
             # Recompute _start_words.
             self._start_words = set(self._trie.children)
@@ -292,6 +291,60 @@ def annotate(self, doc: Document) -> list[Annotation]:
         return annotations
 
 
+class MultiTokenLookupAnnotator(MultiTokenTrieAnnotator):
+    """
+    Annotates entity mentions by looking them up in a `LookupTrie` or
+    a collection of phrases. This is a thin wrapper for
+    class:`MultiTokenTrieAnnotator` that additionally handles non-trie lookup
+    structures by building tries out of them and delegating to the parent class.
+
+    Args:
+        lookup_values: An iterable of phrases that should be matched. These are
+            tokenized using ``tokenizer``.
+        matching_pipeline: An optional pipeline that can be used for matching
+            (e.g. lowercasing). This has no specific impact on matching performance,
+            other than overhead for applying the pipeline to each string.
+        tokenizer: A tokenizer that is used to create the sequence patterns from
+            ``lookup_values``.
+        trie: A `LookupTrie` containing all entity mentions that should be
+            annotated. Specifying this is mutually exclusive with specifying
+            ``lookup_values`` and ``tokenizer``.
+        overlapping: Whether overlapping phrases are to be returned.
+        *args, **kwargs: Passed through to the `Annotator` constructor (which accepts
+            the arguments `tag` and `priority`).
+
+    Raises:
+        RunTimeError, when an incorrect combination of `lookup_values`,
+        `matching_pipeline` and `trie` is supplied.
+    """
+
+    def __init__(
+            self,
+            *args,
+            lookup_values: Optional[Iterable[str]] = None,
+            matching_pipeline: Optional[list[StringModifier]] = None,
+            tokenizer: Optional[Tokenizer] = None,
+            trie: Optional[LookupTrie] = None,
+            overlapping: bool = False,
+            **kwargs,
+    ) -> None:
+
+        if (trie is not None) and (lookup_values is None) and (tokenizer is None):
+            eff_trie = trie
+
+        elif (trie is None) and (lookup_values is not None) and (tokenizer is not None):
+            eff_trie = LookupTrie(matching_pipeline=matching_pipeline)
+            for phrase in filter(None, map(tokenizer.tokenize, lookup_values)):
+                eff_trie.add_item([token.text for token in phrase])
+
+        else:
+            raise RuntimeError(
+                "Please provide either looup_values and a tokenizer, or a trie."
+            )
+
+        super().__init__(*args, trie=eff_trie, overlapping=overlapping, **kwargs)
+
+
 class RegexpAnnotator(Annotator):
     """
     Create annotations based on regular expression patterns. Note that these patterns do

From 68f4afba4f72f3d00de7441c5ce60e8d2e226355 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Thu, 9 Jan 2025 13:38:54 +0100
Subject: [PATCH 36/41] Add a test for matching multi-word phrases

---
 tests/unit/process/test_annotator.py | 42 +++++++++++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/tests/unit/process/test_annotator.py b/tests/unit/process/test_annotator.py
index fb4ec23..466064a 100644
--- a/tests/unit/process/test_annotator.py
+++ b/tests/unit/process/test_annotator.py
@@ -306,8 +306,10 @@ class TestSequenceAnnotator:
     def ds(self):
         ds = DsCollection()
 
-        first_names = ["Andries", "pieter", "Aziz", "Bernard"]
+        first_names = ["Andries", "pieter", "Aziz", "Bernard", "Won Jung"]
         surnames = ["Meijer", "Smit", "Bakker", "Heerma"]
+        interfixes = ["v/d"]
+        interfixed_surnames = ["Heck"]
 
         ds["first_names"] = LookupSet()
         ds["first_names"].add_items_from_iterable(items=first_names)
@@ -315,6 +317,12 @@ def ds(self):
         ds["surnames"] = LookupSet()
         ds["surnames"].add_items_from_iterable(items=surnames)
 
+        ds["interfixes"] = LookupSet()
+        ds["interfixes"].add_items_from_iterable(items=interfixes)
+
+        ds["interfixed_surnames"] = LookupSet()
+        ds["interfixed_surnames"].add_items_from_iterable(items=interfixed_surnames)
+
         return ds
 
     @pytest.fixture
@@ -324,6 +332,20 @@ def pattern_doc(self):
             tokenizers={"default": WordBoundaryTokenizer(False)},
         )
 
+    @pytest.fixture
+    def interfixed_doc(self):
+        return Document(
+            text="De man heet v/d Heck.",
+            tokenizers={"default": WordBoundaryTokenizer(False)},
+        )
+
+    @pytest.fixture
+    def korean_doc(self):
+        return Document(
+            text="De mevrouw heet Won Jung Meijer-Heerma.",
+            tokenizers={"default": WordBoundaryTokenizer(False)},
+        )
+
     def test_match_sequence(self, pattern_doc, ds):
         pattern = [{"lookup": "first_names"}, {"like_name": True}]
 
@@ -414,3 +436,21 @@ def test_annotate(self, pattern_doc, ds):
         assert tpa.annotate(pattern_doc) == [
             Annotation(text="Andries Meijer", start_char=12, end_char=26, tag="_")
         ]
+
+    def test_annotate_multiword(self, interfixed_doc, korean_doc, ds):
+        # XXX This tests functionality (matching multiple tokens with one member of
+        # the "pattern" list) which is not supported as per the SequenceAnnotator
+        # docstring, nonetheless is exercised by the packaged base_config.json (most
+        # notably in the case of the interfix_with_name annotator).
+
+        inter_pattern = [{"lookup": "interfixes"}, {"lookup": "interfixed_surnames"}]
+        ipa = SequenceAnnotator(pattern=inter_pattern, ds=ds, tag="_")
+        assert ipa.annotate(interfixed_doc) == [
+            Annotation(text="v/d Heck", start_char=12, end_char=20, tag="_")
+        ]
+
+        pattern = [{"lookup": "first_names"}, {"like_name": True}]
+        kpa = SequenceAnnotator(pattern=pattern, ds=ds, tag="_")
+        assert kpa.annotate(korean_doc) == [
+            Annotation(text="Won Jung Meijer", start_char=16, end_char=31, tag="_")
+        ]

From fb3cbd84d34b5cb292d4e3c1d3fa788e6ed4a8fb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Thu, 9 Jan 2025 14:19:47 +0100
Subject: [PATCH 37/41] Try to support multi-word matching in SequenceAnnotator

---
 docdeid/process/annotator.py         | 34 ++++++++++++++++------------
 docdeid/tokenizer.py                 |  9 ++++++++
 tests/unit/process/test_annotator.py |  6 +++--
 3 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py
index 88c1b67..193a57a 100644
--- a/docdeid/process/annotator.py
+++ b/docdeid/process/annotator.py
@@ -16,7 +16,7 @@
 from docdeid.pattern import TokenPattern
 from docdeid.process.doc_processor import DocProcessor
 from docdeid.str.processor import StringModifier
-from docdeid.tokenizer import Token, Tokenizer
+from docdeid.tokenizer import Token, Tokenizer, DummyTokenizer
 
 
 @dataclass
@@ -571,15 +571,18 @@ def as_token_pattern(pat_dict: dict) -> TokenPatternFromCfg:
 class SequenceAnnotator(Annotator):
     """
     Annotates based on token patterns, which should be provided as a list of dicts. Each
-    position in the list denotes a token position, e.g.: [{'is_initial': True},
-    {'like_name': True}] matches sequences of two tokens, where the first one is an
-    initial, and the second one is like a name.
+    position in the list corresponds to a token or a token sequence. For example:
+    ``[{'is_initial': True}, {'like_name': True}]`` matches sequences of two tokens
+    where the first one is an initial and the second one looks like a name.
 
     Arguments:
         pattern: The pattern
         ds: Lookup dictionaries. Those referenced by the pattern should be LookupSets.
             (Don't ask why.)
         skip: Any string values that should be skipped in matching (e.g. periods)
+        tokenizer: A tokenizer called to determine the first word to look for each
+            phrase in ``lookup_values``. If none is provided, phrases in
+            ``lookup_values`` are all assumed to be a single word.
     """
 
     def __init__(
@@ -588,15 +591,17 @@ def __init__(
         *args,
         ds: Optional[DsCollection] = None,
         skip: Optional[list[str]] = None,
+        tokenizer: Optional[Tokenizer] = None,
         **kwargs,
     ) -> None:
         self.pattern = pattern
         self.ds = ds
 
         self._start_words = None
-        self._matching_pipeline = None
+        self._start_matching_pipeline = None
 
-        if len(self.pattern) > 0 and "lookup" in self.pattern[0]:
+        # If the first token pattern is lookup, determine the possible starting words.
+        if self.pattern and "lookup" in self.pattern[0]:
 
             if self.ds is None:
                 raise RuntimeError(
@@ -606,19 +611,18 @@ def __init__(
 
             lookup_list = self.ds[self.pattern[0]["lookup"]]
 
-            # FIXME This doesn't work correctly for multiple ([{"lookup":"prefix"},
-            #  {"lookup":"interfix"}]) and nested patterns ("or", "and").
             if not isinstance(lookup_list, LookupSet):
                 raise ValueError(
                     f"Expected a LookupSet, but got a {type(lookup_list)}."
                 )
 
-            # FIXME This doesn't work correctly for multiple ([{"lookup":"prefix"},
-            #  {"lookup":"interfix"}]) and nested patterns ("or", "and").
-            self._start_words = lookup_list.items()
-            # FIXME This doesn't work correctly for multiple ([{"lookup":"prefix"},
-            #  {"lookup":"interfix"}]) and nested patterns ("or", "and").
-            self._matching_pipeline = lookup_list.matching_pipeline
+            eff_tokenizer = tokenizer or DummyTokenizer()
+            self._start_words = {
+                phrase[0].text
+                for phrase in filter(None, map(eff_tokenizer.tokenize,
+                                               lookup_list.items()))
+            }
+            self._start_matching_pipeline = lookup_list.matching_pipeline
 
         self._seq_pattern = SequencePattern(
             Direction.RIGHT, set(skip or ()), [as_token_pattern(it) for it in pattern]
@@ -644,7 +648,7 @@ def annotate(self, doc: Document) -> list[Annotation]:
         if self._start_words is not None:
             tokens: Iterable[Token] = token_list.token_lookup(
                 lookup_values=self._start_words,
-                matching_pipeline=self._matching_pipeline,
+                matching_pipeline=self._start_matching_pipeline,
             )
         else:
             tokens = token_list  # ...to make Mypy happy.
diff --git a/docdeid/tokenizer.py b/docdeid/tokenizer.py
index 39b2bb4..a889ac7 100644
--- a/docdeid/tokenizer.py
+++ b/docdeid/tokenizer.py
@@ -355,6 +355,15 @@ def tokenize(self, text: str) -> TokenList:
         return TokenList(tokens, link_tokens=self.link_tokens)
 
 
+class DummyTokenizer(Tokenizer):  # pylint: disable=R0903
+    """
+    Treats any given string as a single token.
+    """
+
+    def _split_text(self, text: str) -> list[Token]:
+        return [Token(text=text, start_char=0, end_char=len(text))]
+
+
 class SpaceSplitTokenizer(Tokenizer):  # pylint: disable=R0903
     """
     Tokenizes based on splitting on whitespaces.
diff --git a/tests/unit/process/test_annotator.py b/tests/unit/process/test_annotator.py
index 466064a..3fcb773 100644
--- a/tests/unit/process/test_annotator.py
+++ b/tests/unit/process/test_annotator.py
@@ -444,13 +444,15 @@ def test_annotate_multiword(self, interfixed_doc, korean_doc, ds):
         # notably in the case of the interfix_with_name annotator).
 
         inter_pattern = [{"lookup": "interfixes"}, {"lookup": "interfixed_surnames"}]
-        ipa = SequenceAnnotator(pattern=inter_pattern, ds=ds, tag="_")
+        ipa = SequenceAnnotator(pattern=inter_pattern, ds=ds,
+                                tokenizer=WordBoundaryTokenizer(False), tag="_")
         assert ipa.annotate(interfixed_doc) == [
             Annotation(text="v/d Heck", start_char=12, end_char=20, tag="_")
         ]
 
         pattern = [{"lookup": "first_names"}, {"like_name": True}]
-        kpa = SequenceAnnotator(pattern=pattern, ds=ds, tag="_")
+        kpa = SequenceAnnotator(pattern=pattern, ds=ds,
+                                tokenizer=WordBoundaryTokenizer(False), tag="_")
         assert kpa.annotate(korean_doc) == [
             Annotation(text="Won Jung Meijer", start_char=16, end_char=31, tag="_")
         ]

From 0c04a784178cd20cac7dc3ed22a2ba7dad35cb80 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Thu, 9 Jan 2025 14:32:27 +0100
Subject: [PATCH 38/41] Give up multi-word matching in SequenceAnnotator

---
 docdeid/process/annotator.py         | 19 +++++++------------
 tests/unit/process/test_annotator.py | 17 ++++++++++-------
 2 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py
index 193a57a..6e014e8 100644
--- a/docdeid/process/annotator.py
+++ b/docdeid/process/annotator.py
@@ -16,7 +16,7 @@
 from docdeid.pattern import TokenPattern
 from docdeid.process.doc_processor import DocProcessor
 from docdeid.str.processor import StringModifier
-from docdeid.tokenizer import Token, Tokenizer, DummyTokenizer
+from docdeid.tokenizer import Token, Tokenizer
 
 
 @dataclass
@@ -571,7 +571,7 @@ def as_token_pattern(pat_dict: dict) -> TokenPatternFromCfg:
 class SequenceAnnotator(Annotator):
     """
     Annotates based on token patterns, which should be provided as a list of dicts. Each
-    position in the list corresponds to a token or a token sequence. For example:
+    position in the list corresponds to a token. For example:
     ``[{'is_initial': True}, {'like_name': True}]`` matches sequences of two tokens
     where the first one is an initial and the second one looks like a name.
 
@@ -580,9 +580,6 @@ class SequenceAnnotator(Annotator):
         ds: Lookup dictionaries. Those referenced by the pattern should be LookupSets.
             (Don't ask why.)
         skip: Any string values that should be skipped in matching (e.g. periods)
-        tokenizer: A tokenizer called to determine the first word to look for each
-            phrase in ``lookup_values``. If none is provided, phrases in
-            ``lookup_values`` are all assumed to be a single word.
     """
 
     def __init__(
@@ -591,7 +588,6 @@ def __init__(
         *args,
         ds: Optional[DsCollection] = None,
         skip: Optional[list[str]] = None,
-        tokenizer: Optional[Tokenizer] = None,
         **kwargs,
     ) -> None:
         self.pattern = pattern
@@ -616,12 +612,11 @@ def __init__(
                     f"Expected a LookupSet, but got a {type(lookup_list)}."
                 )
 
-            eff_tokenizer = tokenizer or DummyTokenizer()
-            self._start_words = {
-                phrase[0].text
-                for phrase in filter(None, map(eff_tokenizer.tokenize,
-                                               lookup_list.items()))
-            }
+            # XXX We assume the items of the lookup list are all single words. This
+            # is not always the case but just splitting the phrases wouldn't help
+            # because the "lookup" token matcher assumes matching against a single
+            # token.
+            self._start_words = lookup_list.items()
             self._start_matching_pipeline = lookup_list.matching_pipeline
 
         self._seq_pattern = SequencePattern(
diff --git a/tests/unit/process/test_annotator.py b/tests/unit/process/test_annotator.py
index 3fcb773..3b6e0d0 100644
--- a/tests/unit/process/test_annotator.py
+++ b/tests/unit/process/test_annotator.py
@@ -437,22 +437,25 @@ def test_annotate(self, pattern_doc, ds):
             Annotation(text="Andries Meijer", start_char=12, end_char=26, tag="_")
         ]
 
+    @pytest.mark.xfail(reason="The lookup token pattern only ever matches a single "
+                              "token and the SequenceAnnotator docstring accordingly "
+                              "rules the case of multiple tokens per pattern out of "
+                              "scope. Yet, the packaged base_config.json seems to "
+                              "rely on such multi-word matches, most notably in the "
+                              "case of the interfix_with_name annotator.")
     def test_annotate_multiword(self, interfixed_doc, korean_doc, ds):
-        # XXX This tests functionality (matching multiple tokens with one member of
-        # the "pattern" list) which is not supported as per the SequenceAnnotator
-        # docstring, nonetheless is exercised by the packaged base_config.json (most
-        # notably in the case of the interfix_with_name annotator).
-
         inter_pattern = [{"lookup": "interfixes"}, {"lookup": "interfixed_surnames"}]
         ipa = SequenceAnnotator(pattern=inter_pattern, ds=ds,
-                                tokenizer=WordBoundaryTokenizer(False), tag="_")
+                                # tokenizer=WordBoundaryTokenizer(False),
+                                tag="_")
         assert ipa.annotate(interfixed_doc) == [
             Annotation(text="v/d Heck", start_char=12, end_char=20, tag="_")
         ]
 
         pattern = [{"lookup": "first_names"}, {"like_name": True}]
         kpa = SequenceAnnotator(pattern=pattern, ds=ds,
-                                tokenizer=WordBoundaryTokenizer(False), tag="_")
+                                # tokenizer=WordBoundaryTokenizer(False),
+                                tag="_")
         assert kpa.annotate(korean_doc) == [
             Annotation(text="Won Jung Meijer", start_char=16, end_char=31, tag="_")
         ]

From 82c52fc6686d6cf34672cf6da96324641b6cd8c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Thu, 9 Jan 2025 21:34:34 +0100
Subject: [PATCH 39/41] Move seq pattern validation to a new method

---
 docdeid/process/annotator.py         | 48 +++++++++++------
 docdeid/utils.py                     | 28 ++++++++++
 tests/unit/process/test_annotator.py | 78 ++++++++++++++++++++++------
 3 files changed, 122 insertions(+), 32 deletions(-)

diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py
index 6e014e8..ee82334 100644
--- a/docdeid/process/annotator.py
+++ b/docdeid/process/annotator.py
@@ -2,6 +2,7 @@
 
 import re
 import warnings
+
 from abc import ABC, abstractmethod
 from collections import defaultdict
 from dataclasses import dataclass
@@ -17,6 +18,7 @@
 from docdeid.process.doc_processor import DocProcessor
 from docdeid.str.processor import StringModifier
 from docdeid.tokenizer import Token, Tokenizer
+from docdeid.utils import leaf_items
 
 
 @dataclass
@@ -56,6 +58,7 @@ class SequencePattern:
     pattern: list[TokenPatternFromCfg]
 
 
+
 class Annotator(DocProcessor, ABC):
     """
     Abstract class for annotators, which are responsible for generating annotations from
@@ -596,28 +599,16 @@ def __init__(
         self._start_words = None
         self._start_matching_pipeline = None
 
-        # If the first token pattern is lookup, determine the possible starting words.
-        if self.pattern and "lookup" in self.pattern[0]:
-
-            if self.ds is None:
-                raise RuntimeError(
-                    "Created pattern with lookup in TokenPatternAnnotator, but "
-                    "no lookup structures provided."
-                )
-
-            lookup_list = self.ds[self.pattern[0]["lookup"]]
-
-            if not isinstance(lookup_list, LookupSet):
-                raise ValueError(
-                    f"Expected a LookupSet, but got a {type(lookup_list)}."
-                )
+        SequenceAnnotator.validate_pattern(pattern, ds)
 
+        # If the first token pattern is lookup, determine the possible starting words.
+        if start_ent_type := pattern[0].get("lookup"):
             # XXX We assume the items of the lookup list are all single words. This
             # is not always the case but just splitting the phrases wouldn't help
             # because the "lookup" token matcher assumes matching against a single
             # token.
-            self._start_words = lookup_list.items()
-            self._start_matching_pipeline = lookup_list.matching_pipeline
+            self._start_words = ds[start_ent_type].items()
+            self._start_matching_pipeline = ds[start_ent_type].matching_pipeline
 
         self._seq_pattern = SequencePattern(
             Direction.RIGHT, set(skip or ()), [as_token_pattern(it) for it in pattern]
@@ -625,6 +616,29 @@ def __init__(
 
         super().__init__(*args, **kwargs)
 
+    @classmethod
+    def validate_pattern(cls, pattern, ds):
+        if not pattern:
+            raise ValueError(f"Sequence pattern is missing or empty: {pattern}.")
+
+        referenced_ents = {match_val
+                           for tok_pattern in pattern
+                           for func, match_val in leaf_items(tok_pattern)
+                           if func.endswith("lookup")}
+        if referenced_ents and ds is None:
+            raise ValueError("Pattern relies on entity lookups but no lookup "
+                             "structures were provided.")
+
+        if missing := referenced_ents - set(ds or ()):
+            raise ValueError("Unknown lookup entity types: {}."
+                             .format(", ".join(sorted(missing))))
+
+        if start_ent_type := pattern[0].get("lookup"):
+            if not isinstance(ds[start_ent_type], LookupSet):
+                raise ValueError('If the first token pattern is lookup, it must be '
+                                 f'backed by a LookupSet, but "{start_ent_type}" is '
+                                 f'backed by a {type(ds[start_ent_type]).__name__}.')
+
     def annotate(self, doc: Document) -> list[Annotation]:
         """
         Annotate the document, by matching the pattern against all tokens.
diff --git a/docdeid/utils.py b/docdeid/utils.py
index a1fcdd7..3c57a20 100644
--- a/docdeid/utils.py
+++ b/docdeid/utils.py
@@ -1,10 +1,38 @@
 from collections import defaultdict
+from collections.abc import Generator, Iterable, Iterator, Mapping
+from typing import Any, Optional
 
 from frozendict import frozendict
 
 from docdeid.document import Document
 
 
+def leaf_items(json_struct: Mapping) -> Iterator[tuple]:
+    """
+    Generates all `(key, value)` items that appear as leaves of the potentially deeply
+    nested JSON-like structure `json_struct`, where being a leaf item means that
+    `key` is associated with a `value` in a dict and `value` is of an atomic type
+    (such as a `str` but not list-like or map-like).
+
+    :param json_struct: nested structure to iterate
+    :return: generator of leaf `(key, value)` items
+    """
+    return __leaf_items(json_struct, None)
+
+
+def __leaf_items(obj: Any, par_key: Optional[str]) -> Generator[tuple, None, None]:
+    if isinstance(obj, Mapping):
+        for key, val in obj.items():
+            for item in __leaf_items(val, key):
+                yield item
+    elif isinstance(obj, Iterable) and not isinstance(obj, (bytes, str)):
+        for member in obj:
+            for item in __leaf_items(member, None):
+                yield item
+    elif par_key is not None:
+        yield par_key, obj
+
+
 def annotate_intext(doc: Document) -> str:
     """
     Annotate intext, which can be useful to compare the annotations of two different
diff --git a/tests/unit/process/test_annotator.py b/tests/unit/process/test_annotator.py
index 3b6e0d0..d71296b 100644
--- a/tests/unit/process/test_annotator.py
+++ b/tests/unit/process/test_annotator.py
@@ -320,8 +320,10 @@ def ds(self):
         ds["interfixes"] = LookupSet()
         ds["interfixes"].add_items_from_iterable(items=interfixes)
 
-        ds["interfixed_surnames"] = LookupSet()
-        ds["interfixed_surnames"].add_items_from_iterable(items=interfixed_surnames)
+        trie = LookupTrie()
+        for phrase in interfixed_surnames:
+            trie.add_item(phrase.split())
+        ds["interfixed_surnames"] = trie
 
         return ds
 
@@ -346,16 +348,60 @@ def korean_doc(self):
             tokenizers={"default": WordBoundaryTokenizer(False)},
         )
 
+    def test_validation(self, ds):
+        with pytest.raises(ValueError) as exc_info:
+            tpa = SequenceAnnotator(pattern=[], ds=ds, tag="_")
+        assert "missing or empty" in str(exc_info)
+
+        # Lookup structures are not required if there are no lookup token patterns.
+        tpa = SequenceAnnotator(pattern=[{"like_name": True}], tag="_")
+        assert True
+
+        with pytest.raises(ValueError) as exc_info:
+            tpa = SequenceAnnotator(pattern=[{"lookup": "undefined_entity"}], tag="_")
+        assert "no lookup structures were provided" in str(exc_info)
+
+        with pytest.raises(ValueError) as exc_info:
+            tpa = SequenceAnnotator(pattern=[{"lookup": "undefined_entity"}],
+                                    ds=ds,
+                                    tag="_")
+        assert "Unknown lookup entity types: undefined_entity." in str(exc_info)
+
+        with pytest.raises(ValueError) as exc_info:
+            tpa = SequenceAnnotator(
+                pattern=[{"or": [{"lookup": "undefined_entity"},
+                                 {"lookup": "another_entity"}]}],
+                ds=ds,
+                tag="_")
+        assert ("Unknown lookup entity types: another_entity, undefined_entity."
+                in str(exc_info))
+
+        with pytest.raises(ValueError) as exc_info:
+            tpa = SequenceAnnotator(
+                pattern=[{"or": [{"lookup": "interfixes"},
+                                 {"and": [{"lookup": "first_names"},
+                                          {"lookup": "alien_entity"}]}]},
+                         {"lookup": "another_entity"}],
+                ds=ds,
+                tag="_")
+        assert ("Unknown lookup entity types: alien_entity, another_entity."
+                in str(exc_info))
+
+        with pytest.raises(ValueError) as exc_info:
+            tpa = SequenceAnnotator(
+                pattern=[{"lookup": "interfixed_surnames"}],
+                ds=ds,
+                tag="_")
+        assert ("is backed by a LookupTrie" in str(exc_info))
+
     def test_match_sequence(self, pattern_doc, ds):
         pattern = [{"lookup": "first_names"}, {"like_name": True}]
 
-        tpa = SequenceAnnotator(pattern=[], ds=ds, tag="_")
+        tpa = SequenceAnnotator(pattern=pattern, ds=ds, tag="_")
 
         assert tpa._match_sequence(
             pattern_doc,
-            SequencePattern(
-                Direction.RIGHT, set(), [as_token_pattern(it) for it in pattern]
-            ),
+            tpa._seq_pattern,
             start_token=pattern_doc.get_tokens()[3],
             annos_by_token=defaultdict(list),
             dicts=ds,
@@ -363,9 +409,7 @@ def test_match_sequence(self, pattern_doc, ds):
         assert (
             tpa._match_sequence(
                 pattern_doc,
-                SequencePattern(
-                    Direction.RIGHT, set(), [as_token_pattern(it) for it in pattern]
-                ),
+                tpa._seq_pattern,
                 start_token=pattern_doc.get_tokens()[7],
                 annos_by_token=defaultdict(list),
                 dicts=ds,
@@ -374,9 +418,14 @@ def test_match_sequence(self, pattern_doc, ds):
         )
 
     def test_match_sequence_left(self, pattern_doc, ds):
+        """
+        Matching is always performed in the direction left-to-right by
+        SequenceAnnotator proper but the same method is also called by
+        ContextAnnotator in Deduce, where matching may proceed also right-to-left.
+        """
         pattern = [{"lookup": "first_names"}, {"like_name": True}]
 
-        tpa = SequenceAnnotator(pattern=[], ds=ds, tag="_")
+        tpa = SequenceAnnotator(pattern=pattern, ds=ds, tag="_")
 
         assert tpa._match_sequence(
             pattern_doc,
@@ -404,13 +453,12 @@ def test_match_sequence_left(self, pattern_doc, ds):
     def test_match_sequence_skip(self, pattern_doc, ds):
         pattern = [{"lookup": "surnames"}, {"like_name": True}]
 
-        tpa = SequenceAnnotator(pattern=[], ds=ds, tag="_")
+        tpa = SequenceAnnotator(pattern=pattern, ds=ds, tag="_")
+        tpa_skipping = SequenceAnnotator(pattern=pattern, ds=ds, skip=["-"], tag="_")
 
-        assert tpa._match_sequence(
+        assert tpa_skipping._match_sequence(
             pattern_doc,
-            SequencePattern(
-                Direction.RIGHT, {"-"}, [as_token_pattern(it) for it in pattern]
-            ),
+            tpa_skipping._seq_pattern,
             start_token=pattern_doc.get_tokens()[4],
             annos_by_token=defaultdict(list),
             dicts=ds,

From 9dcc4f08cd7f8c115816e252d54a2f399d224e74 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Thu, 9 Jan 2025 21:35:12 +0100
Subject: [PATCH 40/41] Polish the code a little

---
 docdeid/tokenizer.py | 4 ++--
 docdeid/utils.py     | 4 +---
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/docdeid/tokenizer.py b/docdeid/tokenizer.py
index a889ac7..9e69208 100644
--- a/docdeid/tokenizer.py
+++ b/docdeid/tokenizer.py
@@ -4,9 +4,9 @@
 import sys
 from abc import ABC, abstractmethod
 from collections import defaultdict
-from collections.abc import Sequence
+from collections.abc import Generator, Iterator, Sequence
 from dataclasses import dataclass, field
-from typing import Generator, Iterator, Literal, Optional, SupportsIndex, overload
+from typing import Literal, Optional, SupportsIndex, overload
 
 from docdeid.direction import Direction
 from docdeid.str import StringModifier
diff --git a/docdeid/utils.py b/docdeid/utils.py
index 3c57a20..d5bfe5a 100644
--- a/docdeid/utils.py
+++ b/docdeid/utils.py
@@ -71,9 +71,7 @@ def annotate_doc(doc: Document) -> str:
     Handles also nested mentions and in a way also overlapping mentions, even though
     this kind of markup cannot really represent them.
     """
-    annos_from_shortest = sorted(
-        doc.annotations, key=lambda anno: anno.end_char - anno.start_char
-    )
+    annos_from_shortest = doc.annotations.sorted(by=("length", ))
     idx_to_anno_starts = defaultdict(list)
     idx_to_anno_ends = defaultdict(list)
     for anno in annos_from_shortest:

From 659a694b75e32719fa8527e53fb7d596f74c46e0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= <matej@ibis.ai>
Date: Fri, 10 Jan 2025 08:59:57 +0100
Subject: [PATCH 41/41] Don't fail validation on refs to metadata

---
 docdeid/process/annotator.py         |  2 +-
 tests/unit/process/test_annotator.py | 28 +++++++++++++++++++---------
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py
index ee82334..8b7f5c2 100644
--- a/docdeid/process/annotator.py
+++ b/docdeid/process/annotator.py
@@ -624,7 +624,7 @@ def validate_pattern(cls, pattern, ds):
         referenced_ents = {match_val
                            for tok_pattern in pattern
                            for func, match_val in leaf_items(tok_pattern)
-                           if func.endswith("lookup")}
+                           if func.endswith("lookup") and "." not in match_val}
         if referenced_ents and ds is None:
             raise ValueError("Pattern relies on entity lookups but no lookup "
                              "structures were provided.")
diff --git a/tests/unit/process/test_annotator.py b/tests/unit/process/test_annotator.py
index d71296b..380e76e 100644
--- a/tests/unit/process/test_annotator.py
+++ b/tests/unit/process/test_annotator.py
@@ -350,25 +350,25 @@ def korean_doc(self):
 
     def test_validation(self, ds):
         with pytest.raises(ValueError) as exc_info:
-            tpa = SequenceAnnotator(pattern=[], ds=ds, tag="_")
+            SequenceAnnotator(pattern=[], ds=ds, tag="_")
         assert "missing or empty" in str(exc_info)
 
         # Lookup structures are not required if there are no lookup token patterns.
-        tpa = SequenceAnnotator(pattern=[{"like_name": True}], tag="_")
+        SequenceAnnotator(pattern=[{"like_name": True}], tag="_")
         assert True
 
         with pytest.raises(ValueError) as exc_info:
-            tpa = SequenceAnnotator(pattern=[{"lookup": "undefined_entity"}], tag="_")
+            SequenceAnnotator(pattern=[{"lookup": "undefined_entity"}], tag="_")
         assert "no lookup structures were provided" in str(exc_info)
 
         with pytest.raises(ValueError) as exc_info:
-            tpa = SequenceAnnotator(pattern=[{"lookup": "undefined_entity"}],
-                                    ds=ds,
-                                    tag="_")
+            SequenceAnnotator(pattern=[{"lookup": "undefined_entity"}],
+                              ds=ds,
+                              tag="_")
         assert "Unknown lookup entity types: undefined_entity." in str(exc_info)
 
         with pytest.raises(ValueError) as exc_info:
-            tpa = SequenceAnnotator(
+            SequenceAnnotator(
                 pattern=[{"or": [{"lookup": "undefined_entity"},
                                  {"lookup": "another_entity"}]}],
                 ds=ds,
@@ -376,8 +376,18 @@ def test_validation(self, ds):
         assert ("Unknown lookup entity types: another_entity, undefined_entity."
                 in str(exc_info))
 
+        # References to entities from metadata must not cause validation errors.
+        SequenceAnnotator(pattern=[{"or": [{"lookup": "patient.name"},
+                                           {"lookup": "doctor.surname"}]}],
+                          tag="_")
+        SequenceAnnotator(pattern=[{"or": [{"lookup": "interfixes"},
+                                           {"lookup": "doctor.surname"}]}],
+                          ds=ds,
+                          tag="_")
+        assert True
+
         with pytest.raises(ValueError) as exc_info:
-            tpa = SequenceAnnotator(
+            SequenceAnnotator(
                 pattern=[{"or": [{"lookup": "interfixes"},
                                  {"and": [{"lookup": "first_names"},
                                           {"lookup": "alien_entity"}]}]},
@@ -388,7 +398,7 @@ def test_validation(self, ds):
                 in str(exc_info))
 
         with pytest.raises(ValueError) as exc_info:
-            tpa = SequenceAnnotator(
+            SequenceAnnotator(
                 pattern=[{"lookup": "interfixed_surnames"}],
                 ds=ds,
                 tag="_")