vmenger · mkorvas · Mar 1, 2024 · Mar 1, 2024 · Mar 4, 2024 · Mar 4, 2024
diff --git a/.gitignore b/.gitignore
@@ -130,4 +130,10 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
-/.idea/*
+/.idea/*
+
+# IDEs
+*.iml
+
+# misc
+*~
diff --git a/Makefile b/Makefile
@@ -4,9 +4,11 @@ format:
 	python -m docformatter .
 
 lint:
-	python -m flake8 .
-	python -m pylint docdeid/
-	python -m mypy docdeid/
+	{ python -m flake8 .; fret=$$?; }; \
+		{ python -m pylint docdeid/; pret=$$?; }; \
+		{ python -m mypy docdeid/; mret=$$?; }; \
+		echo "flake8: $$fret, pylint: $$pret, mypy: $$mret"; \
+	  [ $$fret,$$pret,$$mret = "0,0,0" ]
 
 build-docs:
 	sphinx-apidoc --module-first --force --templatedir=docs/templates -o docs/source/api docdeid

diff --git a/docdeid/annotation.py b/docdeid/annotation.py
@@ -46,7 +46,7 @@ class Annotation:  # pylint: disable=R0902
     Should only be used when the annotation ends on a token boundary.
     """
 
-    length: int = field(init=False)
+    length: int = field(init=False, compare=False)
     """The number of characters of the annotation text."""
 
     _key_cache: dict = field(default_factory=dict, repr=False, compare=False)
@@ -100,7 +100,7 @@ def get_sort_key(
 
             val = getattr(self, attr, UNKNOWN_ATTR_DEFAULT)
 
-            if callbacks is not None and (attr in callbacks):
+            if callbacks is not None and attr in callbacks:
                 val = callbacks[attr](val)
 
             sort_key.append(val)
@@ -126,6 +126,9 @@ class AnnotationSet(set[Annotation]):
     It extends the builtin ``set``.
     """
 
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+
     def sorted(
         self,
         by: tuple,  # pylint: disable=C0103
@@ -150,14 +153,14 @@ def sorted(
             A RunTimeError, if the callbacks are not provided as a frozen dict.
         """
 
-        if callbacks is not None and not isinstance(callbacks, frozendict):
+        if not isinstance(callbacks, (type(None), frozendict)):
             raise RuntimeError(
                 "Please provide the callbacks as a frozen dict, e.g. "
                 "frozendict.frozendict(end_char=lambda x: -x)"
             )
 
         return sorted(
-            list(self),
+            self,
             key=lambda x: x.get_sort_key(
                 by=by, callbacks=callbacks, deterministic=deterministic
             ),

diff --git a/docdeid/direction.py b/docdeid/direction.py
@@ -0,0 +1,37 @@
+from __future__ import annotations
+
+from enum import IntEnum
+from typing import Iterable, Sequence, TypeVar
+
+T = TypeVar("T")
+
+
+class Direction(IntEnum):
+    """Direction in text -- either left or right."""
+
+    LEFT = -1
+    RIGHT = 1
+
+    @property
+    def opposite(self) -> Direction:
+        """The opposite direction to this."""
+        return Direction(-self)
+
+    @staticmethod
+    def from_string(val: str) -> Direction:
+        """Parses a Direction from a string (case insensitive)."""
+        try:
+            return Direction[val.upper()]
+        except KeyError as key_error:
+            raise ValueError(f"Invalid direction: '{val}'") from key_error
+
+    def iter(self, seq: Sequence[T]) -> Iterable[T]:
+        """
+        Returns an iterator over the given sequence that traverses it in this direction.
+
+        Args:
+            seq: sequence to iterate over
+        """
+        if self is Direction.RIGHT:
+            return seq
+        return reversed(seq)
diff --git a/docdeid/document.py b/docdeid/document.py
@@ -1,7 +1,12 @@
+from collections import defaultdict
+from collections.abc import Mapping
+from dataclasses import dataclass
 from typing import Any, Optional
 
-from docdeid.annotation import AnnotationSet
-from docdeid.tokenizer import Tokenizer, TokenList
+from frozendict import frozendict
+
+from docdeid.annotation import Annotation, AnnotationSet
+from docdeid.tokenizer import Token, Tokenizer, TokenList
 
 
 class MetaData:
@@ -66,6 +71,12 @@ class Document:
             Will be stored in a :class:`.MetaData` object.
     """
 
+    @dataclass
+    class AnnosByToken:
+        """A cache entry associating an `AnnotationSet` with a token->annos map."""
+        anno_set: AnnotationSet
+        value: defaultdict[Token, set[Annotation]]
+
     def __init__(
         self,
         text: str,
@@ -74,7 +85,9 @@ def __init__(
     ) -> None:
 
         self._text = text
-        self._tokenizers = tokenizers
+        self._tokenizers = None if tokenizers is None else frozendict(tokenizers)
+        self._default_annos_by_token = Document.AnnosByToken(None, None)
+        self._tmp_annos_by_token = Document.AnnosByToken(None, None)
 
         self.metadata = MetaData(metadata)
         """The :class:`.MetaData` of this :class:`.Document`, that can be interacted
@@ -94,6 +107,13 @@ def text(self) -> str:
         """
         return self._text
 
+    @property
+    def tokenizers(self) -> Mapping[str, Tokenizer]:
+        """Available tokenizers indexed by their name."""
+        if self._tokenizers is None:
+            raise RuntimeError("No tokenizers initialized.")
+        return self._tokenizers
+
     def get_tokens(self, tokenizer_name: str = "default") -> TokenList:
         """
         Get the tokens corresponding to the input text, for a specific tokenizer.
@@ -146,6 +166,62 @@ def annotations(self, annotations: AnnotationSet) -> None:
         """
         self._annotations = annotations
 
+    def annos_by_token(
+            self,
+            annos: AnnotationSet = None,
+    ) -> defaultdict[Token, set[Annotation]]:
+        """
+        Returns a mapping from document tokens to annotations.
+
+        Args:
+            annos: annotations for this document to index by token (default: current
+                   annotations of this `Document`)
+        """
+
+        # Fill the default arg value.
+        if annos is None:
+            eff_annos = self._annotations
+            cache = self._default_annos_by_token
+        else:
+            eff_annos = annos
+            cache = self._tmp_annos_by_token
+
+        # Try to use a cached response.
+        if eff_annos == cache.anno_set:
+            return cache.value
+
+        # Compute the return value.
+        annos_by_token = defaultdict(set)
+        for tokenizer in self.tokenizers:
+            token_list = self.get_tokens(tokenizer)
+            if not token_list:
+                continue
+            cur_tok_idx = 0
+            tok = token_list[cur_tok_idx]
+            for anno in eff_annos.sorted(by=("start_char",)):
+                try:
+                    # Iterate over tokens till we reach the annotation.
+                    while tok.end_char < anno.start_char:
+                        cur_tok_idx += 1
+                        tok = token_list[cur_tok_idx]
+                except IndexError:
+                    break
+                # Iterate over tokens in the annotation till we reach the end
+                # of it or the end of the tokens.
+                anno_tok_idx = cur_tok_idx
+                anno_tok = tok
+                while anno_tok.start_char < anno.end_char:
+                    annos_by_token[anno_tok].add(anno)
+                    if anno_tok_idx == len(token_list) - 1:
+                        break
+                    anno_tok_idx += 1
+                    anno_tok = token_list[anno_tok_idx]
+
+        # Cache the value before returning.
+        cache.anno_set = eff_annos
+        cache.value = annos_by_token
+        return annos_by_token
+
     @property
     def deidentified_text(self) -> Optional[str]:
         """

diff --git a/docdeid/ds/lookup.py b/docdeid/ds/lookup.py
@@ -2,6 +2,7 @@
 
 import codecs
 import itertools
+from collections.abc import Sequence
 from typing import Iterable, Iterator, Optional, Union
 
 from docdeid.ds.ds import Datastructure
@@ -140,7 +141,7 @@ def add_items_from_self(
     ) -> None:
         """
         Add items from self (this items of this :class:`.LookupSet`). This can be used
-        to do a transformation or replacment of the items.
+        to do a transformation or replacement of the items.
 
         Args:
             cleaning_pipeline: A cleaning pipeline applied to the items of this set.
@@ -265,7 +266,7 @@ def __init__(self, *args, **kwargs) -> None:
         self.children: dict[str, LookupTrie] = {}
         self.is_terminal = False
 
-    def add_item(self, item: list[str]) -> None:
+    def add_item(self, item: Sequence[str]) -> None:
         """
         Add an item, i.e. a list of strings, to this Trie.
 
@@ -285,7 +286,7 @@ def add_item(self, item: list[str]) -> None:
 
             self.children[head].add_item(tail)
 
-    def __contains__(self, item: list[str]) -> bool:
+    def __contains__(self, item: Sequence[str]) -> bool:
         """
         Whether the trie contains the item. Respects the matching pipeline.
 
@@ -304,7 +305,7 @@ def __contains__(self, item: list[str]) -> bool:
         return (head in self.children) and tail in self.children[head]
 
     def longest_matching_prefix(
-        self, item: list[str], start_i: int = 0
+        self, item: Sequence[str], start_i: int = 0
     ) -> Union[list[str], None]:
         """
         Finds the longest matching prefix of a list of strings. This is used to find the

diff --git a/docdeid/process/__init__.py b/docdeid/process/__init__.py
@@ -6,9 +6,10 @@
 from .annotator import (
     Annotator,
     MultiTokenLookupAnnotator,
+    MultiTokenTrieAnnotator,
     RegexpAnnotator,
+    SequenceAnnotator,
     SingleTokenLookupAnnotator,
-    TokenPatternAnnotator,
 )
 from .doc_processor import DocProcessor, DocProcessorGroup
 from .redactor import RedactAllText, Redactor, SimpleRedactor
diff --git a/docdeid/process/annotation_processor.py b/docdeid/process/annotation_processor.py
@@ -60,15 +60,15 @@ def __init__(
     @staticmethod
     def _zero_runs(arr: npt.NDArray) -> npt.NDArray:
         """
-        Finds al zero runs in a numpy array.
+        Finds all zero runs in a numpy array.
         Source: https://stackoverflow.com/questions/24885092/
         finding-the-consecutive-zeros-in-a-numpy-array
 
         Args:
             arr: The input array.
 
         Returns:
-            A (num_zero_runs, 2)-dim array, containing the start and end indeces
+            A (num_zero_runs, 2)-dim array, containing the start and end indices
             of the zero runs.
 
         Examples: