Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
30ce936
Make MultiTok..Annotator notice changes in the trie
matej-ibis-ai Mar 1, 2024
e12d0d0
Provide `LowercaseTail` string modifier
matej-ibis-ai Mar 1, 2024
82aab5f
Enable specifying the lang for titlecasing
matej-ibis-ai Mar 4, 2024
156e201
Minimize data fixtures for tests
matej-ibis-ai Mar 4, 2024
c7b4c89
Log annotated text after every processor
matej-ibis-ai Mar 6, 2024
4459c14
Update documentation slightly
matej-ibis-ai Mar 6, 2024
810b8b3
Expose `Document.token_lists` as a property
matej-ibis-ai Mar 6, 2024
5002696
(Almost) automatically format code
matej-ibis-ai Mar 7, 2024
7d2d866
Simplify `MultiTokenLookupAnnotator`...
matej-ibis-ai Mar 7, 2024
762866a
Update the `MultiTok...Annotator` docstring
matej-ibis-ai Mar 8, 2024
1ae6846
Test user additions to the lookup trie
matej-ibis-ai Mar 8, 2024
ae1f93e
Test the `tokenizers` and `token_lists` props
matej-ibis-ai Mar 8, 2024
d415f51
Remove and ignore the IDEA project file
matej-ibis-ai Mar 8, 2024
d8e8ed3
Annotate docs for logging only if level is DEBUG
matej-ibis-ai Mar 8, 2024
03fc99d
Cosmetics
matej-ibis-ai Mar 8, 2024
5d188cd
Support whitespace trimming in `WordBoundaryTokenizer`
matej-ibis-ai Mar 11, 2024
6ea9b74
Move `SequenceTokenizer` to Docdeid
matej-ibis-ai Mar 11, 2024
4110a53
Format code
matej-ibis-ai Mar 11, 2024
df73e54
Replace `_DIRECTION_MAP` with an enum
matej-ibis-ai Mar 11, 2024
99163d6
Improve and test `annos_by_token()`
matej-ibis-ai Mar 11, 2024
c7ba5bc
Drop `Token.get_nth`, simplify `Token.iter_to`
matej-ibis-ai Mar 12, 2024
c80e2ad
Format code
matej-ibis-ai Mar 12, 2024
40fcd62
Test and fix `Direction`
matej-ibis-ai Mar 12, 2024
15b8648
Fix Flake8-reported errors
matej-ibis-ai Mar 12, 2024
ebdefa4
Address most non-Mypy lint issues
matej-ibis-ai Mar 12, 2024
4a082b8
Address easy and valid Mypy issues
matej-ibis-ai Mar 12, 2024
3319df1
Add a test for keep_blanks=False in WBTokenizer
matej-ibis-ai Jul 12, 2024
1afb16f
Document how to run tests better + cosmetics
matej-ibis-ai Jul 12, 2024
53db956
Drop the `Document.token_lists` property
matej-ibis-ai Jan 7, 2025
230c507
Avoid "|" for union types
matej-ibis-ai Jan 8, 2025
25cbcfd
Move `annos_by_token` to `Document`
matej-ibis-ai Jan 8, 2025
36eb1e3
Simplify `Direction.from_string`
matej-ibis-ai Jan 8, 2025
573deff
Rename `SequenceAnnotator.dicts` to `ds`
matej-ibis-ai Jan 8, 2025
a2704c5
Replace `list(map(f, xs))` with list comprehension
matej-ibis-ai Jan 8, 2025
3ca37aa
Re-add `MultiTokenLookupAnnotator` accepting a `LookupSet`
matej-ibis-ai Jan 8, 2025
68f4afb
Add a test for matching multi-word phrases
matej-ibis-ai Jan 9, 2025
fb3cbd8
Try to support multi-word matching in SequenceAnnotator
matej-ibis-ai Jan 9, 2025
0c04a78
Give up multi-word matching in SequenceAnnotator
matej-ibis-ai Jan 9, 2025
82c52fc
Move seq pattern validation to a new method
matej-ibis-ai Jan 9, 2025
9dcc4f0
Polish the code a little
matej-ibis-ai Jan 9, 2025
659a694
Don't fail validation on refs to metadata
matej-ibis-ai Jan 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -130,4 +130,10 @@ dmypy.json

# Pyre type checker
.pyre/
/.idea/*
/.idea/*

# IDEs
*.iml

# misc
*~
8 changes: 5 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@ format:
python -m docformatter .

lint:
python -m flake8 .
python -m pylint docdeid/
python -m mypy docdeid/
{ python -m flake8 .; fret=$$?; }; \
{ python -m pylint docdeid/; pret=$$?; }; \
{ python -m mypy docdeid/; mret=$$?; }; \
echo "flake8: $$fret, pylint: $$pret, mypy: $$mret"; \
[ $$fret,$$pret,$$mret = "0,0,0" ]

build-docs:
sphinx-apidoc --module-first --force --templatedir=docs/templates -o docs/source/api docdeid
Expand Down
11 changes: 7 additions & 4 deletions docdeid/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class Annotation: # pylint: disable=R0902
Should only be used when the annotation ends on a token boundary.
"""

length: int = field(init=False)
length: int = field(init=False, compare=False)
"""The number of characters of the annotation text."""

_key_cache: dict = field(default_factory=dict, repr=False, compare=False)
Expand Down Expand Up @@ -100,7 +100,7 @@ def get_sort_key(

val = getattr(self, attr, UNKNOWN_ATTR_DEFAULT)

if callbacks is not None and (attr in callbacks):
if callbacks is not None and attr in callbacks:
val = callbacks[attr](val)

sort_key.append(val)
Expand All @@ -126,6 +126,9 @@ class AnnotationSet(set[Annotation]):
It extends the builtin ``set``.
"""

def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)

def sorted(
self,
by: tuple, # pylint: disable=C0103
Expand All @@ -150,14 +153,14 @@ def sorted(
A RunTimeError, if the callbacks are not provided as a frozen dict.
"""

if callbacks is not None and not isinstance(callbacks, frozendict):
if not isinstance(callbacks, (type(None), frozendict)):
raise RuntimeError(
"Please provide the callbacks as a frozen dict, e.g. "
"frozendict.frozendict(end_char=lambda x: -x)"
)

return sorted(
list(self),
self,
key=lambda x: x.get_sort_key(
by=by, callbacks=callbacks, deterministic=deterministic
),
Expand Down
37 changes: 37 additions & 0 deletions docdeid/direction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from __future__ import annotations

from enum import IntEnum
from typing import Iterable, Sequence, TypeVar

T = TypeVar("T")


class Direction(IntEnum):
"""Direction in text -- either left or right."""

LEFT = -1
RIGHT = 1

@property
def opposite(self) -> Direction:
"""The opposite direction to this."""
return Direction(-self)

@staticmethod
def from_string(val: str) -> Direction:
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here's a nice simplification:

@staticmethod
def from_string(val: str) -> Direction: 
    """Parses a Direction from a string (case insensitive)"""

    try:
        return Direction[val.upper()]
    except KeyError:
        raise ValueError(f"Invalid direction: '{val}'")

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed! I completely missed the possibility to use indexing for accessing enum members (and I still don't see it explicitly documented in the Python 3.9 docs).

"""Parses a Direction from a string (case insensitive)."""
try:
return Direction[val.upper()]
except KeyError as key_error:
raise ValueError(f"Invalid direction: '{val}'") from key_error

def iter(self, seq: Sequence[T]) -> Iterable[T]:
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of typevar you can also just use Any (I think they're the same)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That would be less telling about the behaviour of the method, though. If I call it with List[Spam], it currently promises to return Iterable[Spam]; with a signature using Any instead, I would only be guaranteed to receive an Iterable of something, Spam or not.

"""
Returns an iterator over the given sequence that traverses it in this direction.

Args:
seq: sequence to iterate over
"""
if self is Direction.RIGHT:
return seq
return reversed(seq)
82 changes: 79 additions & 3 deletions docdeid/document.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
from collections import defaultdict
from collections.abc import Mapping
from dataclasses import dataclass
from typing import Any, Optional

from docdeid.annotation import AnnotationSet
from docdeid.tokenizer import Tokenizer, TokenList
from frozendict import frozendict

from docdeid.annotation import Annotation, AnnotationSet
from docdeid.tokenizer import Token, Tokenizer, TokenList


class MetaData:
Expand Down Expand Up @@ -66,6 +71,12 @@ class Document:
Will be stored in a :class:`.MetaData` object.
"""

@dataclass
class AnnosByToken:
"""A cache entry associating an `AnnotationSet` with a token->annos map."""
anno_set: AnnotationSet
value: defaultdict[Token, set[Annotation]]

def __init__(
self,
text: str,
Expand All @@ -74,7 +85,9 @@ def __init__(
) -> None:

self._text = text
self._tokenizers = tokenizers
self._tokenizers = None if tokenizers is None else frozendict(tokenizers)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this to make mypy happy?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess it's to make sure that any additions to the tokenizers dict made after it was passed to this Document.__init__ method are not going to affect the tokenizers used by this Document instance. I don't see immediately whether this safety measure is necessary but it looks more certainly more correct this way.

(The same argument would apply to metadata -- I find it ugly that the object, the dictionary passed to the Document initializer here can be modified later and the Document instance's metadata field will reflect the modifications. But I didn't have the need to fix that. I would prefer to just use a simple dict instead of the MetaData class in fact.)

self._default_annos_by_token = Document.AnnosByToken(None, None)
self._tmp_annos_by_token = Document.AnnosByToken(None, None)

self.metadata = MetaData(metadata)
"""The :class:`.MetaData` of this :class:`.Document`, that can be interacted
Expand All @@ -94,6 +107,13 @@ def text(self) -> str:
"""
return self._text

@property
def tokenizers(self) -> Mapping[str, Tokenizer]:
"""Available tokenizers indexed by their name."""
if self._tokenizers is None:
raise RuntimeError("No tokenizers initialized.")
return self._tokenizers

def get_tokens(self, tokenizer_name: str = "default") -> TokenList:
"""
Get the tokens corresponding to the input text, for a specific tokenizer.
Expand Down Expand Up @@ -146,6 +166,62 @@ def annotations(self, annotations: AnnotationSet) -> None:
"""
self._annotations = annotations

def annos_by_token(
self,
annos: AnnotationSet = None,
) -> defaultdict[Token, set[Annotation]]:
"""
Returns a mapping from document tokens to annotations.

Args:
annos: annotations for this document to index by token (default: current
annotations of this `Document`)
"""

# Fill the default arg value.
if annos is None:
eff_annos = self._annotations
cache = self._default_annos_by_token
else:
eff_annos = annos
cache = self._tmp_annos_by_token

# Try to use a cached response.
if eff_annos == cache.anno_set:
return cache.value

# Compute the return value.
annos_by_token = defaultdict(set)
for tokenizer in self.tokenizers:
token_list = self.get_tokens(tokenizer)
if not token_list:
continue
cur_tok_idx = 0
tok = token_list[cur_tok_idx]
for anno in eff_annos.sorted(by=("start_char",)):
try:
# Iterate over tokens till we reach the annotation.
while tok.end_char < anno.start_char:
cur_tok_idx += 1
tok = token_list[cur_tok_idx]
except IndexError:
break
# Iterate over tokens in the annotation till we reach the end
# of it or the end of the tokens.
anno_tok_idx = cur_tok_idx
anno_tok = tok
while anno_tok.start_char < anno.end_char:
annos_by_token[anno_tok].add(anno)
if anno_tok_idx == len(token_list) - 1:
break
anno_tok_idx += 1
anno_tok = token_list[anno_tok_idx]

# Cache the value before returning.
cache.anno_set = eff_annos
cache.value = annos_by_token
return annos_by_token

@property
def deidentified_text(self) -> Optional[str]:
"""
Expand Down
9 changes: 5 additions & 4 deletions docdeid/ds/lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import codecs
import itertools
from collections.abc import Sequence
from typing import Iterable, Iterator, Optional, Union

from docdeid.ds.ds import Datastructure
Expand Down Expand Up @@ -140,7 +141,7 @@ def add_items_from_self(
) -> None:
"""
Add items from self (this items of this :class:`.LookupSet`). This can be used
to do a transformation or replacment of the items.
to do a transformation or replacement of the items.

Args:
cleaning_pipeline: A cleaning pipeline applied to the items of this set.
Expand Down Expand Up @@ -265,7 +266,7 @@ def __init__(self, *args, **kwargs) -> None:
self.children: dict[str, LookupTrie] = {}
self.is_terminal = False

def add_item(self, item: list[str]) -> None:
def add_item(self, item: Sequence[str]) -> None:
"""
Add an item, i.e. a list of strings, to this Trie.

Expand All @@ -285,7 +286,7 @@ def add_item(self, item: list[str]) -> None:

self.children[head].add_item(tail)

def __contains__(self, item: list[str]) -> bool:
def __contains__(self, item: Sequence[str]) -> bool:
"""
Whether the trie contains the item. Respects the matching pipeline.

Expand All @@ -304,7 +305,7 @@ def __contains__(self, item: list[str]) -> bool:
return (head in self.children) and tail in self.children[head]

def longest_matching_prefix(
self, item: list[str], start_i: int = 0
self, item: Sequence[str], start_i: int = 0
) -> Union[list[str], None]:
"""
Finds the longest matching prefix of a list of strings. This is used to find the
Expand Down
3 changes: 2 additions & 1 deletion docdeid/process/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
from .annotator import (
Annotator,
MultiTokenLookupAnnotator,
MultiTokenTrieAnnotator,
RegexpAnnotator,
SequenceAnnotator,
SingleTokenLookupAnnotator,
TokenPatternAnnotator,
)
from .doc_processor import DocProcessor, DocProcessorGroup
from .redactor import RedactAllText, Redactor, SimpleRedactor
4 changes: 2 additions & 2 deletions docdeid/process/annotation_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,15 +60,15 @@ def __init__(
@staticmethod
def _zero_runs(arr: npt.NDArray) -> npt.NDArray:
"""
Finds al zero runs in a numpy array.
Finds all zero runs in a numpy array.
Source: https://stackoverflow.com/questions/24885092/
finding-the-consecutive-zeros-in-a-numpy-array

Args:
arr: The input array.

Returns:
A (num_zero_runs, 2)-dim array, containing the start and end indeces
A (num_zero_runs, 2)-dim array, containing the start and end indices
of the zero runs.

Examples:
Expand Down
Loading