-
Notifications
You must be signed in to change notification settings - Fork 3
Improve PATIENT/PERSOON processing and more #20
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
30ce936
e12d0d0
82aab5f
156e201
c7b4c89
4459c14
810b8b3
5002696
7d2d866
762866a
1ae6846
ae1f93e
d415f51
d8e8ed3
03fc99d
5d188cd
6ea9b74
4110a53
df73e54
99163d6
c7ba5bc
c80e2ad
40fcd62
15b8648
ebdefa4
4a082b8
3319df1
1afb16f
53db956
230c507
25cbcfd
36eb1e3
573deff
a2704c5
3ca37aa
68f4afb
fb3cbd8
0c04a78
82c52fc
9dcc4f0
659a694
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -130,4 +130,10 @@ dmypy.json | |
|
|
||
| # Pyre type checker | ||
| .pyre/ | ||
| /.idea/* | ||
| /.idea/* | ||
|
|
||
| # IDEs | ||
| *.iml | ||
|
|
||
| # misc | ||
| *~ | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,37 @@ | ||
| from __future__ import annotations | ||
|
|
||
| from enum import IntEnum | ||
| from typing import Iterable, Sequence, TypeVar | ||
|
|
||
| T = TypeVar("T") | ||
|
|
||
|
|
||
| class Direction(IntEnum): | ||
| """Direction in text -- either left or right.""" | ||
|
|
||
| LEFT = -1 | ||
| RIGHT = 1 | ||
|
|
||
| @property | ||
| def opposite(self) -> Direction: | ||
| """The opposite direction to this.""" | ||
| return Direction(-self) | ||
|
|
||
| @staticmethod | ||
| def from_string(val: str) -> Direction: | ||
| """Parses a Direction from a string (case insensitive).""" | ||
| try: | ||
| return Direction[val.upper()] | ||
| except KeyError as key_error: | ||
| raise ValueError(f"Invalid direction: '{val}'") from key_error | ||
|
|
||
| def iter(self, seq: Sequence[T]) -> Iterable[T]: | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of typevar you can also just use
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That would be less telling about the behaviour of the method, though. If I call it with |
||
| """ | ||
| Returns an iterator over the given sequence that traverses it in this direction. | ||
|
|
||
| Args: | ||
| seq: sequence to iterate over | ||
| """ | ||
| if self is Direction.RIGHT: | ||
| return seq | ||
| return reversed(seq) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,7 +1,12 @@ | ||
| from collections import defaultdict | ||
| from collections.abc import Mapping | ||
| from dataclasses import dataclass | ||
| from typing import Any, Optional | ||
|
|
||
| from docdeid.annotation import AnnotationSet | ||
| from docdeid.tokenizer import Tokenizer, TokenList | ||
| from frozendict import frozendict | ||
|
|
||
| from docdeid.annotation import Annotation, AnnotationSet | ||
| from docdeid.tokenizer import Token, Tokenizer, TokenList | ||
|
|
||
|
|
||
| class MetaData: | ||
|
|
@@ -66,6 +71,12 @@ class Document: | |
| Will be stored in a :class:`.MetaData` object. | ||
| """ | ||
|
|
||
| @dataclass | ||
| class AnnosByToken: | ||
| """A cache entry associating an `AnnotationSet` with a token->annos map.""" | ||
| anno_set: AnnotationSet | ||
| value: defaultdict[Token, set[Annotation]] | ||
|
|
||
| def __init__( | ||
| self, | ||
| text: str, | ||
|
|
@@ -74,7 +85,9 @@ def __init__( | |
| ) -> None: | ||
|
|
||
| self._text = text | ||
| self._tokenizers = tokenizers | ||
| self._tokenizers = None if tokenizers is None else frozendict(tokenizers) | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this to make mypy happy?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess it's to make sure that any additions to the (The same argument would apply to |
||
| self._default_annos_by_token = Document.AnnosByToken(None, None) | ||
| self._tmp_annos_by_token = Document.AnnosByToken(None, None) | ||
|
|
||
| self.metadata = MetaData(metadata) | ||
| """The :class:`.MetaData` of this :class:`.Document`, that can be interacted | ||
|
|
@@ -94,6 +107,13 @@ def text(self) -> str: | |
| """ | ||
| return self._text | ||
|
|
||
| @property | ||
| def tokenizers(self) -> Mapping[str, Tokenizer]: | ||
| """Available tokenizers indexed by their name.""" | ||
| if self._tokenizers is None: | ||
| raise RuntimeError("No tokenizers initialized.") | ||
| return self._tokenizers | ||
|
|
||
| def get_tokens(self, tokenizer_name: str = "default") -> TokenList: | ||
| """ | ||
| Get the tokens corresponding to the input text, for a specific tokenizer. | ||
|
|
@@ -146,6 +166,62 @@ def annotations(self, annotations: AnnotationSet) -> None: | |
| """ | ||
| self._annotations = annotations | ||
|
|
||
| def annos_by_token( | ||
| self, | ||
| annos: AnnotationSet = None, | ||
| ) -> defaultdict[Token, set[Annotation]]: | ||
| """ | ||
| Returns a mapping from document tokens to annotations. | ||
|
|
||
| Args: | ||
| annos: annotations for this document to index by token (default: current | ||
| annotations of this `Document`) | ||
| """ | ||
|
|
||
| # Fill the default arg value. | ||
| if annos is None: | ||
| eff_annos = self._annotations | ||
| cache = self._default_annos_by_token | ||
| else: | ||
| eff_annos = annos | ||
| cache = self._tmp_annos_by_token | ||
|
|
||
| # Try to use a cached response. | ||
| if eff_annos == cache.anno_set: | ||
| return cache.value | ||
|
|
||
| # Compute the return value. | ||
| annos_by_token = defaultdict(set) | ||
| for tokenizer in self.tokenizers: | ||
| token_list = self.get_tokens(tokenizer) | ||
| if not token_list: | ||
| continue | ||
| cur_tok_idx = 0 | ||
| tok = token_list[cur_tok_idx] | ||
| for anno in eff_annos.sorted(by=("start_char",)): | ||
| try: | ||
| # Iterate over tokens till we reach the annotation. | ||
| while tok.end_char < anno.start_char: | ||
| cur_tok_idx += 1 | ||
| tok = token_list[cur_tok_idx] | ||
| except IndexError: | ||
| break | ||
| # Iterate over tokens in the annotation till we reach the end | ||
| # of it or the end of the tokens. | ||
| anno_tok_idx = cur_tok_idx | ||
| anno_tok = tok | ||
| while anno_tok.start_char < anno.end_char: | ||
| annos_by_token[anno_tok].add(anno) | ||
| if anno_tok_idx == len(token_list) - 1: | ||
| break | ||
| anno_tok_idx += 1 | ||
| anno_tok = token_list[anno_tok_idx] | ||
|
|
||
| # Cache the value before returning. | ||
| cache.anno_set = eff_annos | ||
| cache.value = annos_by_token | ||
| return annos_by_token | ||
|
|
||
| @property | ||
| def deidentified_text(self) -> Optional[str]: | ||
| """ | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Here's a nice simplification:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Indeed! I completely missed the possibility to use indexing for accessing enum members (and I still don't see it explicitly documented in the Python 3.9 docs).