Skip to content

Commit e2d0940

Browse files
authored
feat(medcat): CU-869b9h7y6 Add faster linker (#243)
* CU-869b9h7y6: Add faster linker that only links to primary names * CU-869b9h7y6: Remove debug output * CU-869b9h7y6: Add proper filtering as well as usage of single-possible CUI options * CU-869b9h7y6: Add a simple test for the new linker * CU-869b9h7y6: Rename primary name linker with a shorter name * CU-869b9h7y6: Fix typos in logged output * CU-869b9h7y6: Lower logged output priority (info -> debug)
1 parent 574a913 commit e2d0940

File tree

3 files changed

+131
-0
lines changed

3 files changed

+131
-0
lines changed
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
from typing import Iterator, Optional, Union
2+
import logging
3+
4+
from medcat.tokenizing.tokens import MutableDocument, MutableEntity
5+
from medcat.components.linking.context_based_linker import Linker
6+
from medcat.components.linking.vector_context_model import (
7+
PerDocumentTokenCache)
8+
from medcat.utils.defaults import StatusTypes
9+
from medcat.cdb import CDB
10+
from medcat.vocab import Vocab
11+
from medcat.config import Config
12+
13+
14+
logger = logging.getLogger(__name__)
15+
16+
17+
class PrimNameLinker(Linker):
18+
"""Linker that only links primary names (or other 1-1 matches).
19+
20+
This linker avoids the hard part of linking - the disambiguation.
21+
This should allow it to work faster, but (generally) at the expense
22+
of performance.
23+
"""
24+
name = 'primary_name_only_linker'
25+
26+
def __init__(self, cdb: CDB, vocab: Vocab, config: Config) -> None:
27+
super().__init__(cdb, vocab, config)
28+
# don't need / use the context model
29+
del self.context_model
30+
31+
def _process_entity_inference(
32+
self, doc: MutableDocument,
33+
entity: MutableEntity,
34+
per_doc_valid_token_cache: PerDocumentTokenCache
35+
) -> Iterator[MutableEntity]:
36+
cuis = entity.link_candidates
37+
if not cuis:
38+
return
39+
# Check does it have a detected name
40+
name = entity.detected_name
41+
if name is None:
42+
logger.info("No name detected for entity %s", entity)
43+
return
44+
cnf_l = self.config.components.linking
45+
if cnf_l.filter_before_disamb:
46+
cuis = [cui for cui in cuis if cnf_l.filters.check_filters(cui)]
47+
if not cuis:
48+
logger.debug("No CUIs that fit filter for %s", entity)
49+
return
50+
if len(cuis) == 1:
51+
if cnf_l.filters.check_filters(cuis[0]):
52+
logger.debug("Choosing only possible CUI %s for %s",
53+
cuis[0], entity)
54+
entity.cui = cuis[0]
55+
entity.context_similarity = 1.0
56+
yield entity
57+
else:
58+
logger.debug(
59+
"A single CUI (%s) was mapped to for %s but not in filter",
60+
cuis[0], entity)
61+
return
62+
primary_cuis = [cui for cui in cuis
63+
if (self.cdb.name2info[name]['per_cui_status'][cui]
64+
in StatusTypes.PRIMARY_STATUS and
65+
cnf_l.filters.check_filters(cui))]
66+
if not primary_cuis:
67+
logger.debug("No primary CUIs for name %s", name)
68+
return
69+
if len(primary_cuis) > 1:
70+
logger.debug(
71+
"Ambiguous primary CUIs for name %s: %s", name, primary_cuis)
72+
return
73+
cui = primary_cuis[0]
74+
entity.cui = cui
75+
entity.context_similarity = 1.0
76+
yield entity
77+
78+
def train(self, cui: str,
79+
entity: MutableEntity,
80+
doc: MutableDocument,
81+
negative: bool = False,
82+
names: Union[list[str], dict] = [],
83+
per_doc_valid_token_cache: Optional[PerDocumentTokenCache] = None
84+
) -> None:
85+
raise NoTrainingException("Training is not supported for this linker")
86+
87+
def _train_on_doc(self, doc: MutableDocument,
88+
ner_ents: list[MutableEntity]
89+
) -> Iterator[MutableEntity]:
90+
raise NoTrainingException("Training is not supported for this linker")
91+
92+
93+
class NoTrainingException(ValueError):
94+
pass

medcat-v2/medcat/components/types.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,10 @@ def train(self, cui: str,
213213
"medcat2_embedding_linker": (
214214
"medcat.components.linking.embedding_linker",
215215
"Linker.create_new_component"),
216+
# primary name only
217+
"primary_name_only_linker": (
218+
"medcat.components.linking.only_primary_name_linker",
219+
"PrimNameLinker.create_new_component"),
216220
}
217221

218222

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import os
2+
3+
from medcat.cdb import CDB
4+
from medcat.cat import CAT
5+
from medcat.vocab import Vocab
6+
from medcat.components.linking.only_primary_name_linker import (
7+
PrimNameLinker)
8+
9+
import unittest
10+
11+
from ... import UNPACKED_EXAMPLE_MODEL_PACK_PATH
12+
13+
14+
EXAMPLE_CDB_PATH = os.path.join(UNPACKED_EXAMPLE_MODEL_PACK_PATH, "cdb")
15+
EXAMPLE_VOCAB_PATH = os.path.join(UNPACKED_EXAMPLE_MODEL_PACK_PATH, "vocab")
16+
17+
18+
class PrimaryNamesLinkerTests(unittest.TestCase):
19+
TEXT = (
20+
"Man was diagnosed with severe kidney failure and acute diabetes "
21+
"and presented with a light fever")
22+
23+
@classmethod
24+
def setUpClass(cls):
25+
vocab = Vocab.load(EXAMPLE_VOCAB_PATH)
26+
cdb = CDB.load(EXAMPLE_CDB_PATH)
27+
cdb.config.components.linking.comp_name = PrimNameLinker.name
28+
cls.cat = CAT(cdb, vocab)
29+
30+
def test_gets_entities(self):
31+
ents = self.cat.get_entities(self.TEXT)
32+
self.assertTrue(ents)
33+
self.assertTrue(len(ents["entities"]))

0 commit comments

Comments
 (0)