Support expansion of transformers ner models to include new concepts (CogStack/MedCAT#519)

baixiac · web-flow · commit ee3c6400d237 · 2025-02-14T10:53:52.000Z
* CU-8697v6qr2 support expansion of transformers ner models to include new concepts
* CU-8697v6qr2 add logging suggested by the review
diff --git a/medcat-v1/medcat/ner/transformers_ner.py b/medcat-v1/medcat/ner/transformers_ner.py
@@ -2,6 +2,7 @@
 import json
 import logging
 import datasets
+import torch
 from spacy.tokens import Doc
 from datetime import datetime
 from typing import Iterable, Iterator, Optional, Dict, List, cast, Union, Tuple, Callable, Type
@@ -330,6 +331,63 @@ def save(self, save_dir_path: str) -> None:
         # This is everything we need to save from the class, we do not
         #save the class itself.
 
+    def expand_model_with_concepts(self, cui2preferred_name: Dict[str, str], use_avg_init: bool = True) -> None:
+        """Expand the model with new concepts and their preferred names, which requires subsequent retraining on the model.
+
+        Args:
+            cui2preferred_name(Dict[str, str]):
+                Dictionary where each key is the literal ID of the concept to be added and each value is its preferred name.
+            use_avg_init(bool):
+                Whether to use the average of existing weights or biases as the initial value for the new concept. Defaults to True.
+        """
+
+        avg_weight = torch.mean(self.model.classifier.weight, dim=0, keepdim=True)
+        avg_bias = torch.mean(self.model.classifier.bias, dim=0, keepdim=True)
+
+        new_cuis = set()
+        for label, preferred_name in cui2preferred_name.items():
+            if label in self.model.config.label2id.keys():
+                logger.warning("Concept ID '%s' already exists in the model, skipping...", label)
+                continue
+
+            sname = preferred_name.lower().replace(" ", "~")
+            new_names = {
+                sname: {
+                    "tokens": [],
+                    "snames": [sname],
+                    "raw_name": preferred_name,
+                    "is_upper": True
+                }
+            }
+            self.cdb.add_names(cui=label, names=new_names, name_status="P", full_build=True)
+
+            new_label_id = sorted(self.model.config.label2id.values())[-1] + 1
+            self.model.config.label2id[label] = new_label_id
+            self.model.config.id2label[new_label_id] = label
+            self.tokenizer.label_map[label] = new_label_id
+            self.tokenizer.cui2name = {k: self.cdb.get_name(k) for k in self.tokenizer.label_map.keys()}
+
+            if use_avg_init:
+                self.model.classifier.weight = torch.nn.Parameter(
+                    torch.cat((self.model.classifier.weight, avg_weight), 0)
+                )
+                self.model.classifier.bias = torch.nn.Parameter(
+                    torch.cat((self.model.classifier.bias, avg_bias), 0)
+                )
+            else:
+                self.model.classifier.weight = torch.nn.Parameter(
+                    torch.cat((self.model.classifier.weight, torch.randn(1, self.model.config.hidden_size)), 0)
+                )
+                self.model.classifier.bias = torch.nn.Parameter(
+                    torch.cat((self.model.classifier.bias, torch.randn(1)), 0)
+                )
+            self.model.num_labels += 1
+            self.model.classifier.out_features += 1
+
+            new_cuis.add(label)
+
+        logger.info("Model expanded with the new concept(s): %s and shall be retrained before use.", str(new_cuis))
+
     @classmethod
     def load(cls, save_dir_path: str, config_dict: Optional[Dict] = None) -> "TransformersNER":
         """Load a meta_cat object.
diff --git a/medcat-v1/medcat/utils/ner/model.py b/medcat-v1/medcat/utils/ner/model.py
@@ -76,6 +76,21 @@ def get_entities(self, text: str, *args, **kwargs) -> dict:
         """
         return self.cat.get_entities(text, *args, **kwargs)
 
+    def add_new_concepts(self,
+                         cui2preferred_name: Dict[str, str],
+                         train_nr: int = 0,
+                         with_random_init: bool = False) -> None:
+        """Add new concepts to the model and the concept database.
+
+        Invoking this requires subsequent retraining on the model.
+
+        Args:
+            cui2preferred_name(Dict[str, str]): Dictionary where each key is the literal ID of the concept to be added and each value is its preferred name.
+            train_nr (int): The number of the NER object in cat._addl_train to which new concepts will be added. Defaults to 0.
+            with_random_init (bool): Whether to use the random init strategy for the new concepts. Defaults to False.
+        """
+        self.cat._addl_ner[train_nr].expand_model_with_concepts(cui2preferred_name, use_avg_init=not with_random_init)
+
     @property
     def config(self) -> Config:
         return self.cat.config
diff --git a/medcat-v1/tests/ner/test_transformers_ner.py b/medcat-v1/tests/ner/test_transformers_ner.py
@@ -48,3 +48,20 @@ def on_epoch_end(self, *args, **kwargs) -> None:
         assert dataset["train"].num_rows == 48
         assert dataset["test"].num_rows == 12
         self.assertEqual(tracker.call.call_count, 2)
+
+    def test_expand_model_with_concepts(self):
+        original_num_labels = self.undertest.model.num_labels
+        original_out_features  = self.undertest.model.classifier.out_features
+        original_label_map_size = len(self.undertest.tokenizer.label_map)
+        cui2preferred_name = {
+            "concept_1" : "Preferred Name 1",
+            "concept_2" : "Preferred Name 2",
+        }
+
+        self.undertest.expand_model_with_concepts(cui2preferred_name)
+
+        assert self.undertest.model.num_labels == original_num_labels + len(cui2preferred_name)
+        assert self.undertest.model.classifier.out_features == original_out_features + len(cui2preferred_name)
+        assert len(self.undertest.tokenizer.label_map) == original_label_map_size + len(cui2preferred_name)
+        assert self.undertest.tokenizer.cui2name.get("concept_1") == "Preferred Name 1"
+        assert self.undertest.tokenizer.cui2name.get("concept_2") == "Preferred Name 2"
diff --git a/medcat-v1/tests/utils/ner/test_deid.py b/medcat-v1/tests/utils/ner/test_deid.py
@@ -90,6 +90,13 @@ def test_training(self):
         self.assertIsNotNone(examples)
         self.assertIsNotNone(dataset)
 
+    def test_add_new_concepts(self):
+        self.deid_model.add_new_concepts({'CONCEPT': "Concept"}, with_random_init=True)
+        self.assertTrue("CONCEPT" in self.deid_model.cat.cdb.cui2names)
+        self.assertEqual(self.deid_model.cat.cdb.cui2names["CONCEPT"], {"concept"})
+        self.assertTrue("CONCEPT" in self.deid_model.cat._addl_ner[0].model.config.label2id)
+        self.assertTrue("CONCEPT" in self.deid_model.cat._addl_ner[0].tokenizer.label_map)
+        self.assertTrue("CONCEPT" in self.deid_model.cat._addl_ner[0].tokenizer.cui2name)
 
 input_text = '''
 James Joyce