CU-86995ddec: Extra args on the train to pass in dataset to be split, or train / test files

Tom Searle · Tom Searle · commit 98302e172783 · 2025-05-22T14:31:46.000+01:00
diff --git a/medcat-v1/medcat/datasets/transformers_ner.py b/medcat-v1/medcat/datasets/transformers_ner.py
@@ -71,6 +71,12 @@ def _split_generators(self, dl_manager): # noqa
                     "filepaths": self.config.data_files['train'],
                 },
             ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={
+                    "filepaths": self.config.data_files['test'],
+                },
+            ),
         ]
 
     def _generate_examples(self, filepaths): # noqa
diff --git a/medcat-v1/medcat/ner/transformers_ner.py b/medcat-v1/medcat/ner/transformers_ner.py
@@ -176,7 +176,9 @@ def train(self,
               ignore_extra_labels=False,
               dataset=None,
               meta_requirements=None,
-              trainer_callbacks: Optional[List[TrainerCallback]]=None) -> Tuple:
+              trainer_callbacks: Optional[List[TrainerCallback]]=None,
+              train_json_path: str=None,
+              test_json_path: str=None) -> Tuple:
         """Train or continue training a model give a json_path containing a MedCATtrainer export. It will
         continue training if an existing model is loaded or start new training if the model is blank/new.
 
@@ -186,21 +188,27 @@ def train(self,
             ignore_extra_labels:
                 Makes only sense when an existing deid model was loaded and from the new data we want to ignore
                 labels that did not exist in the old model.
-            dataset: Defaults to None.
+            dataset: Defaults to None. Will be split by self.config.general['test_size'] into train and test datasets.
             meta_requirements: Defaults to None
             trainer_callbacks (List[TrainerCallback]):
                 A list of trainer callbacks for collecting metrics during the training at the client side. The
                 transformers Trainer object will be passed in when each callback is called.
-
+            train_json_path (str): Defaults to None. If provided, will be used as the training dataset json_path to load from
+            test_json_path (str): Defaults to None. If provided, will be used as the test dataset json_path to load from
         Returns:
             Tuple: The dataframe, examples, and the dataset
         """
 
-        if dataset is None and json_path is not None:
+        if dataset is None:
             # Load the medcattrainer export
-            json_path = self._prepare_dataset(json_path, ignore_extra_labels=ignore_extra_labels,
+            if json_path is not None:
+                json_path = self._prepare_dataset(json_path, ignore_extra_labels=ignore_extra_labels,
                                               meta_requirements=meta_requirements, file_name='data_eval.json')
-            # Load dataset
+            elif test_json_path is not None and train_json_path is not None:
+                train_json_path = self._prepare_dataset(train_json_path, ignore_extra_labels=ignore_extra_labels,
+                                                meta_requirements=meta_requirements, file_name='data_train.json')
+                test_json_path = self._prepare_dataset(test_json_path, ignore_extra_labels=ignore_extra_labels,
+                                                meta_requirements=meta_requirements, file_name='data_test.json')  
 
             # NOTE: The following is for backwards comppatibility
             #       in datasets==2.20.0 `trust_remote_code=True` must be explicitly
@@ -212,13 +220,21 @@ def train(self,
                 ds_load_dataset = partial(datasets.load_dataset, trust_remote_code=True)
             else:
                 ds_load_dataset = datasets.load_dataset
-            dataset = ds_load_dataset(os.path.abspath(transformers_ner.__file__),
-                                      data_files={'train': json_path}, # type: ignore
-                                      split='train',
-                                      cache_dir='/tmp/')
-            # We split before encoding so the split is document level, as encoding
-            #does the document splitting into max_seq_len
-            dataset = dataset.train_test_split(test_size=self.config.general['test_size']) # type: ignore
+            
+            if json_path:
+                dataset = ds_load_dataset(os.path.abspath(transformers_ner.__file__),
+                                        data_files={'train': json_path}, # type: ignore
+                                        split='train',
+                                        cache_dir='/tmp/')
+                # We split before encoding so the split is document level, as encoding
+                # does the document splitting into max_seq_len
+                dataset = dataset.train_test_split(test_size=self.config.general['test_size']) # type: ignore
+            elif train_json_path and test_json_path:
+                dataset = ds_load_dataset(os.path.abspath(transformers_ner.__file__), 
+                                          data_files={'train': train_json_path, 'test': test_json_path}, # type: ignore 
+                                          cache_dir='/tmp/')
+            else:
+                raise ValueError("Either json_path or train_json_path and test_json_path must be provided when no dataset is provided")
 
         # Update labelmap in case the current dataset has more labels than what we had before
         self.tokenizer.calculate_label_map(dataset['train'])
@@ -520,3 +536,27 @@ def __call__(self, doc: Doc) -> Doc:
 def func_has_kwarg(func: Callable, keyword: str):
     sig = inspect.signature(func)
     return keyword in sig.parameters
+
+
+if __name__ == "__main__":
+    import json
+    from copy import copy
+    from medcat.utils.ner.deid import DeIdModel
+
+    mct_export = json.load(open('/Users/k1897038/Downloads/MedCAT_Export_With_Text_2025-03-28_18_49_30.json'))
+
+    train_set = {'projects': [copy(mct_export['projects'][0])]}
+    train_set['projects'][0]['documents'] = mct_export['projects'][0]['documents'][0:8]
+
+    test_set = {'projects': [copy(mct_export['projects'][0])]}
+    test_set['projects'][0]['documents'] = mct_export['projects'][0]['documents'][8:]
+
+    json.dump(train_set, open('train_set.json', 'w'))
+    json.dump(test_set, open('test_set.json', 'w'))
+    
+    train_json_path = 'train_set.json'
+    test_json_path = 'test_set.json'
+    deid_model_path = '/Users/k1897038/Documents/cogstack_docs/medcat_models/medcat_deid_model_691c3f6a6e5400e7.zip'
+    deid_cat = DeIdModel.load_model_pack(deid_model_path) 
+    deid_cat.train(train_json_path=train_json_path, test_json_path=test_json_path)
+
diff --git a/medcat-v1/medcat/utils/ner/deid.py b/medcat-v1/medcat/utils/ner/deid.py
@@ -62,9 +62,16 @@ class DeIdModel(NerModel):
     def __init__(self, cat: CAT) -> None:
         self.cat = cat
 
-    def train(self, json_path: Union[str, list, None],
+    def train(self, json_path: Union[str, list, None]=None, 
+              train_json_path: Union[str, list, None]=None, 
+              test_json_path: Union[str, list, None]=None, 
               *args, **kwargs) -> Tuple[Any, Any, Any]:
-        return super().train(json_path, *args, train_nr=0, **kwargs)  # type: ignore
+        assert not all([json_path, train_json_path, test_json_path]), \
+                "Either json_path or train_json_path and test_json_path must be provided when no dataset is provided"
+
+        return super().train(json_path=json_path, 
+                             train_json_path=train_json_path, 
+                             test_json_path=test_json_path, *args, **kwargs)  # type: ignore
 
     def deid_text(self, text: str, redact: bool = False) -> str:
         """Deidentify text and potentially redact information.