CU-8698jzjj3: flake8 fixes

Tom Searle · Tom Searle · commit df0080c834d4 · 2025-05-23T17:16:48.000+01:00
diff --git a/medcat-v1/medcat/ner/transformers_ner.py b/medcat-v1/medcat/ner/transformers_ner.py
@@ -213,7 +213,7 @@ def train(self,
                 train_json_path = self._prepare_dataset(train_json_path, ignore_extra_labels=ignore_extra_labels,
                                                 meta_requirements=meta_requirements, file_name='data_train.json')
                 test_json_path = self._prepare_dataset(test_json_path, ignore_extra_labels=ignore_extra_labels,
-                                                meta_requirements=meta_requirements, file_name='data_test.json')  
+                                                meta_requirements=meta_requirements, file_name='data_test.json')
 
             # NOTE: The following is for backwards comppatibility
             #       in datasets==2.20.0 `trust_remote_code=True` must be explicitly
@@ -225,7 +225,7 @@ def train(self,
                 ds_load_dataset = partial(datasets.load_dataset, trust_remote_code=True)
             else:
                 ds_load_dataset = datasets.load_dataset
-            
+
             if json_path:
                 dataset = ds_load_dataset(os.path.abspath(transformers_ner.__file__),
                                         data_files={'train': json_path}, # type: ignore
@@ -235,8 +235,8 @@ def train(self,
                 # does the document splitting into max_seq_len
                 dataset = dataset.train_test_split(test_size=self.config.general['test_size']) # type: ignore
             elif train_json_path and test_json_path:
-                dataset = ds_load_dataset(os.path.abspath(transformers_ner.__file__), 
-                                          data_files={'train': train_json_path, 'test': test_json_path}, # type: ignore 
+                dataset = ds_load_dataset(os.path.abspath(transformers_ner.__file__),
+                                          data_files={'train': train_json_path, 'test': test_json_path}, # type: ignore
                                           cache_dir='/tmp/')
             else:
                 raise ValueError("Either json_path or train_json_path and test_json_path must be provided when no dataset is provided")
@@ -248,8 +248,8 @@ def train(self,
         if self.model.num_labels != len(self.tokenizer.label_map):
             logger.warning("The dataset contains labels we've not seen before, model is being reinitialized")
             logger.warning("Model: {} vs Dataset: {}".format(self.model.num_labels, len(self.tokenizer.label_map)))
-            self.model = AutoModelForTokenClassification.from_pretrained(self.config.general['model_name'], 
-                                                                         num_labels=len(self.tokenizer.label_map), 
+            self.model = AutoModelForTokenClassification.from_pretrained(self.config.general['model_name'],
+                                                                         num_labels=len(self.tokenizer.label_map),
                                                                          ignore_mismatched_sizes=True)
             self.tokenizer.cui2name = {k:self.cdb.get_name(k) for k in self.tokenizer.label_map.keys()}
 
@@ -290,7 +290,6 @@ def train(self,
             # NOTE: this shouldn't really happen, but we'll do this for type safety
             raise ValueError("Output path should not be None!")
         self.save(save_dir_path=os.path.join(output_dir, 'final_model'))
-
         # Run an eval step and return metrics
         p = trainer.predict(encoded_dataset['test']) # type: ignore
         df, examples = metrics(p, return_df=True, tokenizer=self.tokenizer, dataset=encoded_dataset['test'])
diff --git a/medcat-v1/medcat/utils/ner/deid.py b/medcat-v1/medcat/utils/ner/deid.py
@@ -63,7 +63,7 @@ class DeIdModel(NerModel):
     def __init__(self, cat: CAT) -> None:
         self.cat = cat
 
-    def train(self, json_path: Union[str, list, None]=None, 
+    def train(self, json_path: Union[str, list, None] = None,
               *args, **kwargs) -> Tuple[Any, Any, Any]:
         assert not all([json_path, kwargs.get('train_json_path'), kwargs.get('test_json_path')]), \
                 "Either json_path or train_json_path and test_json_path must be provided when no dataset is provided"
@@ -149,7 +149,8 @@ def deid_multi_texts(self,
         return out
 
     @classmethod
-    def load_model_pack(cls, model_pack_path: str, config: Optional[Dict] = None) -> 'DeIdModel':
+    def load_model_pack(cls, model_pack_path: str,
+                       config: Optional[Dict] = None) -> 'DeIdModel':
         """Load DeId model from model pack.
 
         The method first loads the CAT instance.
@@ -167,7 +168,7 @@ def load_model_pack(cls, model_pack_path: str, config: Optional[Dict] = None) ->
         Returns:
             DeIdModel: The resulting DeI model.
         """
-        ner_model = NerModel.load_model_pack(model_pack_path,config=config)
+        ner_model = NerModel.load_model_pack(model_pack_path, config=config)
         cat = ner_model.cat
         if not cls._is_deid_model(cat):
             raise ValueError(
@@ -190,25 +191,25 @@ def _get_reason_not_deid(cls, cat: CAT) -> str:
 
 
 def match_rules(rules: List[Tuple[str, str]], texts: List[str], cat: CAT):
-    """
-    Match a set of rules - pat / cui combos as post processing labels, uses
-    a cat DeID model forp pretty name mapping
+    """Match a set of rules - pat / cui combos as post processing labels.
+
+    Uses a cat DeID model for pretty name mapping.
 
     Examples:
-    >>> rules = [
-        ('(123) 456-7890', '134'),
-        ('1234567890', '134'),
-        ('123.456.7890', '134'),
-        ('1234567890', '134'),
-        ('1234567890', '134'),
-    ]
-    >>> texts = [
-        'My phone number is (123) 456-7890',
-        'My phone number is 1234567890',
-        'My phone number is 123.456.7890',
-        'My phone number is 1234567890',
-    ]
-    >>> matches = match_rules(rules, texts, cat)
+        >>> rules = [
+            ('(123) 456-7890', '134'),
+            ('1234567890', '134'),
+            ('123.456.7890', '134'),
+            ('1234567890', '134'),
+            ('1234567890', '134'),
+        ]
+        >>> texts = [
+            'My phone number is (123) 456-7890',
+            'My phone number is 1234567890',
+            'My phone number is 123.456.7890',
+            'My phone number is 1234567890',
+        ]
+        >>> matches = match_rules(rules, texts, cat)
     """
     # Iterate through each text and pattern combination
     rule_matches_per_text = []
@@ -217,7 +218,6 @@ def match_rules(rules: List[Tuple[str, str]], texts: List[str], cat: CAT):
         for pattern, concept in rules:
             # Find all matches of current pattern in current text
             text_matches = re.finditer(pattern, text, flags=re.M)
-            
             # Add each match with its pattern and text info
             for match in text_matches:
                 matches_in_text.append({
@@ -233,31 +233,40 @@ def match_rules(rules: List[Tuple[str, str]], texts: List[str], cat: CAT):
     return rule_matches_per_text
 
 
-def merge_preds(model_preds_by_text: List[List[Dict]], rule_matches_per_text: List[List[Dict]], accept_preds=True):
-    """
-    Merge predictions from rule based and deID model predictions for further evaluation
+def merge_preds(model_preds_by_text: List[List[Dict]],
+                rule_matches_per_text: List[List[Dict]],
+                accept_preds: bool = True):
+    """Merge predictions from rule based and deID model predictions.
 
-    Args:   
-        model_preds_by_text (List[Dict]): list of predictions from `cat.get_entities()`, then `[list(m['entities'].values()) for m in model_preds]`
-        rule_matches_by_text (List[Dict]): list of predictions from output of running `match_rules`
-        accept_preds (bool): uses the predicted label from the model, model_preds_by_text, over the rule matches if they overlap. Defaults to using model preds over rules. 
+    Args:
+        model_preds_by_text (List[Dict]): list of predictions from
+            `cat.get_entities()`, then `[list(m['entities'].values()) for m in model_preds]`
+        rule_matches_by_text (List[Dict]): list of predictions from output of
+            running `match_rules`
+        accept_preds (bool): uses the predicted label from the model,
+            model_preds_by_text, over the rule matches if they overlap.
+            Defaults to using model preds over rules.
 
     Examples:
-    >>> # a list of lists of predictions from `cat.get_entities()`
-    >>> model_preds_by_text = [ 
-        [
-            {'cui': '134', 'start': 10, 'end': 20, 'acc': 1.0, 'pretty_name': 'Phone Number'},
-            {'cui': '134', 'start': 25, 'end': 35, 'acc': 1.0, 'pretty_name': 'Phone Number'}
+        >>> # a list of lists of predictions from `cat.get_entities()`
+        >>> model_preds_by_text = [
+            [
+                {'cui': '134', 'start': 10, 'end': 20, 'acc': 1.0,
+                 'pretty_name': 'Phone Number'},
+                {'cui': '134', 'start': 25, 'end': 35, 'acc': 1.0,
+                 'pretty_name': 'Phone Number'}
+            ]
         ]
-    ]
-    >>> # a list of lists of predictions from `match_rules`
-    >>> rule_matches_by_text = [
-        [
-            {'cui': '134', 'start': 10, 'end': 20, 'acc': 1.0, 'pretty_name': 'Phone Number'},
-            {'cui': '134', 'start': 25, 'end': 35, 'acc': 1.0, 'pretty_name': 'Phone Number'}
+        >>> # a list of lists of predictions from `match_rules`
+        >>> rule_matches_by_text = [
+            [
+                {'cui': '134', 'start': 10, 'end': 20, 'acc': 1.0,
+                 'pretty_name': 'Phone Number'},
+                {'cui': '134', 'start': 25, 'end': 35, 'acc': 1.0,
+                 'pretty_name': 'Phone Number'}
+            ]
         ]
-    ]
-    >>> merged_preds = merge_preds(model_preds_by_text, rule_matches_by_text)
+        >>> merged_preds = merge_preds(model_preds_by_text, rule_matches_by_text)
     """
     all_preds = []
     if accept_preds:
@@ -266,28 +275,25 @@ def merge_preds(model_preds_by_text: List[List[Dict]], rule_matches_per_text: Li
     else:
         labels1 = rule_matches_per_text
         labels2 = model_preds_by_text
-    
     for matches_text1, matches_text2 in zip(labels1, labels2):
         # Function to check if two spans overlap
         def has_overlap(span1, span2):
-            return not (span1['end'] <= span2['start'] or span2['end'] <= span1['start'])
-        
+            return not (span1['end'] <= span2['start'] or
+                       span2['end'] <= span1['start'])
+
         # Mark model predictions that overlap with rule matches
-        
         to_remove = set()
         for text_match1 in matches_text1:
             for i, text_match2 in enumerate(matches_text2):
                 if has_overlap(text_match1, text_match2):
                     to_remove.add(i)
-        
+
         # Keep only non-overlapping model predictions
-        matches_text2 = [text_match for i, text_match in enumerate(matches_text2) if i not in to_remove]
-    
+        matches_text2 = [text_match for i, text_match in
+                        enumerate(matches_text2) if i not in to_remove]
+
         # merge preds and sort on start
         merged_preds = matches_text1 + matches_text2
         merged_preds.sort(key=lambda x: x['start'])
         all_preds.append(merged_preds)
     return all_preds
-
-
-
diff --git a/medcat-v1/medcat/utils/ner/metrics.py b/medcat-v1/medcat/utils/ner/metrics.py
@@ -116,7 +116,7 @@ def metrics(p, return_df=False, plus_recall=0, tokenizer=None, dataset=None, mer
     for key in _cr:
         cui = ilabel_map[key]
         p_merged = tp_all / (tp_all + fp_all) if (tp_all + fp_all) > 0 else 0
-        data.append([cui, tokenizer.cui2name.get(cui, cui), _cr[key]['precision'], 
+        data.append([cui, tokenizer.cui2name.get(cui, cui), _cr[key]['precision'],
                      _cr[key]['recall'], _cr[key]['f1-score'], _cr[key]['support'], _cr[key]['r_merged'], p_merged])
 
     df = pd.DataFrame(data[1:], columns=data[0])
@@ -133,7 +133,7 @@ def metrics(p, return_df=False, plus_recall=0, tokenizer=None, dataset=None, mer
 
 def _anno_within_pred_list(label: Dict, preds: List[Dict]) -> bool:
     """
-    Check if a label is within a list of predictions, 
+    Check if a label is within a list of predictions,
 
     Args:
         label (Dict): an annotation likely from a MedCATTrainer project
@@ -147,9 +147,9 @@ def _anno_within_pred_list(label: Dict, preds: List[Dict]) -> bool:
 
 def evaluate_predictions(true_annotations: List[List[Dict]], all_preds: List[List[Dict]], texts: List[str], deid_cdb: CDB):
     """
-    Evaluate predictions against sets of collected labels as collected and output from a MedCATTrainer project. 
+    Evaluate predictions against sets of collected labels as collected and output from a MedCATTrainer project.
     Counts predictions as correct if the prediction fully encloses the label.
-    
+
     Args:
         true_annotations (List[List[Dict]]): Ground truth predictions by text
         all_preds (List[List[Dict]]): Model predictions by text
@@ -165,37 +165,37 @@ def evaluate_predictions(true_annotations: List[List[Dict]], all_preds: List[Lis
     per_cui_anno_counts = {}
     per_cui_annos_missed = defaultdict(list)
     uniq_labels = set([p['cui'] for ap in true_annotations for p in ap])
-    
+
     for cui in uniq_labels:
         # annos in test set
         anno_count = sum([len([p for p in cui_annos if p['cui'] == cui]) for cui_annos in true_annotations])
         pred_counts = sum([len([p for p in d if p['cui'] == cui]) for d in all_preds])
-    
+
         # print(anno_count)
         # print(pred_counts)
-    
+
         # print(f'pred_count: {pred_counts}, anno_count:{anno_count}')
         per_cui_anno_counts[cui] = anno_count
-    
+
         doc_annos_left, preds_left, doc_annos_left_any_cui = [], [], []
-        
+
         for doc_preds, doc_labels, text in zip(all_preds, true_annotations, texts):
             # num of annos that are not found - recall
-            cui_labels = [l for l in doc_labels if l['cui'] == cui]
-            cui_doc_preds = [p for p in doc_preds if p['cui'] == cui]
-            
+            cui_labels = [label for label in doc_labels if label['cui'] == cui]
+            cui_doc_preds = [pred for pred in doc_preds if pred['cui'] == cui]
+
             labels_not_found = [label for label in cui_labels if not _anno_within_pred_list(label, cui_doc_preds)]
             doc_annos_left.append(len(labels_not_found))
-    
+
             # num of annos that are not found across any cui prediction - recall_merged
             any_labels_not_found = [label for label in cui_labels if not _anno_within_pred_list(label, doc_preds)]
             doc_annos_left_any_cui.append(len(any_labels_not_found))
 
             per_cui_annos_missed[cui].append(any_labels_not_found)
-                
+
             # num of preds that are incorrect - precision
             preds_left.append(len([label for label in cui_doc_preds if not _anno_within_pred_list(label, cui_labels)]))
-    
+
         if anno_count != 0 and pred_counts != 0:
             per_cui_recall[cui] = (anno_count - sum(doc_annos_left)) / anno_count
             per_cui_recall_merged[cui] = (anno_count - sum(doc_annos_left_any_cui)) / anno_count
@@ -211,6 +211,5 @@ def evaluate_predictions(true_annotations: List[List[Dict]], all_preds: List[Lis
         'recall': per_cui_recall.values(),
         'precision': per_cui_prec.values(),
         'label_count': per_cui_anno_counts.values()}, index=[deid_cdb.cui2preferred_name[k] for k in per_cui_recall_merged])
-    
-    return res_df, per_cui_annos_missed
 
+    return res_df, per_cui_annos_missed