CU-8698jzjj3: Add tests and respond to comments

Tom Searle · Tom Searle · commit 092729d06f93 · 2025-06-03T12:31:33.000+01:00
diff --git a/medcat-v1/medcat/utils/ner/deid.py b/medcat-v1/medcat/utils/ner/deid.py
@@ -184,23 +184,25 @@ def _is_deid_model(cls, cat: CAT) -> bool:
     @classmethod
     def _get_reason_not_deid(cls, cat: CAT) -> str:
         if cat.vocab is not None:
-            return "Has vocab"
+            return "Has voc§ab"
         if len(cat._addl_ner) != 1:
             return f"Incorrect number of addl_ner: {len(cat._addl_ner)}"
         return ""
 
 
-def match_rules(rules: List[Tuple[str, str]], texts: List[str], cat: CAT) -> List[List[Dict]]:
+def match_rules(rules: List[Tuple[str, str]], texts: List[str], cui2preferred_name: Dict[str, str]) -> List[List[Dict]]:
     """Match a set of rules - pat / cui combos as post processing labels.
 
     Uses a cat DeID model for pretty name mapping.
 
     Args:
         rules (List[Tuple[str, str]]): List of tuples of pattern and cui
         texts (List[str]): List of texts to match rules on
-        cat (CAT): The CAT instance
+        cui2preferred_name (Dict[str, str]): Dictionary of CUI to preferred name, likely to be cat.cdb.cui2preferred_name.
 
     Examples:
+        >>> cat = CAT.load_model_pack(model_pack_path)
+        ...
         >>> rules = [
             ('(123) 456-7890', '134'),
             ('1234567890', '134'),
@@ -214,7 +216,7 @@ def match_rules(rules: List[Tuple[str, str]], texts: List[str], cat: CAT) -> Lis
             'My phone number is 123.456.7890',
             'My phone number is 1234567890',
         ]
-        >>> matches = match_rules(rules, texts, cat)
+        >>> matches = match_rules(rules, texts, cat.cdb.cui2preferred_name)
 
     Returns:
         List[List[Dict]]: List of lists of predictions from `match_rules`
@@ -230,21 +232,20 @@ def match_rules(rules: List[Tuple[str, str]], texts: List[str], cat: CAT) -> Lis
             for match in text_matches:
                 matches_in_text.append({
                     'source_value': match.group(),
-                    'pretty_name': cat.cdb.cui2preferred_name[concept],
+                    'pretty_name': cui2preferred_name[concept],
                     'start': match.start(),
                     'end': match.end(),
                     'cui': concept,
-                    'acc': 1.0,
-                    'soure_value': match.group(0)
+                    'acc': 1.0
                 })
         rule_matches_per_text.append(matches_in_text)
     return rule_matches_per_text
 
 
-def merge_preds(model_preds_by_text: List[List[Dict]],
-                rule_matches_per_text: List[List[Dict]],
-                accept_preds: bool = True) -> List[List[Dict]]:
-    """Merge predictions from rule based and deID model predictions.
+def merge_all_preds(model_preds_by_text: List[List[Dict]],
+                    rule_matches_per_text: List[List[Dict]],
+                    accept_preds: bool = True) -> List[List[Dict]]:
+    """Conveniance method to merge predictions from rule based and deID model predictions.
 
     Args:
         model_preds_by_text (List[Dict]): list of predictions from
@@ -255,56 +256,63 @@ def merge_preds(model_preds_by_text: List[List[Dict]],
             model_preds_by_text, over the rule matches if they overlap.
             Defaults to using model preds over rules.
 
+    Returns:
+        List[List[Dict]]: List of lists of predictions from `merge_all_preds`
+    """
+    assert len(model_preds_by_text) == len(rule_matches_per_text), \
+        "model_preds_by_text and rule_matches_per_text must have the same length as they should be CAT.get_entities and match_rules outputs of the same text"
+    return [merge_preds(model_preds_by_text[i], rule_matches_per_text[i], accept_preds) for i in range(len(model_preds_by_text))]
+
+
+def merge_preds(model_preds: List[Dict],
+                rule_matches: List[Dict],
+                accept_preds: bool = True) -> List[Dict]:
+    """Merge predictions from rule based and deID model predictions.
+
+    Args:
+        model_preds (List[Dict]): predictions from `cat.get_entities()`
+        rule_matches (List[Dict]): predictions from output of running `match_rules` on a text
+        accept_preds (bool): uses the predicted label from the model,
+            model_preds, over the rule matches if they overlap.
+            Defaults to using model preds over rules.
+
     Examples:
-        >>> # a list of lists of predictions from `cat.get_entities()`
-        >>> model_preds_by_text = [
+        >>> # a list of predictions from `cat.get_entities()`
+        >>> model_preds = [
             [
                 {'cui': '134', 'start': 10, 'end': 20, 'acc': 1.0,
                  'pretty_name': 'Phone Number'},
                 {'cui': '134', 'start': 25, 'end': 35, 'acc': 1.0,
                  'pretty_name': 'Phone Number'}
             ]
         ]
-        >>> # a list of lists of predictions from `match_rules`
-        >>> rule_matches_by_text = [
+        >>> # a list of predictions from `match_rules`
+        >>> rule_matches = [
             [
                 {'cui': '134', 'start': 10, 'end': 20, 'acc': 1.0,
                  'pretty_name': 'Phone Number'},
                 {'cui': '134', 'start': 25, 'end': 35, 'acc': 1.0,
                  'pretty_name': 'Phone Number'}
             ]
         ]
-        >>> merged_preds = merge_preds(model_preds_by_text, rule_matches_by_text)
+        >>> merged_preds = merge_preds(model_preds, rule_matches)
 
     Returns:
-        List[List[Dict]]: List of lists of predictions from `merge_preds`
+        List[Dict]: List of predictions from `merge_preds`
     """
-    all_preds = []
     if accept_preds:
-        labels1 = model_preds_by_text
-        labels2 = rule_matches_per_text
+        labels1 = model_preds
+        labels2 = rule_matches
     else:
-        labels1 = rule_matches_per_text
-        labels2 = model_preds_by_text
-    for matches_text1, matches_text2 in zip(labels1, labels2):
-        # Function to check if two spans overlap
-        def has_overlap(span1, span2):
-            return not (span1['end'] <= span2['start'] or
-                       span2['end'] <= span1['start'])
-
-        # Mark model predictions that overlap with rule matches
-        to_remove = set()
-        for text_match1 in matches_text1:
-            for i, text_match2 in enumerate(matches_text2):
-                if has_overlap(text_match1, text_match2):
-                    to_remove.add(i)
-
-        # Keep only non-overlapping model predictions
-        matches_text2 = [text_match for i, text_match in
-                        enumerate(matches_text2) if i not in to_remove]
-
-        # merge preds and sort on start
-        merged_preds = matches_text1 + matches_text2
-        merged_preds.sort(key=lambda x: x['start'])
-        all_preds.append(merged_preds)
-    return all_preds
+        labels1 = rule_matches
+        labels2 = model_preds
+
+    # Keep only non-overlapping model predictions
+    labels2 = [span2 for span2 in labels2
+               if not any(not (span2['end'] <= span1['start'] or span1['end'] <= span2['start'])
+                          for span1 in labels1)]
+    # merge preds and sort on start
+    merged_preds = labels1 + labels2
+    merged_preds.sort(key=lambda x: x['start'])
+    merged_preds
+    return merged_preds
diff --git a/medcat-v1/medcat/utils/ner/metrics.py b/medcat-v1/medcat/utils/ner/metrics.py
@@ -6,8 +6,6 @@
 from scipy.special import softmax
 import logging
 
-from medcat.cdb import CDB
-
 
 logger = logging.getLogger(__name__)
 
@@ -32,7 +30,7 @@ def metrics(p, return_df=False, plus_recall=0, tokenizer=None, dataset=None, mer
     Returns:
         Dict: A dictionary of metrics.
     """
-    """TODO: This could be done better, for sure. But it works."""  # noqa
+
     predictions = np.array(p.predictions)
     predictions = softmax(predictions, axis=2)
     examples = None
@@ -154,7 +152,7 @@ def _anno_within_pred_list(label: Dict, preds: List[Dict]) -> bool:
     return any(label['start'] >= p['start'] and label['end'] <= p['end'] for p in preds)
 
 
-def evaluate_predictions(true_annotations: List[List[Dict]], all_preds: List[List[Dict]], texts: List[str], deid_cdb: CDB):
+def evaluate_predictions(true_annotations: List[List[Dict]], all_preds: List[List[Dict]], texts: List[str], cui2preferred_name: Dict[str, str]):
     """
     Evaluate predictions against sets of collected labels as collected and output from a MedCATTrainer project.
     Counts predictions as correct if the prediction fully encloses the label.
@@ -163,7 +161,7 @@ def evaluate_predictions(true_annotations: List[List[Dict]], all_preds: List[Lis
         true_annotations (List[List[Dict]]): Ground truth predictions by text
         all_preds (List[List[Dict]]): Model predictions by text
         texts (List[str]): Original list of texts
-        deid_cdb (CDB): Concept database
+        cui2preferred_name (Dict[str, str]): Dictionary of CUI to preferred name, likely to be cat.cdb.cui2preferred_name.
 
     Returns:
         Tuple[pd.DataFrame, Dict]: A tuple containing a DataFrame of evaluation metrics and a dictionary of missed annotations per CUI.
@@ -219,6 +217,6 @@ def evaluate_predictions(true_annotations: List[List[Dict]], all_preds: List[Lis
         'recall_merged': per_cui_recall_merged.values(),
         'recall': per_cui_recall.values(),
         'precision': per_cui_prec.values(),
-        'label_count': per_cui_anno_counts.values()}, index=[deid_cdb.cui2preferred_name[k] for k in per_cui_recall_merged])
+        'label_count': per_cui_anno_counts.values()}, index=[cui2preferred_name[k] for k in per_cui_recall_merged])
 
     return res_df, per_cui_annos_missed
diff --git a/medcat-v1/tests/utils/ner/test_deid.py b/medcat-v1/tests/utils/ner/test_deid.py