CU-86995ddvj: Add in post-processing funcs for a de-id pipeline

Tom Searle · Tom Searle · commit 720dabb38cd4 · 2025-05-22T14:35:20.000+01:00
diff --git a/medcat-v1/medcat/utils/ner/deid.py b/medcat-v1/medcat/utils/ner/deid.py
@@ -34,6 +34,7 @@
 - config
 - cdb
 """
+import re
 from typing import Union, Tuple, Any, List, Iterable, Optional, Dict
 import logging
 
@@ -187,3 +188,107 @@ def _get_reason_not_deid(cls, cat: CAT) -> str:
         if len(cat._addl_ner) != 1:
             return f"Incorrect number of addl_ner: {len(cat._addl_ner)}"
         return ""
+
+
+def match_rules(rules: List[Tuple[str, str]], texts: List[str], cat: CAT):
+    """
+    Match a set of rules - pat / cui combos as post processing labels, uses
+    a cat DeID model forp pretty name mapping
+
+    Examples:
+    >>> rules = [
+        ('(123) 456-7890', '134'),
+        ('1234567890', '134'),
+        ('123.456.7890', '134'),
+        ('1234567890', '134'),
+        ('1234567890', '134'),
+    ]
+    >>> texts = [
+        'My phone number is (123) 456-7890',
+        'My phone number is 1234567890',
+        'My phone number is 123.456.7890',
+        'My phone number is 1234567890',
+    ]
+    >>> matches = match_rules(rules, texts, cat)
+    """
+    # Iterate through each text and pattern combination
+    rule_matches_per_text = []
+    for i, text in enumerate(texts):
+        matches_in_text = []
+        for pattern, concept in rules:
+            # Find all matches of current pattern in current text
+            text_matches = re.finditer(pattern, text, flags=re.M)
+            
+            # Add each match with its pattern and text info
+            for match in text_matches:
+                matches_in_text.append({
+                    'source_value': match.group(),
+                    'pretty_name': cat.cdb.cui2preferred_name[concept],
+                    'start': match.start(),
+                    'end': match.end(),
+                    'cui': concept,
+                    'acc': 1.0,
+                    'soure_value': match.group(0)
+                })
+        rule_matches_per_text.append(matches_in_text)
+    return rule_matches_per_text
+
+
+def merge_preds(model_preds_by_text: List[Dict], rule_matches_per_text: List[Dict], accept_preds=True):
+    """
+    Merge predictions from rule based and deID model predictions for further evaluation
+
+    Args:   
+        model_preds_by_text (List[Dict]): list of predictions from `cat.get_entities()`, then `[list(m['entities'].values()) for m in model_preds]`
+        rule_matches_by_text (List[Dict]): list of predictions from output of running `match_rules`
+        accept_preds (bool): uses the predicted label from the model, model_preds_by_text, over the rule matches if they overlap. Defaults to using model preds over rules. 
+
+    Examples:
+    >>> # a list of lists of predictions from `cat.get_entities()`
+    >>> model_preds_by_text = [ 
+        [
+            {'cui': '134', 'start': 10, 'end': 20, 'acc': 1.0, 'pretty_name': 'Phone Number'},
+            {'cui': '134', 'start': 25, 'end': 35, 'acc': 1.0, 'pretty_name': 'Phone Number'}
+        ]
+    ]
+    >>> # a list of lists of predictions from `match_rules`
+    >>> rule_matches_by_text = [
+        [
+            {'cui': '134', 'start': 10, 'end': 20, 'acc': 1.0, 'pretty_name': 'Phone Number'},
+            {'cui': '134', 'start': 25, 'end': 35, 'acc': 1.0, 'pretty_name': 'Phone Number'}
+        ]
+    ]
+    >>> merged_preds = merge_preds(model_preds_by_text, rule_matches_by_text)
+    """
+    all_preds = []
+    if accept_preds:
+        labels1 = model_preds_by_text
+        labels2 = rule_matches_per_text
+    else:
+        labels1 = rule_matches_per_text
+        labels2 = model_preds_by_text
+    
+    for matches_text1, matches_text2 in zip(labels1, labels2):
+        # Function to check if two spans overlap
+        def has_overlap(span1, span2):
+            return not (span1['end'] <= span2['start'] or span2['end'] <= span1['start'])
+        
+        # Mark model predictions that overlap with rule matches
+        
+        to_remove = set()
+        for text_match1 in matches_text1:
+            for i, text_match2 in enumerate(matches_text2):
+                if has_overlap(text_match1, text_match2):
+                    to_remove.add(i)
+        
+        # Keep only non-overlapping model predictions
+        matches_text2 = [text_match for i, text_match in enumerate(matches_text2) if i not in to_remove]
+    
+        # merge preds and sort on start
+        merged_preds = matches_text1 + matches_text2
+        merged_preds.sort(key=lambda x: x['start'])
+        all_preds.append(merged_preds)
+    return all_preds
+
+
+
diff --git a/medcat-v1/medcat/utils/ner/metrics.py b/medcat-v1/medcat/utils/ner/metrics.py
@@ -1,16 +1,28 @@
+from typing import Dict, List
 from sklearn.metrics import classification_report
 import numpy as np
 import pandas as pd
 from collections import defaultdict
 from scipy.special import softmax
 import logging
 
+from medcat.cdb import CDB
+
 
 logger = logging.getLogger(__name__)
 
 
 def metrics(p, return_df=False, plus_recall=0, tokenizer=None, dataset=None, merged_negative={0, 1, -100}, padding_label=-100, csize=15, subword_label=1,
             verbose=False):
+    """
+    Calculate metrics for a model's predictions, based off the tokenized output of a MedCATTrainer project.
+
+    Args:
+        p: The model's predictions.
+        return_df: Whether to return a DataFrame of metrics.
+        plus_recall: The recall to add to the model's predictions.
+        tokenizer: The tokenizer used to tokenize the texts.
+    """
     """TODO: This could be done better, for sure. But it works."""  # noqa
     predictions = np.array(p.predictions)
     predictions = softmax(predictions, axis=2)
@@ -117,3 +129,88 @@ def metrics(p, return_df=False, plus_recall=0, tokenizer=None, dataset=None, mer
                 'precison_merged': np.average([x for x in df.p_merged.values if pd.notna(x)])}
     else:
         return df, examples
+
+
+def _anno_within_pred_list(label: Dict, preds: List[Dict]) -> bool:
+    """
+    Check if a label is within a list of predictions, 
+
+    Args:
+        label (Dict): an annotation likely from a MedCATTrainer project
+        preds (List[Dict]): a list of predictions likely from a cat.__call__
+
+    Returns:
+        bool: True if the label is within the list of predictions, False otherwise
+    """
+    return any(label['start'] >= p['start'] and label['end'] <= p['end'] for p in preds)
+
+
+def evaluate_predictions(true_annotations: List[List[Dict]], all_preds: List[List[Dict]], texts: List[str], deid_cdb: CDB):
+    """
+    Evaluate predictions against sets of collected labels as collected and output from a MedCATTrainer project. 
+    Counts predictions as correct if the prediction fully encloses the label.
+    
+    Args:
+        true_annotations (List[List[Dict]]): Ground truth predictions by text
+        all_preds (List[List[Dict]]): Model predictions by text
+        texts (List[str]): Original list of texts
+        deid_cdb (CDB): Concept database
+
+    Returns:
+        Tuple[pd.DataFrame, Dict]: A tuple containing a DataFrame of evaluation metrics and a dictionary of missed annotations per CUI.
+    """
+    per_cui_recall = {}
+    per_cui_prec = {}
+    per_cui_recall_merged = {}
+    per_cui_anno_counts = {}
+    per_cui_annos_missed = defaultdict(list)
+    uniq_labels = set([p['cui'] for ap in true_annotations for p in ap])
+    
+    for cui in uniq_labels:
+        # annos in test set
+        anno_count = sum([len([p for p in cui_annos if p['cui'] == cui]) for cui_annos in true_annotations])
+        pred_counts = sum([len([p for p in d if p['cui'] == cui]) for d in all_preds])
+    
+        # print(anno_count)
+        # print(pred_counts)
+    
+        # print(f'pred_count: {pred_counts}, anno_count:{anno_count}')
+        per_cui_anno_counts[cui] = anno_count
+    
+        doc_annos_left, preds_left, doc_annos_left_any_cui = [], [], []
+        
+        for doc_preds, doc_labels, text in zip(all_preds, true_annotations, texts):
+            # num of annos that are not found - recall
+            cui_labels = [l for l in doc_labels if l['cui'] == cui]
+            cui_doc_preds = [p for p in doc_preds if p['cui'] == cui]
+            
+            labels_not_found = [label for label in cui_labels if not _anno_within_pred_list(label, cui_doc_preds)]
+            doc_annos_left.append(len(labels_not_found))
+    
+            # num of annos that are not found across any cui prediction - recall_merged
+            any_labels_not_found = [label for label in cui_labels if not _anno_within_pred_list(label, doc_preds)]
+            doc_annos_left_any_cui.append(len(any_labels_not_found))
+
+            per_cui_annos_missed[cui].append(any_labels_not_found)
+                
+            # num of preds that are incorrect - precision
+            preds_left.append(len([label for label in cui_doc_preds if not _anno_within_pred_list(label, cui_labels)]))
+    
+        if anno_count != 0 and pred_counts != 0:
+            per_cui_recall[cui] = (anno_count - sum(doc_annos_left)) / anno_count
+            per_cui_recall_merged[cui] = (anno_count - sum(doc_annos_left_any_cui)) / anno_count
+            per_cui_prec[cui] = (pred_counts - sum(preds_left))  / pred_counts
+        else:
+            per_cui_recall[cui] = 0
+            per_cui_recall_merged[cui] = 0
+            per_cui_prec[cui] = 0
+
+    res_df = pd.DataFrame({
+        'cui': per_cui_recall_merged.keys(),
+        'recall_merged': per_cui_recall_merged.values(),
+        'recall': per_cui_recall.values(),
+        'precision': per_cui_prec.values(),
+        'label_count': per_cui_anno_counts.values()}, index=[deid_cdb.cui2preferred_name[k] for k in per_cui_recall_merged])
+    
+    return res_df, per_cui_annos_missed
+