From 45fa3f476c5c2a3dfe1d9cd971f305636a6185ae Mon Sep 17 00:00:00 2001
From: Mayo Faulkner <mayo.faulkner@ucl.ac.uk>
Date: Thu, 23 Oct 2025 11:24:11 +0100
Subject: [PATCH 1/5] WIP on develop

fix qc tests for stimFreeze, pass in custom qc criteria from note
---
 ibllib/qc/task_metrics.py                | 55 ++++++++++++++++--
 ibllib/tests/qc/test_critical_reasons.py |  5 +-
 ibllib/tests/qc/test_task_metrics.py     | 73 +++++++++++++++++++++++-
 ibllib/tests/test_base_tasks.py          |  3 +
 4 files changed, 127 insertions(+), 9 deletions(-)

diff --git a/ibllib/qc/task_metrics.py b/ibllib/qc/task_metrics.py
index 549c1a2dd..aec231dde 100644
--- a/ibllib/qc/task_metrics.py
+++ b/ibllib/qc/task_metrics.py
@@ -47,6 +47,7 @@
 >>> outcome, results = qc.run()
 """
 import logging
+import json
 import sys
 from packaging import version
 from pathlib import PurePosixPath
@@ -61,6 +62,7 @@
 from brainbox.behavior.wheel import cm_to_rad, traces_by_trial
 from ibllib.io.extractors import ephys_fpga
 from one.alf import spec
+
 from . import base
 
 _log = logging.getLogger(__name__)
@@ -359,7 +361,7 @@ def run(self, update=False, **kwargs):
             self.update(outcome, kwargs.get('namespace', self.namespace))
         return outcome, results
 
-    def compute_session_status(self):
+    def compute_session_status(self, use_custom=True):
         """
         Compute the overall session QC for each key and aggregates in a single value.
 
@@ -377,9 +379,29 @@ def compute_session_status(self):
         # Get mean passed of each check, or None if passed is None or all NaN
         results = {k: None if v is None or np.isnan(v).all() else np.nanmean(v)
                    for k, v in self.passed.items()}
+
+        # If a custom criteria is defined for this session, use it
+        if use_custom:
+            custom_criteria = self.get_custom_session_criteria()
+            if custom_criteria is not None:
+                self.criteria = custom_criteria
+
         session_outcome, outcomes = compute_session_status_from_dict(results, self.criteria)
         return session_outcome, results, outcomes
 
+    def get_custom_session_criteria(self):
+
+        if self.one:
+            note_title = f'=== SESSION QC CRITERIA {self.namespace} ==='
+            query = f'text__icontains,{note_title},object_id,{str(self.eid)}'
+            notes = self.one.alyx.rest('notes', 'list', django=query)
+            if len(notes) > 0:
+                notes = json.loads(notes[0]['text'])
+                criteria = {k if k == 'default' else f'_{self.namespace}_{k}': v
+                            for k, v in notes['criteria'].items()}
+                self.log.info('Using custom QC criteria found on Alyx note associated with session')
+                return criteria
+
     @staticmethod
     def compute_dataset_qc_status(outcomes, namespace='task'):
         """Return map of dataset specific QC values.
@@ -548,6 +570,8 @@ def compute(self, **kwargs):
 # === Delays between events checks ===
 
 def check_stimOn_goCue_delays(data, audio_output='harp', **_):
+    # TODO check
+
     """
     Check the go cue tone occurs less than 10ms before stimulus on.
 
@@ -1124,6 +1148,7 @@ def check_n_trial_events(data, **_):
     intervals = data['intervals']
     correct = data['correct']
     err_trig = data['errorCueTrigger_times']
+    stim_trig = data['stimFreezeTrigger_times']
 
     # Exclude these fields; valve and errorCue times are the same as feedback_times and we must
     # test errorCueTrigger_times separately
@@ -1132,13 +1157,25 @@ def check_n_trial_events(data, **_):
                'wheelMoves_peakVelocity_times', 'valveOpen_times', 'wheelMoves_peakAmplitude',
                'wheelMoves_intervals', 'wheel_timestamps', 'stimFreeze_times']
     events = [k for k in data.keys() if k.endswith('_times') and k not in exclude]
+    exclude_nogo = exclude + ['stimFreezeTrigger_times', 'firstMovement_times']
+    events_nogo = [k for k in data.keys() if k.endswith('_times') and k not in exclude_nogo]
+
     metric = np.zeros(data['intervals'].shape[0], dtype=bool)
 
-    # For each trial interval check that one of each trial event occurred.  For incorrect trials,
-    # check the error cue trigger occurred within the interval, otherwise check it is nan.
+    # For each trial interval check that one of each trial event occurred.
+    # For incorrect trials, check the error cue trigger occurred within the interval, otherwise check it is nan.
+    # For no go trials, stimFreeze is nan
     for i, (start, end) in enumerate(intervals):
-        metric[i] = (all([start < data[k][i] < end for k in events]) and
-                     (np.isnan(err_trig[i]) if correct[i] else start < err_trig[i] < end))
+        if correct[i]:
+            met = all([start < data[k][i] < end for k in events]) and np.isnan(err_trig[i])
+        else:
+            if data['choice'][i] != 0:
+                met = all([start < data[k][i] < end for k in events]) and (start < err_trig[i] < end)
+            else:
+                met = (all([start < data[k][i] < end for k in events_nogo]) and
+                       np.isnan(stim_trig[i]) and (start < err_trig[i] < end))
+        metric[i] = met
+
     passed = metric.astype(bool)
     assert intervals.shape[0] == len(metric) == len(passed)
     return metric, passed
@@ -1228,9 +1265,13 @@ def check_errorCue_delays(data, audio_output='harp', **_):
       percentile of delays over 500 training sessions using the Xonar soundcard.
     """
     threshold = 0.0015 if audio_output.lower() == 'harp' else 0.062
+    # There are some instances when the mouse responds before the goCue ttl is finished. In these cases the errorCue
+    # tone is delayed. # TODO can this be a metric and also check xonar
+    idx = data['response_times'] - data['goCue_times'] < 0.105  # 0.1 is the length of the goCue tone, add a little buffer
     metric = np.nan_to_num(data['errorCue_times'] - data['errorCueTrigger_times'], nan=np.inf)
     passed = ((metric <= threshold) & (metric > 0)).astype(float)
     passed[data['correct']] = metric[data['correct']] = np.nan
+    passed[idx] = metric[idx] = np.nan
     assert data['intervals'].shape[0] == len(metric) == len(passed)
     return metric, passed
 
@@ -1287,6 +1328,7 @@ def check_stimOff_delays(data, **_):
 
 
 def check_stimFreeze_delays(data, **_):
+    # TODO account for nogo where we don't have stimFreeze
     """Check the stimulus freezes within 150ms of the intended time.
 
     Check that the time difference between the visual stimulus freeze-command
@@ -1307,6 +1349,9 @@ def check_stimFreeze_delays(data, **_):
     """
     metric = np.nan_to_num(data['stimFreeze_times'] - data['stimFreezeTrigger_times'], nan=np.inf)
     passed = (metric <= 0.15) & (metric > 0)
+    # Remove no_go trials (stimFreeze not triggered in no-go trials)
+    passed[data['choice'] == 0] = np.nan
+
     assert data['intervals'].shape[0] == len(metric) == len(passed)
     return metric, passed
 
diff --git a/ibllib/tests/qc/test_critical_reasons.py b/ibllib/tests/qc/test_critical_reasons.py
index cf4963eb4..93b9c5deb 100644
--- a/ibllib/tests/qc/test_critical_reasons.py
+++ b/ibllib/tests/qc/test_critical_reasons.py
@@ -218,7 +218,10 @@ def test_upload_existing_note(self):
         assert expected_dict == note_dict
 
     def tearDown(self) -> None:
-        self.one.alyx.rest('sessions', 'delete', id=self.eid)
+
+        notes = self.one.alyx.rest('notes', 'list', django=f'object_id,{self.eid}', no_cache=True)
+        for n in notes:
+            self.one.alyx.rest('notes', 'delete', id=n['id'])
 
 
 if __name__ == '__main__':
diff --git a/ibllib/tests/qc/test_task_metrics.py b/ibllib/tests/qc/test_task_metrics.py
index 0dbcb32d7..98f50eb63 100644
--- a/ibllib/tests/qc/test_task_metrics.py
+++ b/ibllib/tests/qc/test_task_metrics.py
@@ -10,6 +10,7 @@
 from one.api import ONE
 from one.alf import spec
 from ibllib.tests import TEST_DB
+from ibllib.tests.fixtures.utils import register_new_session
 from ibllib.qc import task_metrics as qcmetrics
 
 from brainbox.behavior.wheel import cm_to_rad
@@ -128,7 +129,8 @@ def test_update_dataset_qc(self):
         self.assertRaises(AssertionError, qcmetrics.update_dataset_qc, qc, registered_datasets.copy(), one)
 
 
-class TestTaskMetrics(unittest.TestCase):
+class TaskQCTestData(unittest.TestCase):
+
     def setUp(self):
         self.data = self.load_fake_bpod_data()
         self.wheel_gain = 4
@@ -188,6 +190,8 @@ def load_fake_bpod_data(n=5):
         )
         data['feedback_times'] = data['response_times'] + resp_feeback_delay
         data['stimFreeze_times'] = data['response_times'] + 1e-2
+        # StimFreeze for no go trials is nan
+        data['stimFreeze_times'][0] = np.nan
         data['stimFreezeTrigger_times'] = data['stimFreeze_times'] - trigg_delay
         data['feedbackType'] = np.vectorize(lambda x: -1 if x == 0 else x)(data['correct'])
         outcome = data['feedbackType'].copy()
@@ -281,6 +285,9 @@ def add_frag(t, p):
             'firstMovement_times': np.array(movement_times)
         }
 
+
+class TestTaskMetrics(TaskQCTestData):
+
     def test_check_stimOn_goCue_delays(self):
         metric, passed = qcmetrics.check_stimOn_goCue_delays(self.data)
         self.assertTrue(np.allclose(metric, 0.0011), 'failed to return correct metric')
@@ -303,7 +310,9 @@ def test_check_response_feedback_delays(self):
 
     def test_check_response_stimFreeze_delays(self):
         metric, passed = qcmetrics.check_response_stimFreeze_delays(self.data)
-        self.assertTrue(np.allclose(metric, 1e-2), 'failed to return correct metric')
+        self.assertTrue(np.allclose(metric[1:], 1e-2), 'failed to return correct metric')
+        # No go trial has inf value as stimFreeze values are nan
+        self.assertEqual(metric[0], np.inf)
         # Set incorrect timestamp (stimFreeze occurs before response)
         self.data['stimFreeze_times'][-1] = self.data['response_times'][-1] - 1e-4
         metric, passed = qcmetrics.check_response_stimFreeze_delays(self.data)
@@ -414,7 +423,9 @@ def test_check_stimOff_delays(self):
 
     def test_check_stimFreeze_delays(self):
         metric, passed = qcmetrics.check_stimFreeze_delays(self.data)
-        self.assertTrue(np.allclose(metric, 1e-4), 'failed to return correct metric')
+        self.assertTrue(np.allclose(metric[1:], 1e-4), 'failed to return correct metric')
+        # No go trial has inf value as stimFreeze values are nan
+        self.assertEqual(metric[0], np.inf)
         # Set incorrect timestamp
         self.data['stimFreeze_times'][-1] = self.data['stimFreezeTrigger_times'][-1] + 0.2
         metric, passed = qcmetrics.check_stimFreeze_delays(self.data)
@@ -669,5 +680,61 @@ def test_compute(self):
             self.assertEqual(outcomes['_task_habituation_time'], spec.QC.NOT_SET)
 
 
+class TestTaskQCWithCustomCriteria(TaskQCTestData):
+    """
+    Test running the task QC but with custom criteria stored in a note attached to the session
+    """
+
+    def setUp(self):
+
+        super().setUp()
+
+        self.one = ONE(**TEST_DB)
+        _, self.eid = register_new_session(self.one)
+        self.qc = qcmetrics.TaskQC(self.eid, one=self.one)
+        self.qc.extractor = Bunch({'data': self.data, 'settings': {}})
+
+        import json
+        note_title = '=== SESSION QC CRITERIA task ==='
+        note_text = {
+            "title": note_title,
+            "criteria": {'default': {'WARNING': 1, 'FAIL': 0.5}}
+        }
+
+        note_data = {'user': self.one.alyx.user,
+                     'content_type': 'session',
+                     'object_id': self.eid,
+                     'text': f'{json.dumps(note_text)}'}
+
+        self.note = self.one.alyx.rest('notes', 'create', data=note_data)
+
+    def test_compute(self):
+
+        # Build a dict of metrics and passed for a few the task qcs
+        metrics = dict()
+        passed = dict()
+
+        metrics['_task_stimOff_itiIn_delays'], passed['_task_stimOff_itiIn_delays'] = (
+            qcmetrics.check_stimOff_itiIn_delays(self.data))
+
+        metrics['_task_stimOn_delays'], passed['_task_stimOn_delays'] = (
+            qcmetrics.check_stimOff_itiIn_delays(self.data))
+
+        self.qc.metrics = metrics
+        self.qc.passed = passed
+
+        # Outcome using defualt BWM criteria
+        outcome, *_ = self.qc.compute_session_status(use_custom=False)
+        self.assertEqual(spec.QC.PASS, outcome)
+
+        # Outcome using custom note criteria
+        outcome, *_ = self.qc.compute_session_status(use_custom=True)
+        self.assertEqual(spec.QC.WARNING, outcome)
+
+    def tearDown(self):
+        self.one.alyx.rest('notes', 'delete', id=self.note['id'])
+        self.one.alyx.rest('sessions', 'delete', id=self.eid)
+
+
 if __name__ == '__main__':
     unittest.main(exit=False, verbosity=2)
diff --git a/ibllib/tests/test_base_tasks.py b/ibllib/tests/test_base_tasks.py
index 74e8a4471..78b787130 100644
--- a/ibllib/tests/test_base_tasks.py
+++ b/ibllib/tests/test_base_tasks.py
@@ -82,6 +82,9 @@ def tearDownClass(cls) -> None:
         if cls.tmpdir:
             cls.tmpdir.cleanup()
         if cls.one and cls.eid:
+            notes = cls.one.alyx.rest('notes', 'list', django=f'object_id,{cls.eid}', no_cache=True)
+            for n in notes:
+                cls.one.alyx.rest('notes', 'delete', id=n['id'])
             cls.one.alyx.rest('sessions', 'delete', id=cls.eid)
 
 

From 29845cfee8eeac9dc4195ad05f1dbcf21cc71186 Mon Sep 17 00:00:00 2001
From: Mayo Faulkner <mayo.faulkner@ucl.ac.uk>
Date: Fri, 24 Oct 2025 16:18:41 +0100
Subject: [PATCH 2/5] cast to float for correct reassignment

---
 ibllib/qc/task_metrics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ibllib/qc/task_metrics.py b/ibllib/qc/task_metrics.py
index aec231dde..335a97eb7 100644
--- a/ibllib/qc/task_metrics.py
+++ b/ibllib/qc/task_metrics.py
@@ -1348,7 +1348,7 @@ def check_stimFreeze_delays(data, **_):
     'intervals')
     """
     metric = np.nan_to_num(data['stimFreeze_times'] - data['stimFreezeTrigger_times'], nan=np.inf)
-    passed = (metric <= 0.15) & (metric > 0)
+    passed = ((metric <= 0.15) & (metric > 0)).astype(float)
     # Remove no_go trials (stimFreeze not triggered in no-go trials)
     passed[data['choice'] == 0] = np.nan
 

From e803c6cc433ddf5f272c58f791987e8a277a0af8 Mon Sep 17 00:00:00 2001
From: Mayo Faulkner <mayo.faulkner@ucl.ac.uk>
Date: Fri, 24 Oct 2025 16:25:10 +0100
Subject: [PATCH 3/5] docstring

---
 ibllib/qc/task_metrics.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/ibllib/qc/task_metrics.py b/ibllib/qc/task_metrics.py
index 335a97eb7..23bd9777e 100644
--- a/ibllib/qc/task_metrics.py
+++ b/ibllib/qc/task_metrics.py
@@ -390,7 +390,15 @@ def compute_session_status(self, use_custom=True):
         return session_outcome, results, outcomes
 
     def get_custom_session_criteria(self):
+        """
+        Use a custom criteria stored in a note associated with the session to evaluate the
+        QC outcome.
 
+        Returns
+        -------
+        dict
+            The QC criteria to use
+        """
         if self.one:
             note_title = f'=== SESSION QC CRITERIA {self.namespace} ==='
             query = f'text__icontains,{note_title},object_id,{str(self.eid)}'
@@ -570,8 +578,6 @@ def compute(self, **kwargs):
 # === Delays between events checks ===
 
 def check_stimOn_goCue_delays(data, audio_output='harp', **_):
-    # TODO check
-
     """
     Check the go cue tone occurs less than 10ms before stimulus on.
 
@@ -1328,7 +1334,6 @@ def check_stimOff_delays(data, **_):
 
 
 def check_stimFreeze_delays(data, **_):
-    # TODO account for nogo where we don't have stimFreeze
     """Check the stimulus freezes within 150ms of the intended time.
 
     Check that the time difference between the visual stimulus freeze-command

From c83147515a58d6931c2dd97b5afebbff89ecca6a Mon Sep 17 00:00:00 2001
From: Mayo Faulkner <mayo.faulkner@ucl.ac.uk>
Date: Fri, 24 Oct 2025 16:28:45 +0100
Subject: [PATCH 4/5] flake

---
 ibllib/qc/task_metrics.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/ibllib/qc/task_metrics.py b/ibllib/qc/task_metrics.py
index 23bd9777e..6df9dddb9 100644
--- a/ibllib/qc/task_metrics.py
+++ b/ibllib/qc/task_metrics.py
@@ -391,8 +391,7 @@ def compute_session_status(self, use_custom=True):
 
     def get_custom_session_criteria(self):
         """
-        Use a custom criteria stored in a note associated with the session to evaluate the
-        QC outcome.
+        Use a custom QC criteria associated with the session.
 
         Returns
         -------

From dac67b710d4fe308d1340e79bc3fd6546e1df910 Mon Sep 17 00:00:00 2001
From: Mayo Faulkner <mayo.faulkner@ucl.ac.uk>
Date: Thu, 30 Oct 2025 13:15:46 +0000
Subject: [PATCH 5/5] account for no go trial

---
 ibllib/tests/qc/test_task_metrics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ibllib/tests/qc/test_task_metrics.py b/ibllib/tests/qc/test_task_metrics.py
index 98f50eb63..463f1cd9d 100644
--- a/ibllib/tests/qc/test_task_metrics.py
+++ b/ibllib/tests/qc/test_task_metrics.py
@@ -429,7 +429,7 @@ def test_check_stimFreeze_delays(self):
         # Set incorrect timestamp
         self.data['stimFreeze_times'][-1] = self.data['stimFreezeTrigger_times'][-1] + 0.2
         metric, passed = qcmetrics.check_stimFreeze_delays(self.data)
-        n = len(self.data['stimFreeze_times'])
+        n = len(self.data['stimFreeze_times']) - 1  # remove the nogo trial which we expect to be nan
         expected = (n - 1) / n
         self.assertEqual(np.nanmean(passed), expected, 'failed to detect dodgy timestamp')