From 9623cfaacc58872dc4eef23a61333e20c36f5de7 Mon Sep 17 00:00:00 2001
From: viv3ckj <viveck.kingsley@gmail.com>
Date: Fri, 1 Aug 2025 10:26:50 +0100
Subject: [PATCH 1/7] Add comments to improve clarity of code

---
 analysis/dm_reg_dataset_milan.py | 41 +++++++++++++++++++-------------
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/analysis/dm_reg_dataset_milan.py b/analysis/dm_reg_dataset_milan.py
index ec67fd8..80c2a99 100644
--- a/analysis/dm_reg_dataset_milan.py
+++ b/analysis/dm_reg_dataset_milan.py
@@ -1,8 +1,11 @@
+# Objective: Create a dataset with columns of interest, taken from TPP tables
+
 from pathlib import Path
 
-from ehrql import create_dataset
-from ehrql.codes import codelist_from_csv
+# 1) Import the required ehrql functions
+from ehrql import create_dataset, codelist_from_csv
 
+# 2) Import relevant tables from the TPP backend (OpenSAFELY-TPP)
 from ehrql.tables.tpp import (
     clinical_events,
     patients,
@@ -11,6 +14,8 @@
 
 CODELIST_DIR = Path("codelists")
 
+# 3) Load SNOMED-CT codelists from CSV files
+
 # Cluster name: DM_COD
 # Description: Diabetes mellitus codes
 # Refset ID: ^999004691000230108
@@ -27,8 +32,9 @@
     column="code",
 )
 
+# 4) Define variables and the study population
 
-# Helper function for finding the last matching event
+# Utility function: return the most recent event (if any) for a patient matching a given codelist
 def last_matching_event(events, codelist, where=True):
     return (
         events.where(where)
@@ -37,44 +43,45 @@ def last_matching_event(events, codelist, where=True):
         .last_for_patient()
     )
 
-
+# Define the index date for this dataset (e.g. reporting period end)
 index_date = "2024-03-31"
+# Create a dataset object
 dataset = create_dataset()
 
+# Identify patients who were registered on the index date
 has_registration = practice_registrations.for_patient_on(
     index_date
 ).exists_for_patient()
 
-# Extract prior events for further use in variable definitions below
+# Filter clinical events to only those on or before the index date
 prior_events = clinical_events.where(clinical_events.date.is_on_or_before(index_date))
 
 # Field number: 4
-# PAT_AGE: The age of the patient in full years at the achievement date.
+# PAT_AGE: The age of the patient in full years at the index date.
 dataset.pat_age = patients.age_on(index_date)
 
 # Field number: 6
-# DMLAT_DAT: Date of the most recent diabetes diagnosis up to and
-# including the achievement date.
+# DMLAT_DAT: Date of the most recent diabetes diagnosis on or before index date
 dataset.dmlat_dat = last_matching_event(prior_events, dm_cod).date
 
 # Field number: 7
 # DMRES_DAT: Date of the most recent diabetes diagnosis resolved code
-# recorded after the most recent diabetes diagnosis and up to and
-# including the achievement date.
+# recorded after the most recent diabetes diagnosis (if any)
 dataset.dmres_dat = last_matching_event(prior_events, dm_res_cod).date
 
 # DM_REG rule 1:
-# Pass to the next rule all patients from the specified population who meet
-# both of the criteria below:  Have a diabetes diagnosis in the patient record
-# up to and including the achievement date. Latest diabetes diagnosis is not
-# followed by a diabetes resolved code.
+# Include patients with a diabetes diagnosis and no subsequent resolution
+# (i.e. latest resolved code is earlier than diagnosis, or no resolved code recorded)
 dataset.dm_reg_r1 = (dataset.dmres_dat < dataset.dmlat_dat) | (
     dataset.dmlat_dat.is_not_null() & dataset.dmres_dat.is_null()
 )
 
 # DM_REG rule 2:
-# Reject patients passed to this rule who are aged under 17 years old on the
-# achievement date.
+# Include patients under 17 years of age on the index date
 dataset.dm_reg_r2 = dataset.pat_age < 17
 
-dataset.define_population(has_registration & dataset.dm_reg_r1 & ~dataset.dm_reg_r2)
+# Define the final population:
+# - Registered on index date
+# - Meet rule 1 (active diabetes diagnosis)
+# - Do not meet rule 2 (i.e. are 17 or older)
+dataset.define_population(has_registration & dataset.dm_reg_r1 & ~dataset.dm_reg_r2)
\ No newline at end of file

From ad1159a5851ab43cc5d6d3931d5b20cd7eeacfa1 Mon Sep 17 00:00:00 2001
From: viv3ckj <viveck.kingsley@gmail.com>
Date: Fri, 1 Aug 2025 10:34:08 +0100
Subject: [PATCH 2/7] Add a few comments to test script

---
 analysis/test_dm_reg_dataset_milan.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/analysis/test_dm_reg_dataset_milan.py b/analysis/test_dm_reg_dataset_milan.py
index 08e1aeb..8026ac9 100644
--- a/analysis/test_dm_reg_dataset_milan.py
+++ b/analysis/test_dm_reg_dataset_milan.py
@@ -1,10 +1,14 @@
 from datetime import date
+
+# 1) Import dataset from the dataset definition
 from dm_reg_dataset_milan import dataset
 
 # Patient data for the FY23/24 with index date = "2024-03-31"
 # Run the tests with the following command:
 # opensafely exec ehrql:v1 assure analysis/test_dm_reg_dataset_milan.py
 
+# 2) Add test cases to test the dataset definition
+
 test_data = {
     # Correctly not expected in population
     # No clinical events

From b66cd4a0fb856bf301655ce89c607a92940fe51a Mon Sep 17 00:00:00 2001
From: viv3ckj <viveck.kingsley@gmail.com>
Date: Fri, 1 Aug 2025 10:35:56 +0100
Subject: [PATCH 3/7] Remove other QOF files

---
 analysis/dem_reg_measures.py               | 100 ------------
 analysis/dem_reg_plots.R                   |  36 -----
 analysis/dep_reg_dataset.py                |  62 --------
 analysis/dep_reg_measures_jaidip.py        |  73 ---------
 analysis/dm_reg_dataset_atamborska.py      |  87 -----------
 analysis/dm_reg_dataset_jaidip_gill.py     |  31 ----
 analysis/dm_reg_dataset_viveck.py          |  72 ---------
 analysis/hyp_reg_dataset.py                |  53 -------
 analysis/test_dm_reg_dataset_atamborska.py | 156 ------------------
 analysis/test_hyp_reg_dataset.py           | 174 ---------------------
 10 files changed, 844 deletions(-)
 delete mode 100644 analysis/dem_reg_measures.py
 delete mode 100644 analysis/dem_reg_plots.R
 delete mode 100644 analysis/dep_reg_dataset.py
 delete mode 100644 analysis/dep_reg_measures_jaidip.py
 delete mode 100644 analysis/dm_reg_dataset_atamborska.py
 delete mode 100644 analysis/dm_reg_dataset_jaidip_gill.py
 delete mode 100644 analysis/dm_reg_dataset_viveck.py
 delete mode 100644 analysis/hyp_reg_dataset.py
 delete mode 100644 analysis/test_dm_reg_dataset_atamborska.py
 delete mode 100644 analysis/test_hyp_reg_dataset.py

diff --git a/analysis/dem_reg_measures.py b/analysis/dem_reg_measures.py
deleted file mode 100644
index debf786..0000000
--- a/analysis/dem_reg_measures.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# Task: Calulcate the monthly prevalence trends for the dementia QOF
-
-# 1) Import the required functions from ehrql
-from ehrql import codelist_from_csv
-from ehrql import INTERVAL, case, create_measures, months, when
-
-# 2) Import the tables of interest from TPP
-from ehrql.tables.tpp import (
-    patients,
-    practice_registrations,
-    clinical_events,
-    addresses,
-)
-
-# 3) Import the codelists of interest
-DEM_COD = codelist_from_csv(
-    "codelists/nhsd-primary-care-domain-refsets-dm_cod.csv",
-    column="code",
-)
-
-# 4) Create series for numerator and denominator for each interval
-## Numerator - number of people with diagnosis of dementia before the end of elegible period
-selected_events = clinical_events.where(
-    clinical_events.date.is_on_or_before(INTERVAL.end_date)
-)
-
-latest_dementia_date = (
-    selected_events.where(selected_events.snomedct_code.is_in(DEM_COD))
-    .sort_by(selected_events.date)
-    .last_for_patient()
-    .date
-)
-
-# Simpler version
-
-latest_dementia_date_dementia = (
-    clinical_events.where(
-        (clinical_events.date.is_on_or_before(INTERVAL.end_date))
-        & (clinical_events.snomedct_code.is_in(DEM_COD))
-    )
-    .sort_by(clinical_events.date)
-    .last_for_patient()
-    .date
-)
-
-
-## Denominator - number of people with practice registration before the beginning of the interval
-## and with no deregsotration or with reregistration after the end of each interval
-has_registration_date = (
-    practice_registrations.where(
-        practice_registrations.start_date.is_on_or_before(INTERVAL.start_date)
-        & (
-            practice_registrations.end_date.is_null()
-            | practice_registrations.end_date.is_on_or_after(INTERVAL.end_date)
-        )
-    )
-    .sort_by(practice_registrations.start_date)
-    .first_for_patient()
-    .start_date
-)
-
-# Simpler
-has_registration_date = practice_registrations.spanning(
-    INTERVAL.start_date, INTERVAL.end_date
-).exists_for_patient()
-
-# Create measures framework
-measures = create_measures()
-
-measures.define_measure(
-    name="dem_reg",
-    numerator=latest_dementia_date.is_not_null(),
-    denominator=has_registration_date.is_not_null(),
-    intervals=months(12).starting_on("2023-04-01"),
-)
-
-# The output is a dataframe of measure name, interval_start,
-# interval_end, ratio, numerator, denominator.
-
-# Analyse dementia rates by subrgoup (socio-economic status)
-
-
-imd_rounded = addresses.for_patient_on(INTERVAL.start_date).imd_rounded
-max_imd = 32844
-
-imd_quintile = case(
-    when(imd_rounded < int(max_imd * 1 / 5)).then(1),
-    when(imd_rounded < int(max_imd * 2 / 5)).then(2),
-    when(imd_rounded < int(max_imd * 3 / 5)).then(3),
-    when(imd_rounded < int(max_imd * 4 / 5)).then(4),
-    when(imd_rounded <= max_imd).then(5),
-)
-
-measures.define_measure(
-    name="dem_qof_monthy_imd",
-    numerator=latest_dementia_date.is_not_null(),
-    denominator=has_registration_date,
-    group_by={"imd": imd_quintile},
-    intervals=months(12).starting_on("2023-04-01"),
-)
diff --git a/analysis/dem_reg_plots.R b/analysis/dem_reg_plots.R
deleted file mode 100644
index ba69aff..0000000
--- a/analysis/dem_reg_plots.R
+++ /dev/null
@@ -1,36 +0,0 @@
-# Set up
-library(tidyverse)
-library(lubridate)
-df <- read.csv("output/dem/dem_reg_measures.csv")
-
-# summary(df) - confirms that there are 37 missing ratios and 24 missing imds
-
-# Dementia prevalence plot, stratified by IMD
-df %>%
-  filter(!is.na(imd)) %>%
-  ggplot(aes (x = ymd(interval_start), y = ratio, color = as.factor(imd))) +
-  geom_line()+
-  labs (
-    title = "Dementia prevalence over time, per IMD group", 
-    x = NULL, 
-    y = "Prevalence of dementia",
-    color = "Index of multiple depravation"
-    ) +
-  theme_light()
-
-ggsave("output/dem/dem_reg_rates_imd.png")
-
-# Dementia prevalence, unstratified
-
-df %>%
-filter(is.na(imd)) %>%
-ggplot(aes (x = ymd(interval_start), y = ratio)) +
-geom_line() +
-labs (
-    title = "Dementia prevalence over time, unstratfied", 
-    x = NULL, 
-    y = "Prevalence of dementia"
-    ) +
-theme_light()
-
-ggsave("output/dem/dem_reg_rates.png")
\ No newline at end of file
diff --git a/analysis/dep_reg_dataset.py b/analysis/dep_reg_dataset.py
deleted file mode 100644
index 95bba39..0000000
--- a/analysis/dep_reg_dataset.py
+++ /dev/null
@@ -1,62 +0,0 @@
-from ehrql import create_dataset
-from ehrql.codes import codelist_from_csv
-
-from ehrql.tables.tpp import (
-    clinical_events,
-    patients,
-    practice_registrations,
-)
-
-dataset = create_dataset()
-# Quality Service Start Date
-qs_start_date = '2024-04-01' 
-# Fixed date used in DEPR1_REG and DEPCC01 logic.
-# Depression register: Patients aged at least 18 years old whose latest
-# unresolved episode of depression is since 1 st April 2006.
-min_date = '2006-04-01'
-
-# import codelists
-dep_codelist = codelist_from_csv(
-    'codelists/nhsd-primary-care-domain-refsets-depr_cod.csv',
-    column='code')
-depres_codelist = codelist_from_csv(
-    'codelists/nhsd-primary-care-domain-refsets-depres_cod.csv',
-    column='code')
-
-# inclusion criteria
-has_registration = (practice_registrations
-                    .for_patient_on(qs_start_date)
-                    .exists_for_patient())
-
-# Create disease variables
-# - DEPR_DAT = Latest date of depression diagnosis
-# - DEPRES_DAT = Latest depression resolved date
-# - PAT_AGE = Age at quality service start date
-
-dataset.depr_dat = ((clinical_events.
-                    where(clinical_events
-                          .snomedct_code
-                          .is_in(dep_codelist))
-                          ).sort_by(clinical_events.date)
-                          .last_for_patient()
-                          .date)
-dataset.depres_dat = (clinical_events
-                      .where(clinical_events
-                             .snomedct_code
-                             .is_in(depres_codelist))
-                             .sort_by(clinical_events.date)
-                             .last_for_patient()
-                             .date)
-dataset.pat_age = (patients
-                   .age_on(qs_start_date))
-
-# --- Creating rules ---
-# Have their latest first or new episode of depression on or after 1 st April 2006.
-# Latest episode of depression is not followed by a depression resolved code.
-# Must be aged 18 or over on the achievement date
-
-dep1_reg_r1 = ((dataset.depr_dat.is_on_or_after(min_date))
-               & (dataset.depres_dat.is_null()))
-dep2_reg_r2 = dataset.pat_age >= 18
-
-dataset.define_population(has_registration & dep1_reg_r1 & dep2_reg_r2)
diff --git a/analysis/dep_reg_measures_jaidip.py b/analysis/dep_reg_measures_jaidip.py
deleted file mode 100644
index c646c98..0000000
--- a/analysis/dep_reg_measures_jaidip.py
+++ /dev/null
@@ -1,73 +0,0 @@
-from ehrql import INTERVAL, create_measures, months, codelist_from_csv, case, when
-from ehrql.tables.tpp import practice_registrations, clinical_events, patients
-
-# Instantiate measures
-measures = create_measures()
-
-# Import codelists
-dep_codelist = codelist_from_csv(
-    "codelists/nhsd-primary-care-domain-refsets-depr_cod.csv",
-    column="code",
-)
-
-depres_codelist = codelist_from_csv(
-    "codelists/nhsd-primary-care-domain-refsets-depres_cod.csv",
-    column="code",
-)
-
-# Important dates
-# Quality Service Start Date
-qs_start_date = "2023-04-01"
-
-# Have their latest first or new episode of depression on or after 1 st April 2006
-min_date = "2006-04-01"
-
-# Inclusion criteria
-has_registration = practice_registrations.spanning(
-    INTERVAL.start_date, INTERVAL.end_date
-).exists_for_patient()
-
-# Create disease variables
-# - depr_dat = Latest date of depression diagnosis
-# - depres_dat = Latest depression resolved date
-# - pat_age = Age at quality service start date
-depr_dat = (
-    (clinical_events.where(clinical_events.snomedct_code.is_in(dep_codelist)))
-    .sort_by(clinical_events.date)
-    .last_for_patient()
-    .date
-)
-depres_dat = (
-    clinical_events.where(clinical_events.snomedct_code.is_in(depres_codelist))
-    .sort_by(clinical_events.date)
-    .last_for_patient()
-    .date
-)
-pat_age = patients.age_on(INTERVAL.start_date)
-
-
-# Creating rules
-dep1_reg_r1 = (depr_dat.is_on_or_after(min_date)) & (depres_dat.is_null())
-dep1_reg_r2 = pat_age >= 18
-
-# Define measures
-dep_in_interval = depr_dat.is_during(INTERVAL)
-
-# Stratification groups
-age_band = case(
-    when((pat_age >= 18) & (pat_age <= 21)).then("Young adult"),
-    when((pat_age >= 22) & (pat_age <= 64)).then("Adult"),
-    when(pat_age >= 65).then("Elderly"),
-)
-
-# Create measures
-measures.define_measure(
-    name="monthly_prevalence",
-    numerator=dep_in_interval,
-    denominator=has_registration & dep1_reg_r1 & dep1_reg_r2,
-    intervals=months(12).starting_on(qs_start_date),
-    group_by={
-        "sex": patients.sex,
-        "age": age_band,
-    },
-)
diff --git a/analysis/dm_reg_dataset_atamborska.py b/analysis/dm_reg_dataset_atamborska.py
deleted file mode 100644
index 81f2ae3..0000000
--- a/analysis/dm_reg_dataset_atamborska.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Import functions that will be used for dataset definition
-from ehrql import create_dataset
-from ehrql import codelist_from_csv
-from ehrql import Path
-
-# Import the tables of the variables which you need to define the dataset
-from ehrql.tables.tpp import patients, practice_registrations, clinical_events
-
-## could use from ehrql.tables.core or from ehrql.tables.emis
-
-# Create an empty dataset - we will be adding columns to this, based on which we will filter data
-dataset = create_dataset()
-
-# Create objects containing codelists - these will be used to define the dataset
-codelist_path = Path("codelists")
-dm_cod = codelist_from_csv(
-    codelist_path / "nhsd-primary-care-domain-refsets-dm_cod.csv",
-    column="code",
-)
-dmres_cod = codelist_from_csv(
-    codelist_path / "nhsd-primary-care-domain-refsets-dmres_cod.csv",
-    column="code",
-)
-
-# Simpler:
-dm_cod = codelist_from_csv(
-    "codelists/nhsd-primary-care-domain-refsets-dm_cod.csv",
-    column="code",
-)
-dmres_cod = codelist_from_csv(
-    "codelists/nhsd-primary-care-domain-refsets-dmres_cod.csv",
-    column="code",
-)
-
-# Create index date
-index_date = "2024-03-31"
-
-# Create a boolean vector for which patients were registered up until the index date
-registered = practice_registrations.for_patient_on(index_date).exists_for_patient()
-
-# However, unless we add this information to the dataset, we won't be able to filter based on it, using dataset.define_population
-##So first, we need to add a column to the dataset which defines which patients are registered
-dataset.registered = practice_registrations.for_patient_on(
-    index_date
-).exists_for_patient()
-
-# Derive patient age from the DOB and add to the dataset
-dataset.pat_age = patients.age_on(index_date)
-
-# Derive latest dates for Dx DM and Dx DM res for each patient, and add them to the dataset
-
-## Select the latest date with the diabetes code and add it to the dataset
-dataset.latest_dm = (
-    clinical_events.where(clinical_events.snomedct_code.is_in(dm_cod))
-    .sort_by(clinical_events.date)
-    .last_for_patient()
-    .date
-)
-
-
-## Select the latest date with the diabetes resolved code and add it to the dataset
-dataset.latest_dmres = (
-    clinical_events.where(clinical_events.snomedct_code.is_in(dmres_cod))
-    .sort_by(clinical_events.date)
-    .last_for_patient()
-    .date
-)
-
-# Define the dataset (=cohort) - using the rules
-
-##Rule 1
-### Has a diabetes diagnosis before the index date
-### Latest diabetes diagnosis is not followed by a diabetes resolved code.
-
-dm_reg_r1 = (
-    (dataset.latest_dm.is_not_null())
-    & (dataset.latest_dm < index_date)
-    & ((dataset.latest_dmres < dataset.latest_dm) | dataset.latest_dmres.is_null())
-)
-
-##Rule 2
-### Needs to be minimim 17 years old on the index date
-
-dm_reg_r2 = dataset.pat_age >= 17
-
-# Create the population
-dataset.define_population(registered & dm_reg_r1 & dm_reg_r2)
diff --git a/analysis/dm_reg_dataset_jaidip_gill.py b/analysis/dm_reg_dataset_jaidip_gill.py
deleted file mode 100644
index 0a82504..0000000
--- a/analysis/dm_reg_dataset_jaidip_gill.py
+++ /dev/null
@@ -1,31 +0,0 @@
-from codelists import *
-from ehrql import create_dataset
-from ehrql.tables.tpp import patients, practice_registrations, clinical_events
-
-dataset = create_dataset()
-
-index_date = "2024-03-31"
-has_registration = practice_registrations.for_patient_on(index_date).exists_for_patient()
-'''
-Extract the following columns:
-1. age at achievement date
-2. earliest diabetes resolved date
-'''
-dataset.age = patients.age_on(index_date)
-prev_events = clinical_events.where(clinical_events.date.is_on_or_before(index_date))
-dataset.dmlate_date = prev_events.where(clinical_events.snomedct_code.is_in(diabetes_codelist)).sort_by(clinical_events.date).last_for_patient().date
-dataset.dmreso_date = prev_events.where(clinical_events.snomedct_code.is_in(diabetes_resolved_codelist)).sort_by(clinical_events.date).last_for_patient().date
-
-'''
-Filter each of the columns:
-1. dmreso_date < dmlate_date or dmreso_date doesn't exist
-2. age >= 17
-'''
-dataset.dm_reg_r1 = (
-    ((dataset.dmlate_date > dataset.dmreso_date) | dataset.dmreso_date.is_null()) 
-    & dataset.dmlate_date.is_not_null()
-)
-dataset.dm_reg_r2 = dataset.age >= 17
-
-# Filter the dataset
-dataset.define_population(has_registration & dataset.dm_reg_r1 & dataset.dm_reg_r2)
diff --git a/analysis/dm_reg_dataset_viveck.py b/analysis/dm_reg_dataset_viveck.py
deleted file mode 100644
index 9e6d863..0000000
--- a/analysis/dm_reg_dataset_viveck.py
+++ /dev/null
@@ -1,72 +0,0 @@
-from ehrql import codelist_from_csv
-from ehrql import create_dataset
-from ehrql.tables.tpp import (
-    clinical_events, 
-    patients,
-    practice_registrations,
-)
-
-### codelists ###
-
-# Diabetes mellitus codes
-dm_cod = codelist_from_csv(
-    "codelists/nhsd-primary-care-domain-refsets-dm_cod.csv",
-    column="code"
-)
-# Diabetes resolved codes
-dmres_cod = codelist_from_csv(
-    "codelists/nhsd-primary-care-domain-refsets-dmres_cod.csv",
-    column="code"
-)
-
-### create dataset and constant variable ###
-
-index_date = "2024-03-31"
-dataset = create_dataset()
-
-### variables ###
-
-# VARIABLE 1: pat_age
-# age of the patient in full years at the achievement date
-dataset.pat_age = patients.age_on(index_date)
-
-# VARIABLE 2: dmlat_dat
-# date of the most recent diabetes diagnosis up to and including the achievement date
-dataset.dmlat_dat = (
-    clinical_events.where(clinical_events.snomedct_code.is_in(dm_cod) & clinical_events.date.is_on_or_before(index_date))
-    .sort_by(clinical_events.date)
-    .last_for_patient()
-    .date
-)
-
-# VARIABLE 3: dmres_dat
-# date of the most recent diabetes diagnosis resolved code recorded after the most recent diabetes diagnosis and up to and including the achievement date
-dataset.dmres_dat = (
-    clinical_events.where(clinical_events.snomedct_code.is_in(dmres_cod) & clinical_events.date.is_on_or_before(index_date))
-    .sort_by(clinical_events.date)
-    .last_for_patient()
-    .date
-)
-
-### rules ###
-
-# RULE 1: dm_reg_r1
-# Pass to the next rule all patients from the specified population who meet both of the criteria below:
-# Have a diabetes diagnosis in the patient record up to and including the achievement date.
-# Latest diabetes diagnosis is not followed by a diabetes resolved code.
-dataset.dm_reg_r1 = dataset.dmres_dat.is_null() & dataset.dmlat_dat.is_not_null() | (dataset.dmlat_dat > dataset.dmres_dat)
-
-# RULE 2: dm_reg_r2
-# Reject patients passed to this rule who are aged under 17 years old on the
-# achievement date.
-dataset.dm_reg_r2 = dataset.pat_age < 17
-
-### defining population ###
-
-# Applying both rules and registration on index date to define population
-has_registration = practice_registrations.for_patient_on(index_date).exists_for_patient()
-dataset.define_population((has_registration) & ~(dataset.dm_reg_r2) & (dataset.dm_reg_r1))
-
-
-
-
diff --git a/analysis/hyp_reg_dataset.py b/analysis/hyp_reg_dataset.py
deleted file mode 100644
index c9eb072..0000000
--- a/analysis/hyp_reg_dataset.py
+++ /dev/null
@@ -1,53 +0,0 @@
-from ehrql import codelist_from_csv
-from ehrql import create_dataset
-from ehrql.tables.tpp import (
-    clinical_events, 
-    patients,
-    practice_registrations,
-)
-
-# Hypertension codes
-hyp_cod = codelist_from_csv(
-    "codelists/nhsd-primary-care-domain-refsets-hyp_cod.csv",
-    column="code"
-)
-# Hypertension resolved codes
-hypres_cod = codelist_from_csv(
-    "codelists/nhsd-primary-care-domain-refsets-hypres_cod.csv",
-    column="code"
-)
-
-index_date = "2024-03-31"
-dataset = create_dataset()
-
-### variables ###
-
-# date of the most recent hypertension diagnosis up to and including the achievement date
-dataset.hyplat_dat = (
-    clinical_events.where(clinical_events.snomedct_code.is_in(hyp_cod) & clinical_events.date.is_on_or_before(index_date))
-    .sort_by(clinical_events.date)
-    .last_for_patient()
-    .date
-)
-
-# date of the most recent hypertension diagnosis resolved code recorded after the most recent hypertension diagnosis and up to and including the achievement date
-dataset.hypres_dat = (
-    clinical_events.where(clinical_events.snomedct_code.is_in(hypres_cod) & clinical_events.date.is_on_or_before(index_date))
-    .sort_by(clinical_events.date)
-    .last_for_patient()
-    .date
-)
-
-
-### rules ###
-
-# Pass to the next rule all patients from the specified population who meet both of the criteria below:
-# Have a hypertension diagnosis in the patient record up to and including the achievement date.
-# Latest hypertension diagnosis is not followed by a hypertension resolved code.
-dataset.hyp_reg_r1 = dataset.hyplat_dat.is_not_null() & dataset.hypres_dat.is_null() | (dataset.hyplat_dat > dataset.hypres_dat)
-
-### defining population ###
-has_registration = practice_registrations.for_patient_on(index_date).exists_for_patient()
-dataset.define_population(
-    (has_registration) & dataset.hyp_reg_r1
-    )
diff --git a/analysis/test_dm_reg_dataset_atamborska.py b/analysis/test_dm_reg_dataset_atamborska.py
deleted file mode 100644
index c99b03e..0000000
--- a/analysis/test_dm_reg_dataset_atamborska.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# Set up
-from datetime import date
-from dm_reg_dataset_atamborska import dataset
-
-test_data = {
-    # Patient accepted in the dataset
-    1: {
-        "patients": {"date_of_birth": date(1992, 1, 1)},
-        "practice_registrations": [
-            {
-                # First registration
-                "start_date": date(1994, 1, 1),
-                "end_date": date(1999, 1, 1),
-            },
-            {
-                # Second registration
-                "start_date": date(1999, 1, 2),
-            },
-        ],
-        "clinical_events": [
-            # First clinical event
-            {
-                "snomedct_code": "313435000",
-                "date": date(2000, 4, 13),
-            },
-            # Second clinical event
-            {
-                "snomedct_code": "420270002",
-                "date": date(2004, 1, 17),
-            },
-        ],
-        "expected_in_population": True,
-        "expected_columns": {
-            "pat_age": 32,
-            "registered": True,
-            "latest_dm": date(2004, 1, 17),
-            "latest_dmres": None,
-        },
-    },
-    # Patient not accepted in the dataset: no active registration
-    2: {
-        "patients": {"date_of_birth": date(1992, 1, 1)},
-        "practice_registrations": [
-            {
-                # First registration
-                "start_date": date(1994, 1, 1),
-                "end_date": date(1999, 1, 1),
-            },
-            {
-                # Second registration
-                "start_date": date(1999, 1, 2),
-                "end_date": date(2005, 1, 1),
-            },
-        ],
-        "clinical_events": [
-            # First clinical event
-            {
-                "snomedct_code": "313435000",
-                "date": date(2000, 4, 13),
-            },
-            # Second clinical event
-            {
-                "snomedct_code": "420270002",
-                "date": date(2004, 1, 17),
-            },
-        ],
-        "expected_in_population": False,
-    },
-    # Patient not accepted in the dataset: diabetes resolved
-    3: {
-        "patients": {"date_of_birth": date(1992, 1, 1)},
-        "practice_registrations": [
-            {
-                # First registration
-                "start_date": date(1994, 1, 1),
-                "end_date": date(1999, 1, 1),
-            },
-            {
-                # Second registration
-                "start_date": date(1999, 1, 2),
-            },
-        ],
-        "clinical_events": [
-            # First clinical event
-            {
-                "snomedct_code": "313435000",
-                "date": date(2000, 4, 13),
-            },
-            # Second clinical event
-            {
-                "snomedct_code": "315051004",
-                "date": date(2004, 1, 17),
-            },
-        ],
-        "expected_in_population": False,
-    },
-    # Patient accepted in the dataset: diabetes resolved in the past, then another episode
-    4: {
-        "patients": {"date_of_birth": date(1992, 1, 1)},
-        "practice_registrations": [
-            {
-                # First registration
-                "start_date": date(1994, 1, 1),
-                "end_date": date(1999, 1, 1),
-            },
-            {
-                # Second registration
-                "start_date": date(1999, 1, 2),
-            },
-        ],
-        "clinical_events": [
-            # First clinical event
-            {
-                "snomedct_code": "313435000",
-                "date": date(2000, 4, 13),
-            },
-            # Second clinical event
-            {
-                "snomedct_code": "315051004",
-                "date": date(2004, 1, 17),
-            },
-            # Third clinical event
-            {"snomedct_code": "313436004", "date": date(2015, 1, 1)},
-        ],
-        "expected_in_population": True,
-        "expected_columns": {
-            "pat_age": 32,
-            "registered": True,
-            "latest_dm": date(2015, 1, 1),
-            "latest_dmres": date(2004, 1, 17),
-        },
-    },
-    # Patient not accepted in the dataset: too young
-    5: {
-        "patients": {"date_of_birth": date(2008, 1, 1)},
-        "practice_registrations": [
-            {
-                # First registration
-                "start_date": date(2017, 1, 1),
-                "end_date": date(2022, 1, 1),
-            },
-            {
-                # Second registration
-                "start_date": date(2022, 1, 2),
-            },
-        ],
-        "clinical_events": [
-            # First clinical event
-            {
-                "snomedct_code": "420270002",
-                "date": date(2018, 1, 1),
-            },
-        ],
-        "expected_in_population": False,
-    },
-}
diff --git a/analysis/test_hyp_reg_dataset.py b/analysis/test_hyp_reg_dataset.py
deleted file mode 100644
index fd847b0..0000000
--- a/analysis/test_hyp_reg_dataset.py
+++ /dev/null
@@ -1,174 +0,0 @@
-from datetime import date
-from hyp_reg_dataset import dataset
-
-test_data = {
-
-    # Correctly not expected in population
-    # No clinical events
-    1: {
-        "patients": {"date_of_birth": date(1950, 1, 1)},
-        "practice_registrations": [
-            {
-                "start_date": date(2010, 1, 1),
-            },
-        ],
-        "clinical_events": [{}],
-        "expected_in_population": False,
-    },
-
-    # Correctly expected in population
-    # Most recent hypertension diagnosis before index date
-    2: {
-        "patients": {"date_of_birth": date(1950,1,1)},
-        "practice_registrations": [
-            {
-                "start_date": date(2010,1,1)
-            }
-        ],
-        "clinical_events": [
-            {
-                # First hypertension diagnosis
-                "date": date(2000,1,1),
-                "snomedct_code": "38341003",
-            },
-            {
-                # Second hypertension diagnosis
-                "date": date(2020,3,3),
-                "snomedct_code": "38341003",
-            }
-        ],
-        "expected_in_population": True,
-        "expected_columns": {
-            "hyplat_dat": date(2020,3,3),
-            "hypres_dat": None,
-            "hyp_reg_r1": True, 
-        },
-
-    },
-
-    # Correctly expected in population
-    # Most recent resolved hypertension diagnosis before index date
-    3: {
-        "patients": {"date_of_birth": date(1950,1,1)},
-        "practice_registrations": [
-            {
-                "start_date": date(2010,1,1)
-            }
-        ],
-        "clinical_events": [
-            {
-                # First hypertension diagnosis
-                "date": date(2000,1,1),
-                "snomedct_code": "38341003",
-            },
-            {
-                # Second hypertension diagnosis
-                "date": date(2020,3,3),
-                "snomedct_code": "38341003",
-            },
-            {
-                # Third hypertension diagnosis
-                "date": date(2024,2,2),
-                "snomedct_code": "38341003"
-            },
-            {
-                # First resolved diagnosis
-                "date": date(2000,2,1),
-                "snomedct_code": "162659009"
-            },
-            {
-                # Second resolved diagnosis
-                "date": date(2020,3,4),
-                "snomedct_code": "162659009"
-            },
-        ],
-        "expected_in_population": True,
-        "expected_columns": {
-            "hyplat_dat": date(2024,2,2),
-            "hypres_dat": date(2020,3,4),
-            "hyp_reg_r1": True, 
-        },
-
-    },
-    # Correctly not expected in population
-    # Resolved diagnosis before index date
-    4: {
-        "patients": {"date_of_birth": date(1950, 1, 1)},
-        "practice_registrations": [
-            {
-                "start_date": date(2010, 1, 1),
-            },
-        ],
-        "clinical_events": [
-            {
-               # First hypertension diagnosis
-                "date": date(2000,1,1),
-                "snomedct_code": "38341003", 
-            },
-            {
-               # First resolved diagnosis
-                "date": date(2000,2,1),
-                "snomedct_code": "162659009" 
-            },
-            ],
-        "expected_in_population": False,
-    },
-
-    # Correctly not expected in population
-    # Patient not registered at practice
-    5: {
-        "patients": {"date_of_birth": date(1950, 1, 1)},
-        "practice_registrations": [
-            {
-                "start_date": date(2010, 1, 1),
-                "end_date": date(2020,1,2),
-            },
-        ],
-        "clinical_events": [
-            {
-               # First hypertension diagnosis
-                "date": date(2000,1,1),
-                "snomedct_code": "38341003", 
-            },
-            ],
-        "expected_in_population": False,
-    },
-
-    # Correctly not expected in population
-    # Patient diagnosed after index date
-    6: {
-        "patients": {"date_of_birth": date(1950, 1, 1)},
-        "practice_registrations": [
-            {
-                "start_date": date(2010, 1, 1),
-            },
-        ],
-        "clinical_events": [
-            {
-               # First hypertension diagnosis
-                "date": date(2024,4,1),
-                "snomedct_code": "38341003", 
-            },
-            ],
-        "expected_in_population": False,
-    },
-
-    # Correctly not expected in population
-    # Patient diagnosed with a cold before index date
-    7: {
-        "patients": {"date_of_birth": date(1960, 1, 1)},
-        "practice_registrations": [
-            {
-                "start_date": date(1960, 1, 1),
-            },
-        ],
-        "clinical_events": [
-            {
-                # First cold diagnosis
-                "date": date(2000, 6, 1),
-                "snomedct_code": "82272006",
-            },
-            ],
-        "expected_in_population": False,
-    },
- }
\ No newline at end of file

From dc9a5b9c7f719c826d9bfa4246961117e080634c Mon Sep 17 00:00:00 2001
From: viv3ckj <viveck.kingsley@gmail.com>
Date: Tue, 5 Aug 2025 16:51:41 +0100
Subject: [PATCH 4/7] Make dataset definitions consistent

---
 analysis/dep_reg_dataset.py | 76 +++++++++++++++++++++++++++++++++++++
 analysis/hyp_reg_dataset.py | 67 ++++++++++++++++++++++++++++++++
 2 files changed, 143 insertions(+)
 create mode 100644 analysis/dep_reg_dataset.py
 create mode 100644 analysis/hyp_reg_dataset.py

diff --git a/analysis/dep_reg_dataset.py b/analysis/dep_reg_dataset.py
new file mode 100644
index 0000000..d66c3e3
--- /dev/null
+++ b/analysis/dep_reg_dataset.py
@@ -0,0 +1,76 @@
+# 1) Import the required ehrql functions
+from ehrql import create_dataset, codelist_from_csv
+
+# 2) Import relevant tables from the TPP backend (OpenSAFELY-TPP)
+from ehrql.tables.tpp import (
+    clinical_events,
+    patients,
+    practice_registrations,
+)
+
+# 3) Load SNOMED-CT codelists from CSV files
+
+# Depression codes
+dep_codelist = codelist_from_csv(
+    'codelists/nhsd-primary-care-domain-refsets-depr_cod.csv',
+    column='code')
+# Resolved depression codes
+depres_codelist = codelist_from_csv(
+    'codelists/nhsd-primary-care-domain-refsets-depres_cod.csv',
+    column='code')
+
+# 4) Define variables and the study population
+
+# Utility function: return the most recent event (if any) for a patient matching a given codelist
+def last_matching_event(events, codelist, where=True):
+    return (
+        events.where(where)
+        .where(events.snomedct_code.is_in(codelist))
+        .sort_by(events.date)
+        .last_for_patient()
+    )
+
+# Quality Service Start Date (Index date)
+qs_start_date = '2024-04-01' 
+
+# Fixed date used in DEPR1_REG and DEPCC01 logic.
+# Depression register: Patients aged at least 18 years old whose latest unresolved episode of depression is since 1st April 2006.
+min_date = '2006-04-01'
+# Create a dataset object
+dataset = create_dataset()
+
+# Identify patients who were registered on the index date
+has_registration = practice_registrations.for_patient_on(
+        qs_start_date
+).exists_for_patient()
+
+# Filter clinical events to only those on or before the index date
+prior_events = clinical_events.where(clinical_events.date.is_on_or_before(qs_start_date))
+
+# Field number: 4
+# PAT_AGE: The age of the patient in full years at the index date.
+dataset.pat_age = (patients
+                   .age_on(qs_start_date))
+
+# Field number 5: 
+# DEPR_DAT: Date of the latest depression episode at the index date
+dataset.depr_dat = last_matching_event(prior_events, dep_codelist).date
+
+# Field number 6:
+# DEPRES_DAT: Date of the most recent depression resolved code at the index date
+dataset.depres_dat = last_matching_event(prior_events, depres_codelist).date
+
+# DEP_REG rule 1: 
+# Include patients that have their latest episode of unresolved depression on or after 1st April 2006.
+dep1_reg_r1 = ((dataset.depr_dat.is_on_or_after(min_date))
+               & (dataset.depres_dat.is_null()))
+
+# DEP_REG rule 2:
+# Include patients under the age of 18 on the index date
+dep2_reg_r2 = dataset.pat_age < 18
+
+# Define the final population:
+# - Registered on index date
+# - Meet rule 1 (active depression diagnosis)
+# - Do not meet rule 2 (i.e. are 18 or older)
+dataset.define_population(has_registration & dep1_reg_r1 & ~dep2_reg_r2)
diff --git a/analysis/hyp_reg_dataset.py b/analysis/hyp_reg_dataset.py
new file mode 100644
index 0000000..5aa256d
--- /dev/null
+++ b/analysis/hyp_reg_dataset.py
@@ -0,0 +1,67 @@
+# 1) Import the required ehrql functions
+from ehrql import create_dataset, codelist_from_csv
+
+# 2) Import relevant tables from the TPP backend (OpenSAFELY-TPP)
+from ehrql.tables.tpp import (
+    clinical_events, 
+    practice_registrations,
+)
+
+# 3) Load SNOMED-CT codelists from CSV files
+
+# Hypertension codes
+hyp_cod = codelist_from_csv(
+    "codelists/nhsd-primary-care-domain-refsets-hyp_cod.csv",
+    column="code"
+)
+# Hypertension resolved codes
+hypres_cod = codelist_from_csv(
+    "codelists/nhsd-primary-care-domain-refsets-hypres_cod.csv",
+    column="code"
+)
+
+# 4) Define variables and the study population
+
+# Utility function: return the most recent event (if any) for a patient matching a given codelist
+def last_matching_event(events, codelist, where=True):
+    return (
+        events.where(where)
+        .where(events.snomedct_code.is_in(codelist))
+        .sort_by(events.date)
+        .last_for_patient()
+    )
+
+# Define the index date for this dataset (e.g. reporting period end)
+index_date = "2024-03-31"
+# Create a dataset object
+dataset = create_dataset()
+
+# Identify patients who were registered on the index date
+has_registration = practice_registrations.for_patient_on(
+    index_date
+).exists_for_patient()
+
+# Filter clinical events to only those on or before the index date
+prior_events = clinical_events.where(clinical_events.date.is_on_or_before(index_date))
+
+# Field number: 6
+# Date of the most recent hypertension diagnosis up to and including the achievement date
+dataset.hyplat_dat = last_matching_event(prior_events, hyp_cod).date
+
+# Fielt number: 7
+# Date of the most recent hypertension diagnosis resolved code recorded after the most recent hypertension diagnosis and up to and including the achievement date
+dataset.hypres_dat = last_matching_event(prior_events, hypres_cod).date
+
+
+# HYP_REG rule:
+
+# Include patients with a hypertension diagnosis on or before index date.
+# Latest hypertension diagnosis is not followed by a hypertension resolved code.
+dataset.hyp_reg_r1 = dataset.hyplat_dat.is_not_null() & dataset.hypres_dat.is_null() | (dataset.hyplat_dat > dataset.hypres_dat)
+
+# Define the final population:
+# - Registered on index date
+# - Meet the rule (active diabetes diagnosis)
+dataset.define_population(
+    (has_registration) & dataset.hyp_reg_r1
+    )

From 0b216a69fa689168365910bd9181d535cd8bfd10 Mon Sep 17 00:00:00 2001
From: viv3ckj <viveck.kingsley@gmail.com>
Date: Wed, 6 Aug 2025 13:40:53 +0100
Subject: [PATCH 5/7] Make test script consistent with diabetes test script

---
 analysis/test_hyp_reg_dataset.py | 181 +++++++++++++++++++++++++++++++
 1 file changed, 181 insertions(+)
 create mode 100644 analysis/test_hyp_reg_dataset.py

diff --git a/analysis/test_hyp_reg_dataset.py b/analysis/test_hyp_reg_dataset.py
new file mode 100644
index 0000000..8d60a29
--- /dev/null
+++ b/analysis/test_hyp_reg_dataset.py
@@ -0,0 +1,181 @@
+from datetime import date
+
+# 1) Import dataset from the dataset definition
+from hyp_reg_dataset import dataset
+
+# Patient data for the FY23/24 with index date = "2024-03-31"
+# Run the tests with the following command:
+# opensafely exec ehrql:v1 assure analysis/test_hyp_reg_dataset.py
+
+# 2) Add test cases to test the dataset definition
+
+test_data = {
+    # Correctly not expected in population
+    # No clinical events
+    1: {
+        "patients": {"date_of_birth": date(1950, 1, 1)},
+        "practice_registrations": [
+            {
+                "start_date": date(2010, 1, 1),
+            },
+        ],
+        "clinical_events": [{}],
+        "expected_in_population": False,
+    },
+
+    # Correctly expected in population
+    # Most recent hypertension diagnosis before index date
+    2: {
+        "patients": {"date_of_birth": date(1950,1,1)},
+        "practice_registrations": [
+            {
+                "start_date": date(2010,1,1)
+            }
+        ],
+        "clinical_events": [
+            {
+                # First hypertension diagnosis
+                "date": date(2000,1,1),
+                "snomedct_code": "38341003",
+            },
+            {
+                # Second hypertension diagnosis
+                "date": date(2020,3,3),
+                "snomedct_code": "38341003",
+            }
+        ],
+        "expected_in_population": True,
+        "expected_columns": {
+            "hyplat_dat": date(2020,3,3),
+            "hypres_dat": None,
+            "hyp_reg_r1": True, 
+        },
+
+    },
+
+    # Correctly expected in population
+    # Most recent resolved hypertension diagnosis before index date
+    3: {
+        "patients": {"date_of_birth": date(1950,1,1)},
+        "practice_registrations": [
+            {
+                "start_date": date(2010,1,1)
+            }
+        ],
+        "clinical_events": [
+            {
+                # First hypertension diagnosis
+                "date": date(2000,1,1),
+                "snomedct_code": "38341003",
+            },
+            {
+                # Second hypertension diagnosis
+                "date": date(2020,3,3),
+                "snomedct_code": "38341003",
+            },
+            {
+                # Third hypertension diagnosis
+                "date": date(2024,2,2),
+                "snomedct_code": "38341003"
+            },
+            {
+                # First resolved diagnosis
+                "date": date(2000,2,1),
+                "snomedct_code": "162659009"
+            },
+            {
+                # Second resolved diagnosis
+                "date": date(2020,3,4),
+                "snomedct_code": "162659009"
+            },
+        ],
+        "expected_in_population": True,
+        "expected_columns": {
+            "hyplat_dat": date(2024,2,2),
+            "hypres_dat": date(2020,3,4),
+            "hyp_reg_r1": True, 
+        },
+
+    },
+    # Correctly not expected in population
+    # Resolved diagnosis before index date
+    4: {
+        "patients": {"date_of_birth": date(1950, 1, 1)},
+        "practice_registrations": [
+            {
+                "start_date": date(2010, 1, 1),
+            },
+        ],
+        "clinical_events": [
+            {
+               # First hypertension diagnosis
+                "date": date(2000,1,1),
+                "snomedct_code": "38341003", 
+            },
+            {
+               # First resolved diagnosis
+                "date": date(2000,2,1),
+                "snomedct_code": "162659009" 
+            },
+            ],
+        "expected_in_population": False,
+    },
+
+    # Correctly not expected in population
+    # Patient not registered at practice
+    5: {
+        "patients": {"date_of_birth": date(1950, 1, 1)},
+        "practice_registrations": [
+            {
+                "start_date": date(2010, 1, 1),
+                "end_date": date(2020,1,2),
+            },
+        ],
+        "clinical_events": [
+            {
+               # First hypertension diagnosis
+                "date": date(2000,1,1),
+                "snomedct_code": "38341003", 
+            },
+            ],
+        "expected_in_population": False,
+    },
+
+    # Correctly not expected in population
+    # Patient diagnosed after index date
+    6: {
+        "patients": {"date_of_birth": date(1950, 1, 1)},
+        "practice_registrations": [
+            {
+                "start_date": date(2010, 1, 1),
+            },
+        ],
+        "clinical_events": [
+            {
+               # First hypertension diagnosis
+                "date": date(2024,4,1),
+                "snomedct_code": "38341003", 
+            },
+            ],
+        "expected_in_population": False,
+    },
+
+    # Correctly not expected in population
+    # Patient diagnosed with a cold before index date
+    7: {
+        "patients": {"date_of_birth": date(1960, 1, 1)},
+        "practice_registrations": [
+            {
+                "start_date": date(1960, 1, 1),
+            },
+        ],
+        "clinical_events": [
+            {
+                # First cold diagnosis
+                "date": date(2000, 6, 1),
+                "snomedct_code": "82272006",
+            },
+            ],
+        "expected_in_population": False,
+    },
+ }
\ No newline at end of file

From 9f9f9ac7085a4179f3dee6c2dcd3cdb9bc713ca0 Mon Sep 17 00:00:00 2001
From: viv3ckj <viveck.kingsley@gmail.com>
Date: Wed, 6 Aug 2025 13:47:01 +0100
Subject: [PATCH 6/7] Update project yaml to contain only the remaining script
 actions

---
 project.yaml | 53 ++--------------------------------------------------
 1 file changed, 2 insertions(+), 51 deletions(-)

diff --git a/project.yaml b/project.yaml
index f1f6faa..575c327 100644
--- a/project.yaml
+++ b/project.yaml
@@ -5,38 +5,15 @@ expectations:
   population_size: 1000
 
 actions:
-  generate_dm_reg_atamborska:
-    run: ehrql:v1 
-      generate-dataset analysis/dm_reg_dataset_atamborska.py 
-      --output output/dm/dm_reg_atamborska.csv.gz
-    outputs:
-      highly_sensitive:
-        dataset: output/dm/dm_reg_atamborska.csv.gz
-
-  generate_registry_dm_jaidip:
-    run: ehrql:v1 generate-dataset analysis/dm_reg_dataset_jaidip_gill.py --output output/dm/registry_dm_jaidip.csv.gz
-    outputs:
-      highly_sensitive:
-        dataset: output/dm/registry_dm_jaidip.csv.gz
-        
   generate_hyp_reg_dataset:
       run: >
         ehrql:v1
         generate-dataset analysis/hyp_reg_dataset.py
         --test-data-file analysis/test_hyp_reg_dataset.py
-        --output output/hyp/hyp001_viveck.csv.gz
+        --output output/hyp/hyp001.csv.gz
       outputs:
         highly_sensitive:
-          cohort: output/hyp/hyp001_viveck.csv.gz
-
-  generate_dm_reg_dataset_viveck:
-    run: >
-      ehrql:v1
-       generate-dataset analysis/dm_reg_dataset_viveck.py
-       --output output/dm/dm017_viveck.csv.gz
-    outputs:
-      highly_sensitive:
-        cohort: output/dm/dm017_viveck.csv.gz
+          cohort: output/hyp/hyp001.csv.gz
 
   generate_dm_reg_dataset_milan:
     run: >
@@ -48,32 +25,6 @@ actions:
       highly_sensitive:
         cohort: output/dm/dm017_milan.csv.gz
 
-  generate_dem_reg_measures:
-    run: >
-      ehrql:v1
-        generate-measures analysis/dem_reg_measures.py
-        --output output/dem/dem_reg_measures.csv
-    outputs:
-      moderately_sensitive:
-        dem_reg_measures: output/dem/dem_reg_measures.csv
-
-  visualise_dem_rates:
-    run: > 
-      r:latest
-        analysis/dem_reg_plots.R
-        --output output/dem/dem_reg_rates.png
-        --output output/dem/dem_reg_rates_imd.png
-    needs: [generate_dem_reg_measures]
-    outputs:
-      moderately_sensitive:
-        dem_reg_rates_unstratified: output/dem/dem_reg_rates.png
-        dem_reg_rates_imd: output/dem/dem_reg_rates_imd.png
-
-  generate_dep_reg_measures_jaidip:
-    run: ehrql:v1 generate-measures analysis/dep_reg_measures_jaidip.py --output output/dep/dep_jaidip.csv.gz
-    outputs:
-      highly_sensitive:
-        measure: output/dep/dep_jaidip.csv.gz
   generate_dep_reg_dataset:
     run: ehrql:v1 generate-dataset analysis/dep_reg_dataset.py --output output/dep/dep_reg_dataset.csv.gz
     outputs:

From 88ac1f9ae0a8de9030513a341d12ee9958bdb411 Mon Sep 17 00:00:00 2001
From: viv3ckj <viveck.kingsley@gmail.com>
Date: Wed, 6 Aug 2025 14:30:21 +0100
Subject: [PATCH 7/7] Added depression test script and removed names from
 scripts

---
 ...reg_dataset_milan.py => dm_reg_dataset.py} |   0
 analysis/test_dep_reg_dataset.py              | 162 ++++++++++++++++++
 ...ataset_milan.py => test_dm_reg_dataset.py} |   4 +-
 project.yaml                                  |  16 +-
 4 files changed, 174 insertions(+), 8 deletions(-)
 rename analysis/{dm_reg_dataset_milan.py => dm_reg_dataset.py} (100%)
 create mode 100644 analysis/test_dep_reg_dataset.py
 rename analysis/{test_dm_reg_dataset_milan.py => test_dm_reg_dataset.py} (97%)

diff --git a/analysis/dm_reg_dataset_milan.py b/analysis/dm_reg_dataset.py
similarity index 100%
rename from analysis/dm_reg_dataset_milan.py
rename to analysis/dm_reg_dataset.py
diff --git a/analysis/test_dep_reg_dataset.py b/analysis/test_dep_reg_dataset.py
new file mode 100644
index 0000000..2937f53
--- /dev/null
+++ b/analysis/test_dep_reg_dataset.py
@@ -0,0 +1,162 @@
+from datetime import date
+
+# 1) Import dataset from the dataset definition
+from dep_reg_dataset import dataset
+
+# Patient data for the FY23/24 with index date = "2024-04-01"
+# Run the tests with the following command:
+# opensafely exec ehrql:v1 assure analysis/test_dep_reg_dataset.py
+
+# 2) Add test cases to test the dataset definition
+
+test_data = {
+    # Correctly not expected in population
+    # No clinical events
+    1: {
+        "patients": {"date_of_birth": date(1950, 1, 1)},
+        "practice_registrations": [
+            {
+                "start_date": date(2010, 1, 1),
+            },
+        ],
+        "clinical_events": [{}],
+        "expected_in_population": False,
+    },
+    # Correctly not expected in population
+    # Diagnosis after the index date
+    2: {
+        "patients": {"date_of_birth": date(1950, 1, 1)},
+        "practice_registrations": [
+            {
+                "start_date": date(2010, 1, 1),
+            },
+        ],
+        "clinical_events": [
+            {
+                # First depression diagnosis (dep_codelist)
+                "date": date(2024, 8, 1),
+                "snomedct_code": "191601008",
+            },
+        ],
+        "expected_in_population": False,
+    },
+    # Correctly not expected in population
+    # Younger than 17yo at index date
+    3: {
+        "patients": {"date_of_birth": date(2010, 1, 1)},
+        "practice_registrations": [
+            {
+                "start_date": date(2010, 1, 1),
+            },
+        ],
+        "clinical_events": [
+            {
+                # First depression diagnosis (dep_codelist)
+                "date": date(2022, 8, 1),
+                "snomedct_code": "191601008",
+            },
+        ],
+        "expected_in_population": False,
+    },
+    # Correctly not expected in population
+    # Not registered at index date
+    4: {
+        "patients": {"date_of_birth": date(1960, 1, 1)},
+        "practice_registrations": [
+            {
+                "start_date": date(1960, 1, 1),
+                "end_date": date(2020, 1, 1),
+            },
+        ],
+        "clinical_events": [
+            {
+                # First depression diagnosis (dep_codelist)
+                "date": date(2022, 8, 1),
+                "snomedct_code": "191601008",
+            },
+        ],
+        "expected_in_population": False,
+    },
+    # Correctly not expected in population
+    # Depression diagnosis resolved before index date
+    5: {
+        "patients": {"date_of_birth": date(1960, 1, 1)},
+        "practice_registrations": [
+            {
+                "start_date": date(1960, 1, 1),
+            },
+        ],
+        "clinical_events": [
+            {
+                # First depression diagnosis (dep_codelist)
+                "date": date(2008, 6, 1),
+                "snomedct_code": "191601008",
+            },
+            {
+                # depression diagnosis resolved (depres_codelist)
+                "date": date(2023, 1, 1),
+                "snomedct_code": "196381000000100",
+            },
+        ],
+        "expected_in_population": False,
+    },
+    # Correctly expected in population
+    # depression diagnosis before index date
+    6: {
+        "patients": {"date_of_birth": date(1960, 1, 1)},
+        "practice_registrations": [
+            {
+                "start_date": date(1960, 1, 1),
+            },
+        ],
+        "clinical_events": [
+            {
+                # First depression diagnosis (dep_codelist)
+                "date": date(2015, 6, 1),
+                "snomedct_code": "191601008",
+            },
+        ],
+        "expected_in_population": True,
+        "expected_columns": {
+            "pat_age": 64,
+            "depr_dat": date(2015, 6, 1),
+            "depres_dat": None,
+        },
+    },
+    # Correctly not expected in population
+    # Code not from dep_codelist codelist
+    7: {
+        "patients": {"date_of_birth": date(1960, 1, 1)},
+        "practice_registrations": [
+            {
+                "start_date": date(1960, 1, 1),
+            },
+        ],
+        "clinical_events": [
+            {
+                # Code not in the codelist
+                "date": date(2000, 6, 1),
+                "snomedct_code": "1111111111",
+            },
+        ],
+        "expected_in_population": False,
+    },
+    # Correctly not expected in population
+    # Depression diagnosis recorded before min date (2006-01-01)
+    8: {
+        "patients": {"date_of_birth": date(1960, 1, 1)},
+        "practice_registrations": [
+            {
+                "start_date": date(1960, 1, 1),
+            },
+        ],
+        "clinical_events": [
+            {
+                # Diagnosis before min date
+                "date": date(2000, 6, 1),
+                "snomedct_code": "191601008",
+            },
+        ],
+        "expected_in_population": False,
+    },
+}
diff --git a/analysis/test_dm_reg_dataset_milan.py b/analysis/test_dm_reg_dataset.py
similarity index 97%
rename from analysis/test_dm_reg_dataset_milan.py
rename to analysis/test_dm_reg_dataset.py
index 8026ac9..587ccf8 100644
--- a/analysis/test_dm_reg_dataset_milan.py
+++ b/analysis/test_dm_reg_dataset.py
@@ -1,11 +1,11 @@
 from datetime import date
 
 # 1) Import dataset from the dataset definition
-from dm_reg_dataset_milan import dataset
+from dm_reg_dataset import dataset
 
 # Patient data for the FY23/24 with index date = "2024-03-31"
 # Run the tests with the following command:
-# opensafely exec ehrql:v1 assure analysis/test_dm_reg_dataset_milan.py
+# opensafely exec ehrql:v1 assure analysis/test_dm_reg_dataset.py
 
 # 2) Add test cases to test the dataset definition
 
diff --git a/project.yaml b/project.yaml
index 575c327..668b9d0 100644
--- a/project.yaml
+++ b/project.yaml
@@ -15,18 +15,22 @@ actions:
         highly_sensitive:
           cohort: output/hyp/hyp001.csv.gz
 
-  generate_dm_reg_dataset_milan:
+  generate_dm_reg_dataset:
     run: >
       ehrql:v1 
-        generate-dataset analysis/dm_reg_dataset_milan.py
-        --test-data-file analysis/test_dm_reg_dataset_milan.py
-        --output output/dm/dm017_milan.csv.gz
+        generate-dataset analysis/dm_reg_dataset.py
+        --test-data-file analysis/test_dm_reg_dataset.py
+        --output output/dm/dm017.csv.gz
     outputs:
       highly_sensitive:
-        cohort: output/dm/dm017_milan.csv.gz
+        cohort: output/dm/dm017.csv.gz
 
   generate_dep_reg_dataset:
-    run: ehrql:v1 generate-dataset analysis/dep_reg_dataset.py --output output/dep/dep_reg_dataset.csv.gz
+    run: 
+      ehrql:v1 
+        generate-dataset analysis/dep_reg_dataset.py
+        --test-data-file analysis/test_dep_reg_dataset.py 
+        --output output/dep/dep_reg_dataset.csv.gz
     outputs:
       highly_sensitive:
         dataset: output/dep/dep_reg_dataset.csv.gz