From 9623cfaacc58872dc4eef23a61333e20c36f5de7 Mon Sep 17 00:00:00 2001 From: viv3ckj Date: Fri, 1 Aug 2025 10:26:50 +0100 Subject: [PATCH 1/7] Add comments to improve clarity of code --- analysis/dm_reg_dataset_milan.py | 41 +++++++++++++++++++------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/analysis/dm_reg_dataset_milan.py b/analysis/dm_reg_dataset_milan.py index ec67fd8..80c2a99 100644 --- a/analysis/dm_reg_dataset_milan.py +++ b/analysis/dm_reg_dataset_milan.py @@ -1,8 +1,11 @@ +# Objective: Create a dataset with columns of interest, taken from TPP tables + from pathlib import Path -from ehrql import create_dataset -from ehrql.codes import codelist_from_csv +# 1) Import the required ehrql functions +from ehrql import create_dataset, codelist_from_csv +# 2) Import relevant tables from the TPP backend (OpenSAFELY-TPP) from ehrql.tables.tpp import ( clinical_events, patients, @@ -11,6 +14,8 @@ CODELIST_DIR = Path("codelists") +# 3) Load SNOMED-CT codelists from CSV files + # Cluster name: DM_COD # Description: Diabetes mellitus codes # Refset ID: ^999004691000230108 @@ -27,8 +32,9 @@ column="code", ) +# 4) Define variables and the study population -# Helper function for finding the last matching event +# Utility function: return the most recent event (if any) for a patient matching a given codelist def last_matching_event(events, codelist, where=True): return ( events.where(where) @@ -37,44 +43,45 @@ def last_matching_event(events, codelist, where=True): .last_for_patient() ) - +# Define the index date for this dataset (e.g. reporting period end) index_date = "2024-03-31" +# Create a dataset object dataset = create_dataset() +# Identify patients who were registered on the index date has_registration = practice_registrations.for_patient_on( index_date ).exists_for_patient() -# Extract prior events for further use in variable definitions below +# Filter clinical events to only those on or before the index date prior_events = clinical_events.where(clinical_events.date.is_on_or_before(index_date)) # Field number: 4 -# PAT_AGE: The age of the patient in full years at the achievement date. +# PAT_AGE: The age of the patient in full years at the index date. dataset.pat_age = patients.age_on(index_date) # Field number: 6 -# DMLAT_DAT: Date of the most recent diabetes diagnosis up to and -# including the achievement date. +# DMLAT_DAT: Date of the most recent diabetes diagnosis on or before index date dataset.dmlat_dat = last_matching_event(prior_events, dm_cod).date # Field number: 7 # DMRES_DAT: Date of the most recent diabetes diagnosis resolved code -# recorded after the most recent diabetes diagnosis and up to and -# including the achievement date. +# recorded after the most recent diabetes diagnosis (if any) dataset.dmres_dat = last_matching_event(prior_events, dm_res_cod).date # DM_REG rule 1: -# Pass to the next rule all patients from the specified population who meet -# both of the criteria below: Have a diabetes diagnosis in the patient record -# up to and including the achievement date. Latest diabetes diagnosis is not -# followed by a diabetes resolved code. +# Include patients with a diabetes diagnosis and no subsequent resolution +# (i.e. latest resolved code is earlier than diagnosis, or no resolved code recorded) dataset.dm_reg_r1 = (dataset.dmres_dat < dataset.dmlat_dat) | ( dataset.dmlat_dat.is_not_null() & dataset.dmres_dat.is_null() ) # DM_REG rule 2: -# Reject patients passed to this rule who are aged under 17 years old on the -# achievement date. +# Include patients under 17 years of age on the index date dataset.dm_reg_r2 = dataset.pat_age < 17 -dataset.define_population(has_registration & dataset.dm_reg_r1 & ~dataset.dm_reg_r2) +# Define the final population: +# - Registered on index date +# - Meet rule 1 (active diabetes diagnosis) +# - Do not meet rule 2 (i.e. are 17 or older) +dataset.define_population(has_registration & dataset.dm_reg_r1 & ~dataset.dm_reg_r2) \ No newline at end of file From ad1159a5851ab43cc5d6d3931d5b20cd7eeacfa1 Mon Sep 17 00:00:00 2001 From: viv3ckj Date: Fri, 1 Aug 2025 10:34:08 +0100 Subject: [PATCH 2/7] Add a few comments to test script --- analysis/test_dm_reg_dataset_milan.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/analysis/test_dm_reg_dataset_milan.py b/analysis/test_dm_reg_dataset_milan.py index 08e1aeb..8026ac9 100644 --- a/analysis/test_dm_reg_dataset_milan.py +++ b/analysis/test_dm_reg_dataset_milan.py @@ -1,10 +1,14 @@ from datetime import date + +# 1) Import dataset from the dataset definition from dm_reg_dataset_milan import dataset # Patient data for the FY23/24 with index date = "2024-03-31" # Run the tests with the following command: # opensafely exec ehrql:v1 assure analysis/test_dm_reg_dataset_milan.py +# 2) Add test cases to test the dataset definition + test_data = { # Correctly not expected in population # No clinical events From b66cd4a0fb856bf301655ce89c607a92940fe51a Mon Sep 17 00:00:00 2001 From: viv3ckj Date: Fri, 1 Aug 2025 10:35:56 +0100 Subject: [PATCH 3/7] Remove other QOF files --- analysis/dem_reg_measures.py | 100 ------------ analysis/dem_reg_plots.R | 36 ----- analysis/dep_reg_dataset.py | 62 -------- analysis/dep_reg_measures_jaidip.py | 73 --------- analysis/dm_reg_dataset_atamborska.py | 87 ----------- analysis/dm_reg_dataset_jaidip_gill.py | 31 ---- analysis/dm_reg_dataset_viveck.py | 72 --------- analysis/hyp_reg_dataset.py | 53 ------- analysis/test_dm_reg_dataset_atamborska.py | 156 ------------------ analysis/test_hyp_reg_dataset.py | 174 --------------------- 10 files changed, 844 deletions(-) delete mode 100644 analysis/dem_reg_measures.py delete mode 100644 analysis/dem_reg_plots.R delete mode 100644 analysis/dep_reg_dataset.py delete mode 100644 analysis/dep_reg_measures_jaidip.py delete mode 100644 analysis/dm_reg_dataset_atamborska.py delete mode 100644 analysis/dm_reg_dataset_jaidip_gill.py delete mode 100644 analysis/dm_reg_dataset_viveck.py delete mode 100644 analysis/hyp_reg_dataset.py delete mode 100644 analysis/test_dm_reg_dataset_atamborska.py delete mode 100644 analysis/test_hyp_reg_dataset.py diff --git a/analysis/dem_reg_measures.py b/analysis/dem_reg_measures.py deleted file mode 100644 index debf786..0000000 --- a/analysis/dem_reg_measures.py +++ /dev/null @@ -1,100 +0,0 @@ -# Task: Calulcate the monthly prevalence trends for the dementia QOF - -# 1) Import the required functions from ehrql -from ehrql import codelist_from_csv -from ehrql import INTERVAL, case, create_measures, months, when - -# 2) Import the tables of interest from TPP -from ehrql.tables.tpp import ( - patients, - practice_registrations, - clinical_events, - addresses, -) - -# 3) Import the codelists of interest -DEM_COD = codelist_from_csv( - "codelists/nhsd-primary-care-domain-refsets-dm_cod.csv", - column="code", -) - -# 4) Create series for numerator and denominator for each interval -## Numerator - number of people with diagnosis of dementia before the end of elegible period -selected_events = clinical_events.where( - clinical_events.date.is_on_or_before(INTERVAL.end_date) -) - -latest_dementia_date = ( - selected_events.where(selected_events.snomedct_code.is_in(DEM_COD)) - .sort_by(selected_events.date) - .last_for_patient() - .date -) - -# Simpler version - -latest_dementia_date_dementia = ( - clinical_events.where( - (clinical_events.date.is_on_or_before(INTERVAL.end_date)) - & (clinical_events.snomedct_code.is_in(DEM_COD)) - ) - .sort_by(clinical_events.date) - .last_for_patient() - .date -) - - -## Denominator - number of people with practice registration before the beginning of the interval -## and with no deregsotration or with reregistration after the end of each interval -has_registration_date = ( - practice_registrations.where( - practice_registrations.start_date.is_on_or_before(INTERVAL.start_date) - & ( - practice_registrations.end_date.is_null() - | practice_registrations.end_date.is_on_or_after(INTERVAL.end_date) - ) - ) - .sort_by(practice_registrations.start_date) - .first_for_patient() - .start_date -) - -# Simpler -has_registration_date = practice_registrations.spanning( - INTERVAL.start_date, INTERVAL.end_date -).exists_for_patient() - -# Create measures framework -measures = create_measures() - -measures.define_measure( - name="dem_reg", - numerator=latest_dementia_date.is_not_null(), - denominator=has_registration_date.is_not_null(), - intervals=months(12).starting_on("2023-04-01"), -) - -# The output is a dataframe of measure name, interval_start, -# interval_end, ratio, numerator, denominator. - -# Analyse dementia rates by subrgoup (socio-economic status) - - -imd_rounded = addresses.for_patient_on(INTERVAL.start_date).imd_rounded -max_imd = 32844 - -imd_quintile = case( - when(imd_rounded < int(max_imd * 1 / 5)).then(1), - when(imd_rounded < int(max_imd * 2 / 5)).then(2), - when(imd_rounded < int(max_imd * 3 / 5)).then(3), - when(imd_rounded < int(max_imd * 4 / 5)).then(4), - when(imd_rounded <= max_imd).then(5), -) - -measures.define_measure( - name="dem_qof_monthy_imd", - numerator=latest_dementia_date.is_not_null(), - denominator=has_registration_date, - group_by={"imd": imd_quintile}, - intervals=months(12).starting_on("2023-04-01"), -) diff --git a/analysis/dem_reg_plots.R b/analysis/dem_reg_plots.R deleted file mode 100644 index ba69aff..0000000 --- a/analysis/dem_reg_plots.R +++ /dev/null @@ -1,36 +0,0 @@ -# Set up -library(tidyverse) -library(lubridate) -df <- read.csv("output/dem/dem_reg_measures.csv") - -# summary(df) - confirms that there are 37 missing ratios and 24 missing imds - -# Dementia prevalence plot, stratified by IMD -df %>% - filter(!is.na(imd)) %>% - ggplot(aes (x = ymd(interval_start), y = ratio, color = as.factor(imd))) + - geom_line()+ - labs ( - title = "Dementia prevalence over time, per IMD group", - x = NULL, - y = "Prevalence of dementia", - color = "Index of multiple depravation" - ) + - theme_light() - -ggsave("output/dem/dem_reg_rates_imd.png") - -# Dementia prevalence, unstratified - -df %>% -filter(is.na(imd)) %>% -ggplot(aes (x = ymd(interval_start), y = ratio)) + -geom_line() + -labs ( - title = "Dementia prevalence over time, unstratfied", - x = NULL, - y = "Prevalence of dementia" - ) + -theme_light() - -ggsave("output/dem/dem_reg_rates.png") \ No newline at end of file diff --git a/analysis/dep_reg_dataset.py b/analysis/dep_reg_dataset.py deleted file mode 100644 index 95bba39..0000000 --- a/analysis/dep_reg_dataset.py +++ /dev/null @@ -1,62 +0,0 @@ -from ehrql import create_dataset -from ehrql.codes import codelist_from_csv - -from ehrql.tables.tpp import ( - clinical_events, - patients, - practice_registrations, -) - -dataset = create_dataset() -# Quality Service Start Date -qs_start_date = '2024-04-01' -# Fixed date used in DEPR1_REG and DEPCC01 logic. -# Depression register: Patients aged at least 18 years old whose latest -# unresolved episode of depression is since 1 st April 2006. -min_date = '2006-04-01' - -# import codelists -dep_codelist = codelist_from_csv( - 'codelists/nhsd-primary-care-domain-refsets-depr_cod.csv', - column='code') -depres_codelist = codelist_from_csv( - 'codelists/nhsd-primary-care-domain-refsets-depres_cod.csv', - column='code') - -# inclusion criteria -has_registration = (practice_registrations - .for_patient_on(qs_start_date) - .exists_for_patient()) - -# Create disease variables -# - DEPR_DAT = Latest date of depression diagnosis -# - DEPRES_DAT = Latest depression resolved date -# - PAT_AGE = Age at quality service start date - -dataset.depr_dat = ((clinical_events. - where(clinical_events - .snomedct_code - .is_in(dep_codelist)) - ).sort_by(clinical_events.date) - .last_for_patient() - .date) -dataset.depres_dat = (clinical_events - .where(clinical_events - .snomedct_code - .is_in(depres_codelist)) - .sort_by(clinical_events.date) - .last_for_patient() - .date) -dataset.pat_age = (patients - .age_on(qs_start_date)) - -# --- Creating rules --- -# Have their latest first or new episode of depression on or after 1 st April 2006. -# Latest episode of depression is not followed by a depression resolved code. -# Must be aged 18 or over on the achievement date - -dep1_reg_r1 = ((dataset.depr_dat.is_on_or_after(min_date)) - & (dataset.depres_dat.is_null())) -dep2_reg_r2 = dataset.pat_age >= 18 - -dataset.define_population(has_registration & dep1_reg_r1 & dep2_reg_r2) diff --git a/analysis/dep_reg_measures_jaidip.py b/analysis/dep_reg_measures_jaidip.py deleted file mode 100644 index c646c98..0000000 --- a/analysis/dep_reg_measures_jaidip.py +++ /dev/null @@ -1,73 +0,0 @@ -from ehrql import INTERVAL, create_measures, months, codelist_from_csv, case, when -from ehrql.tables.tpp import practice_registrations, clinical_events, patients - -# Instantiate measures -measures = create_measures() - -# Import codelists -dep_codelist = codelist_from_csv( - "codelists/nhsd-primary-care-domain-refsets-depr_cod.csv", - column="code", -) - -depres_codelist = codelist_from_csv( - "codelists/nhsd-primary-care-domain-refsets-depres_cod.csv", - column="code", -) - -# Important dates -# Quality Service Start Date -qs_start_date = "2023-04-01" - -# Have their latest first or new episode of depression on or after 1 st April 2006 -min_date = "2006-04-01" - -# Inclusion criteria -has_registration = practice_registrations.spanning( - INTERVAL.start_date, INTERVAL.end_date -).exists_for_patient() - -# Create disease variables -# - depr_dat = Latest date of depression diagnosis -# - depres_dat = Latest depression resolved date -# - pat_age = Age at quality service start date -depr_dat = ( - (clinical_events.where(clinical_events.snomedct_code.is_in(dep_codelist))) - .sort_by(clinical_events.date) - .last_for_patient() - .date -) -depres_dat = ( - clinical_events.where(clinical_events.snomedct_code.is_in(depres_codelist)) - .sort_by(clinical_events.date) - .last_for_patient() - .date -) -pat_age = patients.age_on(INTERVAL.start_date) - - -# Creating rules -dep1_reg_r1 = (depr_dat.is_on_or_after(min_date)) & (depres_dat.is_null()) -dep1_reg_r2 = pat_age >= 18 - -# Define measures -dep_in_interval = depr_dat.is_during(INTERVAL) - -# Stratification groups -age_band = case( - when((pat_age >= 18) & (pat_age <= 21)).then("Young adult"), - when((pat_age >= 22) & (pat_age <= 64)).then("Adult"), - when(pat_age >= 65).then("Elderly"), -) - -# Create measures -measures.define_measure( - name="monthly_prevalence", - numerator=dep_in_interval, - denominator=has_registration & dep1_reg_r1 & dep1_reg_r2, - intervals=months(12).starting_on(qs_start_date), - group_by={ - "sex": patients.sex, - "age": age_band, - }, -) diff --git a/analysis/dm_reg_dataset_atamborska.py b/analysis/dm_reg_dataset_atamborska.py deleted file mode 100644 index 81f2ae3..0000000 --- a/analysis/dm_reg_dataset_atamborska.py +++ /dev/null @@ -1,87 +0,0 @@ -# Import functions that will be used for dataset definition -from ehrql import create_dataset -from ehrql import codelist_from_csv -from ehrql import Path - -# Import the tables of the variables which you need to define the dataset -from ehrql.tables.tpp import patients, practice_registrations, clinical_events - -## could use from ehrql.tables.core or from ehrql.tables.emis - -# Create an empty dataset - we will be adding columns to this, based on which we will filter data -dataset = create_dataset() - -# Create objects containing codelists - these will be used to define the dataset -codelist_path = Path("codelists") -dm_cod = codelist_from_csv( - codelist_path / "nhsd-primary-care-domain-refsets-dm_cod.csv", - column="code", -) -dmres_cod = codelist_from_csv( - codelist_path / "nhsd-primary-care-domain-refsets-dmres_cod.csv", - column="code", -) - -# Simpler: -dm_cod = codelist_from_csv( - "codelists/nhsd-primary-care-domain-refsets-dm_cod.csv", - column="code", -) -dmres_cod = codelist_from_csv( - "codelists/nhsd-primary-care-domain-refsets-dmres_cod.csv", - column="code", -) - -# Create index date -index_date = "2024-03-31" - -# Create a boolean vector for which patients were registered up until the index date -registered = practice_registrations.for_patient_on(index_date).exists_for_patient() - -# However, unless we add this information to the dataset, we won't be able to filter based on it, using dataset.define_population -##So first, we need to add a column to the dataset which defines which patients are registered -dataset.registered = practice_registrations.for_patient_on( - index_date -).exists_for_patient() - -# Derive patient age from the DOB and add to the dataset -dataset.pat_age = patients.age_on(index_date) - -# Derive latest dates for Dx DM and Dx DM res for each patient, and add them to the dataset - -## Select the latest date with the diabetes code and add it to the dataset -dataset.latest_dm = ( - clinical_events.where(clinical_events.snomedct_code.is_in(dm_cod)) - .sort_by(clinical_events.date) - .last_for_patient() - .date -) - - -## Select the latest date with the diabetes resolved code and add it to the dataset -dataset.latest_dmres = ( - clinical_events.where(clinical_events.snomedct_code.is_in(dmres_cod)) - .sort_by(clinical_events.date) - .last_for_patient() - .date -) - -# Define the dataset (=cohort) - using the rules - -##Rule 1 -### Has a diabetes diagnosis before the index date -### Latest diabetes diagnosis is not followed by a diabetes resolved code. - -dm_reg_r1 = ( - (dataset.latest_dm.is_not_null()) - & (dataset.latest_dm < index_date) - & ((dataset.latest_dmres < dataset.latest_dm) | dataset.latest_dmres.is_null()) -) - -##Rule 2 -### Needs to be minimim 17 years old on the index date - -dm_reg_r2 = dataset.pat_age >= 17 - -# Create the population -dataset.define_population(registered & dm_reg_r1 & dm_reg_r2) diff --git a/analysis/dm_reg_dataset_jaidip_gill.py b/analysis/dm_reg_dataset_jaidip_gill.py deleted file mode 100644 index 0a82504..0000000 --- a/analysis/dm_reg_dataset_jaidip_gill.py +++ /dev/null @@ -1,31 +0,0 @@ -from codelists import * -from ehrql import create_dataset -from ehrql.tables.tpp import patients, practice_registrations, clinical_events - -dataset = create_dataset() - -index_date = "2024-03-31" -has_registration = practice_registrations.for_patient_on(index_date).exists_for_patient() -''' -Extract the following columns: -1. age at achievement date -2. earliest diabetes resolved date -''' -dataset.age = patients.age_on(index_date) -prev_events = clinical_events.where(clinical_events.date.is_on_or_before(index_date)) -dataset.dmlate_date = prev_events.where(clinical_events.snomedct_code.is_in(diabetes_codelist)).sort_by(clinical_events.date).last_for_patient().date -dataset.dmreso_date = prev_events.where(clinical_events.snomedct_code.is_in(diabetes_resolved_codelist)).sort_by(clinical_events.date).last_for_patient().date - -''' -Filter each of the columns: -1. dmreso_date < dmlate_date or dmreso_date doesn't exist -2. age >= 17 -''' -dataset.dm_reg_r1 = ( - ((dataset.dmlate_date > dataset.dmreso_date) | dataset.dmreso_date.is_null()) - & dataset.dmlate_date.is_not_null() -) -dataset.dm_reg_r2 = dataset.age >= 17 - -# Filter the dataset -dataset.define_population(has_registration & dataset.dm_reg_r1 & dataset.dm_reg_r2) diff --git a/analysis/dm_reg_dataset_viveck.py b/analysis/dm_reg_dataset_viveck.py deleted file mode 100644 index 9e6d863..0000000 --- a/analysis/dm_reg_dataset_viveck.py +++ /dev/null @@ -1,72 +0,0 @@ -from ehrql import codelist_from_csv -from ehrql import create_dataset -from ehrql.tables.tpp import ( - clinical_events, - patients, - practice_registrations, -) - -### codelists ### - -# Diabetes mellitus codes -dm_cod = codelist_from_csv( - "codelists/nhsd-primary-care-domain-refsets-dm_cod.csv", - column="code" -) -# Diabetes resolved codes -dmres_cod = codelist_from_csv( - "codelists/nhsd-primary-care-domain-refsets-dmres_cod.csv", - column="code" -) - -### create dataset and constant variable ### - -index_date = "2024-03-31" -dataset = create_dataset() - -### variables ### - -# VARIABLE 1: pat_age -# age of the patient in full years at the achievement date -dataset.pat_age = patients.age_on(index_date) - -# VARIABLE 2: dmlat_dat -# date of the most recent diabetes diagnosis up to and including the achievement date -dataset.dmlat_dat = ( - clinical_events.where(clinical_events.snomedct_code.is_in(dm_cod) & clinical_events.date.is_on_or_before(index_date)) - .sort_by(clinical_events.date) - .last_for_patient() - .date -) - -# VARIABLE 3: dmres_dat -# date of the most recent diabetes diagnosis resolved code recorded after the most recent diabetes diagnosis and up to and including the achievement date -dataset.dmres_dat = ( - clinical_events.where(clinical_events.snomedct_code.is_in(dmres_cod) & clinical_events.date.is_on_or_before(index_date)) - .sort_by(clinical_events.date) - .last_for_patient() - .date -) - -### rules ### - -# RULE 1: dm_reg_r1 -# Pass to the next rule all patients from the specified population who meet both of the criteria below: -# Have a diabetes diagnosis in the patient record up to and including the achievement date. -# Latest diabetes diagnosis is not followed by a diabetes resolved code. -dataset.dm_reg_r1 = dataset.dmres_dat.is_null() & dataset.dmlat_dat.is_not_null() | (dataset.dmlat_dat > dataset.dmres_dat) - -# RULE 2: dm_reg_r2 -# Reject patients passed to this rule who are aged under 17 years old on the -# achievement date. -dataset.dm_reg_r2 = dataset.pat_age < 17 - -### defining population ### - -# Applying both rules and registration on index date to define population -has_registration = practice_registrations.for_patient_on(index_date).exists_for_patient() -dataset.define_population((has_registration) & ~(dataset.dm_reg_r2) & (dataset.dm_reg_r1)) - - - - diff --git a/analysis/hyp_reg_dataset.py b/analysis/hyp_reg_dataset.py deleted file mode 100644 index c9eb072..0000000 --- a/analysis/hyp_reg_dataset.py +++ /dev/null @@ -1,53 +0,0 @@ -from ehrql import codelist_from_csv -from ehrql import create_dataset -from ehrql.tables.tpp import ( - clinical_events, - patients, - practice_registrations, -) - -# Hypertension codes -hyp_cod = codelist_from_csv( - "codelists/nhsd-primary-care-domain-refsets-hyp_cod.csv", - column="code" -) -# Hypertension resolved codes -hypres_cod = codelist_from_csv( - "codelists/nhsd-primary-care-domain-refsets-hypres_cod.csv", - column="code" -) - -index_date = "2024-03-31" -dataset = create_dataset() - -### variables ### - -# date of the most recent hypertension diagnosis up to and including the achievement date -dataset.hyplat_dat = ( - clinical_events.where(clinical_events.snomedct_code.is_in(hyp_cod) & clinical_events.date.is_on_or_before(index_date)) - .sort_by(clinical_events.date) - .last_for_patient() - .date -) - -# date of the most recent hypertension diagnosis resolved code recorded after the most recent hypertension diagnosis and up to and including the achievement date -dataset.hypres_dat = ( - clinical_events.where(clinical_events.snomedct_code.is_in(hypres_cod) & clinical_events.date.is_on_or_before(index_date)) - .sort_by(clinical_events.date) - .last_for_patient() - .date -) - - -### rules ### - -# Pass to the next rule all patients from the specified population who meet both of the criteria below: -# Have a hypertension diagnosis in the patient record up to and including the achievement date. -# Latest hypertension diagnosis is not followed by a hypertension resolved code. -dataset.hyp_reg_r1 = dataset.hyplat_dat.is_not_null() & dataset.hypres_dat.is_null() | (dataset.hyplat_dat > dataset.hypres_dat) - -### defining population ### -has_registration = practice_registrations.for_patient_on(index_date).exists_for_patient() -dataset.define_population( - (has_registration) & dataset.hyp_reg_r1 - ) diff --git a/analysis/test_dm_reg_dataset_atamborska.py b/analysis/test_dm_reg_dataset_atamborska.py deleted file mode 100644 index c99b03e..0000000 --- a/analysis/test_dm_reg_dataset_atamborska.py +++ /dev/null @@ -1,156 +0,0 @@ -# Set up -from datetime import date -from dm_reg_dataset_atamborska import dataset - -test_data = { - # Patient accepted in the dataset - 1: { - "patients": {"date_of_birth": date(1992, 1, 1)}, - "practice_registrations": [ - { - # First registration - "start_date": date(1994, 1, 1), - "end_date": date(1999, 1, 1), - }, - { - # Second registration - "start_date": date(1999, 1, 2), - }, - ], - "clinical_events": [ - # First clinical event - { - "snomedct_code": "313435000", - "date": date(2000, 4, 13), - }, - # Second clinical event - { - "snomedct_code": "420270002", - "date": date(2004, 1, 17), - }, - ], - "expected_in_population": True, - "expected_columns": { - "pat_age": 32, - "registered": True, - "latest_dm": date(2004, 1, 17), - "latest_dmres": None, - }, - }, - # Patient not accepted in the dataset: no active registration - 2: { - "patients": {"date_of_birth": date(1992, 1, 1)}, - "practice_registrations": [ - { - # First registration - "start_date": date(1994, 1, 1), - "end_date": date(1999, 1, 1), - }, - { - # Second registration - "start_date": date(1999, 1, 2), - "end_date": date(2005, 1, 1), - }, - ], - "clinical_events": [ - # First clinical event - { - "snomedct_code": "313435000", - "date": date(2000, 4, 13), - }, - # Second clinical event - { - "snomedct_code": "420270002", - "date": date(2004, 1, 17), - }, - ], - "expected_in_population": False, - }, - # Patient not accepted in the dataset: diabetes resolved - 3: { - "patients": {"date_of_birth": date(1992, 1, 1)}, - "practice_registrations": [ - { - # First registration - "start_date": date(1994, 1, 1), - "end_date": date(1999, 1, 1), - }, - { - # Second registration - "start_date": date(1999, 1, 2), - }, - ], - "clinical_events": [ - # First clinical event - { - "snomedct_code": "313435000", - "date": date(2000, 4, 13), - }, - # Second clinical event - { - "snomedct_code": "315051004", - "date": date(2004, 1, 17), - }, - ], - "expected_in_population": False, - }, - # Patient accepted in the dataset: diabetes resolved in the past, then another episode - 4: { - "patients": {"date_of_birth": date(1992, 1, 1)}, - "practice_registrations": [ - { - # First registration - "start_date": date(1994, 1, 1), - "end_date": date(1999, 1, 1), - }, - { - # Second registration - "start_date": date(1999, 1, 2), - }, - ], - "clinical_events": [ - # First clinical event - { - "snomedct_code": "313435000", - "date": date(2000, 4, 13), - }, - # Second clinical event - { - "snomedct_code": "315051004", - "date": date(2004, 1, 17), - }, - # Third clinical event - {"snomedct_code": "313436004", "date": date(2015, 1, 1)}, - ], - "expected_in_population": True, - "expected_columns": { - "pat_age": 32, - "registered": True, - "latest_dm": date(2015, 1, 1), - "latest_dmres": date(2004, 1, 17), - }, - }, - # Patient not accepted in the dataset: too young - 5: { - "patients": {"date_of_birth": date(2008, 1, 1)}, - "practice_registrations": [ - { - # First registration - "start_date": date(2017, 1, 1), - "end_date": date(2022, 1, 1), - }, - { - # Second registration - "start_date": date(2022, 1, 2), - }, - ], - "clinical_events": [ - # First clinical event - { - "snomedct_code": "420270002", - "date": date(2018, 1, 1), - }, - ], - "expected_in_population": False, - }, -} diff --git a/analysis/test_hyp_reg_dataset.py b/analysis/test_hyp_reg_dataset.py deleted file mode 100644 index fd847b0..0000000 --- a/analysis/test_hyp_reg_dataset.py +++ /dev/null @@ -1,174 +0,0 @@ -from datetime import date -from hyp_reg_dataset import dataset - -test_data = { - - # Correctly not expected in population - # No clinical events - 1: { - "patients": {"date_of_birth": date(1950, 1, 1)}, - "practice_registrations": [ - { - "start_date": date(2010, 1, 1), - }, - ], - "clinical_events": [{}], - "expected_in_population": False, - }, - - # Correctly expected in population - # Most recent hypertension diagnosis before index date - 2: { - "patients": {"date_of_birth": date(1950,1,1)}, - "practice_registrations": [ - { - "start_date": date(2010,1,1) - } - ], - "clinical_events": [ - { - # First hypertension diagnosis - "date": date(2000,1,1), - "snomedct_code": "38341003", - }, - { - # Second hypertension diagnosis - "date": date(2020,3,3), - "snomedct_code": "38341003", - } - ], - "expected_in_population": True, - "expected_columns": { - "hyplat_dat": date(2020,3,3), - "hypres_dat": None, - "hyp_reg_r1": True, - }, - - }, - - # Correctly expected in population - # Most recent resolved hypertension diagnosis before index date - 3: { - "patients": {"date_of_birth": date(1950,1,1)}, - "practice_registrations": [ - { - "start_date": date(2010,1,1) - } - ], - "clinical_events": [ - { - # First hypertension diagnosis - "date": date(2000,1,1), - "snomedct_code": "38341003", - }, - { - # Second hypertension diagnosis - "date": date(2020,3,3), - "snomedct_code": "38341003", - }, - { - # Third hypertension diagnosis - "date": date(2024,2,2), - "snomedct_code": "38341003" - }, - { - # First resolved diagnosis - "date": date(2000,2,1), - "snomedct_code": "162659009" - }, - { - # Second resolved diagnosis - "date": date(2020,3,4), - "snomedct_code": "162659009" - }, - ], - "expected_in_population": True, - "expected_columns": { - "hyplat_dat": date(2024,2,2), - "hypres_dat": date(2020,3,4), - "hyp_reg_r1": True, - }, - - }, - # Correctly not expected in population - # Resolved diagnosis before index date - 4: { - "patients": {"date_of_birth": date(1950, 1, 1)}, - "practice_registrations": [ - { - "start_date": date(2010, 1, 1), - }, - ], - "clinical_events": [ - { - # First hypertension diagnosis - "date": date(2000,1,1), - "snomedct_code": "38341003", - }, - { - # First resolved diagnosis - "date": date(2000,2,1), - "snomedct_code": "162659009" - }, - ], - "expected_in_population": False, - }, - - # Correctly not expected in population - # Patient not registered at practice - 5: { - "patients": {"date_of_birth": date(1950, 1, 1)}, - "practice_registrations": [ - { - "start_date": date(2010, 1, 1), - "end_date": date(2020,1,2), - }, - ], - "clinical_events": [ - { - # First hypertension diagnosis - "date": date(2000,1,1), - "snomedct_code": "38341003", - }, - ], - "expected_in_population": False, - }, - - # Correctly not expected in population - # Patient diagnosed after index date - 6: { - "patients": {"date_of_birth": date(1950, 1, 1)}, - "practice_registrations": [ - { - "start_date": date(2010, 1, 1), - }, - ], - "clinical_events": [ - { - # First hypertension diagnosis - "date": date(2024,4,1), - "snomedct_code": "38341003", - }, - ], - "expected_in_population": False, - }, - - # Correctly not expected in population - # Patient diagnosed with a cold before index date - 7: { - "patients": {"date_of_birth": date(1960, 1, 1)}, - "practice_registrations": [ - { - "start_date": date(1960, 1, 1), - }, - ], - "clinical_events": [ - { - # First cold diagnosis - "date": date(2000, 6, 1), - "snomedct_code": "82272006", - }, - ], - "expected_in_population": False, - }, - } \ No newline at end of file From dc9a5b9c7f719c826d9bfa4246961117e080634c Mon Sep 17 00:00:00 2001 From: viv3ckj Date: Tue, 5 Aug 2025 16:51:41 +0100 Subject: [PATCH 4/7] Make dataset definitions consistent --- analysis/dep_reg_dataset.py | 76 +++++++++++++++++++++++++++++++++++++ analysis/hyp_reg_dataset.py | 67 ++++++++++++++++++++++++++++++++ 2 files changed, 143 insertions(+) create mode 100644 analysis/dep_reg_dataset.py create mode 100644 analysis/hyp_reg_dataset.py diff --git a/analysis/dep_reg_dataset.py b/analysis/dep_reg_dataset.py new file mode 100644 index 0000000..d66c3e3 --- /dev/null +++ b/analysis/dep_reg_dataset.py @@ -0,0 +1,76 @@ +# 1) Import the required ehrql functions +from ehrql import create_dataset, codelist_from_csv + +# 2) Import relevant tables from the TPP backend (OpenSAFELY-TPP) +from ehrql.tables.tpp import ( + clinical_events, + patients, + practice_registrations, +) + +# 3) Load SNOMED-CT codelists from CSV files + +# Depression codes +dep_codelist = codelist_from_csv( + 'codelists/nhsd-primary-care-domain-refsets-depr_cod.csv', + column='code') +# Resolved depression codes +depres_codelist = codelist_from_csv( + 'codelists/nhsd-primary-care-domain-refsets-depres_cod.csv', + column='code') + +# 4) Define variables and the study population + +# Utility function: return the most recent event (if any) for a patient matching a given codelist +def last_matching_event(events, codelist, where=True): + return ( + events.where(where) + .where(events.snomedct_code.is_in(codelist)) + .sort_by(events.date) + .last_for_patient() + ) + +# Quality Service Start Date (Index date) +qs_start_date = '2024-04-01' + +# Fixed date used in DEPR1_REG and DEPCC01 logic. +# Depression register: Patients aged at least 18 years old whose latest unresolved episode of depression is since 1st April 2006. +min_date = '2006-04-01' +# Create a dataset object +dataset = create_dataset() + +# Identify patients who were registered on the index date +has_registration = practice_registrations.for_patient_on( + qs_start_date +).exists_for_patient() + +# Filter clinical events to only those on or before the index date +prior_events = clinical_events.where(clinical_events.date.is_on_or_before(qs_start_date)) + +# Field number: 4 +# PAT_AGE: The age of the patient in full years at the index date. +dataset.pat_age = (patients + .age_on(qs_start_date)) + +# Field number 5: +# DEPR_DAT: Date of the latest depression episode at the index date +dataset.depr_dat = last_matching_event(prior_events, dep_codelist).date + +# Field number 6: +# DEPRES_DAT: Date of the most recent depression resolved code at the index date +dataset.depres_dat = last_matching_event(prior_events, depres_codelist).date + +# DEP_REG rule 1: +# Include patients that have their latest episode of unresolved depression on or after 1st April 2006. +dep1_reg_r1 = ((dataset.depr_dat.is_on_or_after(min_date)) + & (dataset.depres_dat.is_null())) + +# DEP_REG rule 2: +# Include patients under the age of 18 on the index date +dep2_reg_r2 = dataset.pat_age < 18 + +# Define the final population: +# - Registered on index date +# - Meet rule 1 (active depression diagnosis) +# - Do not meet rule 2 (i.e. are 18 or older) +dataset.define_population(has_registration & dep1_reg_r1 & ~dep2_reg_r2) diff --git a/analysis/hyp_reg_dataset.py b/analysis/hyp_reg_dataset.py new file mode 100644 index 0000000..5aa256d --- /dev/null +++ b/analysis/hyp_reg_dataset.py @@ -0,0 +1,67 @@ +# 1) Import the required ehrql functions +from ehrql import create_dataset, codelist_from_csv + +# 2) Import relevant tables from the TPP backend (OpenSAFELY-TPP) +from ehrql.tables.tpp import ( + clinical_events, + practice_registrations, +) + +# 3) Load SNOMED-CT codelists from CSV files + +# Hypertension codes +hyp_cod = codelist_from_csv( + "codelists/nhsd-primary-care-domain-refsets-hyp_cod.csv", + column="code" +) +# Hypertension resolved codes +hypres_cod = codelist_from_csv( + "codelists/nhsd-primary-care-domain-refsets-hypres_cod.csv", + column="code" +) + +# 4) Define variables and the study population + +# Utility function: return the most recent event (if any) for a patient matching a given codelist +def last_matching_event(events, codelist, where=True): + return ( + events.where(where) + .where(events.snomedct_code.is_in(codelist)) + .sort_by(events.date) + .last_for_patient() + ) + +# Define the index date for this dataset (e.g. reporting period end) +index_date = "2024-03-31" +# Create a dataset object +dataset = create_dataset() + +# Identify patients who were registered on the index date +has_registration = practice_registrations.for_patient_on( + index_date +).exists_for_patient() + +# Filter clinical events to only those on or before the index date +prior_events = clinical_events.where(clinical_events.date.is_on_or_before(index_date)) + +# Field number: 6 +# Date of the most recent hypertension diagnosis up to and including the achievement date +dataset.hyplat_dat = last_matching_event(prior_events, hyp_cod).date + +# Fielt number: 7 +# Date of the most recent hypertension diagnosis resolved code recorded after the most recent hypertension diagnosis and up to and including the achievement date +dataset.hypres_dat = last_matching_event(prior_events, hypres_cod).date + + +# HYP_REG rule: + +# Include patients with a hypertension diagnosis on or before index date. +# Latest hypertension diagnosis is not followed by a hypertension resolved code. +dataset.hyp_reg_r1 = dataset.hyplat_dat.is_not_null() & dataset.hypres_dat.is_null() | (dataset.hyplat_dat > dataset.hypres_dat) + +# Define the final population: +# - Registered on index date +# - Meet the rule (active diabetes diagnosis) +dataset.define_population( + (has_registration) & dataset.hyp_reg_r1 + ) From 0b216a69fa689168365910bd9181d535cd8bfd10 Mon Sep 17 00:00:00 2001 From: viv3ckj Date: Wed, 6 Aug 2025 13:40:53 +0100 Subject: [PATCH 5/7] Make test script consistent with diabetes test script --- analysis/test_hyp_reg_dataset.py | 181 +++++++++++++++++++++++++++++++ 1 file changed, 181 insertions(+) create mode 100644 analysis/test_hyp_reg_dataset.py diff --git a/analysis/test_hyp_reg_dataset.py b/analysis/test_hyp_reg_dataset.py new file mode 100644 index 0000000..8d60a29 --- /dev/null +++ b/analysis/test_hyp_reg_dataset.py @@ -0,0 +1,181 @@ +from datetime import date + +# 1) Import dataset from the dataset definition +from hyp_reg_dataset import dataset + +# Patient data for the FY23/24 with index date = "2024-03-31" +# Run the tests with the following command: +# opensafely exec ehrql:v1 assure analysis/test_hyp_reg_dataset.py + +# 2) Add test cases to test the dataset definition + +test_data = { + # Correctly not expected in population + # No clinical events + 1: { + "patients": {"date_of_birth": date(1950, 1, 1)}, + "practice_registrations": [ + { + "start_date": date(2010, 1, 1), + }, + ], + "clinical_events": [{}], + "expected_in_population": False, + }, + + # Correctly expected in population + # Most recent hypertension diagnosis before index date + 2: { + "patients": {"date_of_birth": date(1950,1,1)}, + "practice_registrations": [ + { + "start_date": date(2010,1,1) + } + ], + "clinical_events": [ + { + # First hypertension diagnosis + "date": date(2000,1,1), + "snomedct_code": "38341003", + }, + { + # Second hypertension diagnosis + "date": date(2020,3,3), + "snomedct_code": "38341003", + } + ], + "expected_in_population": True, + "expected_columns": { + "hyplat_dat": date(2020,3,3), + "hypres_dat": None, + "hyp_reg_r1": True, + }, + + }, + + # Correctly expected in population + # Most recent resolved hypertension diagnosis before index date + 3: { + "patients": {"date_of_birth": date(1950,1,1)}, + "practice_registrations": [ + { + "start_date": date(2010,1,1) + } + ], + "clinical_events": [ + { + # First hypertension diagnosis + "date": date(2000,1,1), + "snomedct_code": "38341003", + }, + { + # Second hypertension diagnosis + "date": date(2020,3,3), + "snomedct_code": "38341003", + }, + { + # Third hypertension diagnosis + "date": date(2024,2,2), + "snomedct_code": "38341003" + }, + { + # First resolved diagnosis + "date": date(2000,2,1), + "snomedct_code": "162659009" + }, + { + # Second resolved diagnosis + "date": date(2020,3,4), + "snomedct_code": "162659009" + }, + ], + "expected_in_population": True, + "expected_columns": { + "hyplat_dat": date(2024,2,2), + "hypres_dat": date(2020,3,4), + "hyp_reg_r1": True, + }, + + }, + # Correctly not expected in population + # Resolved diagnosis before index date + 4: { + "patients": {"date_of_birth": date(1950, 1, 1)}, + "practice_registrations": [ + { + "start_date": date(2010, 1, 1), + }, + ], + "clinical_events": [ + { + # First hypertension diagnosis + "date": date(2000,1,1), + "snomedct_code": "38341003", + }, + { + # First resolved diagnosis + "date": date(2000,2,1), + "snomedct_code": "162659009" + }, + ], + "expected_in_population": False, + }, + + # Correctly not expected in population + # Patient not registered at practice + 5: { + "patients": {"date_of_birth": date(1950, 1, 1)}, + "practice_registrations": [ + { + "start_date": date(2010, 1, 1), + "end_date": date(2020,1,2), + }, + ], + "clinical_events": [ + { + # First hypertension diagnosis + "date": date(2000,1,1), + "snomedct_code": "38341003", + }, + ], + "expected_in_population": False, + }, + + # Correctly not expected in population + # Patient diagnosed after index date + 6: { + "patients": {"date_of_birth": date(1950, 1, 1)}, + "practice_registrations": [ + { + "start_date": date(2010, 1, 1), + }, + ], + "clinical_events": [ + { + # First hypertension diagnosis + "date": date(2024,4,1), + "snomedct_code": "38341003", + }, + ], + "expected_in_population": False, + }, + + # Correctly not expected in population + # Patient diagnosed with a cold before index date + 7: { + "patients": {"date_of_birth": date(1960, 1, 1)}, + "practice_registrations": [ + { + "start_date": date(1960, 1, 1), + }, + ], + "clinical_events": [ + { + # First cold diagnosis + "date": date(2000, 6, 1), + "snomedct_code": "82272006", + }, + ], + "expected_in_population": False, + }, + } \ No newline at end of file From 9f9f9ac7085a4179f3dee6c2dcd3cdb9bc713ca0 Mon Sep 17 00:00:00 2001 From: viv3ckj Date: Wed, 6 Aug 2025 13:47:01 +0100 Subject: [PATCH 6/7] Update project yaml to contain only the remaining script actions --- project.yaml | 53 ++-------------------------------------------------- 1 file changed, 2 insertions(+), 51 deletions(-) diff --git a/project.yaml b/project.yaml index f1f6faa..575c327 100644 --- a/project.yaml +++ b/project.yaml @@ -5,38 +5,15 @@ expectations: population_size: 1000 actions: - generate_dm_reg_atamborska: - run: ehrql:v1 - generate-dataset analysis/dm_reg_dataset_atamborska.py - --output output/dm/dm_reg_atamborska.csv.gz - outputs: - highly_sensitive: - dataset: output/dm/dm_reg_atamborska.csv.gz - - generate_registry_dm_jaidip: - run: ehrql:v1 generate-dataset analysis/dm_reg_dataset_jaidip_gill.py --output output/dm/registry_dm_jaidip.csv.gz - outputs: - highly_sensitive: - dataset: output/dm/registry_dm_jaidip.csv.gz - generate_hyp_reg_dataset: run: > ehrql:v1 generate-dataset analysis/hyp_reg_dataset.py --test-data-file analysis/test_hyp_reg_dataset.py - --output output/hyp/hyp001_viveck.csv.gz + --output output/hyp/hyp001.csv.gz outputs: highly_sensitive: - cohort: output/hyp/hyp001_viveck.csv.gz - - generate_dm_reg_dataset_viveck: - run: > - ehrql:v1 - generate-dataset analysis/dm_reg_dataset_viveck.py - --output output/dm/dm017_viveck.csv.gz - outputs: - highly_sensitive: - cohort: output/dm/dm017_viveck.csv.gz + cohort: output/hyp/hyp001.csv.gz generate_dm_reg_dataset_milan: run: > @@ -48,32 +25,6 @@ actions: highly_sensitive: cohort: output/dm/dm017_milan.csv.gz - generate_dem_reg_measures: - run: > - ehrql:v1 - generate-measures analysis/dem_reg_measures.py - --output output/dem/dem_reg_measures.csv - outputs: - moderately_sensitive: - dem_reg_measures: output/dem/dem_reg_measures.csv - - visualise_dem_rates: - run: > - r:latest - analysis/dem_reg_plots.R - --output output/dem/dem_reg_rates.png - --output output/dem/dem_reg_rates_imd.png - needs: [generate_dem_reg_measures] - outputs: - moderately_sensitive: - dem_reg_rates_unstratified: output/dem/dem_reg_rates.png - dem_reg_rates_imd: output/dem/dem_reg_rates_imd.png - - generate_dep_reg_measures_jaidip: - run: ehrql:v1 generate-measures analysis/dep_reg_measures_jaidip.py --output output/dep/dep_jaidip.csv.gz - outputs: - highly_sensitive: - measure: output/dep/dep_jaidip.csv.gz generate_dep_reg_dataset: run: ehrql:v1 generate-dataset analysis/dep_reg_dataset.py --output output/dep/dep_reg_dataset.csv.gz outputs: From 88ac1f9ae0a8de9030513a341d12ee9958bdb411 Mon Sep 17 00:00:00 2001 From: viv3ckj Date: Wed, 6 Aug 2025 14:30:21 +0100 Subject: [PATCH 7/7] Added depression test script and removed names from scripts --- ...reg_dataset_milan.py => dm_reg_dataset.py} | 0 analysis/test_dep_reg_dataset.py | 162 ++++++++++++++++++ ...ataset_milan.py => test_dm_reg_dataset.py} | 4 +- project.yaml | 16 +- 4 files changed, 174 insertions(+), 8 deletions(-) rename analysis/{dm_reg_dataset_milan.py => dm_reg_dataset.py} (100%) create mode 100644 analysis/test_dep_reg_dataset.py rename analysis/{test_dm_reg_dataset_milan.py => test_dm_reg_dataset.py} (97%) diff --git a/analysis/dm_reg_dataset_milan.py b/analysis/dm_reg_dataset.py similarity index 100% rename from analysis/dm_reg_dataset_milan.py rename to analysis/dm_reg_dataset.py diff --git a/analysis/test_dep_reg_dataset.py b/analysis/test_dep_reg_dataset.py new file mode 100644 index 0000000..2937f53 --- /dev/null +++ b/analysis/test_dep_reg_dataset.py @@ -0,0 +1,162 @@ +from datetime import date + +# 1) Import dataset from the dataset definition +from dep_reg_dataset import dataset + +# Patient data for the FY23/24 with index date = "2024-04-01" +# Run the tests with the following command: +# opensafely exec ehrql:v1 assure analysis/test_dep_reg_dataset.py + +# 2) Add test cases to test the dataset definition + +test_data = { + # Correctly not expected in population + # No clinical events + 1: { + "patients": {"date_of_birth": date(1950, 1, 1)}, + "practice_registrations": [ + { + "start_date": date(2010, 1, 1), + }, + ], + "clinical_events": [{}], + "expected_in_population": False, + }, + # Correctly not expected in population + # Diagnosis after the index date + 2: { + "patients": {"date_of_birth": date(1950, 1, 1)}, + "practice_registrations": [ + { + "start_date": date(2010, 1, 1), + }, + ], + "clinical_events": [ + { + # First depression diagnosis (dep_codelist) + "date": date(2024, 8, 1), + "snomedct_code": "191601008", + }, + ], + "expected_in_population": False, + }, + # Correctly not expected in population + # Younger than 17yo at index date + 3: { + "patients": {"date_of_birth": date(2010, 1, 1)}, + "practice_registrations": [ + { + "start_date": date(2010, 1, 1), + }, + ], + "clinical_events": [ + { + # First depression diagnosis (dep_codelist) + "date": date(2022, 8, 1), + "snomedct_code": "191601008", + }, + ], + "expected_in_population": False, + }, + # Correctly not expected in population + # Not registered at index date + 4: { + "patients": {"date_of_birth": date(1960, 1, 1)}, + "practice_registrations": [ + { + "start_date": date(1960, 1, 1), + "end_date": date(2020, 1, 1), + }, + ], + "clinical_events": [ + { + # First depression diagnosis (dep_codelist) + "date": date(2022, 8, 1), + "snomedct_code": "191601008", + }, + ], + "expected_in_population": False, + }, + # Correctly not expected in population + # Depression diagnosis resolved before index date + 5: { + "patients": {"date_of_birth": date(1960, 1, 1)}, + "practice_registrations": [ + { + "start_date": date(1960, 1, 1), + }, + ], + "clinical_events": [ + { + # First depression diagnosis (dep_codelist) + "date": date(2008, 6, 1), + "snomedct_code": "191601008", + }, + { + # depression diagnosis resolved (depres_codelist) + "date": date(2023, 1, 1), + "snomedct_code": "196381000000100", + }, + ], + "expected_in_population": False, + }, + # Correctly expected in population + # depression diagnosis before index date + 6: { + "patients": {"date_of_birth": date(1960, 1, 1)}, + "practice_registrations": [ + { + "start_date": date(1960, 1, 1), + }, + ], + "clinical_events": [ + { + # First depression diagnosis (dep_codelist) + "date": date(2015, 6, 1), + "snomedct_code": "191601008", + }, + ], + "expected_in_population": True, + "expected_columns": { + "pat_age": 64, + "depr_dat": date(2015, 6, 1), + "depres_dat": None, + }, + }, + # Correctly not expected in population + # Code not from dep_codelist codelist + 7: { + "patients": {"date_of_birth": date(1960, 1, 1)}, + "practice_registrations": [ + { + "start_date": date(1960, 1, 1), + }, + ], + "clinical_events": [ + { + # Code not in the codelist + "date": date(2000, 6, 1), + "snomedct_code": "1111111111", + }, + ], + "expected_in_population": False, + }, + # Correctly not expected in population + # Depression diagnosis recorded before min date (2006-01-01) + 8: { + "patients": {"date_of_birth": date(1960, 1, 1)}, + "practice_registrations": [ + { + "start_date": date(1960, 1, 1), + }, + ], + "clinical_events": [ + { + # Diagnosis before min date + "date": date(2000, 6, 1), + "snomedct_code": "191601008", + }, + ], + "expected_in_population": False, + }, +} diff --git a/analysis/test_dm_reg_dataset_milan.py b/analysis/test_dm_reg_dataset.py similarity index 97% rename from analysis/test_dm_reg_dataset_milan.py rename to analysis/test_dm_reg_dataset.py index 8026ac9..587ccf8 100644 --- a/analysis/test_dm_reg_dataset_milan.py +++ b/analysis/test_dm_reg_dataset.py @@ -1,11 +1,11 @@ from datetime import date # 1) Import dataset from the dataset definition -from dm_reg_dataset_milan import dataset +from dm_reg_dataset import dataset # Patient data for the FY23/24 with index date = "2024-03-31" # Run the tests with the following command: -# opensafely exec ehrql:v1 assure analysis/test_dm_reg_dataset_milan.py +# opensafely exec ehrql:v1 assure analysis/test_dm_reg_dataset.py # 2) Add test cases to test the dataset definition diff --git a/project.yaml b/project.yaml index 575c327..668b9d0 100644 --- a/project.yaml +++ b/project.yaml @@ -15,18 +15,22 @@ actions: highly_sensitive: cohort: output/hyp/hyp001.csv.gz - generate_dm_reg_dataset_milan: + generate_dm_reg_dataset: run: > ehrql:v1 - generate-dataset analysis/dm_reg_dataset_milan.py - --test-data-file analysis/test_dm_reg_dataset_milan.py - --output output/dm/dm017_milan.csv.gz + generate-dataset analysis/dm_reg_dataset.py + --test-data-file analysis/test_dm_reg_dataset.py + --output output/dm/dm017.csv.gz outputs: highly_sensitive: - cohort: output/dm/dm017_milan.csv.gz + cohort: output/dm/dm017.csv.gz generate_dep_reg_dataset: - run: ehrql:v1 generate-dataset analysis/dep_reg_dataset.py --output output/dep/dep_reg_dataset.csv.gz + run: + ehrql:v1 + generate-dataset analysis/dep_reg_dataset.py + --test-data-file analysis/test_dep_reg_dataset.py + --output output/dep/dep_reg_dataset.csv.gz outputs: highly_sensitive: dataset: output/dep/dep_reg_dataset.csv.gz