Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: ukbrapR
Title: R functions to use in the UK Biobank Research Analysis Platform (RAP)
Version: 0.3.14
Version: 0.3.15
Authors@R: c(person("Luke", "Pilling",
email = "L.Pilling@exeter.ac.uk",
role = c("aut", "cre"),
Expand All @@ -24,7 +24,7 @@ Imports:
haven (>= 2.5.0)
Encoding: UTF-8
LazyData: true
RoxygenNote: 7.3.2
RoxygenNote: 7.3.3
BugReports: https://github.com/lcpilling/ukbrapR/issues
Suggests:
knitr,
Expand Down
15 changes: 15 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,18 @@
# ukbrapR v0.3.15 (21st April 2026)

### Changes
- (PR #41) Cancer registry data did not include ICD9 codes. Thanks to @hdg204 for the suggestions. Changes are:
- `export_tables_cancer_registry` now exports fields 40013 (cancer registry ICD9 codes)
- `get_diagnoses` notices if user has provided an ICD9 in range 140 to 208, flag to search cancer registry
- `get_cancer_registry` differentiates between ICD9 and ICD10 when searching the raw data
- If user has an older table (without ICD9) then only ICD10 codes are searched
- `get_df` correctly handles if ICD9 and/or ICD10 diagnoses are available

### Bug fixes
- `fields_to_phenos` now better handles some instanced fields (like 40006) where the Schema has instanced == 2
- Known bug: some arrayed fields (like 20001) are not always handled correctly - not worked this out yet


# ukbrapR v0.3.14 (24th Feb 2026)

### Bug fixes
Expand Down
14 changes: 3 additions & 11 deletions R/export_tables.R
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,6 @@ export_tables_selfrep_illness <- function(
#'
#' @noRd
export_tables_cancer_registry <- function(
n_cancer_arrays = 21,
dataset = NULL,
submit = FALSE,
verbose = FALSE
Expand All @@ -291,23 +290,16 @@ export_tables_cancer_registry <- function(

# RAP stores arrays as separate variables
if (verbose) cli::cli_alert("Determine field names to request")
if (verbose) cli::cli_alert(c("n_cancer_arrays = ", n_cancer_arrays))
# date vars = 40005
# cancer vars = 40006
# cancer ICD10 vars = 40006
# age vars = 40008
# histology vars = 40011
# behaviour vars = 40012
# cancer ICD9 vars = 40013

# get field names
names = "eid"

# phenotypes
for (p in c(40005, 40006, 40008, 40011, 40012)) {

# instances 0:n_instances
for (i in c(0:n_cancer_arrays)) names <- c(names, stringr::str_c("p", p, "_i", i))

}
names <- c(names, fields_to_phenos(as.character(c(40005, 40006, 40008, 40011, 40012, 40013))))

if (verbose) print(names)

Expand Down
6 changes: 3 additions & 3 deletions R/fields_to_phenos.R
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ field_to_phenos <- function(
valid_fields <- NULL

# Check if the field is instanced and generate instances if true
if (field_info$instanced == 1) {
if (field_info$instanced != 0) {
instances <- seq(field_info$instance_min, field_info$instance_max, 1)
if (verbose) cli::cli_alert(stringr::str_c("Is instaced [", stringr::str_c(instances, collapse=","), "]"))
}
Expand All @@ -152,7 +152,7 @@ field_to_phenos <- function(
}

# Generate valid fields for instanced arrayed fields
if (field_info$instanced == 1) {
if (field_info$instanced != 0) {
for (ii in 1:length(instances)) {
for (aa in 1:length(arrays)) {
valid_fields <- c(valid_fields, stringr::str_c(p_field_id, "_i", instances[ii], "_a", arrays[aa]))
Expand All @@ -163,7 +163,7 @@ field_to_phenos <- function(
} else {

# Generate valid fields for instanced non-arrayed fields
if (field_info$instanced == 1) {
if (field_info$instanced != 0) {
for (ii in 1:length(instances)) {
valid_fields <- c(valid_fields, stringr::str_c(p_field_id, "_i", instances[ii]))
}
Expand Down
35 changes: 30 additions & 5 deletions R/get_cancer_registry.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,21 @@
#'
#' @noRd
get_cancer_registry <- function(
codes,
ICD9s,
ICD10s,
ukb_dat,
verbose = FALSE
) {

start_time <- Sys.time()

# Check input
if (verbose) cli::cli_alert_info("Searching cancer registry data for {length(unique(codes))} ICD10 codes")
if (verbose & ICD9s[1]!="") cli::cli_alert_info("Searching cancer registry data for {length(unique(codes))} ICD9 codes")
if (verbose & ICD10s[1]!="") cli::cli_alert_info("Searching cancer registry data for {length(unique(codes))} ICD10 codes")

# if "missing" (empty string) replace with impossible code so grep doesn't catch all rows
if (ICD9s[1]=="") ICD9s <- "not_a_code"
if (ICD10s[1]=="") ICD10s <- "not_a_code"

# remove rows where participant has no cancer data
ukb_dat = ukb_dat |> dplyr::filter(
Expand All @@ -33,6 +39,7 @@ get_cancer_registry <- function(
# behaviour vars = 40012

# variable prefix
v_icd9 <- "p40013_"
v_icd10 <- "p40006_"
v_date <- "p40005_"
v_age <- "p40008_"
Expand All @@ -46,31 +53,49 @@ get_cancer_registry <- function(
dplyr::select(eid, dplyr::contains(v)) |>
tidyr::pivot_longer(!eid, names_to = "instance", names_prefix = v, values_to = n)
}

ukb_dat_icd10 <- pivot_cancer(ukb_dat, v_icd10, "icd10")
ukb_dat_date <- pivot_cancer(ukb_dat, v_date, "date")
ukb_dat_age <- pivot_cancer(ukb_dat, v_age, "age")
ukb_dat_histology <- pivot_cancer(ukb_dat, v_histology, "histology")
ukb_dat_behaviour <- pivot_cancer(ukb_dat, v_behaviour, "behaviour")

# some older exports may not have icd9
ukb_dat_icd9 <- NULL
if ("" %in% colnames(ukb_dat)) {
ukb_dat_icd9 <- pivot_cancer(ukb_dat, v_icd9, "icd9")
} else {
cli::cli_alert_warning("'icd9' not in exported cancer registry data. Consider re-exporting raw tables with `export_tables()`")
}

# join tables
if (verbose) cli::cli_alert("Join cancer registry data")
ukb_dat_cr = purrr::reduce(list(ukb_dat_icd10, ukb_dat_date, ukb_dat_age, ukb_dat_histology, ukb_dat_behaviour), dplyr::full_join, by = c("eid"="eid", "instance"="instance"))
if (!is.null(ukb_dat_icd9)) {
ukb_dat_cr <- dplyr::full_join(ukb_dat_icd9, ukb_dat_cr)
} else {
ukb_dat_cr$icd9 <- NA
}

# remove rows where participant has no cancer data
ukb_dat_cr = ukb_dat_cr |> dplyr::filter(
dplyr::if_any(
c("icd10","date","age","histology","behaviour"),
c("icd9","icd10","date","age","histology","behaviour"),
~!is.na(.)
)
)

# subset to ICD10s in provided codes
# subset to ICD9s/ICD10s in provided codes
if (verbose) cli::cli_alert("Identify matching codes")
ukb_dat_cr = ukb_dat_cr |>
dplyr::filter(
stringr::str_detect(
icd9,
stringr::str_flatten(ICD9s, collapse = "|")
) |
stringr::str_detect(
icd10,
stringr::str_flatten(codes, collapse = "|")
stringr::str_flatten(ICD10s, collapse = "|")
)
)

Expand Down
85 changes: 59 additions & 26 deletions R/get_df.R
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,18 @@ get_df <- function(
ICD10_search = stringr::str_flatten(ICD10s, collapse = "|")
}

# create ICD9 search string
if (any(codes_sub$vocab_id == "ICD9")) {
ICD9s <- codes_sub |>
dplyr::filter(vocab_id == "ICD9") |>
dplyr::select(code) |>
dplyr::pull() |>
unique() |>
stringr::str_remove(stringr::fixed(".")) |>
stringr::str_sub(1, 5)
ICD9_search = stringr::str_flatten(ICD9s, collapse = "|")
}

## hesin_diag
hesin_diag_sub = NULL
if (!is.null(diagnosis_list_sub$hesin_diag) & any(codes_sub$vocab_id %in% c("ICD10","ICD9"))) {
Expand All @@ -229,17 +241,6 @@ get_df <- function(
}

if (any(codes_sub$vocab_id == "ICD9")) {
ICD9s = ""
if (any(codes_sub$vocab_id == "ICD9")) {
ICD9s <- codes_sub |>
dplyr::filter(vocab_id == "ICD9") |>
dplyr::select(code) |>
dplyr::pull() |>
unique() |>
stringr::str_remove(stringr::fixed(".")) |>
stringr::str_sub(1, 5)
}
ICD9_search = stringr::str_flatten(ICD9s, collapse = "|")
colnames(diagnosis_list_sub$hesin_diag) = tolower(colnames(diagnosis_list_sub$hesin_diag))
hesin_diag_sub = rbind(hesin_diag_sub, diagnosis_list_sub$hesin_diag |> dplyr::filter(stringr::str_starts(diag_icd9, !! ICD9_search)))
}
Expand All @@ -255,9 +256,17 @@ get_df <- function(
diagnosis_list_sub$death_cause <- death_cause_sub

## cancer_registry
cancer_registry_sub <- NULL
cancer_registry_sub <- cancer_registry_sub_icd10 <- NULL
if (!is.null(diagnosis_list_sub$cancer_registry) & any(codes_sub$vocab_id == "ICD9")) {
cancer_registry_sub <- diagnosis_list_sub$cancer_registry |> dplyr::filter(stringr::str_detect( icd9, !! ICD9_search))
}
if (!is.null(diagnosis_list_sub$cancer_registry) & any(codes_sub$vocab_id == "ICD10")) {
cancer_registry_sub <- diagnosis_list_sub$cancer_registry |> dplyr::filter(stringr::str_detect( icd10, !! ICD10_search))
cancer_registry_sub_icd10 <- diagnosis_list_sub$cancer_registry |> dplyr::filter(stringr::str_detect( icd10, !! ICD10_search))
if (is.null(cancer_registry_sub)) {
cancer_registry_sub <- cancer_registry_sub_icd10
} else {
cancer_registry_sub <- rbind(cancer_registry_sub, cancer_registry_sub_icd10)
}
}
diagnosis_list_sub$cancer_registry <- cancer_registry_sub

Expand Down Expand Up @@ -704,19 +713,36 @@ get_cancer_registry_df <- function(
start_time <- Sys.time()

if (verbose) cat("Getting cancer registry data\n")

# create ICD9 search string
ICD9_search <- ""
if (any(codes_df$vocab_id == "ICD9")) {
ICD9s <- codes_df |>
dplyr::filter(vocab_id == "ICD9") |>
dplyr::select(code) |>
dplyr::pull() |>
unique() |>
stringr::str_remove(stringr::fixed(".")) |>
stringr::str_sub(1, 5)
ICD9_search = stringr::str_flatten(ICD9s, collapse = "|")
}

# format codes
vocab_col = "vocab_id"
codes_col = "code"
# create ICD10 search string
ICD10_search <- ""
if (any(codes_df$vocab_id == "ICD10")) {
ICD10s <- codes_df |>
dplyr::filter(vocab_id == "ICD10") |>
dplyr::select(code) |>
dplyr::pull() |>
unique() |>
stringr::str_remove(stringr::fixed(".")) |>
stringr::str_sub(1, 5)
ICD10_search = stringr::str_flatten(ICD10s, collapse = "|")
}

codes <- codes_df |>
dplyr::filter(!!rlang::sym(vocab_col) == "ICD10") |>
dplyr::select(!!rlang::sym(codes_col)) |>
dplyr::pull() |>
unique() |>
stringr::str_remove(stringr::fixed(".")) |>
stringr::str_sub(1, 5)
codes_string = stringr::str_flatten(codes, collapse = "|")
# if "missing" (empty string) replace with impossible code so grep doesn't catch all rows
if (ICD9_search=="") ICD9_search <- "not_a_code"
if (ICD10_search=="") ICD10_search <- "not_a_code"

# create empty vars in ukb_dat to modify
ukb_dat$canreg <- 0
Expand All @@ -734,8 +760,15 @@ get_cancer_registry_df <- function(

# Update where the code matches
ukb_dat <- ukb_dat |> dplyr::mutate(
canreg_df = dplyr::if_else(canreg == 0 & stringr::str_detect(icd10, codes_string), date, canreg_df, canreg_df),
canreg = dplyr::if_else(canreg == 0 & stringr::str_detect(icd10, codes_string), 1, canreg, canreg)
canreg_df = dplyr::if_else(
canreg == 0 &
( stringr::str_detect(icd9, ICD9_search) | stringr::str_detect(icd10, ICD10_search) ),
date, canreg_df, canreg_df
),
canreg = dplyr::if_else(
canreg == 0 &
( stringr::str_detect(icd9, ICD9_search) | stringr::str_detect(icd10, ICD10_search) ),
1, canreg, canreg)
)
}

Expand Down
50 changes: 25 additions & 25 deletions R/get_diagnoses.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#'
#' - ICD10 (for `hesin`, `death_cause` and `cancer_registry` searches) - fuzzy matching
#'
#' - ICD9 (for `hesin` searches) - fuzzy matching
#' - ICD9 (for `hesin` searches and `cancer_registry` searches) - fuzzy matching
#'
#' - Read2 / CTV3 (for `gp_clinical`) - exact matches on first 5 characters
#'
Expand All @@ -22,7 +22,7 @@
#'
#' @name get_diagnoses
#'
#' @param codes_df A data frame. Contains two columns: `code` and `vocab_id` i.e., a list of diagnostic codes, and an indicator of the vocabulary (ICD10, Read2, CTV3, OPCS3, OPCS4, ukb_cancer, and ukb_noncancer are recognised). Other columns are ignored.
#' @param codes_df A data frame. Contains two columns: `code` and `vocab_id` i.e., a list of diagnostic codes, and an indicator of the vocabulary (ICD9, ICD10, Read2, CTV3, OPCS3, OPCS4, ukb_cancer, and ukb_noncancer are recognised). Other columns are ignored.
#' @param file_paths A data frame. Columns must be `object` and `path` containing paths to required files. Default assumes you have the tables exported in the RAP environment from
#' ukbrapR::export_tables()
#' \code{default=ukbrapR:::ukbrapr_paths}
Expand Down Expand Up @@ -53,7 +53,7 @@ get_diagnoses <- function(
# start up messages
pkg_version <- utils::packageVersion("ukbrapR")
cli::cli_alert_info("{.pkg ukbrapR} v{pkg_version}")
.ukbrapr_startup_notice()
#.ukbrapr_startup_notice()

start_time <- Sys.time()

Expand Down Expand Up @@ -147,6 +147,7 @@ get_diagnoses <- function(
stringr::str_sub(1, 5)
hyphen_check(ICD9s, "ICD9")
cat(" - N unique ICD9 codes:", length(ICD9s), "\n")
if (any(dplyr::between(as.numeric(ICD9s), 140, 208))) get_canreg <- TRUE
}

# get Read2 and CTV3s. First 5 characters only.
Expand Down Expand Up @@ -343,28 +344,6 @@ get_diagnoses <- function(

if (verbose) cli::cli_alert_info(c("Time taken so far: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}."))

#
# cancer registry ####################################
#

# do any ICD10s start with a C? Skip if not.
if (get_canreg) {

cli::cli_alert("Ascertaining cancer registry data.")

# load data
cancer_registry_dat <- suppressWarnings(readr::read_tsv(file_paths$path[ file_paths$object=="cancer_registry" ], show_col_types = FALSE, progress = FALSE))

# get cancer registry data for these ICD10s
cancer_registry_tbl <- ukbrapR:::get_cancer_registry(codes = ICD10s, ukb_dat = cancer_registry_dat, verbose = verbose)
cli::cli_alert_success("Loaded {.var cancer_registry} with {nrow(cancer_registry_tbl)} matched rows.")

rm(cancer_registry_dat)

if (verbose) cli::cli_alert_info(c("Time taken so far: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}."))

}

#
# HES diagnosis data (ICD10s) ###########################################
#
Expand Down Expand Up @@ -407,6 +386,27 @@ get_diagnoses <- function(

}

#
# cancer registry ####################################
#

if (get_canreg) {

cli::cli_alert("Ascertaining cancer registry data.")

# load data
cancer_registry_dat <- suppressWarnings(readr::read_tsv(file_paths$path[ file_paths$object=="cancer_registry" ], show_col_types = FALSE, progress = FALSE))

# get cancer registry data for these ICD10s
cancer_registry_tbl <- ukbrapR:::get_cancer_registry(ICD9s = ICD9s, ICD10s = ICD10s, ukb_dat = cancer_registry_dat, verbose = verbose)
cli::cli_alert_success("Loaded {.var cancer_registry} with {nrow(cancer_registry_tbl)} matched rows.")

rm(cancer_registry_dat)

if (verbose) cli::cli_alert_info(c("Time taken so far: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}."))

}

#
# ICD9 HES diagnosis data ###########################################
#
Expand Down
Loading