diff --git a/DESCRIPTION b/DESCRIPTION
index 63eac6b..274c5d7 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
Package: ukbrapR
Title: R functions to use in the UK Biobank Research Analysis Platform (RAP)
-Version: 0.3.14
+Version: 0.3.15
Authors@R: c(person("Luke", "Pilling",
email = "L.Pilling@exeter.ac.uk",
role = c("aut", "cre"),
@@ -24,7 +24,7 @@ Imports:
haven (>= 2.5.0)
Encoding: UTF-8
LazyData: true
-RoxygenNote: 7.3.2
+RoxygenNote: 7.3.3
BugReports: https://github.com/lcpilling/ukbrapR/issues
Suggests:
knitr,
diff --git a/NEWS.md b/NEWS.md
index b10360f..5baf8fe 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,18 @@
+# ukbrapR v0.3.15 (21st April 2026)
+
+### Changes
+ - (PR #41) Cancer registry data did not include ICD9 codes. Thanks to @hdg204 for the suggestions. Changes are:
+ - `export_tables_cancer_registry` now exports fields 40013 (cancer registry ICD9 codes)
+ - `get_diagnoses` notices if user has provided an ICD9 in range 140 to 208, flag to search cancer registry
+ - `get_cancer_registry` differentiates between ICD9 and ICD10 when searching the raw data
+ - If user has an older table (without ICD9) then only ICD10 codes are searched
+ - `get_df` correctly handles if ICD9 and/or ICD10 diagnoses are available
+
+### Bug fixes
+ - `fields_to_phenos` now better handles some instanced fields (like 40006) where the Schema has instanced == 2
+ - Known bug: some arrayed fields (like 20001) are not always handled correctly - not worked this out yet
+
+
# ukbrapR v0.3.14 (24th Feb 2026)
### Bug fixes
diff --git a/R/export_tables.R b/R/export_tables.R
index 736af93..78772bb 100644
--- a/R/export_tables.R
+++ b/R/export_tables.R
@@ -281,7 +281,6 @@ export_tables_selfrep_illness <- function(
#'
#' @noRd
export_tables_cancer_registry <- function(
- n_cancer_arrays = 21,
dataset = NULL,
submit = FALSE,
verbose = FALSE
@@ -291,23 +290,16 @@ export_tables_cancer_registry <- function(
# RAP stores arrays as separate variables
if (verbose) cli::cli_alert("Determine field names to request")
- if (verbose) cli::cli_alert(c("n_cancer_arrays = ", n_cancer_arrays))
# date vars = 40005
- # cancer vars = 40006
+ # cancer ICD10 vars = 40006
# age vars = 40008
# histology vars = 40011
# behaviour vars = 40012
+ # cancer ICD9 vars = 40013
# get field names
names = "eid"
-
- # phenotypes
- for (p in c(40005, 40006, 40008, 40011, 40012)) {
-
- # instances 0:n_instances
- for (i in c(0:n_cancer_arrays)) names <- c(names, stringr::str_c("p", p, "_i", i))
-
- }
+ names <- c(names, fields_to_phenos(as.character(c(40005, 40006, 40008, 40011, 40012, 40013))))
if (verbose) print(names)
diff --git a/R/fields_to_phenos.R b/R/fields_to_phenos.R
index 4262525..39bcee8 100644
--- a/R/fields_to_phenos.R
+++ b/R/fields_to_phenos.R
@@ -133,7 +133,7 @@ field_to_phenos <- function(
valid_fields <- NULL
# Check if the field is instanced and generate instances if true
- if (field_info$instanced == 1) {
+ if (field_info$instanced != 0) {
instances <- seq(field_info$instance_min, field_info$instance_max, 1)
if (verbose) cli::cli_alert(stringr::str_c("Is instaced [", stringr::str_c(instances, collapse=","), "]"))
}
@@ -152,7 +152,7 @@ field_to_phenos <- function(
}
# Generate valid fields for instanced arrayed fields
- if (field_info$instanced == 1) {
+ if (field_info$instanced != 0) {
for (ii in 1:length(instances)) {
for (aa in 1:length(arrays)) {
valid_fields <- c(valid_fields, stringr::str_c(p_field_id, "_i", instances[ii], "_a", arrays[aa]))
@@ -163,7 +163,7 @@ field_to_phenos <- function(
} else {
# Generate valid fields for instanced non-arrayed fields
- if (field_info$instanced == 1) {
+ if (field_info$instanced != 0) {
for (ii in 1:length(instances)) {
valid_fields <- c(valid_fields, stringr::str_c(p_field_id, "_i", instances[ii]))
}
diff --git a/R/get_cancer_registry.R b/R/get_cancer_registry.R
index 06e1e9d..315611f 100644
--- a/R/get_cancer_registry.R
+++ b/R/get_cancer_registry.R
@@ -6,7 +6,8 @@
#'
#' @noRd
get_cancer_registry <- function(
- codes,
+ ICD9s,
+ ICD10s,
ukb_dat,
verbose = FALSE
) {
@@ -14,7 +15,12 @@ get_cancer_registry <- function(
start_time <- Sys.time()
# Check input
- if (verbose) cli::cli_alert_info("Searching cancer registry data for {length(unique(codes))} ICD10 codes")
+ if (verbose & ICD9s[1]!="") cli::cli_alert_info("Searching cancer registry data for {length(unique(codes))} ICD9 codes")
+ if (verbose & ICD10s[1]!="") cli::cli_alert_info("Searching cancer registry data for {length(unique(codes))} ICD10 codes")
+
+ # if "missing" (empty string) replace with impossible code so grep doesn't catch all rows
+ if (ICD9s[1]=="") ICD9s <- "not_a_code"
+ if (ICD10s[1]=="") ICD10s <- "not_a_code"
# remove rows where participant has no cancer data
ukb_dat = ukb_dat |> dplyr::filter(
@@ -33,6 +39,7 @@ get_cancer_registry <- function(
# behaviour vars = 40012
# variable prefix
+ v_icd9 <- "p40013_"
v_icd10 <- "p40006_"
v_date <- "p40005_"
v_age <- "p40008_"
@@ -46,31 +53,49 @@ get_cancer_registry <- function(
dplyr::select(eid, dplyr::contains(v)) |>
tidyr::pivot_longer(!eid, names_to = "instance", names_prefix = v, values_to = n)
}
+
ukb_dat_icd10 <- pivot_cancer(ukb_dat, v_icd10, "icd10")
ukb_dat_date <- pivot_cancer(ukb_dat, v_date, "date")
ukb_dat_age <- pivot_cancer(ukb_dat, v_age, "age")
ukb_dat_histology <- pivot_cancer(ukb_dat, v_histology, "histology")
ukb_dat_behaviour <- pivot_cancer(ukb_dat, v_behaviour, "behaviour")
+ # some older exports may not have icd9
+ ukb_dat_icd9 <- NULL
+ if ("" %in% colnames(ukb_dat)) {
+ ukb_dat_icd9 <- pivot_cancer(ukb_dat, v_icd9, "icd9")
+ } else {
+ cli::cli_alert_warning("'icd9' not in exported cancer registry data. Consider re-exporting raw tables with `export_tables()`")
+ }
+
# join tables
if (verbose) cli::cli_alert("Join cancer registry data")
ukb_dat_cr = purrr::reduce(list(ukb_dat_icd10, ukb_dat_date, ukb_dat_age, ukb_dat_histology, ukb_dat_behaviour), dplyr::full_join, by = c("eid"="eid", "instance"="instance"))
+ if (!is.null(ukb_dat_icd9)) {
+ ukb_dat_cr <- dplyr::full_join(ukb_dat_icd9, ukb_dat_cr)
+ } else {
+ ukb_dat_cr$icd9 <- NA
+ }
# remove rows where participant has no cancer data
ukb_dat_cr = ukb_dat_cr |> dplyr::filter(
dplyr::if_any(
- c("icd10","date","age","histology","behaviour"),
+ c("icd9","icd10","date","age","histology","behaviour"),
~!is.na(.)
)
)
- # subset to ICD10s in provided codes
+ # subset to ICD9s/ICD10s in provided codes
if (verbose) cli::cli_alert("Identify matching codes")
ukb_dat_cr = ukb_dat_cr |>
dplyr::filter(
+ stringr::str_detect(
+ icd9,
+ stringr::str_flatten(ICD9s, collapse = "|")
+ ) |
stringr::str_detect(
icd10,
- stringr::str_flatten(codes, collapse = "|")
+ stringr::str_flatten(ICD10s, collapse = "|")
)
)
diff --git a/R/get_df.R b/R/get_df.R
index 92c2c05..557b0d6 100644
--- a/R/get_df.R
+++ b/R/get_df.R
@@ -219,6 +219,18 @@ get_df <- function(
ICD10_search = stringr::str_flatten(ICD10s, collapse = "|")
}
+ # create ICD9 search string
+ if (any(codes_sub$vocab_id == "ICD9")) {
+ ICD9s <- codes_sub |>
+ dplyr::filter(vocab_id == "ICD9") |>
+ dplyr::select(code) |>
+ dplyr::pull() |>
+ unique() |>
+ stringr::str_remove(stringr::fixed(".")) |>
+ stringr::str_sub(1, 5)
+ ICD9_search = stringr::str_flatten(ICD9s, collapse = "|")
+ }
+
## hesin_diag
hesin_diag_sub = NULL
if (!is.null(diagnosis_list_sub$hesin_diag) & any(codes_sub$vocab_id %in% c("ICD10","ICD9"))) {
@@ -229,17 +241,6 @@ get_df <- function(
}
if (any(codes_sub$vocab_id == "ICD9")) {
- ICD9s = ""
- if (any(codes_sub$vocab_id == "ICD9")) {
- ICD9s <- codes_sub |>
- dplyr::filter(vocab_id == "ICD9") |>
- dplyr::select(code) |>
- dplyr::pull() |>
- unique() |>
- stringr::str_remove(stringr::fixed(".")) |>
- stringr::str_sub(1, 5)
- }
- ICD9_search = stringr::str_flatten(ICD9s, collapse = "|")
colnames(diagnosis_list_sub$hesin_diag) = tolower(colnames(diagnosis_list_sub$hesin_diag))
hesin_diag_sub = rbind(hesin_diag_sub, diagnosis_list_sub$hesin_diag |> dplyr::filter(stringr::str_starts(diag_icd9, !! ICD9_search)))
}
@@ -255,9 +256,17 @@ get_df <- function(
diagnosis_list_sub$death_cause <- death_cause_sub
## cancer_registry
- cancer_registry_sub <- NULL
+ cancer_registry_sub <- cancer_registry_sub_icd10 <- NULL
+ if (!is.null(diagnosis_list_sub$cancer_registry) & any(codes_sub$vocab_id == "ICD9")) {
+ cancer_registry_sub <- diagnosis_list_sub$cancer_registry |> dplyr::filter(stringr::str_detect( icd9, !! ICD9_search))
+ }
if (!is.null(diagnosis_list_sub$cancer_registry) & any(codes_sub$vocab_id == "ICD10")) {
- cancer_registry_sub <- diagnosis_list_sub$cancer_registry |> dplyr::filter(stringr::str_detect( icd10, !! ICD10_search))
+ cancer_registry_sub_icd10 <- diagnosis_list_sub$cancer_registry |> dplyr::filter(stringr::str_detect( icd10, !! ICD10_search))
+ if (is.null(cancer_registry_sub)) {
+ cancer_registry_sub <- cancer_registry_sub_icd10
+ } else {
+ cancer_registry_sub <- rbind(cancer_registry_sub, cancer_registry_sub_icd10)
+ }
}
diagnosis_list_sub$cancer_registry <- cancer_registry_sub
@@ -704,19 +713,36 @@ get_cancer_registry_df <- function(
start_time <- Sys.time()
if (verbose) cat("Getting cancer registry data\n")
+
+ # create ICD9 search string
+ ICD9_search <- ""
+ if (any(codes_df$vocab_id == "ICD9")) {
+ ICD9s <- codes_df |>
+ dplyr::filter(vocab_id == "ICD9") |>
+ dplyr::select(code) |>
+ dplyr::pull() |>
+ unique() |>
+ stringr::str_remove(stringr::fixed(".")) |>
+ stringr::str_sub(1, 5)
+ ICD9_search = stringr::str_flatten(ICD9s, collapse = "|")
+ }
- # format codes
- vocab_col = "vocab_id"
- codes_col = "code"
+ # create ICD10 search string
+ ICD10_search <- ""
+ if (any(codes_df$vocab_id == "ICD10")) {
+ ICD10s <- codes_df |>
+ dplyr::filter(vocab_id == "ICD10") |>
+ dplyr::select(code) |>
+ dplyr::pull() |>
+ unique() |>
+ stringr::str_remove(stringr::fixed(".")) |>
+ stringr::str_sub(1, 5)
+ ICD10_search = stringr::str_flatten(ICD10s, collapse = "|")
+ }
- codes <- codes_df |>
- dplyr::filter(!!rlang::sym(vocab_col) == "ICD10") |>
- dplyr::select(!!rlang::sym(codes_col)) |>
- dplyr::pull() |>
- unique() |>
- stringr::str_remove(stringr::fixed(".")) |>
- stringr::str_sub(1, 5)
- codes_string = stringr::str_flatten(codes, collapse = "|")
+ # if "missing" (empty string) replace with impossible code so grep doesn't catch all rows
+ if (ICD9_search=="") ICD9_search <- "not_a_code"
+ if (ICD10_search=="") ICD10_search <- "not_a_code"
# create empty vars in ukb_dat to modify
ukb_dat$canreg <- 0
@@ -734,8 +760,15 @@ get_cancer_registry_df <- function(
# Update where the code matches
ukb_dat <- ukb_dat |> dplyr::mutate(
- canreg_df = dplyr::if_else(canreg == 0 & stringr::str_detect(icd10, codes_string), date, canreg_df, canreg_df),
- canreg = dplyr::if_else(canreg == 0 & stringr::str_detect(icd10, codes_string), 1, canreg, canreg)
+ canreg_df = dplyr::if_else(
+ canreg == 0 &
+ ( stringr::str_detect(icd9, ICD9_search) | stringr::str_detect(icd10, ICD10_search) ),
+ date, canreg_df, canreg_df
+ ),
+ canreg = dplyr::if_else(
+ canreg == 0 &
+ ( stringr::str_detect(icd9, ICD9_search) | stringr::str_detect(icd10, ICD10_search) ),
+ 1, canreg, canreg)
)
}
diff --git a/R/get_diagnoses.R b/R/get_diagnoses.R
index 9ad6871..4102c9b 100644
--- a/R/get_diagnoses.R
+++ b/R/get_diagnoses.R
@@ -6,7 +6,7 @@
#'
#' - ICD10 (for `hesin`, `death_cause` and `cancer_registry` searches) - fuzzy matching
#'
-#' - ICD9 (for `hesin` searches) - fuzzy matching
+#' - ICD9 (for `hesin` searches and `cancer_registry` searches) - fuzzy matching
#'
#' - Read2 / CTV3 (for `gp_clinical`) - exact matches on first 5 characters
#'
@@ -22,7 +22,7 @@
#'
#' @name get_diagnoses
#'
-#' @param codes_df A data frame. Contains two columns: `code` and `vocab_id` i.e., a list of diagnostic codes, and an indicator of the vocabulary (ICD10, Read2, CTV3, OPCS3, OPCS4, ukb_cancer, and ukb_noncancer are recognised). Other columns are ignored.
+#' @param codes_df A data frame. Contains two columns: `code` and `vocab_id` i.e., a list of diagnostic codes, and an indicator of the vocabulary (ICD9, ICD10, Read2, CTV3, OPCS3, OPCS4, ukb_cancer, and ukb_noncancer are recognised). Other columns are ignored.
#' @param file_paths A data frame. Columns must be `object` and `path` containing paths to required files. Default assumes you have the tables exported in the RAP environment from
#' ukbrapR::export_tables()
#' \code{default=ukbrapR:::ukbrapr_paths}
@@ -53,7 +53,7 @@ get_diagnoses <- function(
# start up messages
pkg_version <- utils::packageVersion("ukbrapR")
cli::cli_alert_info("{.pkg ukbrapR} v{pkg_version}")
- .ukbrapr_startup_notice()
+ #.ukbrapr_startup_notice()
start_time <- Sys.time()
@@ -147,6 +147,7 @@ get_diagnoses <- function(
stringr::str_sub(1, 5)
hyphen_check(ICD9s, "ICD9")
cat(" - N unique ICD9 codes:", length(ICD9s), "\n")
+ if (any(dplyr::between(as.numeric(ICD9s), 140, 208))) get_canreg <- TRUE
}
# get Read2 and CTV3s. First 5 characters only.
@@ -343,28 +344,6 @@ get_diagnoses <- function(
if (verbose) cli::cli_alert_info(c("Time taken so far: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}."))
- #
- # cancer registry ####################################
- #
-
- # do any ICD10s start with a C? Skip if not.
- if (get_canreg) {
-
- cli::cli_alert("Ascertaining cancer registry data.")
-
- # load data
- cancer_registry_dat <- suppressWarnings(readr::read_tsv(file_paths$path[ file_paths$object=="cancer_registry" ], show_col_types = FALSE, progress = FALSE))
-
- # get cancer registry data for these ICD10s
- cancer_registry_tbl <- ukbrapR:::get_cancer_registry(codes = ICD10s, ukb_dat = cancer_registry_dat, verbose = verbose)
- cli::cli_alert_success("Loaded {.var cancer_registry} with {nrow(cancer_registry_tbl)} matched rows.")
-
- rm(cancer_registry_dat)
-
- if (verbose) cli::cli_alert_info(c("Time taken so far: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}."))
-
- }
-
#
# HES diagnosis data (ICD10s) ###########################################
#
@@ -407,6 +386,27 @@ get_diagnoses <- function(
}
+ #
+ # cancer registry ####################################
+ #
+
+ if (get_canreg) {
+
+ cli::cli_alert("Ascertaining cancer registry data.")
+
+ # load data
+ cancer_registry_dat <- suppressWarnings(readr::read_tsv(file_paths$path[ file_paths$object=="cancer_registry" ], show_col_types = FALSE, progress = FALSE))
+
+ # get cancer registry data for these ICD10s
+ cancer_registry_tbl <- ukbrapR:::get_cancer_registry(ICD9s = ICD9s, ICD10s = ICD10s, ukb_dat = cancer_registry_dat, verbose = verbose)
+ cli::cli_alert_success("Loaded {.var cancer_registry} with {nrow(cancer_registry_tbl)} matched rows.")
+
+ rm(cancer_registry_dat)
+
+ if (verbose) cli::cli_alert_info(c("Time taken so far: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}."))
+
+ }
+
#
# ICD9 HES diagnosis data ###########################################
#
diff --git a/README.md b/README.md
index e0dabb8..fdc37e1 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
# ukbrapR
-[](https://github.com/lcpilling/ukbrapR)
+[](https://github.com/lcpilling/ukbrapR)
[](https://github.com/lcpilling/ukbrapR/commits/main)
[](https://www.tidyverse.org/lifecycle/#experimental)
[](https://zenodo.org/doi/10.5281/zenodo.11517716)
@@ -11,7 +11,7 @@ ukbrapR (phonetically: 'U-K-B-wrapper') is an R package for working in the UK Bi
Please consider starring the repo and citing it in your papers:
-> Pilling LC (2026). ukbrapR (v0.3.14). doi:[10.5281/zenodo.18836738](https://doi.org/10.5281/zenodo.18836738), https://github.com/lcpilling/ukbrapR
+> Pilling LC (2026). ukbrapR (v0.3.15). doi:[10.5281/zenodo.18836738](https://doi.org/10.5281/zenodo.18836738), https://github.com/lcpilling/ukbrapR
Wrapped server icon by DALL-E
@@ -127,7 +127,7 @@ For a given set of diagnostic codes get the participant Electronic Medical Recor
Codes need to be provided as a data frame with two fields: `vocab_id` and `code`. Valid code vocabularies are:
- `ICD10` (for searching HES diagnoses, cause of death, and cancer registry)
- - `ICD9` (for searching older HES diagnosis data)
+ - `ICD9` (for searching older HES diagnosis and cancer registry data)
- `Read2` and `CTV3` (for GP clinical events)
- `OPCS3` and `OPCS4` (for HES operations)
- `ukb_cancer` and `ukb_noncancer` (for self-reported illness at UK Biobank assessments - all instances will be searched)
diff --git a/man/get_diagnoses.Rd b/man/get_diagnoses.Rd
index 9364440..1e6d046 100644
--- a/man/get_diagnoses.Rd
+++ b/man/get_diagnoses.Rd
@@ -7,7 +7,7 @@
get_diagnoses(codes_df, file_paths = NULL, verbose = FALSE)
}
\arguments{
-\item{codes_df}{A data frame. Contains two columns: `code` and `vocab_id` i.e., a list of diagnostic codes, and an indicator of the vocabulary (ICD10, Read2, CTV3, OPCS3, OPCS4, ukb_cancer, and ukb_noncancer are recognised). Other columns are ignored.}
+\item{codes_df}{A data frame. Contains two columns: `code` and `vocab_id` i.e., a list of diagnostic codes, and an indicator of the vocabulary (ICD9, ICD10, Read2, CTV3, OPCS3, OPCS4, ukb_cancer, and ukb_noncancer are recognised). Other columns are ignored.}
\item{file_paths}{A data frame. Columns must be `object` and `path` containing paths to required files. Default assumes you have the tables exported in the RAP environment from
ukbrapR::export_tables()
@@ -26,7 +26,7 @@ Valid code vocabularies are:
- ICD10 (for `hesin`, `death_cause` and `cancer_registry` searches) - fuzzy matching
- - ICD9 (for `hesin` searches) - fuzzy matching
+ - ICD9 (for `hesin` searches and `cancer_registry` searches) - fuzzy matching
- Read2 / CTV3 (for `gp_clinical`) - exact matches on first 5 characters