diff --git a/DESCRIPTION b/DESCRIPTION index 63eac6b..274c5d7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: ukbrapR Title: R functions to use in the UK Biobank Research Analysis Platform (RAP) -Version: 0.3.14 +Version: 0.3.15 Authors@R: c(person("Luke", "Pilling", email = "L.Pilling@exeter.ac.uk", role = c("aut", "cre"), @@ -24,7 +24,7 @@ Imports: haven (>= 2.5.0) Encoding: UTF-8 LazyData: true -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 BugReports: https://github.com/lcpilling/ukbrapR/issues Suggests: knitr, diff --git a/NEWS.md b/NEWS.md index b10360f..5baf8fe 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,18 @@ +# ukbrapR v0.3.15 (21st April 2026) + +### Changes + - (PR #41) Cancer registry data did not include ICD9 codes. Thanks to @hdg204 for the suggestions. Changes are: + - `export_tables_cancer_registry` now exports fields 40013 (cancer registry ICD9 codes) + - `get_diagnoses` notices if user has provided an ICD9 in range 140 to 208, flag to search cancer registry + - `get_cancer_registry` differentiates between ICD9 and ICD10 when searching the raw data + - If user has an older table (without ICD9) then only ICD10 codes are searched + - `get_df` correctly handles if ICD9 and/or ICD10 diagnoses are available + +### Bug fixes + - `fields_to_phenos` now better handles some instanced fields (like 40006) where the Schema has instanced == 2 + - Known bug: some arrayed fields (like 20001) are not always handled correctly - not worked this out yet + + # ukbrapR v0.3.14 (24th Feb 2026) ### Bug fixes diff --git a/R/export_tables.R b/R/export_tables.R index 736af93..78772bb 100644 --- a/R/export_tables.R +++ b/R/export_tables.R @@ -281,7 +281,6 @@ export_tables_selfrep_illness <- function( #' #' @noRd export_tables_cancer_registry <- function( - n_cancer_arrays = 21, dataset = NULL, submit = FALSE, verbose = FALSE @@ -291,23 +290,16 @@ export_tables_cancer_registry <- function( # RAP stores arrays as separate variables if (verbose) cli::cli_alert("Determine field names to request") - if (verbose) cli::cli_alert(c("n_cancer_arrays = ", n_cancer_arrays)) # date vars = 40005 - # cancer vars = 40006 + # cancer ICD10 vars = 40006 # age vars = 40008 # histology vars = 40011 # behaviour vars = 40012 + # cancer ICD9 vars = 40013 # get field names names = "eid" - - # phenotypes - for (p in c(40005, 40006, 40008, 40011, 40012)) { - - # instances 0:n_instances - for (i in c(0:n_cancer_arrays)) names <- c(names, stringr::str_c("p", p, "_i", i)) - - } + names <- c(names, fields_to_phenos(as.character(c(40005, 40006, 40008, 40011, 40012, 40013)))) if (verbose) print(names) diff --git a/R/fields_to_phenos.R b/R/fields_to_phenos.R index 4262525..39bcee8 100644 --- a/R/fields_to_phenos.R +++ b/R/fields_to_phenos.R @@ -133,7 +133,7 @@ field_to_phenos <- function( valid_fields <- NULL # Check if the field is instanced and generate instances if true - if (field_info$instanced == 1) { + if (field_info$instanced != 0) { instances <- seq(field_info$instance_min, field_info$instance_max, 1) if (verbose) cli::cli_alert(stringr::str_c("Is instaced [", stringr::str_c(instances, collapse=","), "]")) } @@ -152,7 +152,7 @@ field_to_phenos <- function( } # Generate valid fields for instanced arrayed fields - if (field_info$instanced == 1) { + if (field_info$instanced != 0) { for (ii in 1:length(instances)) { for (aa in 1:length(arrays)) { valid_fields <- c(valid_fields, stringr::str_c(p_field_id, "_i", instances[ii], "_a", arrays[aa])) @@ -163,7 +163,7 @@ field_to_phenos <- function( } else { # Generate valid fields for instanced non-arrayed fields - if (field_info$instanced == 1) { + if (field_info$instanced != 0) { for (ii in 1:length(instances)) { valid_fields <- c(valid_fields, stringr::str_c(p_field_id, "_i", instances[ii])) } diff --git a/R/get_cancer_registry.R b/R/get_cancer_registry.R index 06e1e9d..315611f 100644 --- a/R/get_cancer_registry.R +++ b/R/get_cancer_registry.R @@ -6,7 +6,8 @@ #' #' @noRd get_cancer_registry <- function( - codes, + ICD9s, + ICD10s, ukb_dat, verbose = FALSE ) { @@ -14,7 +15,12 @@ get_cancer_registry <- function( start_time <- Sys.time() # Check input - if (verbose) cli::cli_alert_info("Searching cancer registry data for {length(unique(codes))} ICD10 codes") + if (verbose & ICD9s[1]!="") cli::cli_alert_info("Searching cancer registry data for {length(unique(codes))} ICD9 codes") + if (verbose & ICD10s[1]!="") cli::cli_alert_info("Searching cancer registry data for {length(unique(codes))} ICD10 codes") + + # if "missing" (empty string) replace with impossible code so grep doesn't catch all rows + if (ICD9s[1]=="") ICD9s <- "not_a_code" + if (ICD10s[1]=="") ICD10s <- "not_a_code" # remove rows where participant has no cancer data ukb_dat = ukb_dat |> dplyr::filter( @@ -33,6 +39,7 @@ get_cancer_registry <- function( # behaviour vars = 40012 # variable prefix + v_icd9 <- "p40013_" v_icd10 <- "p40006_" v_date <- "p40005_" v_age <- "p40008_" @@ -46,31 +53,49 @@ get_cancer_registry <- function( dplyr::select(eid, dplyr::contains(v)) |> tidyr::pivot_longer(!eid, names_to = "instance", names_prefix = v, values_to = n) } + ukb_dat_icd10 <- pivot_cancer(ukb_dat, v_icd10, "icd10") ukb_dat_date <- pivot_cancer(ukb_dat, v_date, "date") ukb_dat_age <- pivot_cancer(ukb_dat, v_age, "age") ukb_dat_histology <- pivot_cancer(ukb_dat, v_histology, "histology") ukb_dat_behaviour <- pivot_cancer(ukb_dat, v_behaviour, "behaviour") + # some older exports may not have icd9 + ukb_dat_icd9 <- NULL + if ("" %in% colnames(ukb_dat)) { + ukb_dat_icd9 <- pivot_cancer(ukb_dat, v_icd9, "icd9") + } else { + cli::cli_alert_warning("'icd9' not in exported cancer registry data. Consider re-exporting raw tables with `export_tables()`") + } + # join tables if (verbose) cli::cli_alert("Join cancer registry data") ukb_dat_cr = purrr::reduce(list(ukb_dat_icd10, ukb_dat_date, ukb_dat_age, ukb_dat_histology, ukb_dat_behaviour), dplyr::full_join, by = c("eid"="eid", "instance"="instance")) + if (!is.null(ukb_dat_icd9)) { + ukb_dat_cr <- dplyr::full_join(ukb_dat_icd9, ukb_dat_cr) + } else { + ukb_dat_cr$icd9 <- NA + } # remove rows where participant has no cancer data ukb_dat_cr = ukb_dat_cr |> dplyr::filter( dplyr::if_any( - c("icd10","date","age","histology","behaviour"), + c("icd9","icd10","date","age","histology","behaviour"), ~!is.na(.) ) ) - # subset to ICD10s in provided codes + # subset to ICD9s/ICD10s in provided codes if (verbose) cli::cli_alert("Identify matching codes") ukb_dat_cr = ukb_dat_cr |> dplyr::filter( + stringr::str_detect( + icd9, + stringr::str_flatten(ICD9s, collapse = "|") + ) | stringr::str_detect( icd10, - stringr::str_flatten(codes, collapse = "|") + stringr::str_flatten(ICD10s, collapse = "|") ) ) diff --git a/R/get_df.R b/R/get_df.R index 92c2c05..557b0d6 100644 --- a/R/get_df.R +++ b/R/get_df.R @@ -219,6 +219,18 @@ get_df <- function( ICD10_search = stringr::str_flatten(ICD10s, collapse = "|") } + # create ICD9 search string + if (any(codes_sub$vocab_id == "ICD9")) { + ICD9s <- codes_sub |> + dplyr::filter(vocab_id == "ICD9") |> + dplyr::select(code) |> + dplyr::pull() |> + unique() |> + stringr::str_remove(stringr::fixed(".")) |> + stringr::str_sub(1, 5) + ICD9_search = stringr::str_flatten(ICD9s, collapse = "|") + } + ## hesin_diag hesin_diag_sub = NULL if (!is.null(diagnosis_list_sub$hesin_diag) & any(codes_sub$vocab_id %in% c("ICD10","ICD9"))) { @@ -229,17 +241,6 @@ get_df <- function( } if (any(codes_sub$vocab_id == "ICD9")) { - ICD9s = "" - if (any(codes_sub$vocab_id == "ICD9")) { - ICD9s <- codes_sub |> - dplyr::filter(vocab_id == "ICD9") |> - dplyr::select(code) |> - dplyr::pull() |> - unique() |> - stringr::str_remove(stringr::fixed(".")) |> - stringr::str_sub(1, 5) - } - ICD9_search = stringr::str_flatten(ICD9s, collapse = "|") colnames(diagnosis_list_sub$hesin_diag) = tolower(colnames(diagnosis_list_sub$hesin_diag)) hesin_diag_sub = rbind(hesin_diag_sub, diagnosis_list_sub$hesin_diag |> dplyr::filter(stringr::str_starts(diag_icd9, !! ICD9_search))) } @@ -255,9 +256,17 @@ get_df <- function( diagnosis_list_sub$death_cause <- death_cause_sub ## cancer_registry - cancer_registry_sub <- NULL + cancer_registry_sub <- cancer_registry_sub_icd10 <- NULL + if (!is.null(diagnosis_list_sub$cancer_registry) & any(codes_sub$vocab_id == "ICD9")) { + cancer_registry_sub <- diagnosis_list_sub$cancer_registry |> dplyr::filter(stringr::str_detect( icd9, !! ICD9_search)) + } if (!is.null(diagnosis_list_sub$cancer_registry) & any(codes_sub$vocab_id == "ICD10")) { - cancer_registry_sub <- diagnosis_list_sub$cancer_registry |> dplyr::filter(stringr::str_detect( icd10, !! ICD10_search)) + cancer_registry_sub_icd10 <- diagnosis_list_sub$cancer_registry |> dplyr::filter(stringr::str_detect( icd10, !! ICD10_search)) + if (is.null(cancer_registry_sub)) { + cancer_registry_sub <- cancer_registry_sub_icd10 + } else { + cancer_registry_sub <- rbind(cancer_registry_sub, cancer_registry_sub_icd10) + } } diagnosis_list_sub$cancer_registry <- cancer_registry_sub @@ -704,19 +713,36 @@ get_cancer_registry_df <- function( start_time <- Sys.time() if (verbose) cat("Getting cancer registry data\n") + + # create ICD9 search string + ICD9_search <- "" + if (any(codes_df$vocab_id == "ICD9")) { + ICD9s <- codes_df |> + dplyr::filter(vocab_id == "ICD9") |> + dplyr::select(code) |> + dplyr::pull() |> + unique() |> + stringr::str_remove(stringr::fixed(".")) |> + stringr::str_sub(1, 5) + ICD9_search = stringr::str_flatten(ICD9s, collapse = "|") + } - # format codes - vocab_col = "vocab_id" - codes_col = "code" + # create ICD10 search string + ICD10_search <- "" + if (any(codes_df$vocab_id == "ICD10")) { + ICD10s <- codes_df |> + dplyr::filter(vocab_id == "ICD10") |> + dplyr::select(code) |> + dplyr::pull() |> + unique() |> + stringr::str_remove(stringr::fixed(".")) |> + stringr::str_sub(1, 5) + ICD10_search = stringr::str_flatten(ICD10s, collapse = "|") + } - codes <- codes_df |> - dplyr::filter(!!rlang::sym(vocab_col) == "ICD10") |> - dplyr::select(!!rlang::sym(codes_col)) |> - dplyr::pull() |> - unique() |> - stringr::str_remove(stringr::fixed(".")) |> - stringr::str_sub(1, 5) - codes_string = stringr::str_flatten(codes, collapse = "|") + # if "missing" (empty string) replace with impossible code so grep doesn't catch all rows + if (ICD9_search=="") ICD9_search <- "not_a_code" + if (ICD10_search=="") ICD10_search <- "not_a_code" # create empty vars in ukb_dat to modify ukb_dat$canreg <- 0 @@ -734,8 +760,15 @@ get_cancer_registry_df <- function( # Update where the code matches ukb_dat <- ukb_dat |> dplyr::mutate( - canreg_df = dplyr::if_else(canreg == 0 & stringr::str_detect(icd10, codes_string), date, canreg_df, canreg_df), - canreg = dplyr::if_else(canreg == 0 & stringr::str_detect(icd10, codes_string), 1, canreg, canreg) + canreg_df = dplyr::if_else( + canreg == 0 & + ( stringr::str_detect(icd9, ICD9_search) | stringr::str_detect(icd10, ICD10_search) ), + date, canreg_df, canreg_df + ), + canreg = dplyr::if_else( + canreg == 0 & + ( stringr::str_detect(icd9, ICD9_search) | stringr::str_detect(icd10, ICD10_search) ), + 1, canreg, canreg) ) } diff --git a/R/get_diagnoses.R b/R/get_diagnoses.R index 9ad6871..4102c9b 100644 --- a/R/get_diagnoses.R +++ b/R/get_diagnoses.R @@ -6,7 +6,7 @@ #' #' - ICD10 (for `hesin`, `death_cause` and `cancer_registry` searches) - fuzzy matching #' -#' - ICD9 (for `hesin` searches) - fuzzy matching +#' - ICD9 (for `hesin` searches and `cancer_registry` searches) - fuzzy matching #' #' - Read2 / CTV3 (for `gp_clinical`) - exact matches on first 5 characters #' @@ -22,7 +22,7 @@ #' #' @name get_diagnoses #' -#' @param codes_df A data frame. Contains two columns: `code` and `vocab_id` i.e., a list of diagnostic codes, and an indicator of the vocabulary (ICD10, Read2, CTV3, OPCS3, OPCS4, ukb_cancer, and ukb_noncancer are recognised). Other columns are ignored. +#' @param codes_df A data frame. Contains two columns: `code` and `vocab_id` i.e., a list of diagnostic codes, and an indicator of the vocabulary (ICD9, ICD10, Read2, CTV3, OPCS3, OPCS4, ukb_cancer, and ukb_noncancer are recognised). Other columns are ignored. #' @param file_paths A data frame. Columns must be `object` and `path` containing paths to required files. Default assumes you have the tables exported in the RAP environment from #' ukbrapR::export_tables() #' \code{default=ukbrapR:::ukbrapr_paths} @@ -53,7 +53,7 @@ get_diagnoses <- function( # start up messages pkg_version <- utils::packageVersion("ukbrapR") cli::cli_alert_info("{.pkg ukbrapR} v{pkg_version}") - .ukbrapr_startup_notice() + #.ukbrapr_startup_notice() start_time <- Sys.time() @@ -147,6 +147,7 @@ get_diagnoses <- function( stringr::str_sub(1, 5) hyphen_check(ICD9s, "ICD9") cat(" - N unique ICD9 codes:", length(ICD9s), "\n") + if (any(dplyr::between(as.numeric(ICD9s), 140, 208))) get_canreg <- TRUE } # get Read2 and CTV3s. First 5 characters only. @@ -343,28 +344,6 @@ get_diagnoses <- function( if (verbose) cli::cli_alert_info(c("Time taken so far: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}.")) - # - # cancer registry #################################### - # - - # do any ICD10s start with a C? Skip if not. - if (get_canreg) { - - cli::cli_alert("Ascertaining cancer registry data.") - - # load data - cancer_registry_dat <- suppressWarnings(readr::read_tsv(file_paths$path[ file_paths$object=="cancer_registry" ], show_col_types = FALSE, progress = FALSE)) - - # get cancer registry data for these ICD10s - cancer_registry_tbl <- ukbrapR:::get_cancer_registry(codes = ICD10s, ukb_dat = cancer_registry_dat, verbose = verbose) - cli::cli_alert_success("Loaded {.var cancer_registry} with {nrow(cancer_registry_tbl)} matched rows.") - - rm(cancer_registry_dat) - - if (verbose) cli::cli_alert_info(c("Time taken so far: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}.")) - - } - # # HES diagnosis data (ICD10s) ########################################### # @@ -407,6 +386,27 @@ get_diagnoses <- function( } + # + # cancer registry #################################### + # + + if (get_canreg) { + + cli::cli_alert("Ascertaining cancer registry data.") + + # load data + cancer_registry_dat <- suppressWarnings(readr::read_tsv(file_paths$path[ file_paths$object=="cancer_registry" ], show_col_types = FALSE, progress = FALSE)) + + # get cancer registry data for these ICD10s + cancer_registry_tbl <- ukbrapR:::get_cancer_registry(ICD9s = ICD9s, ICD10s = ICD10s, ukb_dat = cancer_registry_dat, verbose = verbose) + cli::cli_alert_success("Loaded {.var cancer_registry} with {nrow(cancer_registry_tbl)} matched rows.") + + rm(cancer_registry_dat) + + if (verbose) cli::cli_alert_info(c("Time taken so far: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}.")) + + } + # # ICD9 HES diagnosis data ########################################### # diff --git a/README.md b/README.md index e0dabb8..fdc37e1 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # ukbrapR -[![](https://img.shields.io/badge/version-0.3.14-informational.svg)](https://github.com/lcpilling/ukbrapR) +[![](https://img.shields.io/badge/version-0.3.15-informational.svg)](https://github.com/lcpilling/ukbrapR) [![](https://img.shields.io/github/last-commit/lcpilling/ukbrapR.svg)](https://github.com/lcpilling/ukbrapR/commits/main) [![](https://img.shields.io/badge/lifecycle-experimental-orange)](https://www.tidyverse.org/lifecycle/#experimental) [![DOI](https://zenodo.org/badge/709765135.svg)](https://zenodo.org/doi/10.5281/zenodo.11517716) @@ -11,7 +11,7 @@ ukbrapR (phonetically: 'U-K-B-wrapper') is an R package for working in the UK Bi Please consider starring the repo and citing it in your papers: -> Pilling LC (2026). ukbrapR (v0.3.14). doi:[10.5281/zenodo.18836738](https://doi.org/10.5281/zenodo.18836738), https://github.com/lcpilling/ukbrapR +> Pilling LC (2026). ukbrapR (v0.3.15). doi:[10.5281/zenodo.18836738](https://doi.org/10.5281/zenodo.18836738), https://github.com/lcpilling/ukbrapR Wrapped server icon by DALL-E @@ -127,7 +127,7 @@ For a given set of diagnostic codes get the participant Electronic Medical Recor Codes need to be provided as a data frame with two fields: `vocab_id` and `code`. Valid code vocabularies are: - `ICD10` (for searching HES diagnoses, cause of death, and cancer registry) - - `ICD9` (for searching older HES diagnosis data) + - `ICD9` (for searching older HES diagnosis and cancer registry data) - `Read2` and `CTV3` (for GP clinical events) - `OPCS3` and `OPCS4` (for HES operations) - `ukb_cancer` and `ukb_noncancer` (for self-reported illness at UK Biobank assessments - all instances will be searched) diff --git a/man/get_diagnoses.Rd b/man/get_diagnoses.Rd index 9364440..1e6d046 100644 --- a/man/get_diagnoses.Rd +++ b/man/get_diagnoses.Rd @@ -7,7 +7,7 @@ get_diagnoses(codes_df, file_paths = NULL, verbose = FALSE) } \arguments{ -\item{codes_df}{A data frame. Contains two columns: `code` and `vocab_id` i.e., a list of diagnostic codes, and an indicator of the vocabulary (ICD10, Read2, CTV3, OPCS3, OPCS4, ukb_cancer, and ukb_noncancer are recognised). Other columns are ignored.} +\item{codes_df}{A data frame. Contains two columns: `code` and `vocab_id` i.e., a list of diagnostic codes, and an indicator of the vocabulary (ICD9, ICD10, Read2, CTV3, OPCS3, OPCS4, ukb_cancer, and ukb_noncancer are recognised). Other columns are ignored.} \item{file_paths}{A data frame. Columns must be `object` and `path` containing paths to required files. Default assumes you have the tables exported in the RAP environment from ukbrapR::export_tables() @@ -26,7 +26,7 @@ Valid code vocabularies are: - ICD10 (for `hesin`, `death_cause` and `cancer_registry` searches) - fuzzy matching - - ICD9 (for `hesin` searches) - fuzzy matching + - ICD9 (for `hesin` searches and `cancer_registry` searches) - fuzzy matching - Read2 / CTV3 (for `gp_clinical`) - exact matches on first 5 characters