lcpilling · hdg204 · Apr 21, 2026 · Apr 21, 2026 · Apr 21, 2026 · Apr 21, 2026
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: ukbrapR
 Title: R functions to use in the UK Biobank Research Analysis Platform (RAP)
-Version: 0.3.14
+Version: 0.3.15
 Authors@R: c(person("Luke", "Pilling", 
                     email = "L.Pilling@exeter.ac.uk", 
                     role = c("aut", "cre"),
@@ -24,7 +24,7 @@ Imports:
   haven (>= 2.5.0)
 Encoding: UTF-8
 LazyData: true
-RoxygenNote: 7.3.2
+RoxygenNote: 7.3.3
 BugReports: https://github.com/lcpilling/ukbrapR/issues
 Suggests: 
     knitr,

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,18 @@
+# ukbrapR v0.3.15 (21st April 2026)
+
+### Changes
+ - (PR #41) Cancer registry data did not include ICD9 codes. Thanks to @hdg204 for the suggestions. Changes are:
+   - `export_tables_cancer_registry` now exports fields 40013 (cancer registry ICD9 codes)
+   - `get_diagnoses` notices if user has provided an ICD9 in range 140 to 208, flag to search cancer registry
+   - `get_cancer_registry` differentiates between ICD9 and ICD10 when searching the raw data
+     - If user has an older table (without ICD9) then only ICD10 codes are searched 
+   - `get_df` correctly handles if ICD9 and/or ICD10 diagnoses are available
+
+### Bug fixes
+ - `fields_to_phenos` now better handles some instanced fields (like 40006) where the Schema has instanced == 2
+   - Known bug: some arrayed fields (like 20001) are not always handled correctly - not worked this out yet
+
+
 # ukbrapR v0.3.14 (24th Feb 2026)
 
 ### Bug fixes

diff --git a/R/export_tables.R b/R/export_tables.R
@@ -281,7 +281,6 @@ export_tables_selfrep_illness <- function(
 #'
 #' @noRd
 export_tables_cancer_registry <- function(
-	n_cancer_arrays = 21,
 	dataset = NULL,
 	submit = FALSE,
 	verbose = FALSE
@@ -291,23 +290,16 @@ export_tables_cancer_registry <- function(
 
 	# RAP stores arrays as separate variables
 	if (verbose) cli::cli_alert("Determine field names to request")
-	if (verbose) cli::cli_alert(c("n_cancer_arrays = ", n_cancer_arrays))
 	#   date vars = 40005
-	#   cancer vars = 40006
+	#   cancer ICD10 vars = 40006
 	#   age vars = 40008
 	#   histology vars = 40011
 	#   behaviour vars = 40012
+	#   cancer ICD9 vars = 40013
 
 	# get field names
 	names = "eid"
-
-	# phenotypes
-	for (p in c(40005, 40006, 40008, 40011, 40012))  {
-
-		# instances 0:n_instances
-		for (i in c(0:n_cancer_arrays))  names <- c(names, stringr::str_c("p", p, "_i", i))
-
-	}
+	names <- c(names, fields_to_phenos(as.character(c(40005, 40006, 40008, 40011, 40012, 40013))))
 
 	if (verbose) print(names)
 

diff --git a/R/fields_to_phenos.R b/R/fields_to_phenos.R
@@ -133,7 +133,7 @@ field_to_phenos <- function(
   valid_fields <- NULL
 
   # Check if the field is instanced and generate instances if true
-  if (field_info$instanced == 1)  {
+  if (field_info$instanced != 0)  {
     instances <- seq(field_info$instance_min, field_info$instance_max, 1)
     if (verbose)  cli::cli_alert(stringr::str_c("Is instaced [", stringr::str_c(instances, collapse=","), "]"))
   }
@@ -152,7 +152,7 @@ field_to_phenos <- function(
     }
 
     # Generate valid fields for instanced arrayed fields
-    if (field_info$instanced == 1)  {
+    if (field_info$instanced != 0)  {
       for (ii in 1:length(instances))  {
         for (aa in 1:length(arrays))  {
           valid_fields <- c(valid_fields, stringr::str_c(p_field_id, "_i", instances[ii], "_a", arrays[aa]))
@@ -163,7 +163,7 @@ field_to_phenos <- function(
   }  else  {
 
     # Generate valid fields for instanced non-arrayed fields
-    if (field_info$instanced == 1)  {
+    if (field_info$instanced != 0)  {
       for (ii in 1:length(instances))  {
         valid_fields <- c(valid_fields, stringr::str_c(p_field_id, "_i", instances[ii]))
       }

diff --git a/R/get_cancer_registry.R b/R/get_cancer_registry.R
@@ -6,15 +6,21 @@
 #'
 #' @noRd
 get_cancer_registry <- function(
-	codes,
+	ICD9s,
+	ICD10s,
 	ukb_dat,
 	verbose = FALSE
 )  {
 
 	start_time <- Sys.time()
 
 	# Check input
-	if (verbose) cli::cli_alert_info("Searching cancer registry data for {length(unique(codes))} ICD10 codes")
+	if (verbose & ICD9s[1]!="")  cli::cli_alert_info("Searching cancer registry data for {length(unique(codes))} ICD9 codes")
+	if (verbose & ICD10s[1]!="") cli::cli_alert_info("Searching cancer registry data for {length(unique(codes))} ICD10 codes")
+
+	# if "missing" (empty string) replace with impossible code so grep doesn't catch all rows 
+	if (ICD9s[1]=="")   ICD9s <- "not_a_code"
+	if (ICD10s[1]=="")  ICD10s <- "not_a_code"
 
 	# remove rows where participant has no cancer data 
 	ukb_dat = ukb_dat |> dplyr::filter(
@@ -33,6 +39,7 @@ get_cancer_registry <- function(
 	#   behaviour vars = 40012
 
 	# variable prefix 
+	v_icd9      <- "p40013_"
 	v_icd10     <- "p40006_"
 	v_date      <- "p40005_"
 	v_age       <- "p40008_"
@@ -46,31 +53,49 @@ get_cancer_registry <- function(
 			dplyr::select(eid, dplyr::contains(v)) |>
 			tidyr::pivot_longer(!eid, names_to = "instance", names_prefix = v, values_to = n)
 	}
+
 	ukb_dat_icd10     <- pivot_cancer(ukb_dat, v_icd10, "icd10")
 	ukb_dat_date      <- pivot_cancer(ukb_dat, v_date, "date")
 	ukb_dat_age       <- pivot_cancer(ukb_dat, v_age, "age")
 	ukb_dat_histology <- pivot_cancer(ukb_dat, v_histology, "histology")
 	ukb_dat_behaviour <- pivot_cancer(ukb_dat, v_behaviour, "behaviour")
 
+	# some older exports may not have icd9
+	ukb_dat_icd9 <- NULL
+	if ("" %in% colnames(ukb_dat))  {
+		ukb_dat_icd9      <- pivot_cancer(ukb_dat, v_icd9, "icd9")
+	} else {
+		cli::cli_alert_warning("'icd9' not in exported cancer registry data. Consider re-exporting raw tables with `export_tables()`")
+	}
+
 	# join tables
 	if (verbose) cli::cli_alert("Join cancer registry data")
 	ukb_dat_cr = purrr::reduce(list(ukb_dat_icd10, ukb_dat_date, ukb_dat_age, ukb_dat_histology, ukb_dat_behaviour), dplyr::full_join, by = c("eid"="eid", "instance"="instance"))
+	if (!is.null(ukb_dat_icd9))  {
+		ukb_dat_cr <- dplyr::full_join(ukb_dat_icd9, ukb_dat_cr)
+	}  else  {
+		ukb_dat_cr$icd9 <- NA
+	}
 
 	# remove rows where participant has no cancer data 
 	ukb_dat_cr = ukb_dat_cr |> dplyr::filter(
 		dplyr::if_any(
-			c("icd10","date","age","histology","behaviour"),
+			c("icd9","icd10","date","age","histology","behaviour"),
 			~!is.na(.)
 		)
 	)
 
-	# subset to ICD10s in provided codes
+	# subset to ICD9s/ICD10s in provided codes
 	if (verbose) cli::cli_alert("Identify matching codes")
 	ukb_dat_cr = ukb_dat_cr |> 
 		dplyr::filter(
+			stringr::str_detect(
+				icd9,
+				stringr::str_flatten(ICD9s, collapse = "|")
+			) |
 			stringr::str_detect(
 				icd10,
-				stringr::str_flatten(codes, collapse = "|")
+				stringr::str_flatten(ICD10s, collapse = "|")
 			)
 		)
 

diff --git a/R/get_df.R b/R/get_df.R
@@ -219,6 +219,18 @@ get_df <- function(
 				ICD10_search = stringr::str_flatten(ICD10s, collapse = "|")
 			}
 
+			# create ICD9 search string
+			if (any(codes_sub$vocab_id == "ICD9"))  {
+				ICD9s <- codes_sub |>
+					dplyr::filter(vocab_id == "ICD9") |>
+					dplyr::select(code) |>
+					dplyr::pull() |>
+					unique() |>
+					stringr::str_remove(stringr::fixed(".")) |>
+					stringr::str_sub(1, 5)
+				ICD9_search = stringr::str_flatten(ICD9s, collapse = "|")
+			}
+
 			## hesin_diag
 			hesin_diag_sub = NULL
 			if (!is.null(diagnosis_list_sub$hesin_diag) & any(codes_sub$vocab_id %in% c("ICD10","ICD9")))  {
@@ -229,17 +241,6 @@ get_df <- function(
 				}
 
 				if (any(codes_sub$vocab_id == "ICD9"))  {
-					ICD9s = ""
-					if (any(codes_sub$vocab_id == "ICD9"))  {
-						ICD9s <- codes_sub |>
-							dplyr::filter(vocab_id == "ICD9") |>
-							dplyr::select(code) |>
-							dplyr::pull() |>
-							unique() |>
-							stringr::str_remove(stringr::fixed(".")) |>
-							stringr::str_sub(1, 5)
-					}
-					ICD9_search = stringr::str_flatten(ICD9s, collapse = "|")
 					colnames(diagnosis_list_sub$hesin_diag) = tolower(colnames(diagnosis_list_sub$hesin_diag))
 					hesin_diag_sub = rbind(hesin_diag_sub, diagnosis_list_sub$hesin_diag |> dplyr::filter(stringr::str_starts(diag_icd9, !! ICD9_search)))
 				}
@@ -255,9 +256,17 @@ get_df <- function(
 			diagnosis_list_sub$death_cause <- death_cause_sub
 
 			## cancer_registry
-			cancer_registry_sub <- NULL
+			cancer_registry_sub <- cancer_registry_sub_icd10 <- NULL
+			if (!is.null(diagnosis_list_sub$cancer_registry) & any(codes_sub$vocab_id == "ICD9"))  {
+				cancer_registry_sub <- diagnosis_list_sub$cancer_registry |> dplyr::filter(stringr::str_detect( icd9, !! ICD9_search))
+			}
 			if (!is.null(diagnosis_list_sub$cancer_registry) & any(codes_sub$vocab_id == "ICD10"))  {
-				cancer_registry_sub <- diagnosis_list_sub$cancer_registry |> dplyr::filter(stringr::str_detect( icd10, !! ICD10_search))
+				cancer_registry_sub_icd10 <- diagnosis_list_sub$cancer_registry |> dplyr::filter(stringr::str_detect( icd10, !! ICD10_search))
+				if (is.null(cancer_registry_sub))  {
+					cancer_registry_sub <- cancer_registry_sub_icd10
+				} else {
+					cancer_registry_sub <- rbind(cancer_registry_sub, cancer_registry_sub_icd10)
+				}
 			}
 			diagnosis_list_sub$cancer_registry <- cancer_registry_sub
 
@@ -704,19 +713,36 @@ get_cancer_registry_df <- function(
 	start_time <- Sys.time()
 
 	if (verbose) cat("Getting cancer registry data\n")
+
+	# create ICD9 search string
+	ICD9_search <- ""
+	if (any(codes_df$vocab_id == "ICD9"))  {
+		ICD9s <- codes_df |>
+			dplyr::filter(vocab_id == "ICD9") |>
+			dplyr::select(code) |>
+			dplyr::pull() |>
+			unique() |>
+			stringr::str_remove(stringr::fixed(".")) |>
+			stringr::str_sub(1, 5)
+		ICD9_search = stringr::str_flatten(ICD9s, collapse = "|")
+	}
 
-	# format codes
-	vocab_col = "vocab_id"
-	codes_col = "code"
+	# create ICD10 search string
+	ICD10_search <- ""
+	if (any(codes_df$vocab_id == "ICD10"))  {
+		ICD10s <- codes_df |>
+			dplyr::filter(vocab_id == "ICD10") |>
+			dplyr::select(code) |>
+			dplyr::pull() |>
+			unique() |>
+			stringr::str_remove(stringr::fixed(".")) |>
+			stringr::str_sub(1, 5)
+		ICD10_search = stringr::str_flatten(ICD10s, collapse = "|")
+	}
 
-	codes <- codes_df |>
-		dplyr::filter(!!rlang::sym(vocab_col) == "ICD10") |>
-		dplyr::select(!!rlang::sym(codes_col)) |>
-		dplyr::pull() |>
-		unique() |>
-		stringr::str_remove(stringr::fixed(".")) |>
-		stringr::str_sub(1, 5)
-	codes_string = stringr::str_flatten(codes, collapse = "|")
+	# if "missing" (empty string) replace with impossible code so grep doesn't catch all rows 
+	if (ICD9_search=="")   ICD9_search <- "not_a_code"
+	if (ICD10_search=="")  ICD10_search <- "not_a_code"
 
 	# create empty vars in ukb_dat to modify
 	ukb_dat$canreg    <- 0
@@ -734,8 +760,15 @@ get_cancer_registry_df <- function(
 
 		# Update where the code matches
 		ukb_dat <- ukb_dat |> dplyr::mutate(
-			canreg_df = dplyr::if_else(canreg == 0 & stringr::str_detect(icd10, codes_string), date, canreg_df, canreg_df),
-			canreg    = dplyr::if_else(canreg == 0 & stringr::str_detect(icd10, codes_string), 1, canreg, canreg)
+			canreg_df = dplyr::if_else(
+				canreg == 0 & 
+				( stringr::str_detect(icd9, ICD9_search) | stringr::str_detect(icd10, ICD10_search) ), 
+				date, canreg_df, canreg_df
+			),
+			canreg    = dplyr::if_else(
+				canreg == 0 & 
+				( stringr::str_detect(icd9, ICD9_search) | stringr::str_detect(icd10, ICD10_search) ), 
+				1, canreg, canreg)
 			)
 	}
 

diff --git a/R/get_diagnoses.R b/R/get_diagnoses.R
@@ -6,7 +6,7 @@
 #'
 #'  - ICD10 (for `hesin`, `death_cause` and `cancer_registry` searches) - fuzzy matching
 #'
-#'  - ICD9 (for `hesin` searches) - fuzzy matching
+#'  - ICD9 (for `hesin` searches and `cancer_registry` searches) - fuzzy matching
 #'
 #'  - Read2 / CTV3 (for `gp_clinical`) - exact matches on first 5 characters
 #'
@@ -22,7 +22,7 @@
 #'
 #' @name get_diagnoses
 #'
-#' @param codes_df A data frame. Contains two columns: `code` and `vocab_id` i.e., a list of diagnostic codes, and an indicator of the vocabulary (ICD10, Read2, CTV3, OPCS3, OPCS4, ukb_cancer, and ukb_noncancer are recognised). Other columns are ignored.
+#' @param codes_df A data frame. Contains two columns: `code` and `vocab_id` i.e., a list of diagnostic codes, and an indicator of the vocabulary (ICD9, ICD10, Read2, CTV3, OPCS3, OPCS4, ukb_cancer, and ukb_noncancer are recognised). Other columns are ignored.
 #' @param file_paths A data frame. Columns must be `object` and `path` containing paths to required files. Default assumes you have the tables exported in the RAP environment from
 #'        ukbrapR::export_tables()
 #'        \code{default=ukbrapR:::ukbrapr_paths}
@@ -53,7 +53,7 @@ get_diagnoses <- function(
 	# start up messages
   pkg_version <- utils::packageVersion("ukbrapR")
   cli::cli_alert_info("{.pkg ukbrapR} v{pkg_version}")
-  .ukbrapr_startup_notice()
+  #.ukbrapr_startup_notice()
 
 	start_time <- Sys.time()
 
@@ -147,6 +147,7 @@ get_diagnoses <- function(
 			stringr::str_sub(1, 5)
 		hyphen_check(ICD9s, "ICD9")
 		cat(" - N unique ICD9 codes:", length(ICD9s), "\n")
+		if (any(dplyr::between(as.numeric(ICD9s), 140, 208)))  get_canreg <- TRUE
 	}
 
 	# get Read2 and CTV3s. First 5 characters only.
@@ -343,28 +344,6 @@ get_diagnoses <- function(
 
 		if (verbose)  cli::cli_alert_info(c("Time taken so far: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}."))
 
-		#
-		# cancer registry ####################################
-		#
-
-		# do any ICD10s start with a C? Skip if not.
-		if (get_canreg)  {
-
-			cli::cli_alert("Ascertaining cancer registry data.")
-
-			# load data
-			cancer_registry_dat <- suppressWarnings(readr::read_tsv(file_paths$path[ file_paths$object=="cancer_registry" ], show_col_types = FALSE, progress = FALSE))
-
-			# get cancer registry data for these ICD10s
-			cancer_registry_tbl <- ukbrapR:::get_cancer_registry(codes = ICD10s, ukb_dat = cancer_registry_dat, verbose = verbose)
-			cli::cli_alert_success("Loaded {.var cancer_registry} with {nrow(cancer_registry_tbl)} matched rows.")
-
-			rm(cancer_registry_dat)
-
-			if (verbose)  cli::cli_alert_info(c("Time taken so far: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}."))
-
-		}
-
 		#
 		# HES diagnosis data (ICD10s) ###########################################
 		#
@@ -407,6 +386,27 @@ get_diagnoses <- function(
 
 	}
 
+	#
+	# cancer registry ####################################
+	#
+
+	if (get_canreg)  {
+
+		cli::cli_alert("Ascertaining cancer registry data.")
+
+		# load data
+		cancer_registry_dat <- suppressWarnings(readr::read_tsv(file_paths$path[ file_paths$object=="cancer_registry" ], show_col_types = FALSE, progress = FALSE))
+
+		# get cancer registry data for these ICD10s
+		cancer_registry_tbl <- ukbrapR:::get_cancer_registry(ICD9s = ICD9s, ICD10s = ICD10s, ukb_dat = cancer_registry_dat, verbose = verbose)
+		cli::cli_alert_success("Loaded {.var cancer_registry} with {nrow(cancer_registry_tbl)} matched rows.")
+
+		rm(cancer_registry_dat)
+
+		if (verbose)  cli::cli_alert_info(c("Time taken so far: ", "{prettyunits::pretty_sec(as.numeric(difftime(Sys.time(), start_time, units=\"secs\")))}."))
+
+	}
+
 	#
 	# ICD9 HES diagnosis data ###########################################
 	#