From 577710ef36318e5508aeccb81af15df26a6a5366 Mon Sep 17 00:00:00 2001 From: Julianne Clina Date: Mon, 15 Dec 2025 15:31:57 -0600 Subject: [PATCH] Create read_proteomics.R --- R/read_proteomics.R | 147 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 R/read_proteomics.R diff --git a/R/read_proteomics.R b/R/read_proteomics.R new file mode 100644 index 0000000..91bc313 --- /dev/null +++ b/R/read_proteomics.R @@ -0,0 +1,147 @@ +#' @title read_proteomics +#' @description Reads the proteomics results file from the directory +#' for the participants. +#' @param directory A path name containing the downloaded ABC-DS data from the +#' University of South Carolina Laboratory of Neuro Imaging's (LONI) Image +#' and Data Archive +#' @param include_demographics An optional parameter to merge demographic information +#' before returning the data, Default: FALSE +#' @param wide_format An optional parameter to return data in wide format with proteins as columns, +#' Default: TRUE +#' @return A data frame containing cleaned proteomics data. If wide_format is TRUE, +#' returns data with one row per subject/event and proteins as columns. If FALSE, returns long format +#' with one row per subject/event/protein combination. +#' @details Reads the proteomics results file from the directory. The function automatically +#' filters for plasma samples, removes pooled and serum samples, handles +#' missing value codes (-999999, -777777, -99999, -9999, -999, -99), and +#' cleans protein names for consistency. End users also have an +#' option to include the demographics with the `include_demographics` argument +#' and can choose between wide and long format output. +#' @rdname read_proteomics +#' @export + +read_proteomics <- function( + directory, + include_demographics = FALSE, + wide_format = TRUE +) { + proteo_file <- list.files( + directory, + pattern = "Proteomics.*Results", + full.names = TRUE + ) + + if (length(proteo_file) == 0) { + stop("Did not find any proteomics files") + } + + proteo_raw <- utils::read.csv(proteo_file) + + col_names <- colnames(proteo_raw) + + subject_col <- col_names[grep( + "subject|subid", + col_names, + ignore.case = TRUE + )][1] + event_col <- col_names[grep("event|visit", col_names, ignore.case = TRUE)][1] + tissue_col <- col_names[grep("tissue|sample", col_names, ignore.case = TRUE)][ + 1 + ] + marker_col <- col_names[grep( + "marker|protein|analyte", + col_names, + ignore.case = TRUE + )][1] + conc_col <- col_names[grep( + "concentration.*mean|meanconc", + col_names, + ignore.case = TRUE + )][1] + + proteo_raw <- proteo_raw %>% + dplyr::rename( + subject_label = !!subject_col, + event_sequence = !!event_col, + tissue = !!tissue_col, + marker = !!marker_col, + concentrationmeanconc = !!conc_col + ) + + missing_codes <- c(-999999, -777777, -99999, -9999, -999, -99) + + proteo_clean <- proteo_raw %>% + dplyr::filter( + !stringr::str_detect(stringr::str_to_lower(tissue), "pool|serum") + ) %>% + dplyr::filter(stringr::str_detect( + stringr::str_to_lower(tissue), + "plasma" + )) %>% + dplyr::mutate( + concentrationmeanconc = dplyr::if_else( + concentrationmeanconc < 0 | concentrationmeanconc %in% missing_codes, + NA_real_, + concentrationmeanconc + ) + ) %>% + dplyr::filter(!is.na(concentrationmeanconc)) %>% + dplyr::mutate( + protein_name = stringr::str_trim(marker), + protein_name = stringr::str_replace( + protein_name, + "(?i)_plasma_", + "_Plasma_" + ), + protein_name = stringr::str_replace(protein_name, "(?i)_Simoa_", "_"), + concentration = as.numeric(concentrationmeanconc), + subject_label = as.character(subject_label) + ) %>% + dplyr::select(subject_label, event_sequence, protein_name, concentration) + + if (nrow(proteo_clean) == 0) { + stop("No proteomics data found after cleaning") + } + + if (wide_format) { + proteo_clean <- proteo_clean %>% + dplyr::group_by(subject_label, event_sequence, protein_name) %>% + dplyr::summarise( + concentration = mean(concentration, na.rm = TRUE), + .groups = "drop" + ) %>% + tidyr::pivot_wider( + names_from = protein_name, + values_from = concentration, + values_fn = mean + ) %>% + dplyr::rename_with(~ gsub("[- /()]", ".", .x)) %>% + dplyr::rename_with(~ gsub("\\.+", ".", .x)) %>% + dplyr::rename_with(~ gsub("\\.$", "", .x)) %>% + dplyr::mutate(event_sequence = as.numeric(event_sequence)) + } + + if (include_demographics) { + proteo_clean <- merge( + read_demographics( + directory, + person = "participants" + ), + proteo_clean, + by = c("subject_label", "event_sequence") + ) + } + + proteo_clean <- add_latency( + directory, + data = proteo_clean, + person = "participants", + visit = "clinical" + ) + proteo_clean <- proteo_clean[ + order(proteo_clean$subject_label, proteo_clean$event_sequence), + ] + class(proteo_clean) <- c("tbl_df", "tbl", "data.frame") + + return(proteo_clean) +}