From 577710ef36318e5508aeccb81af15df26a6a5366 Mon Sep 17 00:00:00 2001
From: Julianne Clina <jclina@kumc.edu>
Date: Mon, 15 Dec 2025 15:31:57 -0600
Subject: [PATCH] Create read_proteomics.R

---
 R/read_proteomics.R | 147 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 147 insertions(+)
 create mode 100644 R/read_proteomics.R

diff --git a/R/read_proteomics.R b/R/read_proteomics.R
new file mode 100644
index 0000000..91bc313
--- /dev/null
+++ b/R/read_proteomics.R
@@ -0,0 +1,147 @@
+#' @title read_proteomics
+#' @description Reads the proteomics results file from the directory
+#'   for the participants.
+#' @param directory A path name containing the downloaded ABC-DS data from the
+#'   University of South Carolina Laboratory of Neuro Imaging's (LONI) Image
+#'   and Data Archive
+#' @param include_demographics An optional parameter to merge demographic information
+#'   before returning the data, Default: FALSE
+#' @param wide_format An optional parameter to return data in wide format with proteins as columns,
+#'   Default: TRUE
+#' @return A data frame containing cleaned proteomics data. If wide_format is TRUE,
+#'   returns data with one row per subject/event and proteins as columns. If FALSE, returns long format
+#'   with one row per subject/event/protein combination.
+#' @details Reads the proteomics results file from the directory. The function automatically
+#'   filters for plasma samples, removes pooled and serum samples, handles
+#'   missing value codes (-999999, -777777, -99999, -9999, -999, -99), and
+#'   cleans protein names for consistency. End users also have an
+#'   option to include the demographics with the `include_demographics` argument
+#'   and can choose between wide and long format output.
+#' @rdname read_proteomics
+#' @export
+
+read_proteomics <- function(
+  directory,
+  include_demographics = FALSE,
+  wide_format = TRUE
+) {
+  proteo_file <- list.files(
+    directory,
+    pattern = "Proteomics.*Results",
+    full.names = TRUE
+  )
+
+  if (length(proteo_file) == 0) {
+    stop("Did not find any proteomics files")
+  }
+
+  proteo_raw <- utils::read.csv(proteo_file)
+
+  col_names <- colnames(proteo_raw)
+
+  subject_col <- col_names[grep(
+    "subject|subid",
+    col_names,
+    ignore.case = TRUE
+  )][1]
+  event_col <- col_names[grep("event|visit", col_names, ignore.case = TRUE)][1]
+  tissue_col <- col_names[grep("tissue|sample", col_names, ignore.case = TRUE)][
+    1
+  ]
+  marker_col <- col_names[grep(
+    "marker|protein|analyte",
+    col_names,
+    ignore.case = TRUE
+  )][1]
+  conc_col <- col_names[grep(
+    "concentration.*mean|meanconc",
+    col_names,
+    ignore.case = TRUE
+  )][1]
+
+  proteo_raw <- proteo_raw %>%
+    dplyr::rename(
+      subject_label = !!subject_col,
+      event_sequence = !!event_col,
+      tissue = !!tissue_col,
+      marker = !!marker_col,
+      concentrationmeanconc = !!conc_col
+    )
+
+  missing_codes <- c(-999999, -777777, -99999, -9999, -999, -99)
+
+  proteo_clean <- proteo_raw %>%
+    dplyr::filter(
+      !stringr::str_detect(stringr::str_to_lower(tissue), "pool|serum")
+    ) %>%
+    dplyr::filter(stringr::str_detect(
+      stringr::str_to_lower(tissue),
+      "plasma"
+    )) %>%
+    dplyr::mutate(
+      concentrationmeanconc = dplyr::if_else(
+        concentrationmeanconc < 0 | concentrationmeanconc %in% missing_codes,
+        NA_real_,
+        concentrationmeanconc
+      )
+    ) %>%
+    dplyr::filter(!is.na(concentrationmeanconc)) %>%
+    dplyr::mutate(
+      protein_name = stringr::str_trim(marker),
+      protein_name = stringr::str_replace(
+        protein_name,
+        "(?i)_plasma_",
+        "_Plasma_"
+      ),
+      protein_name = stringr::str_replace(protein_name, "(?i)_Simoa_", "_"),
+      concentration = as.numeric(concentrationmeanconc),
+      subject_label = as.character(subject_label)
+    ) %>%
+    dplyr::select(subject_label, event_sequence, protein_name, concentration)
+
+  if (nrow(proteo_clean) == 0) {
+    stop("No proteomics data found after cleaning")
+  }
+
+  if (wide_format) {
+    proteo_clean <- proteo_clean %>%
+      dplyr::group_by(subject_label, event_sequence, protein_name) %>%
+      dplyr::summarise(
+        concentration = mean(concentration, na.rm = TRUE),
+        .groups = "drop"
+      ) %>%
+      tidyr::pivot_wider(
+        names_from = protein_name,
+        values_from = concentration,
+        values_fn = mean
+      ) %>%
+      dplyr::rename_with(~ gsub("[- /()]", ".", .x)) %>%
+      dplyr::rename_with(~ gsub("\\.+", ".", .x)) %>%
+      dplyr::rename_with(~ gsub("\\.$", "", .x)) %>%
+      dplyr::mutate(event_sequence = as.numeric(event_sequence))
+  }
+
+  if (include_demographics) {
+    proteo_clean <- merge(
+      read_demographics(
+        directory,
+        person = "participants"
+      ),
+      proteo_clean,
+      by = c("subject_label", "event_sequence")
+    )
+  }
+
+  proteo_clean <- add_latency(
+    directory,
+    data = proteo_clean,
+    person = "participants",
+    visit = "clinical"
+  )
+  proteo_clean <- proteo_clean[
+    order(proteo_clean$subject_label, proteo_clean$event_sequence),
+  ]
+  class(proteo_clean) <- c("tbl_df", "tbl", "data.frame")
+
+  return(proteo_clean)
+}