Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
147 changes: 147 additions & 0 deletions R/read_proteomics.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
#' @title read_proteomics
#' @description Reads the proteomics results file from the directory
#' for the participants.
#' @param directory A path name containing the downloaded ABC-DS data from the
#' University of South Carolina Laboratory of Neuro Imaging's (LONI) Image
#' and Data Archive
#' @param include_demographics An optional parameter to merge demographic information
#' before returning the data, Default: FALSE
#' @param wide_format An optional parameter to return data in wide format with proteins as columns,
#' Default: TRUE
#' @return A data frame containing cleaned proteomics data. If wide_format is TRUE,
#' returns data with one row per subject/event and proteins as columns. If FALSE, returns long format
#' with one row per subject/event/protein combination.
#' @details Reads the proteomics results file from the directory. The function automatically
#' filters for plasma samples, removes pooled and serum samples, handles
#' missing value codes (-999999, -777777, -99999, -9999, -999, -99), and
#' cleans protein names for consistency. End users also have an
#' option to include the demographics with the `include_demographics` argument
#' and can choose between wide and long format output.
#' @rdname read_proteomics
#' @export

read_proteomics <- function(
directory,
include_demographics = FALSE,
wide_format = TRUE
) {
proteo_file <- list.files(
directory,
pattern = "Proteomics.*Results",
full.names = TRUE
)

if (length(proteo_file) == 0) {
stop("Did not find any proteomics files")
}

proteo_raw <- utils::read.csv(proteo_file)

col_names <- colnames(proteo_raw)

subject_col <- col_names[grep(
"subject|subid",
col_names,
ignore.case = TRUE
)][1]
event_col <- col_names[grep("event|visit", col_names, ignore.case = TRUE)][1]
tissue_col <- col_names[grep("tissue|sample", col_names, ignore.case = TRUE)][
1
]
marker_col <- col_names[grep(
"marker|protein|analyte",
col_names,
ignore.case = TRUE
)][1]
conc_col <- col_names[grep(
"concentration.*mean|meanconc",
col_names,
ignore.case = TRUE
)][1]

proteo_raw <- proteo_raw %>%
dplyr::rename(
subject_label = !!subject_col,
event_sequence = !!event_col,
tissue = !!tissue_col,
marker = !!marker_col,
concentrationmeanconc = !!conc_col
)

missing_codes <- c(-999999, -777777, -99999, -9999, -999, -99)

proteo_clean <- proteo_raw %>%
dplyr::filter(
!stringr::str_detect(stringr::str_to_lower(tissue), "pool|serum")
) %>%
dplyr::filter(stringr::str_detect(
stringr::str_to_lower(tissue),
"plasma"
)) %>%
dplyr::mutate(
concentrationmeanconc = dplyr::if_else(
concentrationmeanconc < 0 | concentrationmeanconc %in% missing_codes,
NA_real_,
concentrationmeanconc
)
) %>%
dplyr::filter(!is.na(concentrationmeanconc)) %>%
dplyr::mutate(
protein_name = stringr::str_trim(marker),
protein_name = stringr::str_replace(
protein_name,
"(?i)_plasma_",
"_Plasma_"
),
protein_name = stringr::str_replace(protein_name, "(?i)_Simoa_", "_"),
concentration = as.numeric(concentrationmeanconc),
subject_label = as.character(subject_label)
) %>%
dplyr::select(subject_label, event_sequence, protein_name, concentration)

if (nrow(proteo_clean) == 0) {
stop("No proteomics data found after cleaning")
}

if (wide_format) {
proteo_clean <- proteo_clean %>%
dplyr::group_by(subject_label, event_sequence, protein_name) %>%
dplyr::summarise(
concentration = mean(concentration, na.rm = TRUE),
.groups = "drop"
) %>%
tidyr::pivot_wider(
names_from = protein_name,
values_from = concentration,
values_fn = mean
) %>%
dplyr::rename_with(~ gsub("[- /()]", ".", .x)) %>%
dplyr::rename_with(~ gsub("\\.+", ".", .x)) %>%
dplyr::rename_with(~ gsub("\\.$", "", .x)) %>%
dplyr::mutate(event_sequence = as.numeric(event_sequence))
}

if (include_demographics) {
proteo_clean <- merge(
read_demographics(
directory,
person = "participants"
),
proteo_clean,
by = c("subject_label", "event_sequence")
)
}

proteo_clean <- add_latency(
directory,
data = proteo_clean,
person = "participants",
visit = "clinical"
)
proteo_clean <- proteo_clean[
order(proteo_clean$subject_label, proteo_clean$event_sequence),
]
class(proteo_clean) <- c("tbl_df", "tbl", "data.frame")

return(proteo_clean)
}