diff --git a/config/config.yml b/config/config.yml index 7105367..f7a5484 100644 --- a/config/config.yml +++ b/config/config.yml @@ -16,3 +16,4 @@ comparisons: gene_features: Null enh_features: Null enh_assays: Null + auc_left_rectangle: FALSE diff --git a/workflow/rules/crispr_comparison.smk b/workflow/rules/crispr_comparison.smk index 814ea87..54972ca 100644 --- a/workflow/rules/crispr_comparison.smk +++ b/workflow/rules/crispr_comparison.smk @@ -96,7 +96,8 @@ rule comparePredictionsToExperiment: pos_col = "Regulated", min_sensitivity = 0.7, dist_bins_kb = lambda wildcards: config["comparisons"][wildcards.comparison]["dist_bins_kb"], - include_col = lambda wildcards: get_optional_parameter(wildcards, "include_col", None) + include_col = lambda wildcards: get_optional_parameter(wildcards, "include_col", None), + auc_left_rectangle = lambda wildcards: get_optional_parameter(wildcards, "auc_left_rectangle", None) conda: "../envs/r_crispr_comparison.yml" resources: mem_mb = 32000, diff --git a/workflow/scripts/comparePredictionsToExperiment.Rmd b/workflow/scripts/comparePredictionsToExperiment.Rmd index ff3cc62..8febb1e 100644 --- a/workflow/scripts/comparePredictionsToExperiment.Rmd +++ b/workflow/scripts/comparePredictionsToExperiment.Rmd @@ -123,7 +123,7 @@ write_tsv(pr_table, file = file.path(dirname(snakemake@output[[1]]), "pr_table.t # create performance summary tables perf_summary <- applyCellTypes(merged, .fun = makePRSummaryTableBS, pred_config = pred_config, - pos_col = pos_col) + pos_col = pos_col, auc_left_rectangle = snakemake@params$auc_left_rectangle) # convert performance summaries to one table and add full predictor names to performance summary perf_summary <- perf_summary %>% @@ -152,7 +152,8 @@ pr_plots <- mapply(FUN = makePRCurvePlot, pr_df = pr, n_pos = n_pos, pct_pos = p MoreArgs = list(pred_config = pred_config, min_sensitivity = snakemake@params$min_sensitivity, line_width = 0.8, point_size = 3, - text_size = 13, colors = pred_colors), + text_size = 13, colors = pred_colors, + auc_left_rectangle = snakemake@params$auc_left_rectangle), SIMPLIFY = FALSE) # save plots to files @@ -375,7 +376,8 @@ if (length(invalid_bins) > 0) { # compute performance as a function of distance to TSS dist_perf <- applyCellTypes(merged_bins_filt, .fun = computePerfSubsets, pred_config = pred_config, subset_col = "distToTSS (bins)", metric = "auprc", pos_col = pos_col, - bs_iter = 1000) + bs_iter = 1000, + auc_left_rectangle = snakemake@params$auc_left_rectangle) # create table of performance across distance dist_perf_tbl <- bind_rows(dist_perf, .id = "cell_type") @@ -410,7 +412,8 @@ dist_prc_plots <- applyCellTypes(merged_bins_filt, .fun = makePRCurveSubsets, pos_col = pos_col, min_sensitivity = snakemake@params$min_sensitivity, line_width = 0.8, point_size = 3, text_size = 13, nrow = nrow, - colors = pred_colors) + colors = pred_colors, + auc_left_rectangle = snakemake@params$auc_left_rectangle) # calculate plot dimensions based on number of features plot_height <- nrow * 4 @@ -544,7 +547,8 @@ if (length(gene_feat_cols) > 0) { pred_config = pred_config, pos_col = pos_col, min_sensitivity = snakemake@params$min_sensitivity, line_width = 1.2, point_size = 3.5, text_size = 13, - colors = pred_colors) + colors = pred_colors, + auc_left_rectangle = snakemake@params$auc_left_rectangle) # calculate plot dimensions based on number of features n_features <- length(gene_feat_cols) @@ -574,7 +578,8 @@ if (length(enh_feat_cols) > 0) { pred_config = pred_config, pos_col = pos_col, min_sensitivity = snakemake@params$min_sensitivity, line_width = 1.2, point_size = 3.5, text_size = 13, - colors = pred_colors) + colors = pred_colors, + auc_left_rectangle = snakemake@params$auc_left_rectangle) # calculate plot dimensions based on number of features n_features <- length(enh_feat_cols) diff --git a/workflow/scripts/crisprComparisonBootstrapFunctions.R b/workflow/scripts/crisprComparisonBootstrapFunctions.R index 640fa90..5dc56f5 100644 --- a/workflow/scripts/crisprComparisonBootstrapFunctions.R +++ b/workflow/scripts/crisprComparisonBootstrapFunctions.R @@ -106,7 +106,8 @@ getThresholdValues <- function(pred_config, predictors = NULL, threshold_col = " bootstrapPerformanceIntervals <- function(data, metric = c("auprc", "precision", "recall"), predictors = NULL, thresholds = NULL, weighted = FALSE, R = 10000, conf = 0.95, ncpus = 1, - ci_type = c("perc", "norm", "basic", "bca")) { + ci_type = c("perc", "norm", "basic", "bca"), + auc_left_rectangle = FALSE) { # parse input arguments metric <- match.arg(metric) @@ -134,7 +135,8 @@ bootstrapPerformanceIntervals <- function(data, metric = c("auprc", "precision", # bootstrap performance message("Running bootstraps...") bs_perf <- boot(data, statistic = calculate_performance, metric = metric, R = R, - parallel = parallel, ncpus = ncpus, thresholds = thresholds, weighted = weighted) + parallel = parallel, ncpus = ncpus, thresholds = thresholds, weighted = weighted, + auc_left_rectangle = auc_left_rectangle) # set up parallel backend for computing confidence intervals if specified if (ncpus > 1) { @@ -188,7 +190,8 @@ bootstrapPerformanceIntervals <- function(data, metric = c("auprc", "precision", bootstrapDeltaPerformance <- function(data, metric = c("auprc", "precision", "recall"), comparisons = NULL, thresholds = NULL, weighted = FALSE, R = 10000, conf = 0.95, - ci_type = c("perc", "norm", "basic", "bca"), ncpus = 1) { + ci_type = c("perc", "norm", "basic", "bca"), ncpus = 1, + auc_left_rectangle = auc_left_rectangle) { # parse input arguments metric <- match.arg(metric) @@ -223,7 +226,8 @@ bootstrapDeltaPerformance <- function(data, metric = c("auprc", "precision", "re message("Running bootstraps...") bs_delta <- boot(data, statistic = calc_delta_performance, metric = metric, R = R, parallel = parallel, ncpus = ncpus, thresholds = thresholds, - comparisons = comparisons, weighted = weighted) + comparisons = comparisons, weighted = weighted, + auc_left_rectangle = auc_left_rectangle) # set up parallel backend for computing confidence intervals if specified (useful for 'bca') if (ncpus > 1) { @@ -282,7 +286,8 @@ bootstrapDeltaPerformanceDatasets <- function(data1, data2, metric = c("auprc", "precision", "recall"), predictors = NULL, thresholds = NULL, weighted = FALSE, R = 10000, conf = 0.95, ncpus = 1, - ci_type = c("perc", "norm", "basic", "bca")) { + ci_type = c("perc", "norm", "basic", "bca"), + auc_left_rectangle = FALSE) { # parse input arguments metric <- match.arg(metric) @@ -323,7 +328,8 @@ bootstrapDeltaPerformanceDatasets <- function(data1, data2, message("Running bootstraps...") bs_delta <- boot(data, statistic = calc_delta_performance_datasets, metric = metric, R = R, strata = data$dataset, parallel = parallel, ncpus = ncpus, - thresholds = thresholds, weighted = weighted) + thresholds = thresholds, weighted = weighted, + auc_left_rectangle = auc_left_rectangle) # set up parallel backend for computing confidence intervals if specified (useful for 'bca') if (ncpus > 1) { @@ -395,7 +401,8 @@ plotBootstrappedIntervals <- function(results, title = NULL) { ## FUNCTIONS TO COMPUTE PERFORMANCE AND DELTA PERFORMANCE ========================================== # calculate performance AUPRC, or precision or recall at threshold -calculate_performance <- function(data, indices, metric, thresholds, weighted) { +calculate_performance <- function(data, indices, metric, thresholds, weighted, + auc_left_rectangle = FALSE) { # select bootstrap sample data <- data[indices, ] @@ -412,7 +419,8 @@ calculate_performance <- function(data, indices, metric, thresholds, weighted) { # calculate performance for all predictors performance <- mapply(FUN = calculate_performance_one_pred, pred = preds, threshold = thresholds, - MoreArgs = list(data = data, metric = metric, weighted = weighted), + MoreArgs = list(data = data, metric = metric, weighted = weighted, + auc_left_rectangle = auc_left_rectangle), SIMPLIFY = TRUE) return(performance) @@ -421,11 +429,13 @@ calculate_performance <- function(data, indices, metric, thresholds, weighted) { # function to calculate delta AUPRC, or precision or recall at threshold between pairwise # predictor combinations -calc_delta_performance <- function(data, indices, metric, thresholds, comparisons, weighted) { +calc_delta_performance <- function(data, indices, metric, thresholds, comparisons, weighted, + auc_left_rectangle = FALSE) { # calculate bootstrapped performance perf <- calculate_performance(data, indices = indices, metric = metric, thresholds = thresholds, - weighted = weighted) + weighted = weighted, + auc_left_rectangle = auc_left_rectangle) # calculate delta performance for all specified comparisons delta_perf <- vapply(comparisons, FUN = function(comp, perf) { @@ -437,7 +447,8 @@ calc_delta_performance <- function(data, indices, metric, thresholds, comparison } # function to calculate delta AUPRC, or precision or recall at threshold between 2 datasets -calc_delta_performance_datasets <- function(data, indices, metric, thresholds, weighted) { +calc_delta_performance_datasets <- function(data, indices, metric, thresholds, weighted, + auc_left_rectangle = FALSE) { # select bootstrap sample data <- data[indices, ] @@ -446,9 +457,11 @@ calc_delta_performance_datasets <- function(data, indices, metric, thresholds, w data1 <- data[data$dataset == "1", ] data2 <- data[data$dataset == "2", ] perf1 <- calculate_performance(data1, indices = seq_len(nrow(data1)), metric = metric, - thresholds = thresholds, weighted = weighted) + thresholds = thresholds, weighted = weighted, + auc_left_rectangle = auc_left_rectangle) perf2 <- calculate_performance(data2, indices = seq_len(nrow(data2)), metric = metric, - thresholds = thresholds, weighted = weighted) + thresholds = thresholds, weighted = weighted, + auc_left_rectangle = auc_left_rectangle) # calculate delta performance for all predictors delta_perf <- perf1 - perf2 @@ -461,7 +474,8 @@ calc_delta_performance_datasets <- function(data, indices, metric, thresholds, w ## HELPER FUNCTIONS ================================================================================ # calculate performance auprc, or precision or recall at threshold for one predictor -calculate_performance_one_pred <- function(data, pred, threshold, metric, weighted) { +calculate_performance_one_pred <- function(data, pred, threshold, metric, weighted, + auc_left_rectangle = FALSE) { # return NA if 'Regulated' column does not contain at least one positive and negative if (length(unique(data$Regulated)) != 2) { @@ -492,7 +506,7 @@ calculate_performance_one_pred <- function(data, pred, threshold, metric, weight if (metric %in% c("precision", "recall")) { performance <- calculate_performance_at_threshold(pr, threshold = threshold, metric = metric) } else if (metric == "auprc") { - performance <- calculate_auprc(pr) + performance <- calculate_auprc(pr, auc_left_rectangle = auc_left_rectangle) } else { stop("Invalid 'metric' argument", call. = FALSE) } @@ -502,7 +516,7 @@ calculate_performance_one_pred <- function(data, pred, threshold, metric, weight } # calculate area-under-the-precision-recall-curve (AUPRC) -calculate_auprc <- function(pr) { +calculate_auprc <- function(pr, auc_left_rectangle = FALSE) { # the head() calls here remove the last element of the vector. # The point is that performance objects produced by ROCR always include a Recall = 100% point even @@ -511,17 +525,24 @@ calculate_auprc <- function(pr) { pr <- head(pr, -1) # compute auprc - auprc <- compute_auc(x_vals = pr$recall, y_vals = pr$precision) + auprc <- compute_auc(x_vals = pr$recall, y_vals = pr$precision, + auc_left_rectangle = auc_left_rectangle) return(auprc) } # try to compute area under the curve -compute_auc <- function(x_vals, y_vals) { +compute_auc <- function(x_vals, y_vals, + auc_left_rectangle = FALSE) { good.idx <- which(!is.na(x_vals) & !is.na(y_vals)) if (length(good.idx) > 0) { auc <- trapz(x_vals[good.idx], y_vals[good.idx]) + # Add the area of the rectangle formed by the left end + # of the curve and the origin. + if(auc_left_rectangle == "True"){ + auc <- auc + x_vals[good.idx[1]] * y_vals[good.idx[1]] + } } else { auc <- NA_real_ } diff --git a/workflow/scripts/crisprComparisonPlotFunctions.R b/workflow/scripts/crisprComparisonPlotFunctions.R index c822649..ef83c8c 100644 --- a/workflow/scripts/crisprComparisonPlotFunctions.R +++ b/workflow/scripts/crisprComparisonPlotFunctions.R @@ -292,7 +292,8 @@ calcPRCurves <- function(df, pred_config, pos_col) { # create bootstrapped performance summary table for all predictors in a PR table makePRSummaryTableBS <- function(merged, pred_config, pos_col, threshold_col = "alpha", - min_sensitivity = 0.7, R = 1000, conf = 0.95, ncpus = 1) { + min_sensitivity = 0.7, R = 1000, conf = 0.95, ncpus = 1, + auc_left_rectangle = FALSE) { # convert merged to wide format for bootstrapping merged_bs <- convertMergedForBootstrap(merged, pred_config = pred_config, pos_col = pos_col) @@ -304,7 +305,8 @@ makePRSummaryTableBS <- function(merged, pred_config, pos_col, threshold_col = " # bootstrap overall performance (AUPRC) and reformat for performance summary table message("Bootstrapping AUPRC:") perf <- bootstrapPerformanceIntervals(merged_bs, metric = "auprc", R = R, conf = conf, - ci_type = "perc", ncpus = ncpus) + ci_type = "perc", ncpus = ncpus, + auc_left_rectangle = auc_left_rectangle) perf <- select(perf, pred_uid = id, AUPRC = full, AUPRC_lowerCi = lower, AUPRC_upperCi = upper) # get performance at minimum sensitivity @@ -396,11 +398,13 @@ makePRSummaryTableBS <- function(merged, pred_config, pos_col, threshold_col = " makePRCurvePlot <- function(pr_df, pred_config, n_pos, pct_pos, min_sensitivity = 0.7, plot_name = "PRC full experimental data", line_width = 1, point_size = 3, text_size = 15, colors = NULL, plot_thresholds = TRUE, - na_color = "gray66", na_size_factor = 0.5) { + na_color = "gray66", na_size_factor = 0.5, + auc_left_rectangle = FALSE) { # create performance summary perf_summary <- makePRSummaryTable(pr_df, pred_config = pred_config, - min_sensitivity = min_sensitivity) + min_sensitivity = min_sensitivity, + auc_left_rectangle = auc_left_rectangle) # get PR values at threshold pr_threshold <- perf_summary %>% @@ -685,7 +689,7 @@ plotPairsFeatures <- function(n_pairs_features, # calculate and plot PR curves for a given subset makePRCurveSubset <- function(merged, subset_col, pred_config, pos_col, min_sensitivity = 0.7, line_width = 1, point_size = 3, text_size = 15, nrow = 1, - colors = NULL) { + colors = NULL, auc_left_rectangle = FALSE) { # split df into subsets based on provided column merged_split <- split(merged, f = merged[[subset_col]]) @@ -723,7 +727,8 @@ makePRCurveSubset <- function(merged, subset_col, pred_config, pos_col, min_sens pr_plots <- mapply(FUN = makePRCurvePlot, pr_df = prc, n_pos = n_pos, pct_pos = pct_pos, plot_name = titles, MoreArgs = list(pred_config = pred_config, min_sensitivity = min_sensitivity, line_width = line_width, point_size = point_size, - text_size = text_size, colors = colors), + text_size = text_size, colors = colors, + auc_left_rectangle = auc_left_rectangle), SIMPLIFY = FALSE) # add empty plots for subsets without both positives and negatives @@ -745,7 +750,8 @@ makePRCurveSubset <- function(merged, subset_col, pred_config, pos_col, min_sens # cell type makePRCurveSubsets <- function(merged, subset_cols, pred_config, pos_col, cell_type = "combined", min_sensitivity = 0.7, line_width = 1, point_size = 3, - text_size = 15, nrow = 1, colors = NULL) { + text_size = 15, nrow = 1, colors = NULL, + auc_left_rectangle = FALSE) { # return NULL if no subsets are available if (length(subset_cols) == 0) { @@ -756,7 +762,8 @@ makePRCurveSubsets <- function(merged, subset_cols, pred_config, pos_col, cell_t pr_plots <- lapply(subset_cols, FUN = makePRCurveSubset, merged = merged, pred_config = pred_config, pos_col = pos_col, min_sensitivity = min_sensitivity, line_width = line_width, - point_size = point_size, text_size = text_size, nrow = nrow, colors = colors) + point_size = point_size, text_size = text_size, nrow = nrow, colors = colors, + auc_left_rectangle = auc_left_rectangle) # create one figure with all plots plot_grid(plotlist = pr_plots, nrow = length(subset_cols)) @@ -1013,10 +1020,16 @@ pr2df <- function(pr, calc_f1 = TRUE) { } # try to compute AUC -computeAUC <- function(x_vals, y_vals) { +computeAUC <- function(x_vals, y_vals, + auc_left_rectangle = FALSE) { good.idx <- which(!is.na(x_vals) & !is.na(y_vals)) if (length(good.idx) > 0) { auc <- trapz(x_vals[good.idx], y_vals[good.idx]) + # Add the area of the rectangle formed by the left end + # of the curve and the origin. + if(auc_left_rectangle == "True"){ + auc <- auc + x_vals[good.idx[1]] * y_vals[good.idx[1]] + } } else { auc <- NA_real_ } @@ -1210,7 +1223,8 @@ get_alpha_min <- function(merged, predictor, inverse = FALSE) { ## DEPRECATED ====================================================================================== # create performance summary table for all predictors in a PR table -makePRSummaryTable <- function(pr_df, pred_config, min_sensitivity = 0.7) { +makePRSummaryTable <- function(pr_df, pred_config, min_sensitivity = 0.7, + auc_left_rectangle = FALSE) { # remove any boolean predictors since the following metrics don't make sense for them bool_preds <- pull(filter(pred_config, boolean == TRUE), pred_uid) @@ -1219,7 +1233,9 @@ makePRSummaryTable <- function(pr_df, pred_config, min_sensitivity = 0.7) { # compute performance summary for each predictor perf_summary <- pr_df %>% group_split(pred_uid) %>% - lapply(calcPerfSummaryOnePred, pred_config = pred_config, min_sensitivity = min_sensitivity) %>% + lapply(calcPerfSummaryOnePred, pred_config = pred_config, + min_sensitivity = min_sensitivity, + auc_left_rectangle = auc_left_rectangle) %>% bind_rows() # add predictor information from pred_config @@ -1241,7 +1257,8 @@ makePRSummaryTable <- function(pr_df, pred_config, min_sensitivity = 0.7) { } # create performance summary for one predictor -calcPerfSummaryOnePred <- function(pr_df, pred_config, min_sensitivity) { +calcPerfSummaryOnePred <- function(pr_df, pred_config, min_sensitivity, + auc_left_rectangle = FALSE) { # check that pr_df contains data on only one predictor predictor <- unique(pr_df$pred_uid) @@ -1259,7 +1276,8 @@ calcPerfSummaryOnePred <- function(pr_df, pred_config, min_sensitivity) { # (1,0) on the PR curve. This should not be included in the AUC computation. auprc <- pr_df %>% head(-1) %>% - summarize(AUPRC = computeAUC(x_vals = recall, y_vals = precision), + summarize(AUPRC = computeAUC(x_vals = recall, y_vals = precision, + auc_left_rectangle = auc_left_rectangle), max_F1 = computeMaxF1(.)) # compute performance at min sensitivity