EngreitzLab · 613weilin · Oct 31, 2025
diff --git a/config/config.yml b/config/config.yml
@@ -16,3 +16,4 @@ comparisons:
     gene_features: Null
     enh_features: Null
     enh_assays: Null
+    auc_left_rectangle: FALSE
diff --git a/workflow/rules/crispr_comparison.smk b/workflow/rules/crispr_comparison.smk
@@ -96,7 +96,8 @@ rule comparePredictionsToExperiment:
      pos_col = "Regulated",
      min_sensitivity = 0.7,
      dist_bins_kb = lambda wildcards: config["comparisons"][wildcards.comparison]["dist_bins_kb"],
-     include_col = lambda wildcards: get_optional_parameter(wildcards, "include_col", None)
+     include_col = lambda wildcards: get_optional_parameter(wildcards, "include_col", None),
+     auc_left_rectangle = lambda wildcards: get_optional_parameter(wildcards, "auc_left_rectangle", None)
   conda: "../envs/r_crispr_comparison.yml"
   resources:
     mem_mb = 32000,

diff --git a/workflow/scripts/comparePredictionsToExperiment.Rmd b/workflow/scripts/comparePredictionsToExperiment.Rmd
@@ -123,7 +123,7 @@ write_tsv(pr_table, file = file.path(dirname(snakemake@output[[1]]), "pr_table.t
 
 # create performance summary tables
 perf_summary <- applyCellTypes(merged, .fun = makePRSummaryTableBS, pred_config = pred_config,
-                               pos_col = pos_col)
+                               pos_col = pos_col, auc_left_rectangle = snakemake@params$auc_left_rectangle)
 
 # convert performance summaries to one table and add full predictor names to performance summary
 perf_summary <- perf_summary %>% 
@@ -152,7 +152,8 @@ pr_plots <- mapply(FUN = makePRCurvePlot, pr_df = pr, n_pos = n_pos, pct_pos = p
                    MoreArgs = list(pred_config = pred_config,
                                    min_sensitivity = snakemake@params$min_sensitivity,
                                    line_width = 0.8, point_size = 3,
-                                   text_size = 13, colors = pred_colors),
+                                   text_size = 13, colors = pred_colors,
+                                   auc_left_rectangle = snakemake@params$auc_left_rectangle),
                    SIMPLIFY = FALSE)
 
 # save plots to files
@@ -375,7 +376,8 @@ if (length(invalid_bins) > 0) {
 # compute performance as a function of distance to TSS
 dist_perf <- applyCellTypes(merged_bins_filt, .fun = computePerfSubsets, pred_config = pred_config,
                             subset_col = "distToTSS (bins)", metric = "auprc", pos_col = pos_col,
-                            bs_iter = 1000)
+                            bs_iter = 1000, 
+                            auc_left_rectangle = snakemake@params$auc_left_rectangle)
 
 # create table of performance across distance
 dist_perf_tbl <- bind_rows(dist_perf, .id = "cell_type")
@@ -410,7 +412,8 @@ dist_prc_plots <- applyCellTypes(merged_bins_filt, .fun = makePRCurveSubsets,
                                  pos_col = pos_col,
                                  min_sensitivity = snakemake@params$min_sensitivity,
                                  line_width = 0.8, point_size = 3, text_size = 13, nrow = nrow,
-                                 colors = pred_colors)
+                                 colors = pred_colors,
+                                 auc_left_rectangle = snakemake@params$auc_left_rectangle)
 
 # calculate plot dimensions based on number of features
 plot_height <- nrow * 4
@@ -544,7 +547,8 @@ if (length(gene_feat_cols) > 0) {
                                     pred_config = pred_config, pos_col = pos_col,
                                     min_sensitivity = snakemake@params$min_sensitivity,
                                     line_width = 1.2, point_size = 3.5, text_size = 13,
-                                    colors = pred_colors)
+                                    colors = pred_colors,
+                                    auc_left_rectangle = snakemake@params$auc_left_rectangle)
 
   # calculate plot dimensions based on number of features
   n_features <- length(gene_feat_cols)
@@ -574,7 +578,8 @@ if (length(enh_feat_cols) > 0) {
                                    pred_config = pred_config, pos_col = pos_col,
                                    min_sensitivity = snakemake@params$min_sensitivity,
                                    line_width = 1.2, point_size = 3.5, text_size = 13,
-                                   colors = pred_colors)
+                                   colors = pred_colors,
+                                   auc_left_rectangle = snakemake@params$auc_left_rectangle)
 
   # calculate plot dimensions based on number of features
   n_features <- length(enh_feat_cols)

diff --git a/workflow/scripts/crisprComparisonBootstrapFunctions.R b/workflow/scripts/crisprComparisonBootstrapFunctions.R
@@ -106,7 +106,8 @@ getThresholdValues <- function(pred_config, predictors = NULL, threshold_col = "
 bootstrapPerformanceIntervals <- function(data, metric = c("auprc", "precision", "recall"),
                                           predictors = NULL, thresholds = NULL, weighted = FALSE,
                                           R = 10000, conf = 0.95, ncpus = 1,
-                                          ci_type = c("perc", "norm", "basic", "bca")) {
+                                          ci_type = c("perc", "norm", "basic", "bca"),
+                                          auc_left_rectangle = FALSE) {
 
   # parse input arguments
   metric <- match.arg(metric)
@@ -134,7 +135,8 @@ bootstrapPerformanceIntervals <- function(data, metric = c("auprc", "precision",
   # bootstrap performance
   message("Running bootstraps...")
   bs_perf <- boot(data, statistic = calculate_performance, metric = metric, R = R,
-                  parallel = parallel, ncpus = ncpus, thresholds = thresholds, weighted = weighted)
+                  parallel = parallel, ncpus = ncpus, thresholds = thresholds, weighted = weighted,
+                  auc_left_rectangle = auc_left_rectangle)
 
   # set up parallel backend for computing confidence intervals if specified
   if (ncpus > 1) {
@@ -188,7 +190,8 @@ bootstrapPerformanceIntervals <- function(data, metric = c("auprc", "precision",
 bootstrapDeltaPerformance <- function(data, metric = c("auprc", "precision", "recall"),
                                       comparisons = NULL, thresholds = NULL, weighted = FALSE,
                                       R = 10000, conf = 0.95,
-                                      ci_type = c("perc", "norm", "basic", "bca"), ncpus = 1) {
+                                      ci_type = c("perc", "norm", "basic", "bca"), ncpus = 1,
+                                      auc_left_rectangle = auc_left_rectangle) {
 
   # parse input arguments
   metric <- match.arg(metric)
@@ -223,7 +226,8 @@ bootstrapDeltaPerformance <- function(data, metric = c("auprc", "precision", "re
   message("Running bootstraps...")
   bs_delta <- boot(data, statistic = calc_delta_performance, metric = metric, R = R,
                    parallel = parallel, ncpus = ncpus, thresholds = thresholds,
-                   comparisons = comparisons, weighted = weighted)
+                   comparisons = comparisons, weighted = weighted,
+                   auc_left_rectangle = auc_left_rectangle)
 
   # set up parallel backend for computing confidence intervals if specified (useful for 'bca')
   if (ncpus > 1) {
@@ -282,7 +286,8 @@ bootstrapDeltaPerformanceDatasets <- function(data1, data2,
                                               metric = c("auprc", "precision", "recall"),
                                               predictors = NULL, thresholds = NULL,
                                               weighted = FALSE, R = 10000, conf = 0.95, ncpus = 1,
-                                              ci_type = c("perc", "norm", "basic", "bca")) {
+                                              ci_type = c("perc", "norm", "basic", "bca"),
+                                              auc_left_rectangle = FALSE) {
 
   # parse input arguments
   metric <- match.arg(metric)
@@ -323,7 +328,8 @@ bootstrapDeltaPerformanceDatasets <- function(data1, data2,
   message("Running bootstraps...")
   bs_delta <- boot(data, statistic = calc_delta_performance_datasets, metric = metric, R = R,
                    strata = data$dataset, parallel = parallel, ncpus = ncpus,
-                   thresholds = thresholds, weighted = weighted)
+                   thresholds = thresholds, weighted = weighted,
+                   auc_left_rectangle = auc_left_rectangle)
 
   # set up parallel backend for computing confidence intervals if specified (useful for 'bca')
   if (ncpus > 1) {
@@ -395,7 +401,8 @@ plotBootstrappedIntervals <- function(results, title = NULL) {
 ## FUNCTIONS TO COMPUTE PERFORMANCE AND DELTA PERFORMANCE ==========================================
 
 # calculate performance AUPRC, or precision or recall at threshold
-calculate_performance <- function(data, indices, metric, thresholds, weighted) {
+calculate_performance <- function(data, indices, metric, thresholds, weighted,
+                                  auc_left_rectangle = FALSE) {
 
   # select bootstrap sample
   data <- data[indices, ]
@@ -412,7 +419,8 @@ calculate_performance <- function(data, indices, metric, thresholds, weighted) {
 
   # calculate performance for all predictors
   performance <- mapply(FUN = calculate_performance_one_pred, pred = preds, threshold = thresholds,
-                        MoreArgs = list(data = data, metric = metric, weighted = weighted),
+                        MoreArgs = list(data = data, metric = metric, weighted = weighted,
+                                        auc_left_rectangle = auc_left_rectangle),
                         SIMPLIFY = TRUE)
 
   return(performance)
@@ -421,11 +429,13 @@ calculate_performance <- function(data, indices, metric, thresholds, weighted) {
 
 # function to calculate delta AUPRC, or precision or recall at threshold between pairwise
 # predictor combinations
-calc_delta_performance <- function(data, indices, metric, thresholds, comparisons, weighted) {
+calc_delta_performance <- function(data, indices, metric, thresholds, comparisons, weighted,
+                                   auc_left_rectangle = FALSE) {
 
   # calculate bootstrapped performance
   perf <- calculate_performance(data, indices = indices, metric = metric, thresholds = thresholds,
-                                weighted = weighted)
+                                weighted = weighted,
+                                auc_left_rectangle = auc_left_rectangle)
 
   # calculate delta performance for all specified comparisons
   delta_perf <- vapply(comparisons, FUN = function(comp, perf) {
@@ -437,7 +447,8 @@ calc_delta_performance <- function(data, indices, metric, thresholds, comparison
 }
 
 # function to calculate delta AUPRC, or precision or recall at threshold between 2 datasets
-calc_delta_performance_datasets <- function(data, indices, metric, thresholds, weighted) {
+calc_delta_performance_datasets <- function(data, indices, metric, thresholds, weighted,
+                                            auc_left_rectangle = FALSE) {
 
   # select bootstrap sample
   data <- data[indices, ]
@@ -446,9 +457,11 @@ calc_delta_performance_datasets <- function(data, indices, metric, thresholds, w
   data1 <- data[data$dataset == "1", ]
   data2 <- data[data$dataset == "2", ]
   perf1 <- calculate_performance(data1, indices = seq_len(nrow(data1)), metric = metric,
-                                  thresholds = thresholds, weighted = weighted)
+                                 thresholds = thresholds, weighted = weighted,
+                                 auc_left_rectangle = auc_left_rectangle)
   perf2 <- calculate_performance(data2, indices = seq_len(nrow(data2)), metric = metric,
-                                  thresholds = thresholds, weighted = weighted)
+                                 thresholds = thresholds, weighted = weighted,
+                                 auc_left_rectangle = auc_left_rectangle)
 
   # calculate delta performance for all predictors
   delta_perf <- perf1 - perf2
@@ -461,7 +474,8 @@ calc_delta_performance_datasets <- function(data, indices, metric, thresholds, w
 ## HELPER FUNCTIONS ================================================================================
 
 # calculate performance auprc, or precision or recall at threshold for one predictor
-calculate_performance_one_pred <- function(data, pred, threshold, metric, weighted) {
+calculate_performance_one_pred <- function(data, pred, threshold, metric, weighted,
+                                           auc_left_rectangle = FALSE) {
 
   # return NA if 'Regulated' column does not contain at least one positive and negative
   if (length(unique(data$Regulated)) != 2) {
@@ -492,7 +506,7 @@ calculate_performance_one_pred <- function(data, pred, threshold, metric, weight
   if (metric %in% c("precision", "recall")) {
     performance <- calculate_performance_at_threshold(pr, threshold = threshold, metric = metric)
   } else if (metric == "auprc") {
-    performance <- calculate_auprc(pr)
+    performance <- calculate_auprc(pr, auc_left_rectangle = auc_left_rectangle)
   } else {
     stop("Invalid 'metric' argument", call. = FALSE)
   }
@@ -502,7 +516,7 @@ calculate_performance_one_pred <- function(data, pred, threshold, metric, weight
 }
 
 # calculate area-under-the-precision-recall-curve (AUPRC)
-calculate_auprc <- function(pr) {
+calculate_auprc <- function(pr, auc_left_rectangle = FALSE) {
 
   # the head() calls here remove the last element of the vector. 
   # The point is that performance objects produced by ROCR always include a Recall = 100% point even
@@ -511,17 +525,24 @@ calculate_auprc <- function(pr) {
   pr <- head(pr, -1)
 
   # compute auprc
-  auprc <- compute_auc(x_vals = pr$recall, y_vals = pr$precision)
+  auprc <- compute_auc(x_vals = pr$recall, y_vals = pr$precision,
+                       auc_left_rectangle = auc_left_rectangle)
 
   return(auprc)
 
 }
 
 # try to compute area under the curve
-compute_auc <- function(x_vals, y_vals) {
+compute_auc <- function(x_vals, y_vals, 
+                        auc_left_rectangle = FALSE) {
   good.idx <- which(!is.na(x_vals) & !is.na(y_vals))
   if (length(good.idx) > 0) {
     auc <- trapz(x_vals[good.idx], y_vals[good.idx])
+    # Add the area of the rectangle formed by the left end 
+    # of the curve and the origin.
+    if(auc_left_rectangle == "True"){
+      auc <- auc + x_vals[good.idx[1]] * y_vals[good.idx[1]]
+    }
   } else {
     auc <- NA_real_
   }