Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions config/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ comparisons:
gene_features: Null
enh_features: Null
enh_assays: Null
auc_left_rectangle: FALSE
3 changes: 2 additions & 1 deletion workflow/rules/crispr_comparison.smk
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,8 @@ rule comparePredictionsToExperiment:
pos_col = "Regulated",
min_sensitivity = 0.7,
dist_bins_kb = lambda wildcards: config["comparisons"][wildcards.comparison]["dist_bins_kb"],
include_col = lambda wildcards: get_optional_parameter(wildcards, "include_col", None)
include_col = lambda wildcards: get_optional_parameter(wildcards, "include_col", None),
auc_left_rectangle = lambda wildcards: get_optional_parameter(wildcards, "auc_left_rectangle", None)
conda: "../envs/r_crispr_comparison.yml"
resources:
mem_mb = 32000,
Expand Down
17 changes: 11 additions & 6 deletions workflow/scripts/comparePredictionsToExperiment.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ write_tsv(pr_table, file = file.path(dirname(snakemake@output[[1]]), "pr_table.t

# create performance summary tables
perf_summary <- applyCellTypes(merged, .fun = makePRSummaryTableBS, pred_config = pred_config,
pos_col = pos_col)
pos_col = pos_col, auc_left_rectangle = snakemake@params$auc_left_rectangle)

# convert performance summaries to one table and add full predictor names to performance summary
perf_summary <- perf_summary %>%
Expand Down Expand Up @@ -152,7 +152,8 @@ pr_plots <- mapply(FUN = makePRCurvePlot, pr_df = pr, n_pos = n_pos, pct_pos = p
MoreArgs = list(pred_config = pred_config,
min_sensitivity = snakemake@params$min_sensitivity,
line_width = 0.8, point_size = 3,
text_size = 13, colors = pred_colors),
text_size = 13, colors = pred_colors,
auc_left_rectangle = snakemake@params$auc_left_rectangle),
SIMPLIFY = FALSE)

# save plots to files
Expand Down Expand Up @@ -375,7 +376,8 @@ if (length(invalid_bins) > 0) {
# compute performance as a function of distance to TSS
dist_perf <- applyCellTypes(merged_bins_filt, .fun = computePerfSubsets, pred_config = pred_config,
subset_col = "distToTSS (bins)", metric = "auprc", pos_col = pos_col,
bs_iter = 1000)
bs_iter = 1000,
auc_left_rectangle = snakemake@params$auc_left_rectangle)

# create table of performance across distance
dist_perf_tbl <- bind_rows(dist_perf, .id = "cell_type")
Expand Down Expand Up @@ -410,7 +412,8 @@ dist_prc_plots <- applyCellTypes(merged_bins_filt, .fun = makePRCurveSubsets,
pos_col = pos_col,
min_sensitivity = snakemake@params$min_sensitivity,
line_width = 0.8, point_size = 3, text_size = 13, nrow = nrow,
colors = pred_colors)
colors = pred_colors,
auc_left_rectangle = snakemake@params$auc_left_rectangle)

# calculate plot dimensions based on number of features
plot_height <- nrow * 4
Expand Down Expand Up @@ -544,7 +547,8 @@ if (length(gene_feat_cols) > 0) {
pred_config = pred_config, pos_col = pos_col,
min_sensitivity = snakemake@params$min_sensitivity,
line_width = 1.2, point_size = 3.5, text_size = 13,
colors = pred_colors)
colors = pred_colors,
auc_left_rectangle = snakemake@params$auc_left_rectangle)

# calculate plot dimensions based on number of features
n_features <- length(gene_feat_cols)
Expand Down Expand Up @@ -574,7 +578,8 @@ if (length(enh_feat_cols) > 0) {
pred_config = pred_config, pos_col = pos_col,
min_sensitivity = snakemake@params$min_sensitivity,
line_width = 1.2, point_size = 3.5, text_size = 13,
colors = pred_colors)
colors = pred_colors,
auc_left_rectangle = snakemake@params$auc_left_rectangle)

# calculate plot dimensions based on number of features
n_features <- length(enh_feat_cols)
Expand Down
57 changes: 39 additions & 18 deletions workflow/scripts/crisprComparisonBootstrapFunctions.R
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,8 @@ getThresholdValues <- function(pred_config, predictors = NULL, threshold_col = "
bootstrapPerformanceIntervals <- function(data, metric = c("auprc", "precision", "recall"),
predictors = NULL, thresholds = NULL, weighted = FALSE,
R = 10000, conf = 0.95, ncpus = 1,
ci_type = c("perc", "norm", "basic", "bca")) {
ci_type = c("perc", "norm", "basic", "bca"),
auc_left_rectangle = FALSE) {

# parse input arguments
metric <- match.arg(metric)
Expand Down Expand Up @@ -134,7 +135,8 @@ bootstrapPerformanceIntervals <- function(data, metric = c("auprc", "precision",
# bootstrap performance
message("Running bootstraps...")
bs_perf <- boot(data, statistic = calculate_performance, metric = metric, R = R,
parallel = parallel, ncpus = ncpus, thresholds = thresholds, weighted = weighted)
parallel = parallel, ncpus = ncpus, thresholds = thresholds, weighted = weighted,
auc_left_rectangle = auc_left_rectangle)

# set up parallel backend for computing confidence intervals if specified
if (ncpus > 1) {
Expand Down Expand Up @@ -188,7 +190,8 @@ bootstrapPerformanceIntervals <- function(data, metric = c("auprc", "precision",
bootstrapDeltaPerformance <- function(data, metric = c("auprc", "precision", "recall"),
comparisons = NULL, thresholds = NULL, weighted = FALSE,
R = 10000, conf = 0.95,
ci_type = c("perc", "norm", "basic", "bca"), ncpus = 1) {
ci_type = c("perc", "norm", "basic", "bca"), ncpus = 1,
auc_left_rectangle = auc_left_rectangle) {

# parse input arguments
metric <- match.arg(metric)
Expand Down Expand Up @@ -223,7 +226,8 @@ bootstrapDeltaPerformance <- function(data, metric = c("auprc", "precision", "re
message("Running bootstraps...")
bs_delta <- boot(data, statistic = calc_delta_performance, metric = metric, R = R,
parallel = parallel, ncpus = ncpus, thresholds = thresholds,
comparisons = comparisons, weighted = weighted)
comparisons = comparisons, weighted = weighted,
auc_left_rectangle = auc_left_rectangle)

# set up parallel backend for computing confidence intervals if specified (useful for 'bca')
if (ncpus > 1) {
Expand Down Expand Up @@ -282,7 +286,8 @@ bootstrapDeltaPerformanceDatasets <- function(data1, data2,
metric = c("auprc", "precision", "recall"),
predictors = NULL, thresholds = NULL,
weighted = FALSE, R = 10000, conf = 0.95, ncpus = 1,
ci_type = c("perc", "norm", "basic", "bca")) {
ci_type = c("perc", "norm", "basic", "bca"),
auc_left_rectangle = FALSE) {

# parse input arguments
metric <- match.arg(metric)
Expand Down Expand Up @@ -323,7 +328,8 @@ bootstrapDeltaPerformanceDatasets <- function(data1, data2,
message("Running bootstraps...")
bs_delta <- boot(data, statistic = calc_delta_performance_datasets, metric = metric, R = R,
strata = data$dataset, parallel = parallel, ncpus = ncpus,
thresholds = thresholds, weighted = weighted)
thresholds = thresholds, weighted = weighted,
auc_left_rectangle = auc_left_rectangle)

# set up parallel backend for computing confidence intervals if specified (useful for 'bca')
if (ncpus > 1) {
Expand Down Expand Up @@ -395,7 +401,8 @@ plotBootstrappedIntervals <- function(results, title = NULL) {
## FUNCTIONS TO COMPUTE PERFORMANCE AND DELTA PERFORMANCE ==========================================

# calculate performance AUPRC, or precision or recall at threshold
calculate_performance <- function(data, indices, metric, thresholds, weighted) {
calculate_performance <- function(data, indices, metric, thresholds, weighted,
auc_left_rectangle = FALSE) {

# select bootstrap sample
data <- data[indices, ]
Expand All @@ -412,7 +419,8 @@ calculate_performance <- function(data, indices, metric, thresholds, weighted) {

# calculate performance for all predictors
performance <- mapply(FUN = calculate_performance_one_pred, pred = preds, threshold = thresholds,
MoreArgs = list(data = data, metric = metric, weighted = weighted),
MoreArgs = list(data = data, metric = metric, weighted = weighted,
auc_left_rectangle = auc_left_rectangle),
SIMPLIFY = TRUE)

return(performance)
Expand All @@ -421,11 +429,13 @@ calculate_performance <- function(data, indices, metric, thresholds, weighted) {

# function to calculate delta AUPRC, or precision or recall at threshold between pairwise
# predictor combinations
calc_delta_performance <- function(data, indices, metric, thresholds, comparisons, weighted) {
calc_delta_performance <- function(data, indices, metric, thresholds, comparisons, weighted,
auc_left_rectangle = FALSE) {

# calculate bootstrapped performance
perf <- calculate_performance(data, indices = indices, metric = metric, thresholds = thresholds,
weighted = weighted)
weighted = weighted,
auc_left_rectangle = auc_left_rectangle)

# calculate delta performance for all specified comparisons
delta_perf <- vapply(comparisons, FUN = function(comp, perf) {
Expand All @@ -437,7 +447,8 @@ calc_delta_performance <- function(data, indices, metric, thresholds, comparison
}

# function to calculate delta AUPRC, or precision or recall at threshold between 2 datasets
calc_delta_performance_datasets <- function(data, indices, metric, thresholds, weighted) {
calc_delta_performance_datasets <- function(data, indices, metric, thresholds, weighted,
auc_left_rectangle = FALSE) {

# select bootstrap sample
data <- data[indices, ]
Expand All @@ -446,9 +457,11 @@ calc_delta_performance_datasets <- function(data, indices, metric, thresholds, w
data1 <- data[data$dataset == "1", ]
data2 <- data[data$dataset == "2", ]
perf1 <- calculate_performance(data1, indices = seq_len(nrow(data1)), metric = metric,
thresholds = thresholds, weighted = weighted)
thresholds = thresholds, weighted = weighted,
auc_left_rectangle = auc_left_rectangle)
perf2 <- calculate_performance(data2, indices = seq_len(nrow(data2)), metric = metric,
thresholds = thresholds, weighted = weighted)
thresholds = thresholds, weighted = weighted,
auc_left_rectangle = auc_left_rectangle)

# calculate delta performance for all predictors
delta_perf <- perf1 - perf2
Expand All @@ -461,7 +474,8 @@ calc_delta_performance_datasets <- function(data, indices, metric, thresholds, w
## HELPER FUNCTIONS ================================================================================

# calculate performance auprc, or precision or recall at threshold for one predictor
calculate_performance_one_pred <- function(data, pred, threshold, metric, weighted) {
calculate_performance_one_pred <- function(data, pred, threshold, metric, weighted,
auc_left_rectangle = FALSE) {

# return NA if 'Regulated' column does not contain at least one positive and negative
if (length(unique(data$Regulated)) != 2) {
Expand Down Expand Up @@ -492,7 +506,7 @@ calculate_performance_one_pred <- function(data, pred, threshold, metric, weight
if (metric %in% c("precision", "recall")) {
performance <- calculate_performance_at_threshold(pr, threshold = threshold, metric = metric)
} else if (metric == "auprc") {
performance <- calculate_auprc(pr)
performance <- calculate_auprc(pr, auc_left_rectangle = auc_left_rectangle)
} else {
stop("Invalid 'metric' argument", call. = FALSE)
}
Expand All @@ -502,7 +516,7 @@ calculate_performance_one_pred <- function(data, pred, threshold, metric, weight
}

# calculate area-under-the-precision-recall-curve (AUPRC)
calculate_auprc <- function(pr) {
calculate_auprc <- function(pr, auc_left_rectangle = FALSE) {

# the head() calls here remove the last element of the vector.
# The point is that performance objects produced by ROCR always include a Recall = 100% point even
Expand All @@ -511,17 +525,24 @@ calculate_auprc <- function(pr) {
pr <- head(pr, -1)

# compute auprc
auprc <- compute_auc(x_vals = pr$recall, y_vals = pr$precision)
auprc <- compute_auc(x_vals = pr$recall, y_vals = pr$precision,
auc_left_rectangle = auc_left_rectangle)

return(auprc)

}

# try to compute area under the curve
compute_auc <- function(x_vals, y_vals) {
compute_auc <- function(x_vals, y_vals,
auc_left_rectangle = FALSE) {
good.idx <- which(!is.na(x_vals) & !is.na(y_vals))
if (length(good.idx) > 0) {
auc <- trapz(x_vals[good.idx], y_vals[good.idx])
# Add the area of the rectangle formed by the left end
# of the curve and the origin.
if(auc_left_rectangle == "True"){
auc <- auc + x_vals[good.idx[1]] * y_vals[good.idx[1]]
}
} else {
auc <- NA_real_
}
Expand Down
Loading