diff --git a/examples_not_exhibited/plot_fig_1_nguyen_et_al.py b/examples_not_exhibited/plot_fig_1_nguyen_et_al.py index 14f5ee9..26aeb58 100644 --- a/examples_not_exhibited/plot_fig_1_nguyen_et_al.py +++ b/examples_not_exhibited/plot_fig_1_nguyen_et_al.py @@ -11,6 +11,8 @@ To reduce the script runtime it is desirable to increase n_jobs parameter. """ import matplotlib.pyplot as plt +import matplotlib +matplotlib.use('Agg') import numpy as np from hidimstat.knockoffs import knockoff_aggregation, model_x_knockoff from hidimstat.knockoffs.data_simulation import simu_data @@ -20,10 +22,11 @@ color_blue = '#1f77b4' color_teal = '#1fbecf' +color_green = '#31a354' def one_inference(n, p, snr, rho, sparsity, n_bootstraps=25, gamma=0.3, - n_jobs=1, offset=1, fdr=0.1, seed=None): + n_jobs=50, offset=1, fdr=0.1, seed=None): # Simulate data following autoregressive structure, seed is fixed to ensure # doing inference on only 1 simulation @@ -51,36 +54,61 @@ def one_inference(n, p, snr, rho, sparsity, n_bootstraps=25, gamma=0.3, ako_fdp, ako_power = cal_fdp_power(ako_selected, non_zero_index) - return ko_fdps, ako_fdp, ko_powers, ako_power + # Aggregation via e-values + + eval_selected = knockoff_aggregation(X, y, fdr=fdr, + method='e-values', + offset=offset, + n_jobs=n_jobs, gamma=gamma, + n_bootstraps=n_bootstraps, + random_state=seed*2) + + eval_fdp, eval_power = cal_fdp_power(eval_selected, non_zero_index) + + return ko_fdps, ako_fdp, eval_fdp, ko_powers, ako_power, eval_power def plot(results, n_simu, fdr): ko_fdps = np.array([results[i][0] for i in range(n_simu)]).ravel() ako_fdps = np.array([results[i][1] for i in range(n_simu)]).ravel() - ko_powers = np.array([results[i][2] for i in range(n_simu)]).ravel() - ako_powers = np.array([results[i][3] for i in range(n_simu)]).ravel() - + eval_fdps = np.array([results[i][2] for i in range(n_simu)]).ravel() + ko_powers = np.array([results[i][3] for i in range(n_simu)]).ravel() + ako_powers = np.array([results[i][4] for i in range(n_simu)]).ravel() + eval_powers = np.array([results[i][5] for i in range(n_simu)]).ravel() # Plotting n_bins = 30 - fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(7, 4)) + fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2, figsize=(7, 4)) ax1.tick_params(labelsize=14) ax1.hist(ko_fdps, edgecolor='k', range=[0.0, 1.0], bins=n_bins, color=color_blue) ax1.axvline(x=fdr, linestyle='--', color='r', linewidth=1.0) + ax1.title.set_text('FDP (KO)') ax2.tick_params(labelsize=14) ax2.hist(ko_powers, edgecolor='k', range=[0.0, 1.0], bins=n_bins, color=color_blue) + ax2.title.set_text('Power (KO)') ax3.tick_params(labelsize=14) ax3.hist(ako_fdps, edgecolor='k', range=[0.0, 1.0], bins=n_bins, color=color_teal) + ax3.title.set_text('FDP (AKO)') ax3.axvline(x=fdr, linestyle='--', color='r', linewidth=1.0) ax4.tick_params(labelsize=14) ax4.hist(ako_powers, edgecolor='k', range=[0.0, 1.0], bins=n_bins, color=color_teal) + ax4.title.set_text('Power (AKO)') + ax5.tick_params(labelsize=14) + ax5.hist(eval_fdps, edgecolor='k', + range=[0.0, 1.0], bins=n_bins, color=color_green) + ax5.axvline(x=fdr, linestyle='--', color='r', linewidth=1.0) + ax5.title.set_text('FDP (e-values)') + ax6.tick_params(labelsize=14) + ax6.hist(eval_powers, edgecolor='k', + range=[0.0, 1.0], bins=n_bins, color=color_green) + ax6.title.set_text('Power (e-values)') plt.tight_layout() - figname = 'figures/histogram_ko_vs_ako.png' + figname = 'histogram_ko_vs_ako.png' plt.savefig(figname) print(f'Save figure to {figname}') @@ -94,8 +122,8 @@ def main(): offset = 1 fdr = 0.05 gamma = 0.3 - n_bootstraps = 10 - n_simu = 10 + n_bootstraps = 25 + n_simu = 100 offset = 1 results = Parallel(n_jobs=1)( @@ -109,6 +137,6 @@ def main(): plot(results, n_simu, fdr) print('Done!') - +main() # if __name__ == '__main__': # main() diff --git a/hidimstat/knockoffs/knockoff_aggregation.py b/hidimstat/knockoffs/knockoff_aggregation.py index 14f9047..f8de8c5 100644 --- a/hidimstat/knockoffs/knockoff_aggregation.py +++ b/hidimstat/knockoffs/knockoff_aggregation.py @@ -8,13 +8,13 @@ from .gaussian_knockoff import (_estimate_distribution, gaussian_knockoff_generation) -from .stat_coef_diff import stat_coef_diff +from .stat_coef_diff import stat_coef_diff, _coef_diff_threshold from .utils import fdr_threshold, quantile_aggregation def knockoff_aggregation(X, y, centered=True, shrink=False, construct_method='equi', fdr=0.1, fdr_control='bhq', - reshaping_function=None, offset=1, + method='quantile', reshaping_function=None, offset=1, statistic='lasso_cv', cov_estimator='ledoit_wolf', joblib_verbose=0, n_bootstraps=25, n_jobs=1, adaptive_aggregation=False, gamma=0.5, gamma_min=0.05, @@ -64,21 +64,35 @@ def knockoff_aggregation(X, y, centered=True, shrink=False, ko_stats = parallel(delayed(stat_coef_diff_cached)( X, X_tildes[i], y, method=statistic) for i in range(n_bootstraps)) - pvals = np.array([_empirical_pval(ko_stats[i], offset) - for i in range(n_bootstraps)]) + if method == 'e-values': + evals = np.array([_empirical_eval(ko_stats[i], fdr/2, offset) + for i in range(n_bootstraps)]) - aggregated_pval = quantile_aggregation( - pvals, gamma=gamma, gamma_min=gamma_min, - adaptive=adaptive_aggregation) + aggregated_eval = np.mean(evals, axis=0) + threshold = fdr_threshold(aggregated_eval, fdr=fdr, method='ebh') + selected = np.where(aggregated_eval >= threshold)[0] - threshold = fdr_threshold(aggregated_pval, fdr=fdr, method=fdr_control, - reshaping_function=reshaping_function) - selected = np.where(aggregated_pval <= threshold)[0] + if verbose: + return selected, aggregated_eval, evals + + return selected - if verbose: - return selected, aggregated_pval, pvals + if method == 'quantile': + pvals = np.array([_empirical_pval(ko_stats[i], offset) + for i in range(n_bootstraps)]) - return selected + aggregated_pval = quantile_aggregation( + pvals, gamma=gamma, gamma_min=gamma_min, + adaptive=adaptive_aggregation) + + threshold = fdr_threshold(aggregated_pval, fdr=fdr, method=fdr_control, + reshaping_function=reshaping_function) + selected = np.where(aggregated_pval <= threshold)[0] + + if verbose: + return selected, aggregated_pval, pvals + + return selected def _empirical_pval(test_score, offset=1): @@ -100,3 +114,25 @@ def _empirical_pval(test_score, offset=1): ) return np.array(pvals) + + +def _empirical_eval(test_score, fdr=0.1, offset=1): + + evals = [] + n_features = test_score.size + + if offset not in (0, 1): + raise ValueError("'offset' must be either 0 or 1") + + ko_thr = _coef_diff_threshold(test_score, fdr=fdr, offset=offset) + + for i in range(n_features): + if test_score[i] < ko_thr: + evals.append(0) + else: + evals.append( + n_features / + (offset + np.sum(test_score <= - ko_thr)) + ) + + return np.array(evals) diff --git a/hidimstat/knockoffs/stat_coef_diff.py b/hidimstat/knockoffs/stat_coef_diff.py index 095d60f..f9a2e4e 100644 --- a/hidimstat/knockoffs/stat_coef_diff.py +++ b/hidimstat/knockoffs/stat_coef_diff.py @@ -67,13 +67,13 @@ def stat_coef_diff(X, X_tilde, y, method='lasso_cv', n_splits=5, n_jobs=1, estimator = { 'lasso_cv': LassoCV(alphas=lambdas, n_jobs=n_jobs, - verbose=joblib_verbose, max_iter=1e4, cv=cv), + verbose=joblib_verbose, max_iter=int(1e4), cv=cv), 'logistic_l1': LogisticRegressionCV( - penalty='l1', max_iter=1e4, + penalty='l1', max_iter=int(1e4), solver=solver, cv=cv, n_jobs=n_jobs, tol=1e-8), 'logistic_l2': LogisticRegressionCV( - penalty='l2', max_iter=1e4, n_jobs=n_jobs, + penalty='l2', max_iter=int(1e4), n_jobs=n_jobs, verbose=joblib_verbose, cv=cv, tol=1e-8), } diff --git a/hidimstat/knockoffs/tests/test_knockoff_aggregation.py b/hidimstat/knockoffs/tests/test_knockoff_aggregation.py index 9471478..383e906 100644 --- a/hidimstat/knockoffs/tests/test_knockoff_aggregation.py +++ b/hidimstat/knockoffs/tests/test_knockoff_aggregation.py @@ -34,3 +34,15 @@ def test_knockoff_aggregation(): assert fdp < 0.5 assert power > 0.1 + + # Using e-values aggregation + + selected, aggregated_pval, pvals = knockoff_aggregation( + X, y, fdr=fdr, n_bootstraps=n_bootstraps, method='e-values', + verbose=True, random_state=0) + + fdp, power = cal_fdp_power(selected, non_zero_index) + + assert pvals.shape == (n_bootstraps, p) + assert fdp < 0.5 + assert power > 0.1 diff --git a/hidimstat/knockoffs/utils.py b/hidimstat/knockoffs/utils.py index 044f4c3..31fc566 100644 --- a/hidimstat/knockoffs/utils.py +++ b/hidimstat/knockoffs/utils.py @@ -16,6 +16,8 @@ def fdr_threshold(pvals, fdr=0.1, method='bhq', reshaping_function=None): elif method == 'bhy': return _bhy_threshold( pvals, fdr=fdr, reshaping_function=reshaping_function) + elif method == 'ebh': + return _ebh_threshold(pvals, fdr=fdr) else: raise ValueError( '{} is not support FDR control method'.format(method)) @@ -69,6 +71,21 @@ def _bhq_threshold(pvals, fdr=0.1): else: return -1.0 +def _ebh_threshold(evals, fdr=0.1): + """e-BH procedure for FDR control (see Wang and Ramdas 2021) + """ + n_features = len(evals) + evals_sorted = -np.sort(-evals) # sort in descending order + selected_index = 2 * n_features + for i in range(n_features - 1, -1, -1): + if evals_sorted[i] >= n_features / (fdr * (i + 1)): + selected_index = i + break + if selected_index <= n_features: + return evals_sorted[selected_index] + else: + return np.infty + def _bhy_threshold(pvals, reshaping_function=None, fdr=0.1): """Benjamini-Hochberg-Yekutieli procedure for controlling FDR, with input