-
Notifications
You must be signed in to change notification settings - Fork 7
WIP add e-values support #33
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -11,6 +11,8 @@ | |
| To reduce the script runtime it is desirable to increase n_jobs parameter. | ||
| """ | ||
| import matplotlib.pyplot as plt | ||
| import matplotlib | ||
| matplotlib.use('Agg') | ||
| import numpy as np | ||
| from hidimstat.knockoffs import knockoff_aggregation, model_x_knockoff | ||
| from hidimstat.knockoffs.data_simulation import simu_data | ||
|
|
@@ -20,10 +22,11 @@ | |
|
|
||
| color_blue = '#1f77b4' | ||
| color_teal = '#1fbecf' | ||
| color_green = '#31a354' | ||
|
|
||
|
|
||
| def one_inference(n, p, snr, rho, sparsity, n_bootstraps=25, gamma=0.3, | ||
| n_jobs=1, offset=1, fdr=0.1, seed=None): | ||
| n_jobs=50, offset=1, fdr=0.1, seed=None): | ||
|
|
||
| # Simulate data following autoregressive structure, seed is fixed to ensure | ||
| # doing inference on only 1 simulation | ||
|
|
@@ -51,36 +54,61 @@ def one_inference(n, p, snr, rho, sparsity, n_bootstraps=25, gamma=0.3, | |
|
|
||
| ako_fdp, ako_power = cal_fdp_power(ako_selected, non_zero_index) | ||
|
|
||
| return ko_fdps, ako_fdp, ko_powers, ako_power | ||
| # Aggregation via e-values | ||
|
|
||
| eval_selected = knockoff_aggregation(X, y, fdr=fdr, | ||
| method='e-values', | ||
| offset=offset, | ||
| n_jobs=n_jobs, gamma=gamma, | ||
| n_bootstraps=n_bootstraps, | ||
| random_state=seed*2) | ||
|
|
||
| eval_fdp, eval_power = cal_fdp_power(eval_selected, non_zero_index) | ||
|
|
||
| return ko_fdps, ako_fdp, eval_fdp, ko_powers, ako_power, eval_power | ||
|
|
||
|
|
||
| def plot(results, n_simu, fdr): | ||
|
|
||
| ko_fdps = np.array([results[i][0] for i in range(n_simu)]).ravel() | ||
| ako_fdps = np.array([results[i][1] for i in range(n_simu)]).ravel() | ||
| ko_powers = np.array([results[i][2] for i in range(n_simu)]).ravel() | ||
| ako_powers = np.array([results[i][3] for i in range(n_simu)]).ravel() | ||
|
|
||
| eval_fdps = np.array([results[i][2] for i in range(n_simu)]).ravel() | ||
| ko_powers = np.array([results[i][3] for i in range(n_simu)]).ravel() | ||
| ako_powers = np.array([results[i][4] for i in range(n_simu)]).ravel() | ||
| eval_powers = np.array([results[i][5] for i in range(n_simu)]).ravel() | ||
| # Plotting | ||
| n_bins = 30 | ||
| fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(7, 4)) | ||
| fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2, figsize=(7, 4)) | ||
| ax1.tick_params(labelsize=14) | ||
| ax1.hist(ko_fdps, edgecolor='k', | ||
| range=[0.0, 1.0], bins=n_bins, color=color_blue) | ||
| ax1.axvline(x=fdr, linestyle='--', color='r', linewidth=1.0) | ||
| ax1.title.set_text('FDP (KO)') | ||
| ax2.tick_params(labelsize=14) | ||
| ax2.hist(ko_powers, edgecolor='k', | ||
| range=[0.0, 1.0], bins=n_bins, color=color_blue) | ||
| ax2.title.set_text('Power (KO)') | ||
| ax3.tick_params(labelsize=14) | ||
| ax3.hist(ako_fdps, edgecolor='k', | ||
| range=[0.0, 1.0], bins=n_bins, color=color_teal) | ||
| ax3.title.set_text('FDP (AKO)') | ||
| ax3.axvline(x=fdr, linestyle='--', color='r', linewidth=1.0) | ||
| ax4.tick_params(labelsize=14) | ||
| ax4.hist(ako_powers, edgecolor='k', | ||
| range=[0.0, 1.0], bins=n_bins, color=color_teal) | ||
| ax4.title.set_text('Power (AKO)') | ||
| ax5.tick_params(labelsize=14) | ||
| ax5.hist(eval_fdps, edgecolor='k', | ||
| range=[0.0, 1.0], bins=n_bins, color=color_green) | ||
| ax5.axvline(x=fdr, linestyle='--', color='r', linewidth=1.0) | ||
| ax5.title.set_text('FDP (e-values)') | ||
| ax6.tick_params(labelsize=14) | ||
| ax6.hist(eval_powers, edgecolor='k', | ||
| range=[0.0, 1.0], bins=n_bins, color=color_green) | ||
| ax6.title.set_text('Power (e-values)') | ||
| plt.tight_layout() | ||
|
|
||
| figname = 'figures/histogram_ko_vs_ako.png' | ||
| figname = 'histogram_ko_vs_ako.png' | ||
| plt.savefig(figname) | ||
| print(f'Save figure to {figname}') | ||
|
|
||
|
|
@@ -94,8 +122,8 @@ def main(): | |
| offset = 1 | ||
| fdr = 0.05 | ||
| gamma = 0.3 | ||
| n_bootstraps = 10 | ||
| n_simu = 10 | ||
| n_bootstraps = 25 | ||
| n_simu = 100 | ||
| offset = 1 | ||
|
|
||
| results = Parallel(n_jobs=1)( | ||
|
|
@@ -109,6 +137,6 @@ def main(): | |
| plot(results, n_simu, fdr) | ||
| print('Done!') | ||
|
|
||
|
|
||
| main() | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We avoid this structure in examples to improve readability.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, the all the examples need rework IMO. |
||
| # if __name__ == '__main__': | ||
| # main() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,13 +8,13 @@ | |
|
|
||
| from .gaussian_knockoff import (_estimate_distribution, | ||
| gaussian_knockoff_generation) | ||
| from .stat_coef_diff import stat_coef_diff | ||
| from .stat_coef_diff import stat_coef_diff, _coef_diff_threshold | ||
| from .utils import fdr_threshold, quantile_aggregation | ||
|
|
||
|
|
||
| def knockoff_aggregation(X, y, centered=True, shrink=False, | ||
| construct_method='equi', fdr=0.1, fdr_control='bhq', | ||
| reshaping_function=None, offset=1, | ||
| method='quantile', reshaping_function=None, offset=1, | ||
| statistic='lasso_cv', cov_estimator='ledoit_wolf', | ||
| joblib_verbose=0, n_bootstraps=25, n_jobs=1, | ||
| adaptive_aggregation=False, gamma=0.5, gamma_min=0.05, | ||
|
|
@@ -64,21 +64,35 @@ def knockoff_aggregation(X, y, centered=True, shrink=False, | |
| ko_stats = parallel(delayed(stat_coef_diff_cached)( | ||
| X, X_tildes[i], y, method=statistic) for i in range(n_bootstraps)) | ||
|
|
||
| pvals = np.array([_empirical_pval(ko_stats[i], offset) | ||
| for i in range(n_bootstraps)]) | ||
| if method == 'e-values': | ||
| evals = np.array([_empirical_eval(ko_stats[i], fdr/2, offset) | ||
| for i in range(n_bootstraps)]) | ||
|
|
||
| aggregated_pval = quantile_aggregation( | ||
| pvals, gamma=gamma, gamma_min=gamma_min, | ||
| adaptive=adaptive_aggregation) | ||
| aggregated_eval = np.mean(evals, axis=0) | ||
| threshold = fdr_threshold(aggregated_eval, fdr=fdr, method='ebh') | ||
| selected = np.where(aggregated_eval >= threshold)[0] | ||
|
|
||
| threshold = fdr_threshold(aggregated_pval, fdr=fdr, method=fdr_control, | ||
| reshaping_function=reshaping_function) | ||
| selected = np.where(aggregated_pval <= threshold)[0] | ||
| if verbose: | ||
| return selected, aggregated_eval, evals | ||
|
|
||
| return selected | ||
|
|
||
| if verbose: | ||
| return selected, aggregated_pval, pvals | ||
| if method == 'quantile': | ||
| pvals = np.array([_empirical_pval(ko_stats[i], offset) | ||
| for i in range(n_bootstraps)]) | ||
|
|
||
| return selected | ||
| aggregated_pval = quantile_aggregation( | ||
| pvals, gamma=gamma, gamma_min=gamma_min, | ||
| adaptive=adaptive_aggregation) | ||
|
|
||
| threshold = fdr_threshold(aggregated_pval, fdr=fdr, method=fdr_control, | ||
| reshaping_function=reshaping_function) | ||
| selected = np.where(aggregated_pval <= threshold)[0] | ||
|
|
||
| if verbose: | ||
| return selected, aggregated_pval, pvals | ||
|
|
||
| return selected | ||
|
|
||
|
|
||
| def _empirical_pval(test_score, offset=1): | ||
|
|
@@ -100,3 +114,25 @@ def _empirical_pval(test_score, offset=1): | |
| ) | ||
|
|
||
| return np.array(pvals) | ||
|
|
||
|
|
||
| def _empirical_eval(test_score, fdr=0.1, offset=1): | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do you expose the offset parameter ? I think it should be fixed.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is shown in the rest of the code (i.e. it is a parameter of knockoff_aggregation etc), should we remove this altogether?
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would, as I don't see any use case for changing it. @tbng any opinion on this ?
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, this is done long before, anyway I think it's ok we remove the offset and simple use |
||
|
|
||
| evals = [] | ||
| n_features = test_score.size | ||
|
|
||
| if offset not in (0, 1): | ||
| raise ValueError("'offset' must be either 0 or 1") | ||
|
|
||
| ko_thr = _coef_diff_threshold(test_score, fdr=fdr, offset=offset) | ||
|
|
||
| for i in range(n_features): | ||
| if test_score[i] < ko_thr: | ||
| evals.append(0) | ||
| else: | ||
| evals.append( | ||
| n_features / | ||
| (offset + np.sum(test_score <= - ko_thr)) | ||
| ) | ||
|
|
||
| return np.array(evals) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,6 +16,8 @@ def fdr_threshold(pvals, fdr=0.1, method='bhq', reshaping_function=None): | |
| elif method == 'bhy': | ||
| return _bhy_threshold( | ||
| pvals, fdr=fdr, reshaping_function=reshaping_function) | ||
| elif method == 'ebh': | ||
| return _ebh_threshold(pvals, fdr=fdr) | ||
| else: | ||
| raise ValueError( | ||
| '{} is not support FDR control method'.format(method)) | ||
|
|
@@ -69,6 +71,21 @@ def _bhq_threshold(pvals, fdr=0.1): | |
| else: | ||
| return -1.0 | ||
|
|
||
| def _ebh_threshold(evals, fdr=0.1): | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this function should have a test.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should i add a test_utils file to the test section? the utils file is not tested as of now
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, please. |
||
| """e-BH procedure for FDR control (see Wang and Ramdas 2021) | ||
| """ | ||
| n_features = len(evals) | ||
| evals_sorted = -np.sort(-evals) # sort in descending order | ||
| selected_index = 2 * n_features | ||
| for i in range(n_features - 1, -1, -1): | ||
| if evals_sorted[i] >= n_features / (fdr * (i + 1)): | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that you can avoid the for loop. |
||
| selected_index = i | ||
| break | ||
| if selected_index <= n_features: | ||
| return evals_sorted[selected_index] | ||
| else: | ||
| return np.infty | ||
|
|
||
|
|
||
| def _bhy_threshold(pvals, reshaping_function=None, fdr=0.1): | ||
| """Benjamini-Hochberg-Yekutieli procedure for controlling FDR, with input | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What is the status of this 'examples_not_exhibited' directory ? if these are non-working examples it should be removed ;-)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is one of my TODOs for the knockoff module: after refactoring knockoffs to the main branch using different example for a fast build (the current one is with n=500, p=1000 and 2500 simulations, hence not really the most friendly example to run).