Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 38 additions & 10 deletions examples_not_exhibited/plot_fig_1_nguyen_et_al.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
To reduce the script runtime it is desirable to increase n_jobs parameter.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the status of this 'examples_not_exhibited' directory ? if these are non-working examples it should be removed ;-)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is one of my TODOs for the knockoff module: after refactoring knockoffs to the main branch using different example for a fast build (the current one is with n=500, p=1000 and 2500 simulations, hence not really the most friendly example to run).

"""
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('Agg')
import numpy as np
from hidimstat.knockoffs import knockoff_aggregation, model_x_knockoff
from hidimstat.knockoffs.data_simulation import simu_data
Expand All @@ -20,10 +22,11 @@

color_blue = '#1f77b4'
color_teal = '#1fbecf'
color_green = '#31a354'


def one_inference(n, p, snr, rho, sparsity, n_bootstraps=25, gamma=0.3,
n_jobs=1, offset=1, fdr=0.1, seed=None):
n_jobs=50, offset=1, fdr=0.1, seed=None):

# Simulate data following autoregressive structure, seed is fixed to ensure
# doing inference on only 1 simulation
Expand Down Expand Up @@ -51,36 +54,61 @@ def one_inference(n, p, snr, rho, sparsity, n_bootstraps=25, gamma=0.3,

ako_fdp, ako_power = cal_fdp_power(ako_selected, non_zero_index)

return ko_fdps, ako_fdp, ko_powers, ako_power
# Aggregation via e-values

eval_selected = knockoff_aggregation(X, y, fdr=fdr,
method='e-values',
offset=offset,
n_jobs=n_jobs, gamma=gamma,
n_bootstraps=n_bootstraps,
random_state=seed*2)

eval_fdp, eval_power = cal_fdp_power(eval_selected, non_zero_index)

return ko_fdps, ako_fdp, eval_fdp, ko_powers, ako_power, eval_power


def plot(results, n_simu, fdr):

ko_fdps = np.array([results[i][0] for i in range(n_simu)]).ravel()
ako_fdps = np.array([results[i][1] for i in range(n_simu)]).ravel()
ko_powers = np.array([results[i][2] for i in range(n_simu)]).ravel()
ako_powers = np.array([results[i][3] for i in range(n_simu)]).ravel()

eval_fdps = np.array([results[i][2] for i in range(n_simu)]).ravel()
ko_powers = np.array([results[i][3] for i in range(n_simu)]).ravel()
ako_powers = np.array([results[i][4] for i in range(n_simu)]).ravel()
eval_powers = np.array([results[i][5] for i in range(n_simu)]).ravel()
# Plotting
n_bins = 30
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(7, 4))
fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2, figsize=(7, 4))
ax1.tick_params(labelsize=14)
ax1.hist(ko_fdps, edgecolor='k',
range=[0.0, 1.0], bins=n_bins, color=color_blue)
ax1.axvline(x=fdr, linestyle='--', color='r', linewidth=1.0)
ax1.title.set_text('FDP (KO)')
ax2.tick_params(labelsize=14)
ax2.hist(ko_powers, edgecolor='k',
range=[0.0, 1.0], bins=n_bins, color=color_blue)
ax2.title.set_text('Power (KO)')
ax3.tick_params(labelsize=14)
ax3.hist(ako_fdps, edgecolor='k',
range=[0.0, 1.0], bins=n_bins, color=color_teal)
ax3.title.set_text('FDP (AKO)')
ax3.axvline(x=fdr, linestyle='--', color='r', linewidth=1.0)
ax4.tick_params(labelsize=14)
ax4.hist(ako_powers, edgecolor='k',
range=[0.0, 1.0], bins=n_bins, color=color_teal)
ax4.title.set_text('Power (AKO)')
ax5.tick_params(labelsize=14)
ax5.hist(eval_fdps, edgecolor='k',
range=[0.0, 1.0], bins=n_bins, color=color_green)
ax5.axvline(x=fdr, linestyle='--', color='r', linewidth=1.0)
ax5.title.set_text('FDP (e-values)')
ax6.tick_params(labelsize=14)
ax6.hist(eval_powers, edgecolor='k',
range=[0.0, 1.0], bins=n_bins, color=color_green)
ax6.title.set_text('Power (e-values)')
plt.tight_layout()

figname = 'figures/histogram_ko_vs_ako.png'
figname = 'histogram_ko_vs_ako.png'
plt.savefig(figname)
print(f'Save figure to {figname}')

Expand All @@ -94,8 +122,8 @@ def main():
offset = 1
fdr = 0.05
gamma = 0.3
n_bootstraps = 10
n_simu = 10
n_bootstraps = 25
n_simu = 100
offset = 1

results = Parallel(n_jobs=1)(
Expand All @@ -109,6 +137,6 @@ def main():
plot(results, n_simu, fdr)
print('Done!')


main()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We avoid this structure in examples to improve readability.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the all the examples need rework IMO.

# if __name__ == '__main__':
# main()
62 changes: 49 additions & 13 deletions hidimstat/knockoffs/knockoff_aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@

from .gaussian_knockoff import (_estimate_distribution,
gaussian_knockoff_generation)
from .stat_coef_diff import stat_coef_diff
from .stat_coef_diff import stat_coef_diff, _coef_diff_threshold
from .utils import fdr_threshold, quantile_aggregation


def knockoff_aggregation(X, y, centered=True, shrink=False,
construct_method='equi', fdr=0.1, fdr_control='bhq',
reshaping_function=None, offset=1,
method='quantile', reshaping_function=None, offset=1,
statistic='lasso_cv', cov_estimator='ledoit_wolf',
joblib_verbose=0, n_bootstraps=25, n_jobs=1,
adaptive_aggregation=False, gamma=0.5, gamma_min=0.05,
Expand Down Expand Up @@ -64,21 +64,35 @@ def knockoff_aggregation(X, y, centered=True, shrink=False,
ko_stats = parallel(delayed(stat_coef_diff_cached)(
X, X_tildes[i], y, method=statistic) for i in range(n_bootstraps))

pvals = np.array([_empirical_pval(ko_stats[i], offset)
for i in range(n_bootstraps)])
if method == 'e-values':
evals = np.array([_empirical_eval(ko_stats[i], fdr/2, offset)
for i in range(n_bootstraps)])

aggregated_pval = quantile_aggregation(
pvals, gamma=gamma, gamma_min=gamma_min,
adaptive=adaptive_aggregation)
aggregated_eval = np.mean(evals, axis=0)
threshold = fdr_threshold(aggregated_eval, fdr=fdr, method='ebh')
selected = np.where(aggregated_eval >= threshold)[0]

threshold = fdr_threshold(aggregated_pval, fdr=fdr, method=fdr_control,
reshaping_function=reshaping_function)
selected = np.where(aggregated_pval <= threshold)[0]
if verbose:
return selected, aggregated_eval, evals

return selected

if verbose:
return selected, aggregated_pval, pvals
if method == 'quantile':
pvals = np.array([_empirical_pval(ko_stats[i], offset)
for i in range(n_bootstraps)])

return selected
aggregated_pval = quantile_aggregation(
pvals, gamma=gamma, gamma_min=gamma_min,
adaptive=adaptive_aggregation)

threshold = fdr_threshold(aggregated_pval, fdr=fdr, method=fdr_control,
reshaping_function=reshaping_function)
selected = np.where(aggregated_pval <= threshold)[0]

if verbose:
return selected, aggregated_pval, pvals

return selected


def _empirical_pval(test_score, offset=1):
Expand All @@ -100,3 +114,25 @@ def _empirical_pval(test_score, offset=1):
)

return np.array(pvals)


def _empirical_eval(test_score, fdr=0.1, offset=1):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do you expose the offset parameter ? I think it should be fixed.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is shown in the rest of the code (i.e. it is a parameter of knockoff_aggregation etc), should we remove this altogether?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would, as I don't see any use case for changing it. @tbng any opinion on this ?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this is done long before, anyway I think it's ok we remove the offset and simple use $1 + num W_j \leq -W_k$ in the numerator, as written in the paper.


evals = []
n_features = test_score.size

if offset not in (0, 1):
raise ValueError("'offset' must be either 0 or 1")

ko_thr = _coef_diff_threshold(test_score, fdr=fdr, offset=offset)

for i in range(n_features):
if test_score[i] < ko_thr:
evals.append(0)
else:
evals.append(
n_features /
(offset + np.sum(test_score <= - ko_thr))
)

return np.array(evals)
6 changes: 3 additions & 3 deletions hidimstat/knockoffs/stat_coef_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,13 +67,13 @@ def stat_coef_diff(X, X_tilde, y, method='lasso_cv', n_splits=5, n_jobs=1,

estimator = {
'lasso_cv': LassoCV(alphas=lambdas, n_jobs=n_jobs,
verbose=joblib_verbose, max_iter=1e4, cv=cv),
verbose=joblib_verbose, max_iter=int(1e4), cv=cv),
'logistic_l1': LogisticRegressionCV(
penalty='l1', max_iter=1e4,
penalty='l1', max_iter=int(1e4),
solver=solver, cv=cv,
n_jobs=n_jobs, tol=1e-8),
'logistic_l2': LogisticRegressionCV(
penalty='l2', max_iter=1e4, n_jobs=n_jobs,
penalty='l2', max_iter=int(1e4), n_jobs=n_jobs,
verbose=joblib_verbose, cv=cv, tol=1e-8),
}

Expand Down
12 changes: 12 additions & 0 deletions hidimstat/knockoffs/tests/test_knockoff_aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,15 @@ def test_knockoff_aggregation():

assert fdp < 0.5
assert power > 0.1

# Using e-values aggregation

selected, aggregated_pval, pvals = knockoff_aggregation(
X, y, fdr=fdr, n_bootstraps=n_bootstraps, method='e-values',
verbose=True, random_state=0)

fdp, power = cal_fdp_power(selected, non_zero_index)

assert pvals.shape == (n_bootstraps, p)
assert fdp < 0.5
assert power > 0.1
17 changes: 17 additions & 0 deletions hidimstat/knockoffs/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ def fdr_threshold(pvals, fdr=0.1, method='bhq', reshaping_function=None):
elif method == 'bhy':
return _bhy_threshold(
pvals, fdr=fdr, reshaping_function=reshaping_function)
elif method == 'ebh':
return _ebh_threshold(pvals, fdr=fdr)
else:
raise ValueError(
'{} is not support FDR control method'.format(method))
Expand Down Expand Up @@ -69,6 +71,21 @@ def _bhq_threshold(pvals, fdr=0.1):
else:
return -1.0

def _ebh_threshold(evals, fdr=0.1):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this function should have a test.

Copy link
Author

@alexblnn alexblnn Aug 3, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should i add a test_utils file to the test section? the utils file is not tested as of now

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, please.

"""e-BH procedure for FDR control (see Wang and Ramdas 2021)
"""
n_features = len(evals)
evals_sorted = -np.sort(-evals) # sort in descending order
selected_index = 2 * n_features
for i in range(n_features - 1, -1, -1):
if evals_sorted[i] >= n_features / (fdr * (i + 1)):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that you can avoid the for loop.

selected_index = i
break
if selected_index <= n_features:
return evals_sorted[selected_index]
else:
return np.infty


def _bhy_threshold(pvals, reshaping_function=None, fdr=0.1):
"""Benjamini-Hochberg-Yekutieli procedure for controlling FDR, with input
Expand Down