diff --git a/.gitignore b/.gitignore index 5751abe..a0e1a41 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ _build build **.ipynb_checkpoints src/pted/_version.py +.idea diff --git a/README.md b/README.md index 6493799..fcddd20 100644 --- a/README.md +++ b/README.md @@ -269,6 +269,7 @@ def pted( chunk_size: Optional[int] = None, chunk_iter: Optional[int] = None, two_tailed: bool = True, + prog_bar: bool = False, ) -> Union[float, tuple[float, np.ndarray, float]]: ``` @@ -280,6 +281,7 @@ def pted( * **chunk_size** *(Optional[int])*: if not None, use chunked energy distance estimation. This is useful for large datasets. The chunk size is the number of samples to use for each chunk. If None, use the full dataset. * **chunk_iter** *(Optional[int])*: The chunk iter is the number of iterations to use with the given chunk size. * **two_tailed** *(bool)*: if True, compute a two-tailed p-value. This is useful if you want to reject the null hypothesis when x and y are either too similar or too different. If False, only checks for dissimilarity but is more sensitive. Default is True. +* **prog_bar** *(bool)*: if True, show a progress bar to track the progress of permutation tests. Default is False. ### Coverage test @@ -295,6 +297,7 @@ def pted_coverage_test( chunk_iter: Optional[int] = None, sbc_histogram: Optional[str] = None, sbc_bins: Optional[int] = None, + prog_bar: bool = False, ) -> Union[float, tuple[np.ndarray, np.ndarray, float]]: ``` @@ -307,6 +310,7 @@ def pted_coverage_test( * **chunk_iter** *(Optional[int])*: The chunk iter is the number of iterations to use with the given chunk size. * **sbc_histogram** *(Optional[str])*: If given, the path/filename to save a Simulation-Based-Calibration histogram. * **sbc_bins** *(Optional[int])*: If given, force the histogram to have the provided number of bins. Otherwise, select an appropriate size: ~sqrt(N). +* **prog_bar** *(bool)*: if True, show a progress bar to track the progress of simulations. Default is False. ## GPU Compatibility diff --git a/requirements.txt b/requirements.txt index 5576e19..53e852f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ numpy -scipy \ No newline at end of file +scipy +tqdm \ No newline at end of file diff --git a/src/pted/pted.py b/src/pted/pted.py index f92fcf6..9d401ee 100644 --- a/src/pted/pted.py +++ b/src/pted/pted.py @@ -1,3 +1,4 @@ +from tqdm.auto import trange from typing import Union, Optional import numpy as np @@ -24,6 +25,7 @@ def pted( chunk_size: Optional[int] = None, chunk_iter: Optional[int] = None, two_tailed: bool = True, + prog_bar: bool = False, ) -> Union[float, tuple[float, np.ndarray, float]]: """ Two sample null hypothesis test using a permutation test on the energy @@ -90,6 +92,9 @@ def pted( two_tailed (bool): if True, compute a two-tailed p-value. This is useful if you want to reject the null hypothesis when x and y are either too similar or too different. Default is True. + prog_bar (bool): if True, show a progress bar to track the progress + of permutation tests. Default is False. + Note ---- @@ -131,9 +136,10 @@ def pted( metric=metric, chunk_size=int(chunk_size), chunk_iter=int(chunk_iter), + prog_bar=prog_bar, ) elif is_torch_tensor(x): - test, permute = pted_torch(x, y, permutations=permutations, metric=metric) + test, permute = pted_torch(x, y, permutations=permutations, metric=metric, prog_bar=prog_bar) elif chunk_size is not None: test, permute = pted_chunk_numpy( x, @@ -142,9 +148,10 @@ def pted( metric=metric, chunk_size=int(chunk_size), chunk_iter=int(chunk_iter), + prog_bar=prog_bar, ) else: - test, permute = pted_numpy(x, y, permutations=permutations, metric=metric) + test, permute = pted_numpy(x, y, permutations=permutations, metric=metric, prog_bar=prog_bar) permute = np.array(permute) @@ -173,6 +180,7 @@ def pted_coverage_test( chunk_iter: Optional[int] = None, sbc_histogram: Optional[str] = None, sbc_bins: Optional[int] = None, + prog_bar: bool = False, ) -> Union[float, tuple[np.ndarray, np.ndarray, float]]: """ Coverage test using a permutation test on the energy distance. @@ -231,7 +239,7 @@ def pted_coverage_test( return_all (bool): if True, return the test statistic and the permuted statistics with the p-value. If False, just return the p-value. bool (default: False) - chunk_size (Optional[int]): if not None, use chunked energy distance + chunk_size (Optional[int]): If not None, use chunked energy distance estimation. This is useful for large datasets. The chunk size is the number of samples to use for each chunk. If None, use the full dataset. @@ -241,6 +249,8 @@ def pted_coverage_test( Simulation-Based-Calibration histogram. sbc_bins (Optional[int]): If given, force the histogram to have the provided number of bins. Otherwise, select an appropriate size: ~sqrt(N). + prog_bar (bool): If True, show a progress bar to track the progress + of simulations. Default is False. Note ---- @@ -272,7 +282,7 @@ def pted_coverage_test( test_stats = [] permute_stats = [] pvals = [] - for i in range(nsim): + for i in trange(nsim, disable=not prog_bar): test, permute, p = pted( g[:, i], s[:, i], diff --git a/src/pted/utils.py b/src/pted/utils.py index b166656..d00e0ec 100644 --- a/src/pted/utils.py +++ b/src/pted/utils.py @@ -5,6 +5,7 @@ from scipy.spatial.distance import cdist from scipy.stats import chi2 as chi2_dist, binom from scipy.optimize import root_scalar +from tqdm.auto import trange try: import torch @@ -116,13 +117,14 @@ def pted_chunk_numpy( metric: str = "euclidean", chunk_size: int = 100, chunk_iter: int = 10, + prog_bar: bool = False, ) -> tuple[float, list[float]]: assert np.all(np.isfinite(x)) and np.all(np.isfinite(y)), "Input contains NaN or Inf!" nx = len(x) test_stat = _energy_distance_estimate_numpy(x, y, chunk_size, chunk_iter, metric=metric) permute_stats = [] - for _ in range(permutations): + for _ in trange(permutations, disable=not prog_bar): z = np.concatenate((x, y), axis=0) z = z[np.random.permutation(len(z))] x, y = z[:nx], z[nx:] @@ -139,6 +141,7 @@ def pted_chunk_torch( metric: Union[str, float] = "euclidean", chunk_size: int = 100, chunk_iter: int = 10, + prog_bar: bool = False, ) -> tuple[float, list[float]]: assert torch.__version__ != "null", "PyTorch is not installed! try: `pip install torch`" assert torch.all(torch.isfinite(x)) and torch.all( @@ -148,7 +151,7 @@ def pted_chunk_torch( test_stat = _energy_distance_estimate_torch(x, y, chunk_size, chunk_iter, metric=metric) permute_stats = [] - for _ in range(permutations): + for _ in trange(permutations, disable=not prog_bar): z = torch.cat((x, y), dim=0) z = z[torch.randperm(len(z))] x, y = z[:nx], z[nx:] @@ -159,7 +162,7 @@ def pted_chunk_torch( def pted_numpy( - x: np.ndarray, y: np.ndarray, permutations: int = 100, metric: str = "euclidean" + x: np.ndarray, y: np.ndarray, permutations: int = 100, metric: str = "euclidean", prog_bar: bool = False, ) -> tuple[float, list[float]]: z = np.concatenate((x, y), axis=0) assert np.all(np.isfinite(z)), "Input contains NaN or Inf!" @@ -172,7 +175,7 @@ def pted_numpy( test_stat = _energy_distance_precompute(dmatrix, nx, ny) permute_stats = [] - for _ in range(permutations): + for _ in trange(permutations, disable=not prog_bar): I = np.random.permutation(len(z)) dmatrix = dmatrix[I][:, I] permute_stats.append(_energy_distance_precompute(dmatrix, nx, ny)) @@ -184,6 +187,7 @@ def pted_torch( y: torch.Tensor, permutations: int = 100, metric: Union[str, float] = "euclidean", + prog_bar: bool = False, ) -> tuple[float, list[float]]: assert torch.__version__ != "null", "PyTorch is not installed! try: `pip install torch`" z = torch.cat((x, y), dim=0) @@ -199,7 +203,7 @@ def pted_torch( test_stat = _energy_distance_precompute(dmatrix, nx, ny).item() permute_stats = [] - for _ in range(permutations): + for _ in trange(permutations, disable=not prog_bar): I = torch.randperm(len(z)) dmatrix = dmatrix[I][:, I] permute_stats.append(_energy_distance_precompute(dmatrix, nx, ny).item()) diff --git a/tests/test_pted.py b/tests/test_pted.py index 2cc2795..abaaa04 100644 --- a/tests/test_pted.py +++ b/tests/test_pted.py @@ -32,6 +32,16 @@ def test_pted_main(): pted.test() +def test_pted_progress_bar(capsys): + pted.pted(np.array([[1,2],[3,4]]), np.array([[3,2],[1,4]]), permutations=42) + captured = capsys.readouterr().err + assert "42/42" not in captured, "progress bar showed up when prog_bar is set to False by default" + + pted.pted(np.array([[1,2],[3,4]]), np.array([[3,2],[1,4]]), permutations=42, prog_bar=True) + captured = capsys.readouterr().err + assert "42/42" in captured, "progress bar did not show when prog_bar is set to True" + + def test_pted_torch(): if torch is None: pytest.skip("torch not installed") @@ -112,6 +122,18 @@ def test_pted_coverage_edgecase(): assert p > 1e-2 and p < 0.99, f"p-value {p} is not in the expected range (U(0,1))" +def test_pted_coverage_progress_bar(capsys): + g = np.random.normal(size=(42, 10)) + s = np.random.normal(size=(100, 42, 10)) + pted.pted_coverage_test(g, s) + captured = capsys.readouterr().err + assert "42/42" not in captured, "progress bar showed up when prog_bar is set to False by default" + + pted.pted_coverage_test(g, s, prog_bar=True) + captured = capsys.readouterr().err + assert "42/42" in captured, "progress bar did not show when prog_bar is set to True" + + def test_pted_coverage_overunder(): if torch is None: pytest.skip("torch not installed")