Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ _build
build
**.ipynb_checkpoints
src/pted/_version.py
.idea
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,7 @@ def pted(
chunk_size: Optional[int] = None,
chunk_iter: Optional[int] = None,
two_tailed: bool = True,
prog_bar: bool = False,
) -> Union[float, tuple[float, np.ndarray, float]]:
```

Expand All @@ -280,6 +281,7 @@ def pted(
* **chunk_size** *(Optional[int])*: if not None, use chunked energy distance estimation. This is useful for large datasets. The chunk size is the number of samples to use for each chunk. If None, use the full dataset.
* **chunk_iter** *(Optional[int])*: The chunk iter is the number of iterations to use with the given chunk size.
* **two_tailed** *(bool)*: if True, compute a two-tailed p-value. This is useful if you want to reject the null hypothesis when x and y are either too similar or too different. If False, only checks for dissimilarity but is more sensitive. Default is True.
* **prog_bar** *(bool)*: if True, show a progress bar to track the progress of permutation tests. Default is False.

### Coverage test

Expand All @@ -295,6 +297,7 @@ def pted_coverage_test(
chunk_iter: Optional[int] = None,
sbc_histogram: Optional[str] = None,
sbc_bins: Optional[int] = None,
prog_bar: bool = False,
) -> Union[float, tuple[np.ndarray, np.ndarray, float]]:
```

Expand All @@ -307,6 +310,7 @@ def pted_coverage_test(
* **chunk_iter** *(Optional[int])*: The chunk iter is the number of iterations to use with the given chunk size.
* **sbc_histogram** *(Optional[str])*: If given, the path/filename to save a Simulation-Based-Calibration histogram.
* **sbc_bins** *(Optional[int])*: If given, force the histogram to have the provided number of bins. Otherwise, select an appropriate size: ~sqrt(N).
* **prog_bar** *(bool)*: if True, show a progress bar to track the progress of simulations. Default is False.

## GPU Compatibility

Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
numpy
scipy
scipy
tqdm
18 changes: 14 additions & 4 deletions src/pted/pted.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from tqdm.auto import trange
from typing import Union, Optional
import numpy as np

Expand All @@ -24,6 +25,7 @@ def pted(
chunk_size: Optional[int] = None,
chunk_iter: Optional[int] = None,
two_tailed: bool = True,
prog_bar: bool = False,
) -> Union[float, tuple[float, np.ndarray, float]]:
"""
Two sample null hypothesis test using a permutation test on the energy
Expand Down Expand Up @@ -90,6 +92,9 @@ def pted(
two_tailed (bool): if True, compute a two-tailed p-value. This is useful
if you want to reject the null hypothesis when x and y are either
too similar or too different. Default is True.
prog_bar (bool): if True, show a progress bar to track the progress
of permutation tests. Default is False.


Note
----
Expand Down Expand Up @@ -131,9 +136,10 @@ def pted(
metric=metric,
chunk_size=int(chunk_size),
chunk_iter=int(chunk_iter),
prog_bar=prog_bar,
)
elif is_torch_tensor(x):
test, permute = pted_torch(x, y, permutations=permutations, metric=metric)
test, permute = pted_torch(x, y, permutations=permutations, metric=metric, prog_bar=prog_bar)
elif chunk_size is not None:
test, permute = pted_chunk_numpy(
x,
Expand All @@ -142,9 +148,10 @@ def pted(
metric=metric,
chunk_size=int(chunk_size),
chunk_iter=int(chunk_iter),
prog_bar=prog_bar,
)
else:
test, permute = pted_numpy(x, y, permutations=permutations, metric=metric)
test, permute = pted_numpy(x, y, permutations=permutations, metric=metric, prog_bar=prog_bar)

permute = np.array(permute)

Expand Down Expand Up @@ -173,6 +180,7 @@ def pted_coverage_test(
chunk_iter: Optional[int] = None,
sbc_histogram: Optional[str] = None,
sbc_bins: Optional[int] = None,
prog_bar: bool = False,
) -> Union[float, tuple[np.ndarray, np.ndarray, float]]:
"""
Coverage test using a permutation test on the energy distance.
Expand Down Expand Up @@ -231,7 +239,7 @@ def pted_coverage_test(
return_all (bool): if True, return the test statistic and the permuted
statistics with the p-value. If False, just return the p-value. bool
(default: False)
chunk_size (Optional[int]): if not None, use chunked energy distance
chunk_size (Optional[int]): If not None, use chunked energy distance
estimation. This is useful for large datasets. The chunk size is the
number of samples to use for each chunk. If None, use the full
dataset.
Expand All @@ -241,6 +249,8 @@ def pted_coverage_test(
Simulation-Based-Calibration histogram.
sbc_bins (Optional[int]): If given, force the histogram to have the provided
number of bins. Otherwise, select an appropriate size: ~sqrt(N).
prog_bar (bool): If True, show a progress bar to track the progress
of simulations. Default is False.

Note
----
Expand Down Expand Up @@ -272,7 +282,7 @@ def pted_coverage_test(
test_stats = []
permute_stats = []
pvals = []
for i in range(nsim):
for i in trange(nsim, disable=not prog_bar):
test, permute, p = pted(
g[:, i],
s[:, i],
Expand Down
14 changes: 9 additions & 5 deletions src/pted/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from scipy.spatial.distance import cdist
from scipy.stats import chi2 as chi2_dist, binom
from scipy.optimize import root_scalar
from tqdm.auto import trange

try:
import torch
Expand Down Expand Up @@ -116,13 +117,14 @@ def pted_chunk_numpy(
metric: str = "euclidean",
chunk_size: int = 100,
chunk_iter: int = 10,
prog_bar: bool = False,
) -> tuple[float, list[float]]:
assert np.all(np.isfinite(x)) and np.all(np.isfinite(y)), "Input contains NaN or Inf!"
nx = len(x)

test_stat = _energy_distance_estimate_numpy(x, y, chunk_size, chunk_iter, metric=metric)
permute_stats = []
for _ in range(permutations):
for _ in trange(permutations, disable=not prog_bar):
z = np.concatenate((x, y), axis=0)
z = z[np.random.permutation(len(z))]
x, y = z[:nx], z[nx:]
Expand All @@ -139,6 +141,7 @@ def pted_chunk_torch(
metric: Union[str, float] = "euclidean",
chunk_size: int = 100,
chunk_iter: int = 10,
prog_bar: bool = False,
) -> tuple[float, list[float]]:
assert torch.__version__ != "null", "PyTorch is not installed! try: `pip install torch`"
assert torch.all(torch.isfinite(x)) and torch.all(
Expand All @@ -148,7 +151,7 @@ def pted_chunk_torch(

test_stat = _energy_distance_estimate_torch(x, y, chunk_size, chunk_iter, metric=metric)
permute_stats = []
for _ in range(permutations):
for _ in trange(permutations, disable=not prog_bar):
z = torch.cat((x, y), dim=0)
z = z[torch.randperm(len(z))]
x, y = z[:nx], z[nx:]
Expand All @@ -159,7 +162,7 @@ def pted_chunk_torch(


def pted_numpy(
x: np.ndarray, y: np.ndarray, permutations: int = 100, metric: str = "euclidean"
x: np.ndarray, y: np.ndarray, permutations: int = 100, metric: str = "euclidean", prog_bar: bool = False,
) -> tuple[float, list[float]]:
z = np.concatenate((x, y), axis=0)
assert np.all(np.isfinite(z)), "Input contains NaN or Inf!"
Expand All @@ -172,7 +175,7 @@ def pted_numpy(

test_stat = _energy_distance_precompute(dmatrix, nx, ny)
permute_stats = []
for _ in range(permutations):
for _ in trange(permutations, disable=not prog_bar):
I = np.random.permutation(len(z))
dmatrix = dmatrix[I][:, I]
permute_stats.append(_energy_distance_precompute(dmatrix, nx, ny))
Expand All @@ -184,6 +187,7 @@ def pted_torch(
y: torch.Tensor,
permutations: int = 100,
metric: Union[str, float] = "euclidean",
prog_bar: bool = False,
) -> tuple[float, list[float]]:
assert torch.__version__ != "null", "PyTorch is not installed! try: `pip install torch`"
z = torch.cat((x, y), dim=0)
Expand All @@ -199,7 +203,7 @@ def pted_torch(

test_stat = _energy_distance_precompute(dmatrix, nx, ny).item()
permute_stats = []
for _ in range(permutations):
for _ in trange(permutations, disable=not prog_bar):
I = torch.randperm(len(z))
dmatrix = dmatrix[I][:, I]
permute_stats.append(_energy_distance_precompute(dmatrix, nx, ny).item())
Expand Down
22 changes: 22 additions & 0 deletions tests/test_pted.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,16 @@ def test_pted_main():
pted.test()


def test_pted_progress_bar(capsys):
pted.pted(np.array([[1,2],[3,4]]), np.array([[3,2],[1,4]]), permutations=42)
captured = capsys.readouterr().err
assert "42/42" not in captured, "progress bar showed up when prog_bar is set to False by default"

pted.pted(np.array([[1,2],[3,4]]), np.array([[3,2],[1,4]]), permutations=42, prog_bar=True)
captured = capsys.readouterr().err
assert "42/42" in captured, "progress bar did not show when prog_bar is set to True"


def test_pted_torch():
if torch is None:
pytest.skip("torch not installed")
Expand Down Expand Up @@ -112,6 +122,18 @@ def test_pted_coverage_edgecase():
assert p > 1e-2 and p < 0.99, f"p-value {p} is not in the expected range (U(0,1))"


def test_pted_coverage_progress_bar(capsys):
g = np.random.normal(size=(42, 10))
s = np.random.normal(size=(100, 42, 10))
pted.pted_coverage_test(g, s)
captured = capsys.readouterr().err
assert "42/42" not in captured, "progress bar showed up when prog_bar is set to False by default"

pted.pted_coverage_test(g, s, prog_bar=True)
captured = capsys.readouterr().err
assert "42/42" in captured, "progress bar did not show when prog_bar is set to True"


def test_pted_coverage_overunder():
if torch is None:
pytest.skip("torch not installed")
Expand Down
Loading