Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
332 changes: 89 additions & 243 deletions README.md

Large diffs are not rendered by default.

10 changes: 7 additions & 3 deletions docs/data_installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@ Data Installation
=================

``nSTAT-python`` does not bundle raw example data in the Git tree.
The canonical paper-example dataset is downloaded automatically the first
time a paper example or dataset helper requires it.

Use one of the supported Python-native installation paths instead:
Use one of the supported Python-native prefetch paths if you want the cache
materialized ahead of time:

Command line
------------
Expand All @@ -25,6 +28,7 @@ Python API
Notes
-----

- Example data is cached under ``data_cache/`` by default.
- The dataset source is figshare DOI ``10.6084/m9.figshare.4834640.v3``.
- Source checkouts cache data under ``data_cache/nstat_data`` by default.
- Set ``NSTAT_DATA_DIR`` to point at an existing dataset cache if needed.
- The repository intentionally ignores ``data/`` so local example-data installs are not committed.
- The repository intentionally ignores ``data/`` and ``data_cache/`` so local downloads are not committed.
40 changes: 21 additions & 19 deletions nstat/data_manager.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,12 @@
"""Resolve and materialize the external nSTAT example-data package.

This mirrors the MATLAB-side `nSTAT_ExampleDataInfo` / `nSTAT_Install`
workflow added in the upstream toolbox while keeping raw example data out of
the Python Git tree.
"""
"""Resolve and materialize the standalone nSTAT-python example dataset."""

from __future__ import annotations

import json
import os
import re
import shutil
import ssl
import tempfile
import time
import urllib.request
Expand All @@ -19,19 +15,23 @@
from pathlib import Path
from typing import Final

import certifi


FIGSHARE_API_URL: Final[str] = "https://api.figshare.com/v2/articles/4834640"
FIGSHARE_DOI_URL: Final[str] = "https://doi.org/10.6084/m9.figshare.4834640.v3"
PAPER_DOI_URL: Final[str] = "https://doi.org/10.1016/j.jneumeth.2012.08.009"
SENTINEL_NAME: Final[str] = ".nstat_data_ok.json"
USER_AGENT: Final[str] = "nSTAT-python-data-manager/1.0 (+https://github.com/cajigaslab/nSTAT-python)"
SSL_CONTEXT: Final[ssl.SSLContext] = ssl.create_default_context(cafile=certifi.where())
DOWNLOAD_URL_RE: Final[re.Pattern[str]] = re.compile(
r"https?://(?:www\.)?(?:ndownloader|figshare\.com/ndownloader)/files/\d+"
)


@dataclass(frozen=True)
class ExampleDataInfo:
"""Python analogue of MATLAB `nSTAT_ExampleDataInfo`."""
"""Resolved on-disk metadata for the canonical example dataset."""

root_dir: Path
data_dir: Path
Expand All @@ -56,11 +56,15 @@ def _default_cache_dir() -> Path:
return (_repo_root() / "data_cache" / "nstat_data").resolve()


def get_example_data_info(root_dir: str | Path | None = None) -> ExampleDataInfo:
"""Return dataset metadata using MATLAB-compatible file requirements."""
def get_example_data_info(
root_dir: str | Path | None = None,
*,
treat_as_data_dir: bool = False,
) -> ExampleDataInfo:
"""Return dataset metadata for a repo root or explicit dataset cache path."""

raw_root = _repo_root() if root_dir is None else Path(root_dir).expanduser().resolve()
if (raw_root / "mEPSCs").exists() or raw_root.name == "data":
if treat_as_data_dir or (raw_root / "mEPSCs").exists() or raw_root.name == "data":
data_dir = raw_root
root = raw_root.parent if raw_root.name == "data" else raw_root
else:
Expand Down Expand Up @@ -97,11 +101,9 @@ def _write_sentinel(data_dir: Path, *, source_url: str) -> None:
def _http_get(url: str, *, timeout: float = 60.0) -> tuple[str, bytes]:
req = urllib.request.Request(
url,
headers={
"User-Agent": "nSTAT-python-data-manager/1.0 (+https://github.com/cajigaslab/nSTAT-python)"
},
headers={"User-Agent": USER_AGENT},
)
with urllib.request.urlopen(req, timeout=timeout) as resp:
with urllib.request.urlopen(req, timeout=timeout, context=SSL_CONTEXT) as resp:
final_url = str(resp.geturl())
body = resp.read()
return final_url, body
Expand Down Expand Up @@ -145,11 +147,11 @@ def _stream_download(url: str, destination: Path, *, retries: int = 3) -> None:
try:
req = urllib.request.Request(
url,
headers={
"User-Agent": "nSTAT-python-data-manager/1.0 (+https://github.com/cajigaslab/nSTAT-python)"
},
headers={"User-Agent": USER_AGENT},
)
with urllib.request.urlopen(req, timeout=180.0) as resp, destination.open("wb") as out:
with urllib.request.urlopen(req, timeout=180.0, context=SSL_CONTEXT) as resp, destination.open(
"wb"
) as out:
shutil.copyfileobj(resp, out, length=1024 * 1024)
return
except Exception as exc: # pragma: no cover - network timing dependent
Expand Down Expand Up @@ -216,7 +218,7 @@ def get_data_dir() -> Path:
def data_is_present(data_dir: Path) -> bool:
"""Return True when the required MATLAB-mirrored example files exist."""

return get_example_data_info(data_dir).is_installed
return get_example_data_info(data_dir, treat_as_data_dir=True).is_installed


def ensure_example_data(download: bool = True) -> Path:
Expand Down
8 changes: 4 additions & 4 deletions nstat/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,14 @@ def list_datasets() -> list[str]:
return sorted(_load_manifest().keys())


def _resolve_dataset_target(rel_path: str) -> Path:
def _resolve_dataset_target(rel_path: str, *, download: bool) -> Path:
repo_root = _repo_root()
rel = Path(rel_path)
if not rel.parts:
return repo_root / rel
if rel.parts[0] == "data":
try:
data_dir = ensure_example_data(download=False)
data_dir = ensure_example_data(download=download)
except FileNotFoundError as exc:
raise DataNotFoundError(str(exc)) from exc
return data_dir.joinpath(*rel.parts[1:])
Expand All @@ -60,7 +60,7 @@ def get_dataset_path(name: str) -> Path:
if name not in entries:
raise DataNotFoundError(f"Unknown dataset '{name}'. Available: {', '.join(sorted(entries))}")

path = _resolve_dataset_target(entries[name]["path"])
path = _resolve_dataset_target(entries[name]["path"], download=True)
if not path.exists():
raise DataNotFoundError(f"Dataset '{name}' not found at expected path: {path}")
return path
Expand All @@ -71,7 +71,7 @@ def verify_checksums() -> dict[str, bool]:
result: dict[str, bool] = {}
for name, item in entries.items():
try:
path = _resolve_dataset_target(item["path"])
path = _resolve_dataset_target(item["path"], download=True)
except DataNotFoundError:
result[name] = False
continue
Expand Down
35 changes: 23 additions & 12 deletions nstat/install.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,12 @@ def _should_prompt_for_example_data(info: dict[str, Any]) -> bool:
return answer.strip().lower() in {"y", "yes"}


def _apply_example_data_info(report: dict[str, Any], info: Any) -> None:
report["example_data"]["data_dir"] = str(info.data_dir)
report["example_data"]["is_installed"] = bool(info.is_installed)
report["example_data"]["required_files"] = [str(path) for path in info.required_files]


def nstat_install(
*,
rebuild_doc_search: bool = True,
Expand All @@ -52,8 +58,9 @@ def nstat_install(

mode = _normalize_download_mode(download_example_data)
repo_root = Path(__file__).resolve().parents[1]
info = get_example_data_info(repo_root)
repo_info = get_example_data_info(repo_root)
data_dir = get_data_dir()
data_info = get_example_data_info(data_dir, treat_as_data_dir=True)

report: dict[str, Any] = {
"repo_root": str(repo_root),
Expand All @@ -63,34 +70,38 @@ def nstat_install(
"download_example_data": mode,
"example_data": {
"data_dir": str(data_dir),
"is_installed": bool(info.is_installed or get_example_data_info(data_dir).is_installed),
"is_installed": bool(repo_info.is_installed or data_info.is_installed),
"figshare_doi": FIGSHARE_DOI_URL,
"paper_doi": PAPER_DOI_URL,
"required_files": [str(path) for path in info.required_files],
"required_files": [str(path) for path in data_info.required_files],
},
"notes": [],
}

try:
if info.is_installed:
report["example_data"]["is_installed"] = True
report["example_data"]["data_dir"] = str(info.data_dir)
if repo_info.is_installed:
_apply_example_data_info(report, repo_info)
report["notes"].append("Example data already present.")
elif data_info.is_installed:
_apply_example_data_info(report, data_info)
report["notes"].append("Example data already present.")
elif mode == "always":
path = ensure_example_data(download=True)
report["example_data"]["is_installed"] = True
report["example_data"]["data_dir"] = str(path)
_apply_example_data_info(report, get_example_data_info(path, treat_as_data_dir=True))
report["notes"].append("Downloaded example data.")
elif mode == "prompt":
if _should_prompt_for_example_data(report["example_data"]):
path = ensure_example_data(download=True)
report["example_data"]["is_installed"] = True
report["example_data"]["data_dir"] = str(path)
_apply_example_data_info(report, get_example_data_info(path, treat_as_data_dir=True))
report["notes"].append("Downloaded example data after prompt.")
else:
report["notes"].append("Example data not installed; run with download_example_data=True to install.")
report["notes"].append(
"Example data was not preinstalled; paper-example and dataset APIs will download it on first use."
)
else:
report["notes"].append("Example data not installed; run with download_example_data=True to install.")
report["notes"].append(
"Example data was not preinstalled; paper-example and dataset APIs will download it on first use."
)
except Exception as exc: # noqa: BLE001
report["example_data"]["error"] = str(exc)
report["notes"].append("Example data installation failed.")
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ dependencies = [
"numpy>=1.24",
"scipy>=1.10",
"matplotlib>=3.7",
"certifi>=2024.0.0",
"PyYAML>=6.0",
"nbformat>=5.10",
"nbclient>=0.10"
Expand Down
47 changes: 35 additions & 12 deletions tests/test_datasets.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,46 @@
from __future__ import annotations

from pathlib import Path

import nstat
from nstat.errors import DataNotFoundError
import nstat.datasets


def test_dataset_manifest_and_checksums() -> None:
names = nstat.list_datasets()
assert names
assert names == sorted(names)


def test_get_dataset_path_triggers_download_when_data_is_external(monkeypatch, tmp_path: Path) -> None:
data_root = tmp_path / "nstat_data"
dataset_path = data_root / "mEPSCs" / "epsc2.txt"
dataset_path.parent.mkdir(parents=True, exist_ok=True)
dataset_path.write_text("header\n0 0\n", encoding="utf-8")

calls: list[bool] = []

def fake_ensure_example_data(*, download: bool = True) -> Path:
calls.append(download)
return data_root

monkeypatch.setattr(nstat.datasets, "ensure_example_data", fake_ensure_example_data)

resolved = nstat.get_dataset_path("mepcs_epsc2")
assert resolved == dataset_path
assert calls == [True]


def test_verify_checksums_triggers_download_when_data_is_external(monkeypatch, tmp_path: Path) -> None:
calls: list[bool] = []

check = nstat.verify_checksums()
assert set(check.keys()) == set(names)
assert all(isinstance(v, bool) for v in check.values())
def fake_ensure_example_data(*, download: bool = True) -> Path:
calls.append(download)
return tmp_path / "nstat_data"

monkeypatch.setattr(nstat.datasets, "ensure_example_data", fake_ensure_example_data)

def test_get_dataset_path() -> None:
name = nstat.list_datasets()[0]
try:
path = nstat.get_dataset_path(name)
except DataNotFoundError:
# Standalone checkouts may intentionally omit large datasets.
return
assert path.exists()
result = nstat.verify_checksums()
assert result
assert all(isinstance(value, bool) for value in result.values())
assert calls and all(call is True for call in calls)
6 changes: 6 additions & 0 deletions tests/test_install_and_compat.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from __future__ import annotations

from pathlib import Path

from nstat.compat.matlab import CIF, Covariate, SignalObj, nspikeTrain, nstColl
from nstat.install import nstat_install

Expand All @@ -18,3 +20,7 @@ def test_nstat_install_report_without_download() -> None:
assert "example_data" in report
assert report["download_example_data"] == "never"
assert "required_files" in report["example_data"]
data_dir = Path(report["example_data"]["data_dir"])
required = [Path(path) for path in report["example_data"]["required_files"]]
assert required
assert all(data_dir in path.parents or path == data_dir for path in required)
Loading