diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..ef5bcc4 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,40 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + test: + name: Test (Python ${{ matrix.python-version }}) + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.11", "3.12"] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: pip + + - name: Install package with test dependencies + run: | + pip install --upgrade pip + pip install -e ".[dev]" + + - name: Run tests + run: pytest src/tests -v --tb=short --no-header + + - name: Upload test results + if: always() + uses: actions/upload-artifact@v4 + with: + name: test-results-${{ matrix.python-version }} + path: .pytest_cache/ diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..abaa89d --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,57 @@ +name: Release + +on: + push: + branches: [main] + +permissions: + contents: write # push tags / GitHub release + id-token: write # OIDC token for trusted PyPI publishing + +jobs: + release: + name: Semantic release & publish + runs-on: ubuntu-latest + # Only run when CI passes — avoid publishing broken code + needs: [] + + concurrency: + group: release + cancel-in-progress: false + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 # full history so PSR can read all commits + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + cache: pip + + - name: Install build tools + run: pip install python-semantic-release build + + # python-semantic-release reads commit history, bumps the version in + # setup.cfg, creates a tag, a GitHub release, and builds the package. + - name: Run semantic-release + id: semrel + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: semantic-release version + + # Build sdist + wheel only when a new version was actually released + - name: Build distribution + if: steps.semrel.outputs.released == 'true' + run: python -m build + + # Publish to PyPI via OIDC (no API token needed — configure trusted + # publisher on pypi.org: owner=, + # repo=SpectralNet, workflow=release.yml, environment=pypi) + - name: Publish to PyPI + if: steps.semrel.outputs.released == 'true' + uses: pypa/gh-action-pypi-publish@release/v1 + with: + print-hash: true diff --git a/README.md b/README.md index 5c440a4..296a5b1 100644 --- a/README.md +++ b/README.md @@ -8,52 +8,123 @@ This package is based on the following paper - [SpectralNet](https://openreview. ## Installation -You can install the latest package version via +### From PyPI ```bash pip install spectralnet ``` +### From source (with pixi) + +[pixi](https://pixi.sh) is the recommended way to set up a fully reproducible +development environment after cloning the repo. + +```bash +# 1. Install pixi (once, system-wide) +curl -fsSL https://pixi.sh/install.sh | sh + +# 2. Clone and enter the repo +git clone https://github.com/shaham-lab/SpectralNet.git +cd SpectralNet + +# 3. Install all dependencies (conda + PyPI) into an isolated environment +pixi install + +# 4. Run the test suite to verify everything works +pixi run test +``` + +After `pixi install` you can prefix any command with `pixi run` to execute it +inside the managed environment, or activate the environment with: + +```bash +pixi shell +``` + ## Usage -### Clustering +### Clustering — small datasets (in-memory tensor) -The basic functionality is quite intuitive and easy to use, e.g., +For datasets that fit in RAM, pass a `torch.Tensor` directly: ```python from spectralnet import SpectralNet spectralnet = SpectralNet(n_clusters=10) -spectralnet.fit(X) # X is the dataset and it should be a torch.Tensor -cluster_assignments = spectralnet.predict(X) # Get the final assignments to clusters +spectralnet.fit(X) # X: torch.Tensor of shape (N, ...) +cluster_assignments = spectralnet.predict(X) ``` -If you have labels to your dataset and you want to measure ACC and NMI you can do the following: +To measure ACC and NMI when labels are available: ```python -from spectralnet import SpectralNet -from spectralnet import Metrics - +from spectralnet import SpectralNet, Metrics spectralnet = SpectralNet(n_clusters=2) -spectralnet.fit(X, y) # X is the dataset and it should be a torch.Tensor -cluster_assignments = spectralnet.predict(X) # Get the final assignments to clusters - -y = y_train.detach().cpu().numpy() # In case your labels are of torch.Tensor type. -acc_score = Metrics.acc_score(cluster_assignments, y, n_clusters=2) -nmi_score = Metrics.nmi_score(cluster_assignments, y) -print(f"ACC: {np.round(acc_score, 3)}") -print(f"NMI: {np.round(nmi_score, 3)}") +spectralnet.fit(X, y) # y: integer label tensor +cluster_assignments = spectralnet.predict(X) + +y_np = y.detach().cpu().numpy() +acc_score = Metrics.acc_score(cluster_assignments, y_np, n_clusters=2) +nmi_score = Metrics.nmi_score(cluster_assignments, y_np) +print(f"ACC: {acc_score:.3f} NMI: {nmi_score:.3f}") +``` + +### Clustering — large datasets (streaming from disk) + +For datasets too large to hold in RAM (e.g. millions of images on disk), +define a `torch.utils.data.Dataset` that loads **one sample at a time** +and pass it to `fit()`. Nothing large ever lives in memory at once — every +trainer pulls mini-batches through its own `DataLoader` internally. + +```python +from torch.utils.data import Dataset, DataLoader +from spectralnet import SpectralNet +from PIL import Image +import torchvision.transforms as T +import os + +class ImageFolderDataset(Dataset): + def __init__(self, root): + self.paths = [ + os.path.join(root, f) for f in os.listdir(root) if f.endswith(".jpg") + ] + self.transform = T.Compose([T.Resize(64), T.ToTensor(), T.Normalize(0.5, 0.5)]) + + def __len__(self): + return len(self.paths) + + def __getitem__(self, idx): + return self.transform(Image.open(self.paths[idx]).convert("RGB")) + +dataset = ImageFolderDataset("/path/to/images") + +spectralnet = SpectralNet( + n_clusters=10, + should_use_ae=True, # compress images before clustering + ae_hiddens=[2048, 512, 64, 10], + spectral_hiddens=[512, 512, 10], +) +spectralnet.fit(dataset) + +# predict() also accepts a DataLoader for large test sets +test_loader = DataLoader(dataset, batch_size=512, shuffle=False) +cluster_assignments = spectralnet.predict(test_loader) ``` -You can read the code docs for more information and functionalities
+> **Note on Siamese training with large datasets:** the Siamese network +> builds exact k-NN pairs, which requires loading all features into memory. +> For very large datasets either disable it (`should_use_siamese=False`), +> enable approximate neighbours (`siamese_use_approx=True`), or pass a +> representative subset as the Dataset. -#### Running examples +### Running examples -In order to run the model on twomoons or MNIST datasets, you should first cd to the examples folder and then run:
-`python3 cluster_twomoons.py`
-or
-`python3 cluster_mnist.py` +```bash +cd examples +python3 cluster_twomoons.py +python3 cluster_mnist.py +```