goodfire-ai · mivanit · Oct 10, 2025 · Oct 10, 2025 · Oct 10, 2025 · Oct 10, 2025
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,8 @@
 spd/scripts/sweep_params.yaml
-spd/scripts/sweep_params.yaml
 docs/coverage/**
+artifacts/**
+docs/dep_graph/**
+tests/.temp/**
 
 **/out/
 neuronpedia_outputs/

diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -230,6 +230,38 @@
                 "--model_path",
                 "wandb:goodfire/spd/runs/ioprgffh"
             ]
+        },
+        {
+            "name": "run_clustering example",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${workspaceFolder}/spd/clustering/scripts/run_clustering.py",
+            "args": [
+                "--config",
+                "${workspaceFolder}/spd/clustering/configs/example.yaml",
+            ],
+            "python": "${command:python.interpreterPath}",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "env": {
+                "PYDEVD_DISABLE_FILE_VALIDATION": "1"
+            }
+        },
+        {
+            "name": "clustering pipeline",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${workspaceFolder}/spd/clustering/scripts/run_pipeline.py",
+            "args": [
+                "--config",
+                "${workspaceFolder}/spd/clustering/configs/pipeline_config.yaml",
+            ],
+            "python": "${command:python.interpreterPath}",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "env": {
+                "PYDEVD_DISABLE_FILE_VALIDATION": "1"
+            }
         }
     ]
 }
diff --git a/Makefile b/Makefile
@@ -76,10 +76,23 @@ coverage:
 	uv run python -m coverage report -m > $(COVERAGE_DIR)/coverage.txt
 	uv run python -m coverage html --directory=$(COVERAGE_DIR)/html/
 
+
+.PHONY: clean
+clean:
+	@echo "Cleaning Python cache and build artifacts..."
+	find . -type d -name "__pycache__" -exec rm -rf {} +
+	find . -type d -name "*.egg-info" -exec rm -rf {} +
+	rm -rf build/ dist/ .ruff_cache/ .pytest_cache/ .coverage
+
+
+.PHONY: clustering-dev
+clustering-dev:
+	uv run spd-cluster --local --config spd/clustering/configs/pipeline-dev-simplestories.yaml
+
 .PHONY: app
 app:
 	@uv run python app/run_app.py
 
 .PHONY: install-app
 install-app:
-	(cd app/frontend && npm install)
+	(cd app/frontend && npm install)
diff --git a/TODO.md b/TODO.md
@@ -0,0 +1,73 @@
+# TODO: Cluster Coactivation Matrix Implementation
+
+## What Was Changed
+
+### 1. Added `ClusterActivations` dataclass (`spd/clustering/dashboard/compute_max_act.py`)
+- New dataclass to hold vectorized cluster activations for all clusters
+- Contains `activations` tensor [n_samples, n_clusters] and `cluster_indices` list
+
+### 2. Added `compute_all_cluster_activations()` function
+- Vectorized computation of all cluster activations at once
+- Replaces the per-cluster loop for better performance
+- Returns `ClusterActivations` object
+
+### 3. Added `compute_cluster_coactivations()` function
+- Computes coactivation matrix from list of `ClusterActivations` across batches
+- Binarizes activations (acts > 0) and computes matrix multiplication: `activation_mask.T @ activation_mask`
+- Follows the pattern from `spd/clustering/merge.py:69`
+- Returns tuple of (coactivation_matrix, cluster_indices)
+
+### 4. Modified `compute_max_activations()` function
+- Now accumulates `ClusterActivations` from each batch in `all_cluster_activations` list
+- Calls `compute_cluster_coactivations()` to compute the matrix
+- **Changed return type**: now returns `tuple[DashboardData, np.ndarray, list[int]]`
+  - Added coactivation matrix and cluster_indices to return value
+
+### 5. Modified `spd/clustering/dashboard/run.py`
+- Updated to handle new return value from `compute_max_activations()`
+- Saves coactivation matrix as `coactivations.npz` in the dashboard output directory
+- NPZ file contains:
+  - `coactivations`: the [n_clusters, n_clusters] matrix
+  - `cluster_indices`: array mapping matrix positions to cluster IDs
+
+## What Needs to be Checked
+
+### Testing
+- [ ] **Run the dashboard pipeline** on a real clustering run to verify:
+  - Coactivation computation doesn't crash
+  - Coactivations are saved correctly to NPZ file
+  - Matrix dimensions are correct
+  - `cluster_indices` mapping is correct
+
+### Type Checking
+- [ ] Run `make type` to ensure no type errors were introduced
+- [ ] Verify jaxtyping annotations are correct
+
+### Verification
+- [ ] Load a saved `coactivations.npz` file and verify:
+  ```python
+  data = np.load("coactivations.npz")
+  coact = data["coactivations"]
+  cluster_indices = data["cluster_indices"]
+  # Check: coact should be symmetric
+  # Check: diagonal should be >= off-diagonal (clusters coactivate with themselves most)
+  # Check: cluster_indices length should match coact.shape[0]
+  ```
+
+### Performance
+- [ ] Check if vectorization actually improved performance
+- [ ] Monitor memory usage with large numbers of clusters
+
+### Edge Cases
+- [ ] Test with clusters that have zero activations
+- [ ] Test with single-batch runs
+- [ ] Test with very large number of clusters
+
+### Integration
+- [ ] Verify the coactivation matrix can be used in downstream analysis
+- [ ] Consider if visualization of coactivations should be added to dashboard
+
+## Notes
+- The coactivation matrix is computed over all samples processed (n_batches * batch_size * seq_len samples)
+- Binarization threshold is currently hardcoded as `> 0` - may want to make this configurable
+- The computation happens in the dashboard pipeline, NOT during the main clustering pipeline
diff --git a/pyproject.toml b/pyproject.toml
@@ -28,6 +28,9 @@ dependencies = [
     # see:  https://github.com/huggingface/datasets/issues/6980  https://github.com/huggingface/datasets/pull/6991  (fixed in https://github.com/huggingface/datasets/releases/tag/2.21.0 )
     "datasets>=2.21.0",
     "simple_stories_train @ git+https://github.com/goodfire-ai/simple_stories_train.git@dev",
+    "scipy>=1.14.1",
+    "muutils",
+    "scikit-learn",
     "fastapi",
     "uvicorn",
 ]
@@ -40,10 +43,12 @@ dev = [
     "ruff",
     "basedpyright<1.32.0", # pyright and wandb issues, see https://github.com/goodfire-ai/spd/pull/232
     "pre-commit",
+    "nbconvert",
 ]
 
 [project.scripts]
 spd-run = "spd.scripts.run:cli"
+spd-cluster = "spd.clustering.scripts.run_pipeline:cli"
 
 [build-system]
 requires = ["setuptools", "wheel"]

diff --git a/spd/base_config.py b/spd/base_config.py
@@ -6,6 +6,14 @@
 from pydantic import BaseModel, ConfigDict
 
 
+class FileTypeError(ValueError):
+    """Error raised when a file has an unsupported type/extension."""
+
+
+class ConfigValidationError(ValueError):
+    """Error raised when a config file fails pydantic validation."""
+
+
 class BaseConfig(BaseModel):
     """Pydantic BaseModel suited for configs.
 
@@ -15,6 +23,8 @@ class BaseConfig(BaseModel):
 
     model_config: ClassVar[ConfigDict] = ConfigDict(extra="forbid", frozen=True)
 
+    # TODO: add a "config_type" field, which is set to the class name, so that when loading a config we can check whether the config type matches the expected class
+
     @classmethod
     def from_file(cls, path: Path | str) -> Self:
         """Load config from path to a JSON or YAML file."""
@@ -27,9 +37,16 @@ def from_file(cls, path: Path | str) -> Self:
             case Path() if path.suffix in [".yaml", ".yml"]:
                 data = yaml.safe_load(path.read_text())
             case _:
-                raise ValueError(f"Only (.json, .yaml, .yml) files are supported, got {path}")
+                raise FileTypeError(f"Only (.json, .yaml, .yml) files are supported, got {path}")
+
+        try:
+            cfg = cls.model_validate(data)
+        except Exception as e:
+            raise ConfigValidationError(
+                f"Error validating config {cls=} from path `{path.as_posix()}`\n{data = }"
+            ) from e
 
-        return cls.model_validate(data)
+        return cfg
 
     def to_file(self, path: Path | str) -> None:
         """Save config to file (format inferred from extension)."""
@@ -43,4 +60,4 @@ def to_file(self, path: Path | str) -> None:
             case ".yaml" | ".yml":
                 path.write_text(yaml.dump(self.model_dump(mode="json")))
             case _:
-                raise ValueError(f"Only (.json, .yaml, .yml) files are supported, got {path}")
+                raise FileTypeError(f"Only (.json, .yaml, .yml) files are supported, got {path}")
diff --git a/spd/clustering/__init__.py b/spd/clustering/__init__.py