ArcInstitute · abhinadduri · Sep 14, 2025 · Sep 14, 2025 · Sep 14, 2025 · Sep 14, 2025
@@ -1,19 +1,19 @@
 [project]
 name = "arc-state"
-version = "0.9.31"
+version = "0.9.32"
 description = "State is a machine learning model that predicts cellular perturbation response across diverse contexts."
 readme = "README.md"
 authors = [
     { name = "Abhinav Adduri", email = "abhinav.adduri@arcinstitute.org" },
     { name = "Yusuf Roohani", email = "yusuf.roohani@arcinstitute.org" },
     { name = "Noam Teyssier", email = "noam.teyssier@arcinstitute.org" },
-    { name = "Rajesh Ilango" },
+    { name = "Rajesh Ilango", email = "rilango@gmail.com" },
     { name = "Dhruv Gautam", email = "dhruvgautam@berkeley.edu" },
 ]
 requires-python = ">=3.10,<3.13"
 dependencies = [
     "anndata>=0.11.4",
-    "cell-load>=0.8.3",
+    "cell-load>=0.8.7",
     "numpy>=2.2.6",
     "pandas>=2.2.3",
     "pyyaml>=6.0.2",
@@ -27,11 +27,15 @@ dependencies = [
     "geomloss>=0.2.6",
     "transformers>=4.52.3",
     "peft>=0.11.0",
-    "cell-eval>=0.5.22",
+    "cell-eval>=0.6.2",
     "ipykernel>=6.30.1",
     "scipy>=1.15.0",
 ]
 
+[tool.uv.sources]
+cell-load = {path = "/home/aadduri/cell-load"}
+cell-eval = {git = "https://github.com/ArcInstitute/cell-eval", branch = "aadduri/aupr_curves"}
+
 [project.optional-dependencies]
 vectordb = [
     "lancedb>=0.24.0"

@@ -11,6 +11,7 @@
     run_emb_query,
     run_emb_preprocess,
     run_emb_eval,
+    run_tx_combo,
     run_tx_infer,
     run_tx_predict,
     run_tx_preprocess_infer,
@@ -124,6 +125,8 @@ def main():
                 case "infer":
                     # Run inference using argparse, similar to predict
                     run_tx_infer(args)
+                case "combo":
+                    run_tx_combo(args)
                 case "preprocess_train":
                     # Run preprocessing using argparse
                     run_tx_preprocess_train(args.adata, args.output, args.num_hvgs)

@@ -1,6 +1,7 @@
 from ._emb import add_arguments_emb, run_emb_fit, run_emb_transform, run_emb_query, run_emb_preprocess, run_emb_eval
 from ._tx import (
     add_arguments_tx,
+    run_tx_combo,
     run_tx_infer,
     run_tx_predict,
     run_tx_preprocess_infer,
@@ -16,6 +17,7 @@
     "run_tx_infer",
     "run_tx_preprocess_train",
     "run_tx_preprocess_infer",
+    "run_tx_combo",
     "run_emb_fit",
     "run_emb_query",
     "run_emb_transform",

@@ -3,8 +3,16 @@
 
 def add_arguments_transform(parser: ap.ArgumentParser):
     """Add arguments for state embedding CLI."""
-    parser.add_argument("--model-folder", required=True, help="Path to the model checkpoint folder")
-    parser.add_argument("--checkpoint", required=False, help="Path to the specific model checkpoint")
+    parser.add_argument(
+        "--model-folder",
+        required=False,
+        help="Path to the model checkpoint folder (required if --checkpoint is not provided)",
+    )
+    parser.add_argument(
+        "--checkpoint",
+        required=False,
+        help="Path to the specific model checkpoint (required if --model-folder is not provided)",
+    )
     parser.add_argument(
         "--config",
         required=False,
@@ -46,6 +54,7 @@ def run_emb_transform(args: ap.ArgumentParser):
     import glob
     import logging
     import os
+    import numpy as np
 
     import torch
     from omegaconf import OmegaConf
@@ -60,13 +69,19 @@ def run_emb_transform(args: ap.ArgumentParser):
         logger.error("Either --output or --lancedb must be provided")
         raise ValueError("Either --output or --lancedb must be provided")
 
-    # look in the model folder with glob for *.ckpt, get the first one, and print it
-    model_files = glob.glob(os.path.join(args.model_folder, "*.ckpt"))
-    if not model_files:
-        logger.error(f"No model checkpoint found in {args.model_folder}")
-        raise FileNotFoundError(f"No model checkpoint found in {args.model_folder}")
-    if not args.checkpoint:
-        args.checkpoint = model_files[-1]
+    # Resolve checkpoint path, allowing either --checkpoint, --model-folder, or both
+    checkpoint_path = args.checkpoint
+    if args.model_folder:
+        model_files = glob.glob(os.path.join(args.model_folder, "*.ckpt"))
+        if not model_files and not checkpoint_path:
+            logger.error(f"No model checkpoint found in {args.model_folder}")
+            raise FileNotFoundError(f"No model checkpoint found in {args.model_folder}")
+        if not checkpoint_path and model_files:
+            checkpoint_path = model_files[-1]
+    if not checkpoint_path:
+        logger.error("Either --checkpoint or --model-folder must be provided")
+        raise ValueError("Either --checkpoint or --model-folder must be provided")
+    args.checkpoint = checkpoint_path
     logger.info(f"Using model checkpoint: {args.checkpoint}")
 
     # Create inference object
@@ -79,7 +94,7 @@ def run_emb_transform(args: ap.ArgumentParser):
     if args.protein_embeddings:
         logger.info(f"Using protein embeddings override: {args.protein_embeddings}")
         protein_embeds = torch.load(args.protein_embeddings, weights_only=False, map_location="cpu")
-    else:
+    elif args.model_folder:
         # Try auto-detect in model folder
         try:
             exact_path = os.path.join(args.model_folder, "protein_embeddings.pt")
@@ -110,6 +125,12 @@ def run_emb_transform(args: ap.ArgumentParser):
     logger.info(f"Loading model from checkpoint: {args.checkpoint}")
     inferer.load_model(args.checkpoint)
 
+    save_as_npy = False
+    output_target = args.output
+    if args.output:
+        _, ext = os.path.splitext(args.output)
+        save_as_npy = ext.lower() == ".npy"
+
     # Create output directory if it doesn't exist
     if args.output:
         output_dir = os.path.dirname(args.output)
@@ -120,18 +141,28 @@ def run_emb_transform(args: ap.ArgumentParser):
     # Generate embeddings
     logger.info(f"Computing embeddings for {args.input}")
     if args.output:
-        logger.info(f"Output will be saved to {args.output}")
+        if save_as_npy:
+            logger.info(f"Output embeddings will be saved to {args.output} as a NumPy array")
+        else:
+            logger.info(f"Output will be saved to {args.output}")
     if args.lancedb:
         logger.info(f"Embeddings will be saved to LanceDB at {args.lancedb}")
 
-    inferer.encode_adata(
+    embeddings = inferer.encode_adata(
         input_adata_path=args.input,
-        output_adata_path=args.output,
+        output_adata_path=None if save_as_npy else output_target,
         emb_key=args.embed_key,
         batch_size=args.batch_size if getattr(args, "batch_size", None) is not None else None,
         lancedb_path=args.lancedb,
         update_lancedb=args.lancedb_update,
         lancedb_batch_size=args.lancedb_batch_size,
     )
 
+    if save_as_npy:
+        if embeddings is None:
+            logger.error("Failed to generate embeddings for NumPy output")
+            raise RuntimeError("Embedding generation returned no data")
+        np.save(args.output, embeddings)
+        logger.info(f"Saved embeddings matrix with shape {embeddings.shape} to {args.output}")
+
     logger.info("Embedding computation completed successfully!")
@@ -5,13 +5,15 @@
 from ._preprocess_infer import add_arguments_preprocess_infer, run_tx_preprocess_infer
 from ._preprocess_train import add_arguments_preprocess_train, run_tx_preprocess_train
 from ._train import add_arguments_train, run_tx_train
+from ._combo import add_arguments_combo, run_tx_combo
 
 __all__ = [
     "run_tx_train",
     "run_tx_predict",
     "run_tx_infer",
     "run_tx_preprocess_train",
     "run_tx_preprocess_infer",
+    "run_tx_combo",
     "add_arguments_tx",
 ]
 
@@ -24,3 +26,4 @@ def add_arguments_tx(parser: ap.ArgumentParser):
     add_arguments_infer(subparsers.add_parser("infer"))
     add_arguments_preprocess_train(subparsers.add_parser("preprocess_train"))
     add_arguments_preprocess_infer(subparsers.add_parser("preprocess_infer"))
+    add_arguments_combo(subparsers.add_parser("combo"))