SearchScale · nvzm123 · Mar 30, 2026
diff --git a/generate-combinations.py b/generate-combinations.py
@@ -1,4 +1,3 @@
-
 import itertools
 import argparse
 import sys
@@ -42,7 +41,11 @@
     invariants["vectorDimension"] = dataset_info["vector_dimension"]
     print("sweep: " + sweep)
     for param, value in sweeps[sweep].get("common-params", {}).items():
-        if not isinstance(value, list):
+        # efSearch is always passed through as a list — Java handles the iteration
+        if param == 'efSearch':
+            # Ensure it's always a list
+            invariants[param] = value if isinstance(value, list) else [value]
+        elif not isinstance(value, list):
             invariants[param] = value
         else:
             variants[param] = value
@@ -54,146 +57,64 @@
 
         for param, value in algorithms[algo].items():
             if param not in ["params"]:
-                if not isinstance(value, list):
+                # efSearch is always passed through as a list — Java handles the iteration
+                if param == 'efSearch':
+                    algo_invariants[param] = value if isinstance(value, list) else [value]
+                elif not isinstance(value, list):
                     algo_invariants[param] = value
                 else:
                     algo_variants[param] = value
 
         # Generate all combination of variants. For each combination, generate a hashed ID, and a file with the
         # name pattern as <sweep>-<algo>-<hash>.json. The file should contain the invariants as is, and the variants as the current combination.
         if algo_variants:
-            # Separate efSearch from other variants if it exists
-            efSearch_values = None
-            other_variant_keys = []
-            other_variant_values = []
-
-            for key, value in algo_variants.items():
-                if key == 'efSearch':
-                    efSearch_values = value
-                else:
-                    other_variant_keys.append(key)
-                    other_variant_values.append(value)
-
-            # Generate combinations with efSearch at the beginning (innermost loop)
-            if efSearch_values and other_variant_keys:
-                # Generate combinations of other parameters first
-                for other_combination in itertools.product(*other_variant_values):
-                    other_variants = dict(zip(other_variant_keys, other_combination))
-                    # Then iterate through efSearch values
-                    for ef_index, ef_value in enumerate(efSearch_values):
-                        current_variants = other_variants.copy()
-                        current_variants['efSearch'] = ef_value
-
-                        # Skip if cagraIntermediateDegree < cagraGraphDegree
-                        if 'cagraIntermediateDegree' in current_variants and 'cagraGraphDegree' in current_variants:
-                            if current_variants['cagraIntermediateDegree'] < current_variants['cagraGraphDegree']:
-                                print(f"\t\tSkipping combination: cagraIntermediateDegree ({current_variants['cagraIntermediateDegree']}) < cagraGraphDegree ({current_variants['cagraGraphDegree']})")
-                                continue
-
-                        # Skip if hnswMaxConn > hnswBeamWidth
-                        if 'hnswMaxConn' in current_variants and 'hnswBeamWidth' in current_variants:
-                            if current_variants['hnswMaxConn'] > current_variants['hnswBeamWidth']:
-                                print(f"\t\tSkipping combination: hnswMaxConn ({current_variants['hnswMaxConn']}) > hnswBeamWidth ({current_variants['hnswBeamWidth']})")
-                                continue
-
-                        # Generate hash only from other_variants (excluding efSearch)
-                        base_hash = hashlib.md5(json.dumps(other_variants, sort_keys=True).encode()).hexdigest()[:8]
-                        hash_id = f"{base_hash}-ef{ef_value}"
-
-                        config = algo_invariants.copy()
-                        config.update(current_variants)
-
-                        # For multiple efSearch combinations: subsequent ones skip indexing
-                        if len(efSearch_values) > 1 and ef_index > 0:
-                            config['skipIndexing'] = True
-
-                        # Set cleanIndexDirectory based on position
-                        if ef_index == 0:
-                            config['cleanIndexDirectory'] = False
-                        elif ef_index == len(efSearch_values) - 1:
-                            config['cleanIndexDirectory'] = True
-                        else:
-                            config['cleanIndexDirectory'] = False
-
-                        # Use base_hash for index directory paths
-                        if 'hnswIndexDirPath' in config:
-                            config['hnswIndexDirPath'] = f"hnswIndex-{base_hash}"
-                        if 'cuvsIndexDirPath' in config:
-                            config['cuvsIndexDirPath'] = f"cuvsIndex-{base_hash}"
-
-                        filename = f"{algo}-{hash_id}.json"
-                        sweep_dir = f"{args.configs_dir}/{sweep}"
-                        filepath = f"{sweep_dir}/{filename}"
-                        os.makedirs(sweep_dir, exist_ok=True)
-                        with open(filepath, 'w') as f:
-                            json.dump(config, f, indent=2)
-                        print(f"\tGenerated config file: {filepath}")
-            elif efSearch_values:
-                # Only efSearch values, no other variants
-                for ef_index, ef_value in enumerate(efSearch_values):
-                    current_variants = {'efSearch': ef_value}
-                    # Generate hash from empty dict since no other variants exist
-                    base_hash = hashlib.md5(json.dumps({}, sort_keys=True).encode()).hexdigest()[:8]
-                    hash_id = f"{base_hash}-ef{ef_value}"
-
-                    config = algo_invariants.copy()
-                    config.update(current_variants)
-
-                    # For multiple efSearch combinations: subsequent ones skip indexing
-                    if len(efSearch_values) > 1 and ef_index > 0:
-                        config['skipIndexing'] = True
-
-                    # Set cleanIndexDirectory based on position
-                    if ef_index == 0:
-                        config['cleanIndexDirectory'] = False
-                    elif ef_index == len(efSearch_values) - 1:
-                        config['cleanIndexDirectory'] = True
-                    else:
-                        config['cleanIndexDirectory'] = False
-
-                    # Use base_hash for index directory paths
-                    if 'hnswIndexDirPath' in config:
-                        config['hnswIndexDirPath'] = f"hnswIndex-{base_hash}"
-                    if 'cuvsIndexDirPath' in config:
-                        config['cuvsIndexDirPath'] = f"cuvsIndex-{base_hash}"
-
-                    filename = f"{algo}-{hash_id}.json"
-                    sweep_dir = f"{args.configs_dir}/{sweep}"
-                    filepath = f"{sweep_dir}/{filename}"
-                    os.makedirs(sweep_dir, exist_ok=True)
-                    with open(filepath, 'w') as f:
-                        json.dump(config, f, indent=2)
-                    print(f"\tGenerated config file: {filepath}")
-            else:
-                # No efSearch, use original logic
-                variant_keys = list(algo_variants.keys())
-                variant_values = list(algo_variants.values())
-                for combination in itertools.product(*variant_values):
-                    current_variants = dict(zip(variant_keys, combination))
-
-                    # Skip if cagraIntermediateDegree < cagraGraphDegree
-                    if 'cagraIntermediateDegree' in current_variants and 'cagraGraphDegree' in current_variants:
-                        if current_variants['cagraIntermediateDegree'] < current_variants['cagraGraphDegree']:
-                            print(f"\t\tSkipping combination: cagraIntermediateDegree ({current_variants['cagraIntermediateDegree']}) < cagraGraphDegree ({current_variants['cagraGraphDegree']})")
-                            continue
-
-                    # Skip if hnswMaxConn > hnswBeamWidth
-                    if 'hnswMaxConn' in current_variants and 'hnswBeamWidth' in current_variants:
-                        if current_variants['hnswMaxConn'] > current_variants['hnswBeamWidth']:
-                            print(f"\t\tSkipping combination: hnswMaxConn ({current_variants['hnswMaxConn']}) > hnswBeamWidth ({current_variants['hnswBeamWidth']})")
-                            continue
-
-                    hash_id = hashlib.md5(json.dumps(current_variants, sort_keys=True).encode()).hexdigest()[:8]
-
-                    config = algo_invariants.copy()
-                    config.update(current_variants)
-                    filename = f"{algo}-{hash_id}.json"
-                    sweep_dir = f"{args.configs_dir}/{sweep}"
-                    filepath = f"{sweep_dir}/{filename}"
-                    os.makedirs(sweep_dir, exist_ok=True)
-                    with open(filepath, 'w') as f:
-                        json.dump(config, f, indent=2)
-                    print(f"\tGenerated config file: {filepath}")
+            variant_keys = list(algo_variants.keys())
+            variant_values = list(algo_variants.values())
+            for combination in itertools.product(*variant_values):
+                current_variants = dict(zip(variant_keys, combination))
+
+                hash_id = hashlib.md5(json.dumps(current_variants, sort_keys=True).encode()).hexdigest()[:8]
+
+                config = algo_invariants.copy()
+                config.update(current_variants)
+
+                # Skip if cagraIntermediateGraphDegree < cagraGraphDegree
+                # (CAGRA silently clamps graphDegree down to intermediateGraphDegree,
+                # which produces duplicate test runs)
+                if config.get('cagraIntermediateGraphDegree', float('inf')) < config.get('cagraGraphDegree', 0):
+                    print(f"\t\tSkipping combination: cagraIntermediateGraphDegree ({config['cagraIntermediateGraphDegree']}) < cagraGraphDegree ({config['cagraGraphDegree']})")
+                    continue
+
+                # Skip if hnswMaxConn > hnswBeamWidth
+                if config.get('hnswMaxConn', 0) > config.get('hnswBeamWidth', float('inf')):
+                    print(f"\t\tSkipping combination: hnswMaxConn ({config['hnswMaxConn']}) > hnswBeamWidth ({config['hnswBeamWidth']})")
+                    continue
+
+                # Set indexDirPath based on hash
+                config['indexDirPath'] = f"index-{hash_id}"
+
+                filename = f"{algo}-{hash_id}.json"
+                sweep_dir = f"{args.configs_dir}/{sweep}"
+                filepath = f"{sweep_dir}/{filename}"
+                os.makedirs(sweep_dir, exist_ok=True)
+                with open(filepath, 'w') as f:
+                    json.dump(config, f, indent=2)
+                print(f"\tGenerated config file: {filepath}")
+        else:
+            # No variants at all, just generate a single config
+            hash_id = hashlib.md5(json.dumps(algo_invariants, sort_keys=True).encode()).hexdigest()[:8]
+            config = algo_invariants.copy()
+
+            # Set indexDirPath based on hash
+            config['indexDirPath'] = f"index-{hash_id}"
+
+            filename = f"{algo}-{hash_id}.json"
+            sweep_dir = f"{args.configs_dir}/{sweep}"
+            filepath = f"{sweep_dir}/{filename}"
+            os.makedirs(sweep_dir, exist_ok=True)
+            with open(filepath, 'w') as f:
+                json.dump(config, f, indent=2)
+            print(f"\tGenerated config file: {filepath}")
 
 
     print("----------------------")
diff --git a/src/main/java/com/searchscale/lucene/cuvs/benchmarks/BenchmarkConfiguration.java b/src/main/java/com/searchscale/lucene/cuvs/benchmarks/BenchmarkConfiguration.java
@@ -5,6 +5,7 @@
 import com.nvidia.cuvs.CagraIndexParams.CudaDataType;
 import com.nvidia.cuvs.CagraIndexParams.CuvsDistanceType;
 import com.searchscale.lucene.cuvs.benchmarks.LuceneCuvsBenchmarks.Codex;
+import java.util.List;
 
 public class BenchmarkConfiguration {
 
@@ -47,7 +48,7 @@ public class BenchmarkConfiguration {
   public int cagraITopK;
   public int cagraSearchWidth;
   public int cagraHnswLayers; // layers in CAGRA->HNSW conversion
-  public int efSearch;
+  public List<Integer> efSearch; // e.g. [64] or [64, 128, 256]
   public CagraGraphBuildAlgo cagraGraphBuildAlgo;
 
   // CAGRA IVF_PQ parameters
@@ -89,11 +90,22 @@ public boolean isCagraHNSWScalar() {
     return Codex.CAGRA_HNSW_SCALAR.equals(algoToRun);
   }
 
-  public int getEffectiveEfSearch() {
-    if (efSearch > 0) {
+  /**
+   * Returns the list of efSearch values to use during search.
+   *
+   * <p>If {@code efSearch} is set in the config JSON (e.g. [64, 128, 256]),
+   * those values are returned directly. Otherwise, falls back to a single-element
+   * list containing a default derived from topK.
+   *
+   * <p>The benchmark runner iterates over these values and runs search once per value
+   * against the <b>same</b> index — no rebuild is needed.
+   */
+  public List<Integer> getEfSearchValues() {
+    if (efSearch != null && !efSearch.isEmpty()) {
       return efSearch;
     }
-    return Math.max(topK, (int) Math.ceil(topK * 1.5));
+    // Default: 1.5x topK, but at least topK
+    return List.of(Math.max(topK, (int) Math.ceil(topK * 1.5)));
   }
 
   public String prettyString() {
@@ -128,6 +140,7 @@ public String prettyString() {
     sb.append("Enable TieredMerge: ").append(enableTieredMerge).append('\n');
     sb.append("Num HNSW merge threads: ").append(hnswMergeThreads).append('\n');
     sb.append("enableIndexWriterInfoStream: ").append(enableIndexWriterInfoStream).append('\n');
+    sb.append("efSearch: ").append(getEfSearchValues()).append('\n');
 
     sb.append("------- algo parameters ------\n");
     if (isLucene()) {