Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
191 changes: 56 additions & 135 deletions generate-combinations.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

import itertools
import argparse
import sys
Expand Down Expand Up @@ -42,7 +41,11 @@
invariants["vectorDimension"] = dataset_info["vector_dimension"]
print("sweep: " + sweep)
for param, value in sweeps[sweep].get("common-params", {}).items():
if not isinstance(value, list):
# efSearch is always passed through as a list — Java handles the iteration
if param == 'efSearch':
# Ensure it's always a list
invariants[param] = value if isinstance(value, list) else [value]
elif not isinstance(value, list):
invariants[param] = value
else:
variants[param] = value
Expand All @@ -54,146 +57,64 @@

for param, value in algorithms[algo].items():
if param not in ["params"]:
if not isinstance(value, list):
# efSearch is always passed through as a list — Java handles the iteration
if param == 'efSearch':
algo_invariants[param] = value if isinstance(value, list) else [value]
elif not isinstance(value, list):
algo_invariants[param] = value
else:
algo_variants[param] = value

# Generate all combination of variants. For each combination, generate a hashed ID, and a file with the
# name pattern as <sweep>-<algo>-<hash>.json. The file should contain the invariants as is, and the variants as the current combination.
if algo_variants:
# Separate efSearch from other variants if it exists
efSearch_values = None
other_variant_keys = []
other_variant_values = []

for key, value in algo_variants.items():
if key == 'efSearch':
efSearch_values = value
else:
other_variant_keys.append(key)
other_variant_values.append(value)

# Generate combinations with efSearch at the beginning (innermost loop)
if efSearch_values and other_variant_keys:
# Generate combinations of other parameters first
for other_combination in itertools.product(*other_variant_values):
other_variants = dict(zip(other_variant_keys, other_combination))
# Then iterate through efSearch values
for ef_index, ef_value in enumerate(efSearch_values):
current_variants = other_variants.copy()
current_variants['efSearch'] = ef_value

# Skip if cagraIntermediateDegree < cagraGraphDegree
if 'cagraIntermediateDegree' in current_variants and 'cagraGraphDegree' in current_variants:
if current_variants['cagraIntermediateDegree'] < current_variants['cagraGraphDegree']:
print(f"\t\tSkipping combination: cagraIntermediateDegree ({current_variants['cagraIntermediateDegree']}) < cagraGraphDegree ({current_variants['cagraGraphDegree']})")
continue

# Skip if hnswMaxConn > hnswBeamWidth
if 'hnswMaxConn' in current_variants and 'hnswBeamWidth' in current_variants:
if current_variants['hnswMaxConn'] > current_variants['hnswBeamWidth']:
print(f"\t\tSkipping combination: hnswMaxConn ({current_variants['hnswMaxConn']}) > hnswBeamWidth ({current_variants['hnswBeamWidth']})")
continue

# Generate hash only from other_variants (excluding efSearch)
base_hash = hashlib.md5(json.dumps(other_variants, sort_keys=True).encode()).hexdigest()[:8]
hash_id = f"{base_hash}-ef{ef_value}"

config = algo_invariants.copy()
config.update(current_variants)

# For multiple efSearch combinations: subsequent ones skip indexing
if len(efSearch_values) > 1 and ef_index > 0:
config['skipIndexing'] = True

# Set cleanIndexDirectory based on position
if ef_index == 0:
config['cleanIndexDirectory'] = False
elif ef_index == len(efSearch_values) - 1:
config['cleanIndexDirectory'] = True
else:
config['cleanIndexDirectory'] = False

# Use base_hash for index directory paths
if 'hnswIndexDirPath' in config:
config['hnswIndexDirPath'] = f"hnswIndex-{base_hash}"
if 'cuvsIndexDirPath' in config:
config['cuvsIndexDirPath'] = f"cuvsIndex-{base_hash}"

filename = f"{algo}-{hash_id}.json"
sweep_dir = f"{args.configs_dir}/{sweep}"
filepath = f"{sweep_dir}/{filename}"
os.makedirs(sweep_dir, exist_ok=True)
with open(filepath, 'w') as f:
json.dump(config, f, indent=2)
print(f"\tGenerated config file: {filepath}")
elif efSearch_values:
# Only efSearch values, no other variants
for ef_index, ef_value in enumerate(efSearch_values):
current_variants = {'efSearch': ef_value}
# Generate hash from empty dict since no other variants exist
base_hash = hashlib.md5(json.dumps({}, sort_keys=True).encode()).hexdigest()[:8]
hash_id = f"{base_hash}-ef{ef_value}"

config = algo_invariants.copy()
config.update(current_variants)

# For multiple efSearch combinations: subsequent ones skip indexing
if len(efSearch_values) > 1 and ef_index > 0:
config['skipIndexing'] = True

# Set cleanIndexDirectory based on position
if ef_index == 0:
config['cleanIndexDirectory'] = False
elif ef_index == len(efSearch_values) - 1:
config['cleanIndexDirectory'] = True
else:
config['cleanIndexDirectory'] = False

# Use base_hash for index directory paths
if 'hnswIndexDirPath' in config:
config['hnswIndexDirPath'] = f"hnswIndex-{base_hash}"
if 'cuvsIndexDirPath' in config:
config['cuvsIndexDirPath'] = f"cuvsIndex-{base_hash}"

filename = f"{algo}-{hash_id}.json"
sweep_dir = f"{args.configs_dir}/{sweep}"
filepath = f"{sweep_dir}/{filename}"
os.makedirs(sweep_dir, exist_ok=True)
with open(filepath, 'w') as f:
json.dump(config, f, indent=2)
print(f"\tGenerated config file: {filepath}")
else:
# No efSearch, use original logic
variant_keys = list(algo_variants.keys())
variant_values = list(algo_variants.values())
for combination in itertools.product(*variant_values):
current_variants = dict(zip(variant_keys, combination))

# Skip if cagraIntermediateDegree < cagraGraphDegree
if 'cagraIntermediateDegree' in current_variants and 'cagraGraphDegree' in current_variants:
if current_variants['cagraIntermediateDegree'] < current_variants['cagraGraphDegree']:
print(f"\t\tSkipping combination: cagraIntermediateDegree ({current_variants['cagraIntermediateDegree']}) < cagraGraphDegree ({current_variants['cagraGraphDegree']})")
continue

# Skip if hnswMaxConn > hnswBeamWidth
if 'hnswMaxConn' in current_variants and 'hnswBeamWidth' in current_variants:
if current_variants['hnswMaxConn'] > current_variants['hnswBeamWidth']:
print(f"\t\tSkipping combination: hnswMaxConn ({current_variants['hnswMaxConn']}) > hnswBeamWidth ({current_variants['hnswBeamWidth']})")
continue

hash_id = hashlib.md5(json.dumps(current_variants, sort_keys=True).encode()).hexdigest()[:8]

config = algo_invariants.copy()
config.update(current_variants)
filename = f"{algo}-{hash_id}.json"
sweep_dir = f"{args.configs_dir}/{sweep}"
filepath = f"{sweep_dir}/{filename}"
os.makedirs(sweep_dir, exist_ok=True)
with open(filepath, 'w') as f:
json.dump(config, f, indent=2)
print(f"\tGenerated config file: {filepath}")
variant_keys = list(algo_variants.keys())
variant_values = list(algo_variants.values())
for combination in itertools.product(*variant_values):
current_variants = dict(zip(variant_keys, combination))

hash_id = hashlib.md5(json.dumps(current_variants, sort_keys=True).encode()).hexdigest()[:8]

config = algo_invariants.copy()
config.update(current_variants)

# Skip if cagraIntermediateGraphDegree < cagraGraphDegree
# (CAGRA silently clamps graphDegree down to intermediateGraphDegree,
# which produces duplicate test runs)
if config.get('cagraIntermediateGraphDegree', float('inf')) < config.get('cagraGraphDegree', 0):
print(f"\t\tSkipping combination: cagraIntermediateGraphDegree ({config['cagraIntermediateGraphDegree']}) < cagraGraphDegree ({config['cagraGraphDegree']})")
continue

# Skip if hnswMaxConn > hnswBeamWidth
if config.get('hnswMaxConn', 0) > config.get('hnswBeamWidth', float('inf')):
print(f"\t\tSkipping combination: hnswMaxConn ({config['hnswMaxConn']}) > hnswBeamWidth ({config['hnswBeamWidth']})")
continue

# Set indexDirPath based on hash
config['indexDirPath'] = f"index-{hash_id}"

filename = f"{algo}-{hash_id}.json"
sweep_dir = f"{args.configs_dir}/{sweep}"
filepath = f"{sweep_dir}/{filename}"
os.makedirs(sweep_dir, exist_ok=True)
with open(filepath, 'w') as f:
json.dump(config, f, indent=2)
print(f"\tGenerated config file: {filepath}")
else:
# No variants at all, just generate a single config
hash_id = hashlib.md5(json.dumps(algo_invariants, sort_keys=True).encode()).hexdigest()[:8]
config = algo_invariants.copy()

# Set indexDirPath based on hash
config['indexDirPath'] = f"index-{hash_id}"

filename = f"{algo}-{hash_id}.json"
sweep_dir = f"{args.configs_dir}/{sweep}"
filepath = f"{sweep_dir}/{filename}"
os.makedirs(sweep_dir, exist_ok=True)
with open(filepath, 'w') as f:
json.dump(config, f, indent=2)
print(f"\tGenerated config file: {filepath}")


print("----------------------")
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import com.nvidia.cuvs.CagraIndexParams.CudaDataType;
import com.nvidia.cuvs.CagraIndexParams.CuvsDistanceType;
import com.searchscale.lucene.cuvs.benchmarks.LuceneCuvsBenchmarks.Codex;
import java.util.List;

public class BenchmarkConfiguration {

Expand Down Expand Up @@ -47,7 +48,7 @@ public class BenchmarkConfiguration {
public int cagraITopK;
public int cagraSearchWidth;
public int cagraHnswLayers; // layers in CAGRA->HNSW conversion
public int efSearch;
public List<Integer> efSearch; // e.g. [64] or [64, 128, 256]
public CagraGraphBuildAlgo cagraGraphBuildAlgo;

// CAGRA IVF_PQ parameters
Expand Down Expand Up @@ -89,11 +90,22 @@ public boolean isCagraHNSWScalar() {
return Codex.CAGRA_HNSW_SCALAR.equals(algoToRun);
}

public int getEffectiveEfSearch() {
if (efSearch > 0) {
/**
* Returns the list of efSearch values to use during search.
*
* <p>If {@code efSearch} is set in the config JSON (e.g. [64, 128, 256]),
* those values are returned directly. Otherwise, falls back to a single-element
* list containing a default derived from topK.
*
* <p>The benchmark runner iterates over these values and runs search once per value
* against the <b>same</b> index — no rebuild is needed.
*/
public List<Integer> getEfSearchValues() {
if (efSearch != null && !efSearch.isEmpty()) {
return efSearch;
}
return Math.max(topK, (int) Math.ceil(topK * 1.5));
// Default: 1.5x topK, but at least topK
return List.of(Math.max(topK, (int) Math.ceil(topK * 1.5)));
}

public String prettyString() {
Expand Down Expand Up @@ -128,6 +140,7 @@ public String prettyString() {
sb.append("Enable TieredMerge: ").append(enableTieredMerge).append('\n');
sb.append("Num HNSW merge threads: ").append(hnswMergeThreads).append('\n');
sb.append("enableIndexWriterInfoStream: ").append(enableIndexWriterInfoStream).append('\n');
sb.append("efSearch: ").append(getEfSearchValues()).append('\n');

sb.append("------- algo parameters ------\n");
if (isLucene()) {
Expand Down
Loading