From 3511498570e3293d63b97d5485536c1585d01be9 Mon Sep 17 00:00:00 2001
From: Adrian Romberg <adrian.romberg@icloud.com>
Date: Wed, 3 Dec 2025 16:48:47 +0100
Subject: [PATCH 1/7] add a CLI command to run the full XspecT pipeline

---
 src/xspect/main.py | 159 +++++++++++++++++++++++++++++++++++++++++++
 tests/test_cli.py  | 165 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 324 insertions(+)
diff --git a/src/xspect/main.py b/src/xspect/main.py
index fbf019b..3b53225 100644
--- a/src/xspect/main.py
+++ b/src/xspect/main.py
@@ -29,6 +29,165 @@ def web():
     run(app, host="0.0.0.0", port=8000)
 
 
+@cli.command(
+    name="all",
+    help="Run full classification pipeline: genus filtering, species classification, and MLST (if applicable).",
+)
+@click.option(
+    "-g",
+    "--genus",
+    "model_genus",
+    help="Genus of the model to use.",
+    type=click.Choice(get_models().get("Species", [])),
+    prompt=True,
+)
+@click.option(
+    "-i",
+    "--input-path",
+    help="Path to FASTA or FASTQ file for classification.",
+    type=click.Path(exists=True, dir_okay=True, file_okay=True),
+    prompt=True,
+    default=Path("."),
+)
+@click.option(
+    "-o",
+    "--output-dir",
+    help="Directory for output files (default: auto-generated 'xspect_results_<uuid>' directory).",
+    type=click.Path(dir_okay=True, file_okay=False),
+    default=None,
+)
+@click.option(
+    "-t",
+    "--threshold",
+    type=click.FloatRange(0, 1),
+    help="Threshold for genus filtering (default: 0.7).",
+    default=0.7,
+)
+@click.option(
+    "--sparse-sampling-step",
+    type=int,
+    help="Sparse sampling step (e. g. only every 500th kmer for '--sparse-sampling-step 500').",
+    default=1,
+)
+@click.option(
+    "-n",
+    "--display-names",
+    help="Includes the display names next to taxonomy-IDs.",
+    is_flag=True,
+)
+@click.option(
+    "-v",
+    "--validation",
+    help="Detects misclassification for small reads or contigs.",
+    is_flag=True,
+)
+def all_pipeline(
+    model_genus,
+    input_path,
+    output_dir,
+    threshold,
+    sparse_sampling_step,
+    display_names,
+    validation,
+):
+    """Run full classification pipeline."""
+    import json
+    from xspect import filter_sequences, classify
+    from xspect.file_io import fasta_endings, fastq_endings
+
+    run_id = uuid4()
+
+    if output_dir is None:
+        output_dir = Path(f"xspect_results_{run_id}")
+    else:
+        output_dir = Path(output_dir)
+
+    output_dir.mkdir(exist_ok=True, parents=True)
+    input_path = Path(input_path)
+
+    filtered_dir = output_dir / "filtered_sequences"
+    filtered_dir.mkdir(exist_ok=True, parents=True)
+
+    genus_filtered_path = filtered_dir / f"genus_filtered_{run_id}.fasta"
+    genus_classification_path = output_dir / f"genus_classification_{run_id}.json"
+    species_classification_path = output_dir / f"species_classification_{run_id}.json"
+
+    # Step 1: Genus filtering
+    click.echo(f"Step 1/3: Filtering for genus {model_genus}...")
+    filter_sequences.filter_genus(
+        model_genus,
+        input_path,
+        genus_filtered_path,
+        threshold,
+        genus_classification_path,
+        sparse_sampling_step=sparse_sampling_step,
+    )
+
+    ending_wildcards = [f"*.{ending}" for ending in fasta_endings + fastq_endings]
+    filtered_files = [p for e in ending_wildcards for p in filtered_dir.glob(e)]
+
+    if not filtered_files:
+        click.echo("No sequences passed the genus filter. Pipeline aborted.")
+        return
+
+    # Step 2: Species classification on filtered sequences
+    click.echo(
+        f"Step 2/3: Classifying species for {len(filtered_files)} filtered file(s)..."
+    )
+    classify.classify_species(
+        model_genus,
+        filtered_dir,
+        species_classification_path,
+        sparse_sampling_step,
+        display_names,
+        validation,
+        None,
+    )
+
+    species_results = list(output_dir.glob(f"species_classification_{run_id}*.json"))
+
+    # Step 3: Check if we need to run MLST (if any prediction is "470" for abaumannii)
+    mlst_needed = False
+    for species_result_path in species_results:
+        with open(species_result_path, "r", encoding="utf-8") as f:
+            species_result = json.load(f)
+
+        prediction = species_result.get("prediction")
+        if prediction == "470":
+            mlst_needed = True
+            click.echo(
+                f"Species prediction is 470 (abaumannii) in {species_result_path.name}."
+            )
+
+    if mlst_needed:
+        click.echo("Step 3/3: Running MLST classification for abaumannii...")
+
+        mlst_schemes = get_available_mlst_schemes()
+        if "abaumannii" in mlst_schemes and mlst_schemes["abaumannii"]:
+            scheme = mlst_schemes["abaumannii"][0]
+            mlst_output_path = output_dir / f"mlst_classification_{run_id}.json"
+
+            classify.classify_mlst(
+                filtered_dir,
+                "abaumannii",
+                scheme,
+                mlst_output_path,
+                False,
+            )
+            click.echo(f"MLST classification completed: {mlst_output_path.name}")
+        else:
+            click.echo(
+                "Warning: No MLST schemes available for abaumannii. Skipping MLST classification."
+            )
+    else:
+        click.echo(
+            "Step 3/3: Not running MLST classification (organism is not Acinetobacter baumannii)."
+        )
+
+    click.echo("\nPipeline completed successfully!")
+    click.echo(f"Results saved in: {output_dir}")
+
+
 # # # # # # # # # # # # # # #
 # Model management commands #
 # # # # # # # # # # # # # # #
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 670d87b..cfd9518 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -274,3 +274,168 @@ def test_filter_species_max_scoring(mixed_species_assembly_file_path, tmpdir):
         filtered_content = f.read()
         assert "Acinetobacter calcoaceticus" in filtered_content
         assert "Acinetobacter baumannii" not in filtered_content
+
+
+@pytest.mark.parametrize(
+    ["assembly_file_path", "genus", "species"],
+    [
+        (
+            "GCF_000069245.1_ASM6924v1_genomic.fna",
+            "Acinetobacter",
+            "470",
+        ),
+    ],
+    indirect=["assembly_file_path"],
+)
+def test_all_pipeline(assembly_file_path, genus, species, tmpdir):
+    """Test the complete 'xspect all' pipeline"""
+    runner = CliRunner()
+    output_dir = str(tmpdir) + "/all_results"
+
+    result = runner.invoke(
+        cli,
+        [
+            "all",
+            "-g",
+            genus,
+            "-i",
+            assembly_file_path,
+            "-o",
+            output_dir,
+            "-t",
+            "0.7",
+        ],
+    )
+    assert result.exit_code == 0, f"Error: {result.output}"
+
+    output_path = Path(output_dir)
+    assert output_path.exists()
+
+    filtered_dir = output_path / "filtered_sequences"
+    assert filtered_dir.exists()
+
+    filtered_files = list(filtered_dir.glob("genus_filtered_*.fasta"))
+    assert len(filtered_files) > 0, "No genus filtered files found"
+
+    genus_classification_files = list(output_path.glob("genus_classification_*.json"))
+    assert len(genus_classification_files) > 0, "No genus classification files found"
+
+    species_classification_files = list(
+        output_path.glob("species_classification_*.json")
+    )
+    assert (
+        len(species_classification_files) > 0
+    ), "No species classification files found"
+
+    with open(species_classification_files[0], encoding="utf-8") as f:
+        result_content = json.load(f)
+        assert result_content["prediction"] == species
+
+    if species == "470":
+        mlst_classification_files = list(output_path.glob("mlst_classification_*.json"))
+        assert (
+            len(mlst_classification_files) > 0
+        ), "No MLST classification files found for abaumannii"
+
+
+@pytest.mark.parametrize(
+    ["assembly_file_path", "genus", "species"],
+    [
+        (
+            "GCF_000006945.2_ASM694v2_genomic.fna",
+            "Salmonella",
+            "28901",
+        ),
+    ],
+    indirect=["assembly_file_path"],
+)
+def test_all_pipeline_no_mlst(assembly_file_path, genus, species, tmpdir):
+    """Test 'xspect all' pipeline without MLST (non-abaumannii species)"""
+    runner = CliRunner()
+    output_dir = str(tmpdir) + "/all_results_no_mlst"
+
+    result = runner.invoke(
+        cli,
+        [
+            "all",
+            "-g",
+            genus,
+            "-i",
+            assembly_file_path,
+            "-o",
+            output_dir,
+            "-t",
+            "0.7",
+        ],
+    )
+    assert result.exit_code == 0, f"Error: {result.output}"
+
+    # Verify output directory structure
+    output_path = Path(output_dir)
+    assert output_path.exists()
+
+    # Check for species classification results
+    species_classification_files = list(
+        output_path.glob("species_classification_*.json")
+    )
+    assert len(species_classification_files) > 0
+
+    # Verify species classification content
+    with open(species_classification_files[0], encoding="utf-8") as f:
+        result_content = json.load(f)
+        assert result_content["prediction"] == species
+
+    # No MLST classification should exist for non-abaumannii species
+    mlst_classification_files = list(output_path.glob("mlst_classification_*.json"))
+    assert (
+        len(mlst_classification_files) == 0
+    ), "MLST classification should not exist for non-abaumannii"
+
+
+@pytest.mark.parametrize(
+    "assembly_file_path",
+    [
+        "GCF_000069245.1_ASM6924v1_genomic.fna",
+    ],
+    indirect=["assembly_file_path"],
+)
+def test_all_pipeline_auto_directory(assembly_file_path, tmpdir):
+    """Test 'xspect all' with auto-generated output directory"""
+    runner = CliRunner()
+
+    # Change to tmpdir to ensure auto-generated directory is created there
+    import os
+
+    original_dir = os.getcwd()
+    os.chdir(tmpdir)
+
+    try:
+        result = runner.invoke(
+            cli,
+            [
+                "all",
+                "-g",
+                "Acinetobacter",
+                "-i",
+                assembly_file_path,
+                "-t",
+                "0.7",
+            ],
+        )
+        assert result.exit_code == 0, f"Error: {result.output}"
+
+        # Check that an auto-generated directory was created
+        result_dirs = [
+            d
+            for d in Path(tmpdir).iterdir()
+            if d.is_dir() and d.name.startswith("xspect_results_")
+        ]
+        assert len(result_dirs) > 0, "No auto-generated output directory found"
+
+        # Verify the directory contains expected files
+        output_path = result_dirs[0]
+        assert (output_path / "filtered_sequences").exists()
+        assert len(list(output_path.glob("genus_classification_*.json"))) > 0
+        assert len(list(output_path.glob("species_classification_*.json"))) > 0
+    finally:
+        os.chdir(original_dir)

From 53bcb7571af75edfd7b3bce8434595319bc8e76b Mon Sep 17 00:00:00 2001
From: Adrian Romberg <adrian.romberg@icloud.com>
Date: Mon, 24 Nov 2025 12:52:29 +0100
Subject: [PATCH 2/7] Train models using gene families from ppanggolin

---
 scripts/nextflow-utils/environment.yml  |   2 +-
 scripts/pangenome-train/main.nf         | 500 ++++++++++++++++++++++++
 scripts/pangenome-train/nextflow.config |   6 +
 3 files changed, 507 insertions(+), 1 deletion(-)
 create mode 100644 scripts/pangenome-train/main.nf
 create mode 100644 scripts/pangenome-train/nextflow.config

diff --git a/scripts/nextflow-utils/environment.yml b/scripts/nextflow-utils/environment.yml
index 7381a97..f604c56 100644
--- a/scripts/nextflow-utils/environment.yml
+++ b/scripts/nextflow-utils/environment.yml
@@ -5,4 +5,4 @@ dependencies:
   - python=3.12
   - pip
   - pip:
-      - XspecT==0.5.4
\ No newline at end of file
+      - XspecT==0.7.2
\ No newline at end of file
diff --git a/scripts/pangenome-train/main.nf b/scripts/pangenome-train/main.nf
new file mode 100644
index 0000000..4c44253
--- /dev/null
+++ b/scripts/pangenome-train/main.nf
@@ -0,0 +1,500 @@
+#!/usr/bin/env nextflow
+
+include { strain_species_mapping } from '../nextflow-utils'
+
+// --------------------- PARAMETERS ---------------------
+params.dataset_dir = "data/genomes/ncbi_dataset"
+params.cpus = 32
+
+// --------------------- WORKFLOW -----------------------
+workflow {
+
+  // Build maps
+  taxon_map = EXTRACT_TAXON(file(params.dataset_dir) + "/data/assembly_data_report.jsonl")
+  fasta_index = INDEX_FASTAS(file(params.dataset_dir))
+  map = JOIN_MAPS(taxon_map, fasta_index)
+
+  tax_report = file("data/aci_species.json")
+
+  tax_mapping_json = strain_species_mapping(tax_report)
+  reassigned_map = REASSIGN_MAP(map, tax_mapping_json)
+
+  // Exclude taxa with ambiguous or provisional names (e.g., containing "sp." or "Candidatus")
+  filtered_name_map = FILTER_TAXON_NAMES(reassigned_map, tax_report)
+  // Filter to only ACB clade species
+  // acb_map = FILTER_ACB(reassigned_map)
+
+  split_files = TRAIN_TEST_SPLIT(filtered_name_map)
+  filtered = FILTER_SINGLETON_TAXA(split_files.train.flatten().collect())
+  train_taxon_files = filtered.train.flatten()
+
+  pangenomes = PANGENOME(train_taxon_files, file(params.dataset_dir))
+  pangenome_fastas = EXTRACT_FASTA(pangenomes)
+  organized_fasta_folders = ORGANIZE_PANGENOME_FASTAS(pangenome_fastas.collect())
+
+  fasta_folder_ch = organized_fasta_folders.core.mix(
+    organized_fasta_folders.softcore_95,
+    organized_fasta_folders.softcore_90,
+    organized_fasta_folders.softcore_85,
+    organized_fasta_folders.softcore_80,
+    organized_fasta_folders.softcore_75,
+    organized_fasta_folders.persistent,
+    organized_fasta_folders.persistent_shell,
+    organized_fasta_folders.persistent_shell_cloud,
+  )
+
+  trained_models = XSPECT_TRAIN(fasta_folder_ch)
+  UPDATE_MODEL_METADATA(trained_models.collect(), train_taxon_files.collect(), tax_report)
+}
+
+// --------------------- PROCESSES ----------------------
+
+process EXTRACT_TAXON {
+  publishDir "results/pangenome-train/maps", mode: 'copy'
+
+  conda "conda-forge::jq"
+
+  input:
+  path jsonl
+
+  output:
+  path "genome_to_taxid.tsv", emit: 'genome_to_taxid_tsv'
+
+  script:
+  """
+    set -euo pipefail
+    jq -r 'select(.averageNucleotideIdentity.taxonomyCheckStatus == "OK" and .checkmInfo.completeness >= 95) | [.accession, (.organism.taxId // .taxon.taxId // "Unknown")] | @tsv' "${jsonl}" > genome_to_taxid.tsv
+    """
+}
+
+process INDEX_FASTAS {
+  publishDir "results/pangenome-train/maps", mode: 'copy'
+
+  input:
+  path dataset_dir
+
+  output:
+  path "accession_to_fna.tsv", emit: 'accession_to_fna_tsv'
+
+  script:
+  """
+    set -euo pipefail
+    find "${dataset_dir}/data" -type f '(' -name "*genomic.fna" -o -name "*genomic.fna.gz" ')' |
+      awk '{
+        n = split(\$0, a, "/");
+        acc = a[n-1];
+        print acc "\t" \$0
+      }' |
+      sort > accession_to_fna.tsv
+    """
+}
+
+process JOIN_MAPS {
+  publishDir "results/pangenome-train/maps", mode: 'copy'
+
+  input:
+  path genome_to_taxid_tsv
+  path accession_to_fna_tsv
+
+  output:
+  path "accession_taxid_fna.tsv", emit: 'accession_taxid_fna_tsv'
+
+  script:
+  """
+    set -euo pipefail
+    join -t \$'\\t' -1 1 -2 1 <(sort "${genome_to_taxid_tsv}") <(sort "${accession_to_fna_tsv}") > accession_taxid_fna.tsv
+    """
+}
+
+process REASSIGN_MAP {
+  publishDir "results/pangenome-train/maps", mode: 'copy'
+  conda "conda-forge::jq"
+
+  input:
+  path accession_taxid_fna_tsv
+  path tax_mapping_json
+
+  output:
+  path "accession_taxid_fna_reassigned.tsv", emit: 'accession_taxid_fna_reassigned_tsv'
+
+  script:
+  """
+    set -euo pipefail
+    awk -F'\\t' 'NR==FNR{map[\$1]=\$2; next}{
+        if (\$2 in map) { \$2 = map[\$2] }
+        print \$0
+      }' <(jq -r 'to_entries[] | "\\(.key)\\t\\(.value)"' "${tax_mapping_json}") "${accession_taxid_fna_tsv}" > accession_taxid_fna_reassigned.tsv
+    """
+}
+
+process FILTER_TAXON_NAMES {
+  publishDir "results/pangenome-train/maps", mode: 'copy'
+  conda "conda-forge::jq"
+
+  input:
+  path reassigned_tsv
+  path tax_report
+
+  output:
+  path "accession_taxid_fna_filtered.tsv", emit: 'accession_taxid_fna_filtered_tsv'
+
+  script:
+  """
+    set -euo pipefail
+
+    # Extract valid species-level tax IDs whose scientific names do NOT contain 'sp.' or 'Candidatus' (case-insensitive)
+    jq -r '.reports[]
+      | .taxonomy
+      | select((.current_scientific_name.name // "") | test("(?i)\\\\bCandidatus\\\\b|\\\\bsp\\\\.", "i") | not)
+      | .tax_id' "${tax_report}" \
+      | sort -u > valid_species_taxids.txt
+
+    # Keep only rows where the (possibly reassigned) taxid is in the valid list
+    awk -F'\t' 'NR==FNR{valid[\$1]=1; next} (\$2 in valid)' valid_species_taxids.txt "${reassigned_tsv}" \
+      > accession_taxid_fna_filtered.tsv
+  """
+}
+
+process FILTER_ACB {
+  publishDir "results/pangenome-train/maps", mode: 'copy'
+
+  input:
+  path reassigned_tsv
+
+  output:
+  path "accession_taxid_fna_ACB.tsv", emit: 'accession_taxid_fna_acb_tsv'
+
+  script:
+  """
+    set -euo pipefail
+    awk -F'\t' '(\$2=="470"||\$2=="48296"||\$2=="106654"||\$2=="471"||\$2=="1530123"||\$2=="1785128")' "${reassigned_tsv}" > accession_taxid_fna_ACB.tsv
+    """
+}
+
+process TRAIN_TEST_SPLIT {
+  publishDir "results/pangenome-train/maps/by_taxid", mode: 'copy'
+
+  input:
+  path accession_taxid_fna_tsv
+
+  output:
+  path "*_train.tsv", emit: 'train'
+  path "*_test.tsv", emit: 'test'
+
+  script:
+  """
+    #!/usr/bin/env python
+    import random
+    from collections import defaultdict
+
+    # Group rows by taxid
+    groups = defaultdict(list)
+    with open("${accession_taxid_fna_tsv}", 'r') as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            parts = line.split('\\t', 2)
+            if len(parts) < 3:
+                continue
+            acc = parts[0]
+            tax = parts[1]
+            path = parts[2]
+            groups[tax].append(f"{acc}\\t{path}")
+
+    # For each taxid, shuffle and split into train (80%) and test (20%)
+    random.seed(42)
+    for tax, rows in groups.items():
+        random.shuffle(rows)
+        
+        split_index = max(1, (8 * len(rows)) // 10)
+        train_rows = rows[:split_index]
+        test_rows = rows[split_index:]
+        
+        with open(f"{tax}_train.tsv", 'w') as f:
+            f.write('\\n'.join(train_rows) + '\\n')
+        
+        with open(f"{tax}_test.tsv", 'w') as f:
+            f.write('\\n'.join(test_rows) + '\\n')
+    """
+}
+
+
+process FILTER_SINGLETON_TAXA {
+  publishDir "results/pangenome-train/maps/by_taxid", mode: 'copy', pattern: "*_train.tsv"
+  publishDir "results/pangenome-train/maps", mode: 'copy', pattern: "singletons/*.tsv"
+
+  input:
+  path train_file
+
+  output:
+  path "filtered_train/*_train.tsv", emit: 'train'
+  path "singletons/*.tsv", emit: 'singletons'
+
+  script:
+  """
+  #!/usr/bin/env python3
+  import os
+  import shutil
+
+  train_files = "${train_file}".split()
+  for train_file in train_files:
+    base = os.path.basename(train_file)
+    taxid = base.split('_', 1)[0]
+
+    # Count non-empty lines
+    n = 0
+    with open(train_file, 'r') as f:
+      for line in f:
+        if line.strip():
+          n += 1
+
+    if n > 3:
+      os.makedirs('filtered_train', exist_ok=True)
+      shutil.copy(train_file, os.path.join('filtered_train', base))
+    else:
+      os.makedirs('singletons', exist_ok=True)
+      shutil.copy(train_file, os.path.join('singletons', f"{taxid}.tsv"))
+  """
+}
+
+
+process PANGENOME {
+  publishDir "results/pangenome-train/pangenome", mode: 'copy'
+  conda "bioconda::ppanggolin"
+  cpus { genomes_list.baseName == "470_train" ? params.cpus * 2 : params.cpus }
+  memory params.cpus * 4 + " GB"
+
+  input:
+  path genomes_list
+  path dataset_dir
+
+  output:
+  path "${genomes_list.baseName.split('_')[0]}.h5", emit: 'pangenome_h5'
+
+  script:
+  """
+    set -euo pipefail
+    ppanggolin all --fasta "${genomes_list}" --cpu ${task.cpus}
+    mv **/*.h5 ${genomes_list.baseName.split('_')[0]}.h5
+    """
+}
+
+process EXTRACT_FASTA {
+  publishDir "results/pangenome-train/fastas", mode: 'copy'
+  conda "bioconda::ppanggolin"
+  cpus params.cpus
+  memory params.cpus * 4 + " GB"
+
+  input:
+  path pangenome_h5
+
+  output:
+  path "${pangenome_h5.baseName}"
+
+  script:
+  """
+    set -euo pipefail
+
+    ppanggolin fasta -p "${pangenome_h5}" --output "${pangenome_h5.baseName}" --gene_families core -f
+    ppanggolin fasta -p "${pangenome_h5}" --output "${pangenome_h5.baseName}" --gene_families softcore -f
+    ppanggolin fasta -p "${pangenome_h5}" --output "${pangenome_h5.baseName}/sc90" --gene_families softcore -f --soft_core 0.9
+    ppanggolin fasta -p "${pangenome_h5}" --output "${pangenome_h5.baseName}/sc85" --gene_families softcore -f --soft_core 0.85
+    ppanggolin fasta -p "${pangenome_h5}" --output "${pangenome_h5.baseName}/sc80" --gene_families softcore -f --soft_core 0.8
+    ppanggolin fasta -p "${pangenome_h5}" --output "${pangenome_h5.baseName}/sc75" --gene_families softcore -f --soft_core 0.75
+    ppanggolin fasta -p "${pangenome_h5}" --output "${pangenome_h5.baseName}" --gene_families persistent -f
+    ppanggolin fasta -p "${pangenome_h5}" --output "${pangenome_h5.baseName}" --gene_families shell -f
+    ppanggolin fasta -p "${pangenome_h5}" --output "${pangenome_h5.baseName}" --gene_families cloud -f
+    """
+}
+
+process ORGANIZE_PANGENOME_FASTAS {
+  publishDir "results/pangenome-train/organized", mode: 'copy'
+
+  input:
+  path pangenome_fastas
+
+  output:
+  path "core", emit: core
+  path "softcore_95", emit: softcore_95
+  path "softcore_90", emit: softcore_90
+  path "softcore_85", emit: softcore_85
+  path "softcore_80", emit: softcore_80
+  path "softcore_75", emit: softcore_75
+  path "persistent", emit: persistent
+  path "persistent_shell", emit: persistent_shell
+  path "persistent_shell_cloud", emit: persistent_shell_cloud
+
+  script:
+  """
+    #!/usr/bin/env python
+    import os
+    import shutil
+    import glob
+    import re
+
+    pangenome_dirs = "${pangenome_fastas}".split()
+    
+    for fasta_dir in pangenome_dirs:
+        if not os.path.isdir(fasta_dir):
+            continue
+            
+        taxid = os.path.basename(fasta_dir)
+        
+        # Find all fasta files in the directory and subdirectories
+        for fasta in glob.glob(f"{fasta_dir}/**/*.fasta", recursive=True):
+            base = os.path.basename(fasta)
+            
+            # Determine partition type from filename
+            out_dirs = []
+            if "core" in base and "softcore" not in base:
+                out_dirs.append("core")
+            elif "softcore" in base:
+                # Determine softcore threshold from path or filename
+                # Default softcore (0.95) - check if it's in the main directory
+                parent_dir = os.path.basename(os.path.dirname(fasta))
+                
+                if parent_dir == "sc90":
+                    out_dirs.append("softcore_90")
+                elif parent_dir == "sc85":
+                    out_dirs.append("softcore_85")
+                elif parent_dir == "sc80":
+                    out_dirs.append("softcore_80")
+                elif parent_dir == "sc75":
+                    out_dirs.append("softcore_75")
+                else:
+                    # Default softcore (0.95)
+                    out_dirs.append("softcore_95")
+            elif "persistent" in base:
+                out_dirs.append("persistent")
+                out_dirs.append("persistent_shell")
+                out_dirs.append("persistent_shell_cloud")
+            elif "shell" in base:
+                out_dirs.append("persistent_shell")
+                out_dirs.append("persistent_shell_cloud")
+            elif "cloud" in base:
+                out_dirs.append("persistent_shell_cloud")
+
+            for out_dir in out_dirs:
+                dest_dir = f"{out_dir}/cobs/{taxid}"
+                os.makedirs(dest_dir, exist_ok=True)
+                dest_path = os.path.join(dest_dir, base)
+                shutil.copy(fasta, dest_path)
+    """
+}
+
+process XSPECT_TRAIN {
+  conda "./scripts/nextflow-utils/environment.yml"
+  cpus params.cpus
+  memory params.cpus * 4 + " GB"
+
+  input:
+  path organized_fasta_folder
+
+  output:
+  path "model_${organized_fasta_folder.baseName}.txt"
+
+  script:
+  """
+    set -euo pipefail
+
+    xspect models train directory -g "Acinetobacter_${organized_fasta_folder.baseName}" -i "${organized_fasta_folder}"
+    echo "Acinetobacter_${organized_fasta_folder.baseName}" > "model_${organized_fasta_folder.baseName}.txt"
+    """
+}
+
+process UPDATE_MODEL_METADATA {
+  publishDir "results/pangenome-train/metadata", mode: 'copy'
+
+  input:
+  path model_names
+  path train_files
+  path tax_report
+
+  output:
+  path "training_accessions.json"
+
+  script:
+  """
+    #!/usr/bin/env python
+    import json
+    import os
+    from pathlib import Path
+
+    # Load taxonomy data to get scientific names
+    with open("${tax_report}", 'r') as f:
+        tax_data = json.load(f)
+    
+    # Build a mapping from tax_id to scientific name
+    taxid_to_name = {}
+    for report in tax_data.get('reports', []):
+        taxonomy = report.get('taxonomy', {})
+        tax_id = taxonomy.get('tax_id')
+        scientific_name = taxonomy.get('current_scientific_name', {}).get('name')
+        if tax_id and scientific_name:
+            taxid_to_name[str(tax_id)] = scientific_name
+
+    # Parse all training files and collect accessions by taxid
+    training_data = {}
+    train_files = "${train_files}".split()
+    
+    for train_file in train_files:
+        if not os.path.exists(train_file):
+            continue
+        
+        # Extract taxid from filename (e.g., "470_train.tsv")
+        taxid = Path(train_file).stem.split('_')[0]
+        
+        # Read accessions from the training file
+        accessions = []
+        with open(train_file, 'r') as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    parts = line.split('\\t')
+                    if parts:
+                        accessions.append(parts[0])
+        
+        training_data[taxid] = accessions
+    
+    # Save the training accessions mapping
+    with open('training_accessions.json', 'w') as f:
+        json.dump(training_data, f, indent=4)
+    
+    # Update model metadata files
+    model_names = "${model_names}".split()
+    for model_name_file in model_names:
+        with open(model_name_file, 'r') as f:
+            model_name = f.read().strip()
+        
+        # Determine model path based on naming convention
+        # Extract the partition type from model name (e.g., "Acinetobacter_core" -> "core")
+        partition = model_name.split('_', maxsplit=1)[-1].replace('_', '-')
+        
+        # Construct model file path
+        model_file = Path.home() / "xspect-data" / "models" / f"acinetobacter-{partition}-species.json"
+        
+        if model_file.exists():
+            # Read existing model metadata
+            with open(model_file, 'r') as f:
+                metadata = json.load(f)
+            
+            # Update with training accessions
+            metadata['training_accessions'] = training_data
+            
+            # Update display_names with tax_id: scientific_name mappings
+            display_names = {}
+            for taxid in training_data.keys():
+                if taxid in taxid_to_name:
+                    display_names[taxid] = taxid_to_name[taxid]
+            metadata['display_names'] = display_names
+            
+            # Write back to model file
+            with open(model_file, 'w') as f:
+                json.dump(metadata, f, indent=4)
+            
+            print(f"Updated {model_file}")
+        else:
+            print(f"Model file not found: {model_file}")
+    """
+}
diff --git a/scripts/pangenome-train/nextflow.config b/scripts/pangenome-train/nextflow.config
new file mode 100644
index 0000000..4867f83
--- /dev/null
+++ b/scripts/pangenome-train/nextflow.config
@@ -0,0 +1,6 @@
+process.executor = 'slurm'
+executor.account = 'intern'
+process.queue = 'all'
+
+
+conda.enabled = true

From ec83b12c5f0e8aa149e80b92f43ba0fe484fffda Mon Sep 17 00:00:00 2001
From: Adrian Romberg <adrian.romberg@icloud.com>
Date: Thu, 18 Dec 2025 12:50:05 +0100
Subject: [PATCH 3/7] update benchmark

---
 docs/benchmark.md                  |  87 +++++++-
 mkdocs.yml                         |   5 +
 scripts/benchmark/classify/main.nf |  16 +-
 scripts/benchmark/environment.yml  |   8 -
 scripts/benchmark/main.nf          | 318 ++++++++++++++++-------------
 scripts/nextflow-utils/main.nf     | 121 +++++++++++
 6 files changed, 383 insertions(+), 172 deletions(-)
 delete mode 100644 scripts/benchmark/environment.yml

diff --git a/docs/benchmark.md b/docs/benchmark.md
index 51f6cd8..bc2657f 100644
--- a/docs/benchmark.md
+++ b/docs/benchmark.md
@@ -1,23 +1,78 @@
 # Benchmark
 
-XspecT is a tool designed for fast and accurate species classification of genome assemblies and simulated reads. To evaluate its classification accuracy, we conducted a benchmark using a set of Acinetobacter genomes.
+XspecT is a tool designed for fast and accurate species classification of genome assemblies and simulated reads. To evaluate its classification accuracy, we conducted a benchmark using a set of *Acinetobacter* genomes.
 
-The benchmark was performed by first downloading all available Acinetobacter genomes from RefSeq, filtered on a passed ("OK") taxonomy check status and on them not being part of the training dataset. Genomes assigned to strain IDs were remapped to their respective species IDs, after which genomes with species IDs not contained in XspecT's Acinetobacter model were removed. The remaining genomes were then used to classify both assemblies and simulated reads generated from them. Simulated reads were generated by first filtering on genomes that were categorized as "complete" or "chromosome" by NCBI. The reads were then simulated from the longest contig of each genome (assumed to be the chromosome) using ART. 100 000 reads were simulated for each genome based on the HiSeq 2500 profile, with a read length of 125 bp. The reads were then classified using XspecT with predictions based on the maximum-scoring species.
+```mermaid
+flowchart TD
+
+  A([Start]) --> DL[Download genomes]
+
+  genomes[(NCBI RefSeq<br/>assemblies<br/>latest, non-atypical)]
+  meta@{ shape: doc, label: "Assembly metadata" }
+  tax@{ shape: doc, label: "Taxonomy report" }
+  xspect@{ shape: doc, label: "XspecT model" }
+
+  genomes --> DL
+  genomes --> meta
+
+  subgraph Data_preparation[Data Preparation]
+    DP1[Keep only assemblies with OK taxonomy check status]
+    DP2[Map taxonomy IDs:<br/>strain → species]
+    DP3[Remove species IDs not in XspecT model]
+    DP4[Remove assemblies used for model training]
+  end
+
+  DL --> DP1
+  meta --> DP1
+  tax --> DP2
+  DP1 --> DP2
+  DP2 --> DP3
+  DP3 --> DP4
+  DP4 --> assemblies_clean@{ shape: docs, label: "Filtered assemblies" }
+  xspect --> DP3
+  xspect --> DP4
+
+  subgraph Assembly_level_evaluation[Assemblies]
+    assemblies_clean --> AssClassify[Classify assemblies]
+    AssClassify --> AssSummary[Summarize assembly classifications]
+    AssSummary --> AssMatrices[Generate assembly confusion matrices]
+  end
+
+  xspect --> AssClassify
+
+  subgraph Read_level_evaluation[Reads]
+    assemblies_clean --> SelectReads[Select assemblies for read generation]
+    SelectReads --> SimReads[Generate simulated reads]
+    SimReads --> ReadsClassify[Classify simulated reads]
+    ReadsClassify --> ReadsSummary[Summarize read classifications]
+    ReadsSummary --> ReadMatrices[Generate read confusion matrix]
+  end
+
+  xspect --> ReadsClassify
+
+  AssSummary --> Stats[Calculate overall statistics]
+  ReadsSummary --> Stats
+  Stats --> Z([End])
+```
+
+The benchmark was performed by first downloading all available *Acinetobacter* genomes from RefSeq (latest version only, excluding atypical), filtered on a passed ("OK") taxonomy check status and on them not being part of the training dataset. Genomes assigned to strain IDs were remapped to their respective species IDs, after which genomes with species IDs not contained in the XspecT *Acinetobacter* model were removed. The remaining genomes were then used to classify both assemblies and simulated reads generated from them. Simulated reads were generated by first filtering on genomes that were categorized as "complete" or "chromosome" by NCBI. The reads were then simulated from the longest contig of each genome (assumed to be the chromosome) using InSilicoSeq. 100 000 reads were simulated for each genome based on the NovaSeq profile, with a read length of 150 bp. The reads were then classified using XspecT with predictions based on the maximum-scoring species. 
 
 ## Benchmark Results
 
-The benchmark results show that XspecT achieves very high classification accuracy of nearly 100% for whole genomes and strong but reduced accuracy of 70% for simulated reads. However, the low macro-average F1 score (0.21) for the read dataset highlights a substantial class imbalance.
+The benchmark results show that XspecT achieves very high classification accuracy of nearly 100% for whole genomes and strong but reduced accuracy of 76% for simulated reads. However, the low macro-average F1 score (0.24) for the read dataset highlights a substantial class imbalance.
 
 | Dataset   | Total Samples | Matches   | Mismatches | Match Rate | Mismatch Rate | Accuracy | Macro Avg F1 | Weighted Avg F1 |
 |-----------|--------------:|----------:|-----------:|-----------:|--------------:|---------:|-------------:|----------------:|
-| Assembly  | 13,795        | 13,776    | 19         | 99.86%     | 0.14%         | ≈1.00    | 0.96         | ≈1.00           |
-| Reads     | 121,590,139   | 85,679,572| 35,910,567 | 70.47%     | 29.53%        | 0.70     | 0.21         | 0.79            |
+| Assemblies| 13,786        | 13,776    | 19         | 99.86%     | 0.14%         | ≈1.00    | 0.96         | ≈1.00           |
+| Reads     | 121,800,000   | 88,368,547| 33,431,453 | 72.55%     | 27.45%        | 0.73     | 0.21         | 0.81            |
+
+Counting instances in which the highest number of hits are shared by multiple species as abstentions, the a selective accuracy of 82.80% is achieved for simulated reads, with a coverage of 87.63%. Rejection recall is 45.09%.
 
 ## Running the benchmark yourself
 
 To benchmark XspecT performance yourself, you can use the Nextflow workflow provided in the `scripts/benchmark` directory. This workflow allows you to run XspecT on a set of samples and measure species classification accuracy on both genome assemblies, as well as on simulated reads.
 
-Before you run the benchmark, you first need to download benchmarking data to the `data` directory, for example from NCBI. To do so, you can use the bash script in `scripts/benchmark-data` to download the data using the [NCBI Datasets CLI](https://www.ncbi.nlm.nih.gov/datasets/docs/v2/command-line-tools/download-and-install/), which needs to be installed first. The script will download all available Acinetobacter genomes, as well as taxonomic data.
+Before you run the benchmark, you first need to download benchmarking data to the `data` directory, for example from NCBI. To do so, you can use the bash script in `scripts/benchmark-data` to download the data using the [NCBI Datasets CLI](https://www.ncbi.nlm.nih.gov/datasets/docs/v2/command-line-tools/download-and-install/), which needs to be installed first. The script will download all available *Acinetobacter* genomes, as well as taxonomic data.
 
 To run the benchmark, install [Nextflow](https://www.nextflow.io/docs/latest/install.html) and run the following command:
 
@@ -25,10 +80,20 @@ To run the benchmark, install [Nextflow](https://www.nextflow.io/docs/latest/ins
 nextflow run scripts/benchmark
 ```
 
-This will execute the benchmark workflow, which will classify the samples, as well as reads generated from them, using XspecT. The results will be saved in the `results` directory:
+The workflow can be parameterized using the following flags/arguments:
+
+- `--publishDir`: Directory to save benchmark results to (default: `results/benchmark`)
+- `--xspectModel`: XspecT model to use for classification (default: `Acinetobacter`)
+- `--excludedSpeciesIDs`: Comma-separated list of species IDs to exclude from the benchmark (default: none)
+- `--maxForks`: Maximum number of parallel processes to use (default: 50)
+- `--validate`: Whether to use mapping-based classification validation (default: false)
+- `--seqPlatform`: InSilicoSeq profile to use for read simulation (default: `NovaSeq` for NovaSeq)
+
+Workflow results will be saved in the `results` directory:
 
-- `results/classifications.tsv` for the classifications of the assemblies
-- `results/read_classifications.tsv` for the classifications of the simulated reads
-- `results/confusion_matrix.png` for the confusion matrix of genome assembly classifications
+- `results/classifications.tsv` for classifications of the assemblies
+- `results/read_classifications.tsv` for classifications of the simulated reads
+- `results/confusion_matrix.png` for a confusion matrix of genome assembly classifications
 - `results/mismatches_confusion_matrix.png` for a confusion matrix filtered on mismatches of genome assembly classifications
-- `results/stats.txt` for the statistics of the benchmark run
\ No newline at end of file
+- `results/read_confusion_matrix.png` for a confusion matrix of simulated read classifications
+- `results/stats.txt` for overall statistics of the benchmark run
\ No newline at end of file
diff --git a/mkdocs.yml b/mkdocs.yml
index 7b83e42..b118a3e 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -10,6 +10,11 @@ plugins:
 repo_url: https://github.com/BIONF/XspecT
 markdown_extensions:
   - attr_list
+  - pymdownx.superfences:
+      custom_fences:
+        - name: mermaid
+          class: mermaid
+          format: !!python/name:pymdownx.superfences.fence_code_format
 nav:
   - Home: index.md
   - Quickstart: quickstart.md
diff --git a/scripts/benchmark/classify/main.nf b/scripts/benchmark/classify/main.nf
index f2b7b0b..a1d7359 100644
--- a/scripts/benchmark/classify/main.nf
+++ b/scripts/benchmark/classify/main.nf
@@ -1,23 +1,23 @@
 process classifySample {
-  conda "./scripts/benchmark/environment.yml"
+  conda "./scripts/nextflow-utils/environment.yml"
   cpus 4
   memory '32 GB'
+  errorStrategy 'retry'
+  maxRetries 3
+  maxForks params.maxForks
 
   input:
   path sample
   val model
+  val excludedSpeciesIDs
 
   output:
   path "${sample.baseName}.json"
 
   script:
+  def excludeOptions = excludedSpeciesIDs ? "--exclude-species ${excludedSpeciesIDs}" : ''
+  def validateFlag = params.validate ? "--validation" : ''
   """
-  xspect classify species -g ${model} -i ${sample} -o ${sample.baseName}.json
-  """
-
-  stub:
-  """
-  mkdir -p results
-  touch results/${sample.baseName}.json
+  xspect classify species -g ${model} -i ${sample} -o ${sample.baseName}.json ${excludeOptions} ${validateFlag}
   """
 }
\ No newline at end of file
diff --git a/scripts/benchmark/environment.yml b/scripts/benchmark/environment.yml
deleted file mode 100644
index 7997d6c..0000000
--- a/scripts/benchmark/environment.yml
+++ /dev/null
@@ -1,8 +0,0 @@
-name: xspect-benchmark
-channels:
-  - conda-forge
-dependencies:
-  - python=3.12
-  - pip
-  - pip:
-      - XspecT==0.5.4
\ No newline at end of file
diff --git a/scripts/benchmark/main.nf b/scripts/benchmark/main.nf
index 44bd9b7..64c0373 100644
--- a/scripts/benchmark/main.nf
+++ b/scripts/benchmark/main.nf
@@ -3,10 +3,18 @@
 include { classifySample as classifyAssembly } from './classify'
 include { classifySample as classifyRead } from './classify'
 include { strain_species_mapping } from '../nextflow-utils'
+include { confusionMatrix as assemblyConfusionMatrix } from '../nextflow-utils'
+include { mismatchConfusionMatrix as assemblyMismatchConfusionMatrix } from '../nextflow-utils'
+include { confusionMatrix as readConfusionMatrix } from '../nextflow-utils'
+include { mismatchConfusionMatrix as readMismatchConfusionMatrix } from '../nextflow-utils'
 
 // --------------------- PARAMETERS ---------------------
-params.publishDir       = "results/benchmark"
-params.xspectModel     = "Acinetobacter"
+params.publishDir         = "results/benchmark"
+params.xspectModel        = "Acinetobacter"
+params.excludedSpeciesIDs = ""
+params.maxForks           = 50
+params.validate           = false
+params.seqPlatform        = "NovaSeq"
 
 // --------------------- WORKFLOW -----------------------
 workflow {
@@ -15,7 +23,7 @@ workflow {
   genomes = file("data/genomes")
   tax_report = file("data/aci_species.json")
   tax_mapping_json = strain_species_mapping(tax_report)
-  assemblies = createAssemblyTable(genomes, tax_mapping_json, species_model)
+  assemblies = createAssemblyTable(genomes, tax_mapping_json, species_model, params.excludedSpeciesIDs)
 
   // Whole genome assemblies
   samples = Channel.fromPath("${genomes}/**/*.fna")
@@ -27,10 +35,20 @@ workflow {
       [sample.baseName.split('_')[0..1].join('_'), sample]
     })
     .map { it[1][1] }
-  classifications = classifyAssembly(filtered_samples, params.xspectModel)
+  classifications = classifyAssembly(filtered_samples, params.xspectModel, params.excludedSpeciesIDs)
   summarizeClassifications(assemblies, classifications.collect())
-  confusionMatrix(summarizeClassifications.out, name_mapping)
-  mismatchConfusionMatrix(summarizeClassifications.out, name_mapping)
+  assemblyConfusionMatrix(
+    summarizeClassifications.out, 
+    name_mapping, 
+    'confusion_matrix.png', 
+    'Xspect Acinetobacter Confusion Matrix'
+  )
+  assemblyMismatchConfusionMatrix(
+    summarizeClassifications.out, 
+    name_mapping, 
+    'mismatches_confusion_matrix.png', 
+    'Mismatches Confusion Matrix'
+  )
 
   // Simulated reads
   selectForReadGen(assemblies, species_model)
@@ -43,8 +61,20 @@ workflow {
     .map { it[1][1] }
   filterForChromosome(read_assemblies)
   generateReads(filterForChromosome.out)
-  read_classifications = classifyRead(generateReads.out, params.xspectModel)
+  read_classifications = classifyRead(generateReads.out, params.xspectModel, params.excludedSpeciesIDs)
   summarizeReadClassifications(selectForReadGen.out, read_classifications.collect())
+  readConfusionMatrix(
+    summarizeReadClassifications.out, 
+    name_mapping, 
+    'read_confusion_matrix.png', 
+    'Xspect Acinetobacter Read Confusion Matrix'
+  )
+  readMismatchConfusionMatrix(
+    summarizeReadClassifications.out, 
+    name_mapping, 
+    'read_mismatches_confusion_matrix.png', 
+    'Read Mismatches Confusion Matrix'
+  )
 
   calculateStats(summarizeClassifications.out, summarizeReadClassifications.out)
   }
@@ -61,7 +91,7 @@ process getModelJSON {
   script:
   """
   model_name="${params.xspectModel.toLowerCase().replaceAll('_','-')}-species.json"
-  cp "$HOME/xspect-data/models/\$model_name" species_model.json
+  cp "\$HOME/xspect-data/models/\$model_name" species_model.json
   """
 }
 
@@ -78,6 +108,7 @@ process getNameMapping {
 
   script:
   """
+  # test
   jq '.display_names | to_entries | map({key: .key, value: (.value | sub("Acinetobacter"; "A."))}) | from_entries' ${species_model} > name_mapping.json
   """
 
@@ -97,14 +128,11 @@ process createAssemblyTable {
   path genomes
   path tax_mapping_json
   path species_model
+  val excludedSpeciesIDs
 
   output:
   path "assemblies.tsv"
 
-  when:
-  !file("assemblies.tsv").exists()
-
-
   script:
   """
   inputfile="${genomes}/ncbi_dataset/data/assembly_data_report.jsonl"
@@ -161,6 +189,25 @@ process createAssemblyTable {
   ' assemblies.tsv > temp_assemblies.tsv
   mv temp_assemblies.tsv assemblies.tsv
   rm training_accessions.txt
+
+  # filter out assemblies with excluded species IDs
+  excluded_species="${excludedSpeciesIDs}"
+  if [ -n "\$excluded_species" ]; then
+    awk -F'\t' -v excluded="\$excluded_species" '
+      BEGIN {
+        # split on commas into array arr
+        n = split(excluded, arr, /,/);
+        for (i = 1; i <= n; i++) {
+          if (arr[i] != "") {
+            excluded_map[arr[i]] = 1;
+          }
+        }
+      }
+      NR==1 { print; next }
+      !(\$6 in excluded_map) { print }
+    ' assemblies.tsv > temp_assemblies.tsv
+    mv temp_assemblies.tsv assemblies.tsv
+  fi
   """
 
   stub:
@@ -173,7 +220,9 @@ process summarizeClassifications {
   conda "conda-forge::pandas"
   cpus 4
   memory '16 GB'
-  publishDir { params.publishDir }, mode: 'copy'
+  errorStrategy 'retry'
+  maxRetries 3
+  publishDir params.publishDir, mode: 'copy'
 
   input:
   path assemblies
@@ -206,7 +255,19 @@ process summarizeClassifications {
     
     with open(json_file, 'r') as f:
       data = json.load(f)
-      prediction = data.get('prediction', 'unknown')
+      prediction = data.get('prediction', None)
+
+      # based on max hits if no prediction field
+      if not prediction:
+        hits = data.get('hits', {})
+        # hits is structured as {subsequence : {species: hits}}
+        total_hits = {
+          species: sum(subseq.get(species, 0) for subseq in hits.values())
+          for species in {s for subseq in hits.values() for s in subseq}
+        }
+        max_hits = max(total_hits.values())
+        max_species = [species for species, species_hits in total_hits.items() if species_hits == max_hits]
+        prediction = max_species[0] if len(max_species) == 1 else "ambiguous"
     
     mask = df['Assembly Accession'].str.contains(accession, na=False)
     df.loc[mask, 'Prediction'] = prediction
@@ -259,6 +320,9 @@ process filterForChromosome {
   conda "bioconda::seqkit"
   cpus 2
   memory '16 GB'
+  errorStrategy 'retry'
+  maxRetries 3
+  
 
   input:
   path sample
@@ -276,9 +340,11 @@ process filterForChromosome {
 }
 
 process generateReads {
-  conda "bioconda::art"
-  cpus 2
-  memory '16 GB'
+  conda "bioconda::insilicoseq"
+  cpus 4
+  memory '32 GB'
+  errorStrategy 'retry'
+  maxRetries 3
 
   input:
   path sample
@@ -290,22 +356,31 @@ process generateReads {
   """
   set -euo pipefail
 
-  art_illumina \
-    -ss HS25 \
-    -i "${sample}" \
-    -l 125 \
-    -c 100000 \
-    -na \
-    -rs 42 \
-    -o "${sample.baseName}_simulated"
+  iss generate \
+    --model ${params.seqPlatform} \
+    --genomes "${sample}" \
+    --n_reads 100000 \
+    --seed 42 \
+    --cpus ${task.cpus} \
+    --output "${sample.baseName}_simulated"
+  
+  # InSilicoSeq creates paired-end files by default (_R1.fastq and _R2.fastq)
+  # Concatenate them into a single file if both exist
+  if [ -f "${sample.baseName}_simulated_R1.fastq" ] && [ -f "${sample.baseName}_simulated_R2.fastq" ]; then
+    cat "${sample.baseName}_simulated_R1.fastq" "${sample.baseName}_simulated_R2.fastq" > "${sample.baseName}_simulated.fq"
+  elif [ -f "${sample.baseName}_simulated_R1.fastq" ]; then
+    mv "${sample.baseName}_simulated_R1.fastq" "${sample.baseName}_simulated.fq"
+  fi
   """
 }
 
 process summarizeReadClassifications {
   conda "conda-forge::pandas"
   cpus 4
-  memory '64 GB'
-  publishDir { params.publishDir }, mode: 'copy'
+  memory '128 GB'
+  errorStrategy 'retry'
+  maxRetries 3
+  publishDir params.publishDir, mode: 'copy'
 
   input:
   path read_assemblies
@@ -342,15 +417,17 @@ process summarizeReadClassifications {
       for read_name, read_scores in scores.items():
         if read_name != 'total':
           if read_scores:
-            max_score = max(read_scores.values())
-            max_species = [species for species, score in read_scores.items() if score == max_score]
+            hits = data.get('hits', {}).get(read_name, {})
+            max_hits = max(hits.values())
+            max_species = [species for species, species_hits in hits.items() if species_hits == max_hits]
             prediction = max_species[0] if len(max_species) == 1 else "ambiguous"
 
             result = {
               'Assembly Accession': accession,
               'Read': read_name,
               'Prediction': prediction,
-              'Species ID': species_id
+              'Species ID': species_id,
+              'Rejected': True if prediction == "ambiguous" else False
             }
             
             for species, score in read_scores.items():
@@ -358,7 +435,29 @@ process summarizeReadClassifications {
 
             results.append(result)
       
-
+      # Reads marked as misclassified
+      misclassified = data.get('misclassified', {})
+      if misclassified:
+        for misclass_species_id, misclass_reads in misclassified.items():
+          for read_name, read_hits in misclass_reads.items():
+            if read_hits:
+              num_kmers = data['num_kmers'][read_name]
+              read_scores = {}
+              for species, hits_count in read_hits.items():
+                read_scores[species] = round(hits_count / num_kmers, 2)
+              
+              result = {
+                'Assembly Accession': accession,
+                'Read': read_name,
+                'Prediction': misclass_species_id,
+                'Species ID': species_id,
+                'Rejected': True
+              }
+              
+              for species, score in read_scores.items():
+                result[species] = score
+
+              results.append(result)
 
       df_results = pd.DataFrame(results)
       df_results.to_csv('read_classifications.tsv', sep='\\t', index=False, mode='a', header=include_header)
@@ -370,7 +469,7 @@ process calculateStats {
   conda "conda-forge::pandas conda-forge::scikit-learn"
   cpus 8
   memory '256 GB'
-  publishDir { params.publishDir }, mode: 'copy'
+  publishDir params.publishDir, mode: 'copy'
 
   input:
   path assembly_classifications
@@ -423,6 +522,38 @@ process calculateStats {
       zero_division=0
   )
 
+  # --- Abstaining Metrics (Reads only) ---
+  # Determine actual misclassification (prediction != ground truth)
+  df_read['Actually_Misclassified'] = df_read['Species ID'] != df_read['Prediction']
+  
+  # Get rejection status from Rejected column
+  rejected = df_read['Rejected']
+  not_rejected = ~rejected
+  
+  # Coverage: proportion of samples that are NOT rejected
+  coverage = not_rejected.sum() / read_total
+  
+  # Selective Accuracy: accuracy on non-rejected samples only
+  if not_rejected.sum() > 0:
+      selective_correct = ((df_read['Species ID'] == df_read['Prediction']) & not_rejected).sum()
+      selective_accuracy = selective_correct / not_rejected.sum()
+      selective_risk = 1 - selective_accuracy
+  else:
+      selective_accuracy = 0.0
+      selective_risk = 1.0
+  
+  # Rejection Precision: of all rejected samples, how many were actually misclassified
+  if rejected.sum() > 0:
+      rejection_precision = (rejected & df_read['Actually_Misclassified']).sum() / rejected.sum()
+  else:
+      rejection_precision = 0.0
+  
+  # Rejection Recall: of all misclassified samples, how many were rejected
+  if df_read['Actually_Misclassified'].sum() > 0:
+      rejection_recall = (rejected & df_read['Actually_Misclassified']).sum() / df_read['Actually_Misclassified'].sum()
+  else:
+      rejection_recall = 0.0
+
   # --- Output ---
   with open('stats.txt', 'w') as f:
       f.write("=== Assembly ===\\n")
@@ -442,118 +573,15 @@ process calculateStats {
       f.write(f"Mismatch Rate: {(read_total - read_matches) / read_total * 100:.2f}%\\n\\n")
       f.write("Classification report (per class):\\n")
       f.write(read_report + "\\n")
-  """
-}
-
-process confusionMatrix {
-  conda "conda-forge::pandas conda-forge::scikit-learn conda-forge::numpy conda-forge::matplotlib"
-  cpus 2
-  memory '16 GB'
-  publishDir { params.publishDir }, mode: 'copy'
-
-  input:
-  path classifications
-  path name_mapping
-
-  output:
-  path "confusion_matrix.png"
-
-  script:
-  """
-  #!/usr/bin/env python
-  import pandas as pd
-  from sklearn.metrics import confusion_matrix
-  import matplotlib.pyplot as plt
-  import numpy as np
-  import json
-  
-  df = pd.read_csv('${classifications}', sep='\\t')
-  y_true = df["Species ID"].astype(str)
-  y_pred = df["Prediction"].astype(str)
-
-  with open('${name_mapping}', 'r') as f:
-      name_mapping_dict = json.load(f)
-  labels = list(set(y_true) | set(y_pred))
-  labels = sorted(labels, key=lambda x: name_mapping_dict.get(x, x))
-  display_labels = [name_mapping_dict.get(label, label) for label in labels]
-
-  cm = confusion_matrix(y_true, y_pred, labels=labels)
-  cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
-  
-  plt.figure(figsize=(30, 30))
-  plt.imshow(cm_normalized, interpolation='nearest', cmap=plt.cm.Blues)
-  plt.colorbar()
-  
-  plt.xticks(ticks=np.arange(len(labels)), labels=display_labels, rotation=90, fontsize=12)
-  plt.yticks(ticks=np.arange(len(labels)), labels=display_labels, fontsize=12)
-  
-  plt.title('Xspect Acinetobacter Confusion Matrix', fontsize=24)
-  plt.xlabel('Predicted Labels', fontsize=20)
-  plt.ylabel('True Labels', fontsize=20)
-  
-  plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
-  """
-}
-
-process mismatchConfusionMatrix {
-  conda "conda-forge::pandas conda-forge::scikit-learn conda-forge::numpy conda-forge::matplotlib"
-  cpus 2
-  memory '16 GB'
-  publishDir { params.publishDir }, mode: 'copy'
-
-  input:
-  path classifications
-  path name_mapping
-
-  output:
-  path "mismatches_confusion_matrix.png"
-
-  script:
-  """
-  #!/usr/bin/env python
-  import pandas as pd
-  from sklearn.metrics import confusion_matrix
-  import matplotlib.pyplot as plt
-  import numpy as np
-  import json
-
-  
-  df = pd.read_csv('${classifications}', sep='\\t')
-  df["Species ID"] = df["Species ID"].astype(str)
-  df["Prediction"] = df["Prediction"].astype(str)
-  df_comparison_mismatch = df[df["Species ID"] != df["Prediction"]]
-
-  with open('${name_mapping}', 'r') as f:
-      name_mapping_dict = json.load(f)
-  y_true = df_comparison_mismatch["Species ID"]
-  y_pred = df_comparison_mismatch["Prediction"]
-  
-  labels = list(set(y_true) | set(y_pred))
-  labels = sorted(labels, key=lambda x: name_mapping_dict.get(x, x))
-  display_labels = [name_mapping_dict.get(label, label) for label in labels]
-  
-  cm = confusion_matrix(y_true, y_pred, labels=labels)
-  
-  plt.figure(figsize=(30, 30))
-  plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
-  cbar = plt.colorbar()
-  cbar.ax.tick_params(labelsize=20)
-  
-  plt.xticks(ticks=np.arange(len(labels)), labels=display_labels, rotation=90, fontsize=16)
-  plt.yticks(ticks=np.arange(len(labels)), labels=display_labels, fontsize=16)
-  
-  thresh = cm.max() / 2.
-  for i in range(cm.shape[0]):
-      for j in range(cm.shape[1]):
-          plt.text(j, i, format(cm[i, j], 'd'),  # 'd' ensures integer formatting
-                  horizontalalignment="center",
-                  color="white" if cm[i, j] > thresh else "black",
-                  fontsize=14)
-  
-  plt.title('Mismatches Confusion Matrix', fontsize=30)
-  plt.xlabel('Predicted Labels', fontsize=24)
-  plt.ylabel('True Labels', fontsize=24)
-  
-  plt.savefig('mismatches_confusion_matrix.png', dpi=300, bbox_inches='tight')
+      
+      f.write("\\n=== Abstaining Metrics (Reads) ===\\n")
+      f.write(f"Total Reads: {read_total}\\n")
+      f.write(f"Rejected Reads: {rejected.sum()}\\n")
+      f.write(f"Accepted Reads: {not_rejected.sum()}\\n")
+      f.write(f"Coverage: {coverage * 100:.2f}% (proportion of non-rejected samples)\\n")
+      f.write(f"Selective Accuracy: {selective_accuracy * 100:.2f}% (accuracy on non-rejected samples)\\n")
+      f.write(f"Selective Risk: {selective_risk * 100:.2f}% (error rate on non-rejected samples)\\n")
+      f.write(f"Rejection Precision: {rejection_precision * 100:.2f}% (of rejected, how many were truly misclassified)\\n")
+      f.write(f"Rejection Recall: {rejection_recall * 100:.2f}% (of misclassified, how many were rejected)\\n")
   """
 }
\ No newline at end of file
diff --git a/scripts/nextflow-utils/main.nf b/scripts/nextflow-utils/main.nf
index dd86042..19f5727 100644
--- a/scripts/nextflow-utils/main.nf
+++ b/scripts/nextflow-utils/main.nf
@@ -29,3 +29,124 @@ process strain_species_mapping {
   """
 }
 
+process confusionMatrix {
+  conda "conda-forge::pandas conda-forge::scikit-learn conda-forge::numpy conda-forge::matplotlib"
+  cpus 4
+  memory '32 GB'
+  publishDir "${params.publishDir ?: 'results'}", mode: 'copy', enabled: params.publishDir != null
+
+  input:
+  path classifications
+  path name_mapping
+  val output_filename
+  val title
+
+  output:
+  path "${output_filename}"
+
+  script:
+  """
+  #!/usr/bin/env python
+  import pandas as pd
+  from sklearn.metrics import confusion_matrix
+  import matplotlib.pyplot as plt
+  import numpy as np
+  import json
+  
+  df = pd.read_csv('${classifications}', sep='\\t', usecols=['Species ID', 'Prediction'], dtype=str)
+  y_true = df["Species ID"].astype(str)
+  y_pred = df["Prediction"].astype(str)
+
+  with open('${name_mapping}', 'r') as f:
+      name_mapping_dict = json.load(f)
+  labels = list(set(y_true) | set(y_pred))
+  labels = sorted(labels, key=lambda x: name_mapping_dict.get(x, x))
+  display_labels = [name_mapping_dict.get(label, label) for label in labels]
+
+  cm = confusion_matrix(y_true, y_pred, labels=labels)
+  cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
+  
+  plt.figure(figsize=(30, 30))
+  plt.imshow(cm_normalized, interpolation='nearest', cmap=plt.cm.Blues)
+  plt.colorbar()
+  
+  plt.xticks(ticks=np.arange(len(labels)), labels=display_labels, rotation=90, fontsize=12)
+  plt.yticks(ticks=np.arange(len(labels)), labels=display_labels, fontsize=12)
+  
+  plt.title('${title}', fontsize=24)
+  plt.xlabel('Predicted Species', fontsize=20)
+  plt.ylabel('NCBI-annotated Species', fontsize=20)
+  
+  plt.savefig('${output_filename}', dpi=300, bbox_inches='tight')
+  """
+}
+
+process mismatchConfusionMatrix {
+  conda "conda-forge::pandas conda-forge::scikit-learn conda-forge::numpy conda-forge::matplotlib"
+  cpus 4
+  memory '32 GB'
+  publishDir "${params.publishDir ?: 'results'}", mode: 'copy', enabled: params.publishDir != null
+
+  input:
+  path classifications
+  path name_mapping
+  val output_filename
+  val title
+
+  output:
+  path "${output_filename}"
+
+  script:
+  """
+  #!/usr/bin/env python
+  import pandas as pd
+  from sklearn.metrics import confusion_matrix
+  import matplotlib.pyplot as plt
+  import numpy as np
+  import json
+
+  
+  df = pd.read_csv('${classifications}', sep='\\t', usecols=['Species ID', 'Prediction'], dtype=str)
+
+  df_comparison_mismatch = df[df["Species ID"] != df["Prediction"]]
+  if df_comparison_mismatch.empty:
+      print("No mismatches found. Skipping mismatch confusion matrix generation.")
+      with open('${output_filename}', 'w') as f:
+          f.write('')
+      exit(0)
+
+  with open('${name_mapping}', 'r') as f:
+      name_mapping_dict = json.load(f)
+  y_true = df_comparison_mismatch["Species ID"]
+  y_pred = df_comparison_mismatch["Prediction"]
+  
+  labels = list(set(y_true) | set(y_pred))
+  labels = sorted(labels, key=lambda x: name_mapping_dict.get(x, x))
+  display_labels = [name_mapping_dict.get(label, label) for label in labels]
+  
+  cm = confusion_matrix(y_true, y_pred, labels=labels)
+  
+  plt.figure(figsize=(30, 30))
+  plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
+  cbar = plt.colorbar()
+  cbar.ax.tick_params(labelsize=20)
+  
+  plt.xticks(ticks=np.arange(len(labels)), labels=display_labels, rotation=90, fontsize=16)
+  plt.yticks(ticks=np.arange(len(labels)), labels=display_labels, fontsize=16)
+  
+  thresh = cm.max() / 2.
+  for i in range(cm.shape[0]):
+      for j in range(cm.shape[1]):
+          plt.text(j, i, format(cm[i, j], 'd'),  # 'd' ensures integer formatting
+                  horizontalalignment="center",
+                  color="white" if cm[i, j] > thresh else "black",
+                  fontsize=14)
+  
+  plt.title('${title}', fontsize=30)
+  plt.xlabel('Predicted Species', fontsize=24)
+  plt.ylabel('NCBI-annotated Species', fontsize=24)
+  
+  plt.savefig('${output_filename}', dpi=300, bbox_inches='tight')
+  """
+}
+

From e6c6a54cd5f1619ffb7faeaccd6631a90c5aff40 Mon Sep 17 00:00:00 2001
From: Adrian Romberg <adrian.romberg@icloud.com>
Date: Thu, 18 Dec 2025 12:50:33 +0100
Subject: [PATCH 4/7] add svm workflow for score-based validation

---
 scripts/score-svm/main.nf         | 154 ++++++++++++++++++++++++++++++
 scripts/score-svm/nextflow.config |   6 ++
 2 files changed, 160 insertions(+)
 create mode 100644 scripts/score-svm/main.nf
 create mode 100644 scripts/score-svm/nextflow.config

diff --git a/scripts/score-svm/main.nf b/scripts/score-svm/main.nf
new file mode 100644
index 0000000..46de2a2
--- /dev/null
+++ b/scripts/score-svm/main.nf
@@ -0,0 +1,154 @@
+#!/usr/bin/env nextflow
+
+// --------------------- PARAMETERS ---------------------
+params.classifications = "results/benchmark/read_classifications.tsv"
+params.publishDir      = "results/score-svm"
+params.test_size       = 0.2
+params.random_state    = 42
+params.sample_size     = 100000
+params.mismatch_fraction = 0.2
+
+// --------------------- WORKFLOW -----------------------
+workflow {
+  classifications_file = file(params.classifications)
+
+  DOWNSAMPLE(classifications_file)
+  UNSORTED_SVM(DOWNSAMPLE.out.downsampled)
+}
+
+// --------------------- PROCESSES ----------------------
+
+process DOWNSAMPLE {
+  conda "conda-forge::pandas"
+  cpus 16
+  memory '256 GB'
+
+  input:
+    path classifications
+
+  output:
+    path "downsampled.tsv", emit: downsampled
+
+  script:
+  """
+  #!/usr/bin/env python
+  import pandas as pd
+
+  df = pd.read_csv("${classifications}", sep="\\t")
+
+  # apply to_numeric to reduce memory usage
+  score_cols = [col for col in df.columns if col.isdigit()]
+  for col in score_cols:
+      df[col] = pd.to_numeric(df[col], downcast='float')
+
+  # Same rows may already be rejected due to ambiguous classifications - filter them out
+  df = df[df["Rejected"] == False].copy()
+      
+  df["label"] = (df["Species ID"].astype(str) == df["Prediction"].astype(str)).astype(int)
+  
+  sample_size = ${params.sample_size}
+  mismatch_fraction = ${params.mismatch_fraction}
+  random_state = ${params.random_state}
+  
+  # Separate mismatches (label=0) and correct predictions (label=1)
+  mismatches = df[df["label"] == 0].copy()
+  correct = df[df["label"] == 1].copy()
+  
+  # Calculate target sizes for each class
+  n_mismatches = int(sample_size * mismatch_fraction)
+  n_correct = sample_size - n_mismatches
+  
+  print(f"Target: {n_mismatches} mismatches, {n_correct} correct predictions")
+  print(f"Available: {len(mismatches)} mismatches, {len(correct)} correct predictions")
+  
+  # Function to sample with even species distribution using groupby
+  def stratified_sample_by_species(data, n_samples, random_state):
+      if len(data) <= n_samples:
+          return data
+      
+      n_species = data["Species ID"].nunique()
+      samples_per_species = n_samples // n_species
+      
+      # Sample evenly from each species
+      sampled = data.groupby("Species ID", group_keys=False).apply(
+          lambda x: x.sample(n=min(samples_per_species, len(x)), random_state=random_state)
+      )
+      
+      # If we need more samples, sample remaining from the full dataset
+      if len(sampled) < n_samples:
+          remaining_needed = n_samples - len(sampled)
+          additional = data.drop(sampled.index).sample(n=min(remaining_needed, len(data) - len(sampled)), random_state=random_state)
+          sampled = pd.concat([sampled, additional])
+      
+      return sampled.sample(frac=1, random_state=random_state).reset_index(drop=True)
+  
+  # Sample each class with stratification by species
+  sampled_mismatches = stratified_sample_by_species(mismatches, n_mismatches, random_state)
+  sampled_correct = stratified_sample_by_species(correct, n_correct, random_state + 1000)
+  
+  # Combine and shuffle
+  df_sampled = pd.concat([sampled_mismatches, sampled_correct], ignore_index=True)
+  df_sampled = df_sampled.sample(frac=1, random_state=random_state).reset_index(drop=True)
+  
+  print(f"Final dataset: {len(df_sampled)} samples")
+  print(f"  Mismatches: {(df_sampled['label'] == 0).sum()} ({(df_sampled['label'] == 0).sum() / len(df_sampled):.2%})")
+  print(f"  Correct: {(df_sampled['label'] == 1).sum()} ({(df_sampled['label'] == 1).sum() / len(df_sampled):.2%})")
+  print(f"  Unique species in mismatches: {df_sampled[df_sampled['label'] == 0]['Species ID'].nunique()}")
+  print(f"  Unique species in correct: {df_sampled[df_sampled['label'] == 1]['Species ID'].nunique()}")
+  
+  df_sampled.to_csv("downsampled.tsv", sep="\\t", index=False)
+  """
+}
+
+process UNSORTED_SVM {
+  publishDir params.publishDir, mode: 'copy'
+  conda "conda-forge::python=3.12 conda-forge::pandas conda-forge::scikit-learn conda-forge::numpy"
+  cpus 16
+  memory '256 GB'
+
+  input:
+    path classifications
+
+  output:
+    path "classification_report.txt", emit: classification_report
+
+  script:
+  """
+  #!/usr/bin/env python
+  import pandas as pd
+  from sklearn.svm import SVC
+  from sklearn.model_selection import train_test_split
+  from sklearn.metrics import classification_report
+
+  df = pd.read_csv("${classifications}", sep="\\t")
+
+  # apply use_to_numeric to reduce memory usage
+  score_cols = [col for col in df.columns if col.isdigit()]
+  for col in score_cols:
+      df[col] = pd.to_numeric(df[col], downcast='float')
+      
+  df["label"] = (df["Species ID"].astype(str) == df["Prediction"].astype(str)).astype(int)
+  
+  feature_cols = [c for c in df.columns if c.isdigit()]
+  X = df[feature_cols].values
+  y = df["label"].values
+  
+  X_train, X_test, y_train, y_test = train_test_split(
+      X, y, test_size=${params.test_size}, random_state=${params.random_state}, stratify=y
+  )
+
+  svm = SVC(kernel="rbf", probability=True, class_weight="balanced", random_state=42)
+  svm.fit(X_train, y_train)
+  
+  y_pred = svm.predict(X_test)
+  
+  with open('classification_report.txt', 'w') as f:
+    f.write(f"SVM Classification Report\\n")
+    f.write(f"========================\\n\\n")
+    f.write(f"Total samples used: {len(df)}\\n")
+    f.write(f"Training samples: {len(X_train)}\\n")
+    f.write(f"Test samples: {len(X_test)}\\n\\n")
+    f.write(classification_report(y_test, y_pred, target_names=["Incorrect", "Correct"]))
+  """
+}
+
diff --git a/scripts/score-svm/nextflow.config b/scripts/score-svm/nextflow.config
new file mode 100644
index 0000000..4867f83
--- /dev/null
+++ b/scripts/score-svm/nextflow.config
@@ -0,0 +1,6 @@
+process.executor = 'slurm'
+executor.account = 'intern'
+process.queue = 'all'
+
+
+conda.enabled = true

From 2de17f4c66f573ba4546bb3d08046876459cdd8e Mon Sep 17 00:00:00 2001
From: Adrian Romberg <adrian.romberg@icloud.com>
Date: Thu, 18 Dec 2025 12:50:57 +0100
Subject: [PATCH 5/7] update pangenome-informed training workflow

---
 scripts/pangenome-train/main.nf | 94 ++++++++++++++++++++++++++-------
 1 file changed, 75 insertions(+), 19 deletions(-)

diff --git a/scripts/pangenome-train/main.nf b/scripts/pangenome-train/main.nf
index 4c44253..f5a18da 100644
--- a/scripts/pangenome-train/main.nf
+++ b/scripts/pangenome-train/main.nf
@@ -5,9 +5,15 @@ include { strain_species_mapping } from '../nextflow-utils'
 // --------------------- PARAMETERS ---------------------
 params.dataset_dir = "data/genomes/ncbi_dataset"
 params.cpus = 32
+// Clustering method: either "98" or "GF"
+params.clusteringMethod = "98"
 
 // --------------------- WORKFLOW -----------------------
 workflow {
+  // Input validation
+  if (!['98', 'GF'].contains(params.clusteringMethod)) {
+    throw new IllegalArgumentException("Invalid params.clusteringMethod: '${params.clusteringMethod}'. Must be '98' for 98% similarity clustering or 'GF' for gene family clustering.")
+  }
 
   // Build maps
   taxon_map = EXTRACT_TAXON(file(params.dataset_dir) + "/data/assembly_data_report.jsonl")
@@ -30,7 +36,12 @@ workflow {
 
   pangenomes = PANGENOME(train_taxon_files, file(params.dataset_dir))
   pangenome_fastas = EXTRACT_FASTA(pangenomes)
-  organized_fasta_folders = ORGANIZE_PANGENOME_FASTAS(pangenome_fastas.collect())
+  if (params.clusteringMethod == '98') {
+    clustered_fastas = CLUSTER_FASTAS(pangenome_fastas)
+  } else {
+    clustered_fastas = pangenome_fastas
+  }
+  organized_fasta_folders = ORGANIZE_PANGENOME_FASTAS(clustered_fastas.collect())
 
   fasta_folder_ch = organized_fasta_folders.core.mix(
     organized_fasta_folders.softcore_95,
@@ -281,7 +292,7 @@ process PANGENOME {
 }
 
 process EXTRACT_FASTA {
-  publishDir "results/pangenome-train/fastas", mode: 'copy'
+  publishDir "results/pangenome-train/${params.clusteringMethod}-fastas", mode: 'copy'
   conda "bioconda::ppanggolin"
   cpus params.cpus
   memory params.cpus * 4 + " GB"
@@ -292,24 +303,66 @@ process EXTRACT_FASTA {
   output:
   path "${pangenome_h5.baseName}"
 
+
   script:
+  def geneOpt = params.clusteringMethod == '98' ? '--genes' : '--gene_families'
   """
     set -euo pipefail
 
-    ppanggolin fasta -p "${pangenome_h5}" --output "${pangenome_h5.baseName}" --gene_families core -f
-    ppanggolin fasta -p "${pangenome_h5}" --output "${pangenome_h5.baseName}" --gene_families softcore -f
-    ppanggolin fasta -p "${pangenome_h5}" --output "${pangenome_h5.baseName}/sc90" --gene_families softcore -f --soft_core 0.9
-    ppanggolin fasta -p "${pangenome_h5}" --output "${pangenome_h5.baseName}/sc85" --gene_families softcore -f --soft_core 0.85
-    ppanggolin fasta -p "${pangenome_h5}" --output "${pangenome_h5.baseName}/sc80" --gene_families softcore -f --soft_core 0.8
-    ppanggolin fasta -p "${pangenome_h5}" --output "${pangenome_h5.baseName}/sc75" --gene_families softcore -f --soft_core 0.75
-    ppanggolin fasta -p "${pangenome_h5}" --output "${pangenome_h5.baseName}" --gene_families persistent -f
-    ppanggolin fasta -p "${pangenome_h5}" --output "${pangenome_h5.baseName}" --gene_families shell -f
-    ppanggolin fasta -p "${pangenome_h5}" --output "${pangenome_h5.baseName}" --gene_families cloud -f
+    ppanggolin fasta -p "${pangenome_h5}" --output "${pangenome_h5.baseName}" ${geneOpt} core -f
+    ppanggolin fasta -p "${pangenome_h5}" --output "${pangenome_h5.baseName}" ${geneOpt} softcore -f
+    ppanggolin fasta -p "${pangenome_h5}" --output "${pangenome_h5.baseName}/sc90" ${geneOpt} softcore -f --soft_core 0.9
+    ppanggolin fasta -p "${pangenome_h5}" --output "${pangenome_h5.baseName}/sc85" ${geneOpt} softcore -f --soft_core 0.85
+    ppanggolin fasta -p "${pangenome_h5}" --output "${pangenome_h5.baseName}/sc80" ${geneOpt} softcore -f --soft_core 0.8
+    ppanggolin fasta -p "${pangenome_h5}" --output "${pangenome_h5.baseName}/sc75" ${geneOpt} softcore -f --soft_core 0.75
+    ppanggolin fasta -p "${pangenome_h5}" --output "${pangenome_h5.baseName}" ${geneOpt} persistent -f
+    ppanggolin fasta -p "${pangenome_h5}" --output "${pangenome_h5.baseName}" ${geneOpt} shell -f
+    ppanggolin fasta -p "${pangenome_h5}" --output "${pangenome_h5.baseName}" ${geneOpt} cloud -f
+    """
+}
+
+process CLUSTER_FASTAS {
+  publishDir "results/pangenome-train/clustered", mode: 'copy'
+  conda "bioconda::cd-hit bioconda::seqkit"
+  cpus params.cpus
+  memory params.cpus * 4 + " GB"
+
+  input:
+  path fasta_dir
+
+  output:
+  path "${fasta_dir.baseName}_clustered"
+
+  script:
+  """
+    set -euo pipefail
+    
+    mkdir -p "${fasta_dir.baseName}_clustered"
+    
+    find -L "${fasta_dir}" -type f -name "*.fna" | while read -r fasta_file; do
+      rel_path=\${fasta_file#${fasta_dir}/}
+      output_file="${fasta_dir.baseName}_clustered/\${rel_path}"
+      temp_file="\${output_file}.temp"
+      
+      mkdir -p "\$(dirname "\${output_file}")"
+      
+      seqkit rmdup -s "\${fasta_file}" -o "\${temp_file}"
+      
+      # Cluster using cd-hit-est
+      # -c 0.98: 98% sequence identity threshold
+      # -T 0: use all available threads
+      # -M 0: unlimited memory
+      # -d 0: keep full sequence descriptions
+      cd-hit-est -i "\${temp_file}" -o "\${output_file}" \\
+        -c 0.98 -T ${task.cpus} -M 0 -d 0
+      
+      rm -f "\${temp_file}" "\${output_file}.clstr"
+    done
     """
 }
 
 process ORGANIZE_PANGENOME_FASTAS {
-  publishDir "results/pangenome-train/organized", mode: 'copy'
+  publishDir "results/pangenome-train/${params.clusteringMethod}-organized", mode: 'copy'
 
   input:
   path pangenome_fastas
@@ -339,10 +392,13 @@ process ORGANIZE_PANGENOME_FASTAS {
         if not os.path.isdir(fasta_dir):
             continue
             
-        taxid = os.path.basename(fasta_dir)
+        taxid = os.path.basename(fasta_dir).split('_')[0]
         
         # Find all fasta files in the directory and subdirectories
-        for fasta in glob.glob(f"{fasta_dir}/**/*.fasta", recursive=True):
+        files = []
+        files.extend(glob.glob(f"{fasta_dir}/**/*.fna", recursive=True))
+        files.extend(glob.glob(f"{fasta_dir}/**/*.fasta", recursive=True))
+        for fasta in sorted(set(files)):
             base = os.path.basename(fasta)
             
             # Determine partition type from filename
@@ -398,8 +454,8 @@ process XSPECT_TRAIN {
   """
     set -euo pipefail
 
-    xspect models train directory -g "Acinetobacter_${organized_fasta_folder.baseName}" -i "${organized_fasta_folder}"
-    echo "Acinetobacter_${organized_fasta_folder.baseName}" > "model_${organized_fasta_folder.baseName}.txt"
+    xspect models train directory -g "Acinetobacter_${organized_fasta_folder.baseName}_${params.clusteringMethod}" -i "${organized_fasta_folder}"
+    echo "Acinetobacter_${organized_fasta_folder.baseName}_${params.clusteringMethod}" > "model_${organized_fasta_folder.baseName}.txt"
     """
 }
 
@@ -468,11 +524,11 @@ process UPDATE_MODEL_METADATA {
             model_name = f.read().strip()
         
         # Determine model path based on naming convention
-        # Extract the partition type from model name (e.g., "Acinetobacter_core" -> "core")
-        partition = model_name.split('_', maxsplit=1)[-1].replace('_', '-')
+        # Extract the model suffix from model name (e.g., "Acinetobacter_core_98" or "Acinetobacter_core_GF")
+        model_suffix = model_name.split('_', maxsplit=1)[-1].replace('_', '-').lower()
         
         # Construct model file path
-        model_file = Path.home() / "xspect-data" / "models" / f"acinetobacter-{partition}-species.json"
+        model_file = Path.home() / "xspect-data" / "models" / f"acinetobacter-{model_suffix}-species.json"
         
         if model_file.exists():
             # Read existing model metadata

From 228584575d73dee238f349bb9ffab5945fefee64 Mon Sep 17 00:00:00 2001
From: Adrian Romberg <adrian.romberg@icloud.com>
Date: Thu, 18 Dec 2025 12:54:34 +0100
Subject: [PATCH 6/7] update gitignore

---
 .gitignore | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.gitignore b/.gitignore
index c25ba18..2f01ed7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -192,3 +192,7 @@ delete.slurm
 
 
 playground.ipynb
+old_results/
+svm.ipynb
+zip.slurm
+scripts/benchmarks.sh
\ No newline at end of file

From 257dcc29744dcc6b80cd76c5c04d1dd5e291a314 Mon Sep 17 00:00:00 2001
From: Adrian Romberg <adrian.romberg@icloud.com>
Date: Thu, 18 Dec 2025 13:27:26 +0100
Subject: [PATCH 7/7] bump version

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 5a29a10..3c186ec 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "XspecT"
-version = "0.7.2"
+version = "0.7.3"
 description = "Tool to monitor and characterize pathogens using Bloom filters."
 readme = {file = "README.md", content-type = "text/markdown"}
 license = {file = "LICENSE"}