Skip to content

VCF Issue while running PharmCAT pipeline #223

@Sillar-codes

Description

@Sillar-codes

The pharmcat_pipeline triggers an issue when processing my vcf files.

docker run --rm -v /mnt/d/pharmcat-test/data:/pharmcat/data pgkb/pharmcat pharmcat_pipeline /pharmcat/data/out.vcf
PharmCAT version: 3.1.1



Processing /pharmcat/data/out.vcf ...
* Cataloging 1202 missing positions in /pharmcat/data/out.missing_pgx_var.vcf

Running PharmCAT...
Checking files...
* Found 1 VCF file
Saving named allele matcher JSON results to /pharmcat/data/out.match.json
Saving phenotyper JSON results to /pharmcat/data/out.phenotype.json
Saving reporter HTML results to /pharmcat/data/out.report.html

Done.

Issue: Cataloging 1202 missing positions in /pharmcat/data/out.missing_pgx_var.vcf

Here is our VCF generation python code:

import os
import stdpopsim
import random
from typing import Any, List


def parse_chromosome_spec(chromosome_spec: Any) -> List[str]:
    """
    Parse chromosome specification supporting single, multiple, and range formats.

    Supports:
    - Single: "1", "chr1", "X", 1
    - Multiple: "1,2,3", ["1", "2", "3"], "chr1,chr2,chrX"
    - Range: "1-5", "chr1-chr5", "1-22"
    - Mixed: "1-3,5,X" or "chr1-chr3,chr5,chrX"

    Args:
        chromosome_spec: Chromosome specification (int, str, or list)

    Returns:
        List of normalized chromosome strings (without "chr" prefix)
    """
    valid_chroms = [str(i) for i in range(1, 23)] + ["X", "Y", "MT", "M"]
    chromosomes = []

    # Handle list input
    if isinstance(chromosome_spec, list):
        for chrom in chromosome_spec:
            chromosomes.extend(parse_chromosome_spec(chrom))
        return chromosomes

    # Convert to string and normalize
    chrom_str = str(chromosome_spec).strip()
    if not chrom_str:
        return []

    # Remove "chr" prefix for processing
    chrom_str = chrom_str.replace("chr", "")

    # Check for comma-separated values (multiple chromosomes)
    if "," in chrom_str:
        parts = [p.strip() for p in chrom_str.split(",")]
        for part in parts:
            chromosomes.extend(parse_chromosome_spec(part))
        return chromosomes

    # Check for range (e.g., "1-5" or "chr1-chr5")
    if "-" in chrom_str:
        parts = chrom_str.split("-", 1)
        if len(parts) != 2:
            return []

        start_str = parts[0].strip()
        end_str = parts[1].strip()

        # Try to parse as numeric range
        try:
            start = int(start_str)
            end = int(end_str)

            # Validate range
            if start < 1 or end > 22 or start > end:
                return []

            # Generate range
            return [str(i) for i in range(start, end + 1)]
        except ValueError:
            # Not a numeric range, treat as invalid
            return []

    # Single chromosome - validate and return
    if chrom_str in valid_chroms:
        return [chrom_str]

    return []


def simulate(
    vcf_name: str
) -> bool:
    """Run stdpopsim simulation."""
    try:
        # Select a random integer between 10 and 20
        chromosome = random.randint(10, 20)
        chrom_value = parse_chromosome_spec(chromosome)[0]
        chromosome_chr = f"chr{chrom_value}"

        species_name = "HomSap"

        # Get species with caching and specific error handling
        species = stdpopsim.get_species(species_name)

        model = species.demographic_models[random.randint(0, len(species.demographic_models) - 1)]
        population_model = model.id
        genome_model = species.genetic_maps[random.randint(0, len(species.genetic_maps) - 1)].id
        print(chromosome, population_model, genome_model)

        # Get contig with genetic map with caching and specific error handling
        contig = species.get_contig(
            chromosome_chr,
            genetic_map=genome_model,
            mutation_rate=model.mutation_rate,
        )

        # Get engine with caching and specific error handling
        engine = stdpopsim.get_engine("msprime")

        # Try sampling from available populations until simulation succeeds.
        population_names = [p.name for p in model.populations]
        random.shuffle(population_names)
        last_exc = None
        ts = None
        for cand in population_names:
            samples = {cand: 1}
            try:
                ts = engine.simulate(model, contig, samples)
                break
            except Exception as e:
                last_exc = e
                err = str(e).lower()
                # If error indicates non-sampling population, try another candidate
                if "non-sampling population" in err or "non sampling population" in err or "non-sampling" in err:
                    continue
                # Otherwise re-raise as it's an unexpected simulation error
                raise Exception(f"Simulation failed: {e}")

        if ts is None:
            raise Exception(f"Simulation failed for all populations: {last_exc}")

        # Write to VCF with correct chromosome name
        try:
            with open(vcf_name, "w") as out:
                ts.write_vcf(out, contig_id=chromosome_chr)
        except IOError as e:
            raise Exception(f"Failed to write VCF file to {vcf_name}: {e}")
        except Exception as e:
            raise Exception(f"Error writing VCF file: {e}")

        # Verify file was created and has content
        if not os.path.exists(vcf_name) or os.path.getsize(vcf_name) == 0:
            raise Exception(f"VCF file was not created or is empty: {vcf_name}")

        return True

    except Exception as e:
        raise Exception(f"Unexpected simulation error: {e}")


def main():
    vcf_path = os.path.abspath("data/out.vcf")
    simulate(vcf_path)


if __name__ == '__main__':
    main()

I'm not sure why this issue occurs.
Any help or suggestions would be greatly appreciated.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions