-
Notifications
You must be signed in to change notification settings - Fork 53
Open
Description
The pharmcat_pipeline triggers an issue when processing my vcf files.
docker run --rm -v /mnt/d/pharmcat-test/data:/pharmcat/data pgkb/pharmcat pharmcat_pipeline /pharmcat/data/out.vcf
PharmCAT version: 3.1.1
Processing /pharmcat/data/out.vcf ...
* Cataloging 1202 missing positions in /pharmcat/data/out.missing_pgx_var.vcf
Running PharmCAT...
Checking files...
* Found 1 VCF file
Saving named allele matcher JSON results to /pharmcat/data/out.match.json
Saving phenotyper JSON results to /pharmcat/data/out.phenotype.json
Saving reporter HTML results to /pharmcat/data/out.report.html
Done.
Issue: Cataloging 1202 missing positions in /pharmcat/data/out.missing_pgx_var.vcf
Here is our VCF generation python code:
import os
import stdpopsim
import random
from typing import Any, List
def parse_chromosome_spec(chromosome_spec: Any) -> List[str]:
"""
Parse chromosome specification supporting single, multiple, and range formats.
Supports:
- Single: "1", "chr1", "X", 1
- Multiple: "1,2,3", ["1", "2", "3"], "chr1,chr2,chrX"
- Range: "1-5", "chr1-chr5", "1-22"
- Mixed: "1-3,5,X" or "chr1-chr3,chr5,chrX"
Args:
chromosome_spec: Chromosome specification (int, str, or list)
Returns:
List of normalized chromosome strings (without "chr" prefix)
"""
valid_chroms = [str(i) for i in range(1, 23)] + ["X", "Y", "MT", "M"]
chromosomes = []
# Handle list input
if isinstance(chromosome_spec, list):
for chrom in chromosome_spec:
chromosomes.extend(parse_chromosome_spec(chrom))
return chromosomes
# Convert to string and normalize
chrom_str = str(chromosome_spec).strip()
if not chrom_str:
return []
# Remove "chr" prefix for processing
chrom_str = chrom_str.replace("chr", "")
# Check for comma-separated values (multiple chromosomes)
if "," in chrom_str:
parts = [p.strip() for p in chrom_str.split(",")]
for part in parts:
chromosomes.extend(parse_chromosome_spec(part))
return chromosomes
# Check for range (e.g., "1-5" or "chr1-chr5")
if "-" in chrom_str:
parts = chrom_str.split("-", 1)
if len(parts) != 2:
return []
start_str = parts[0].strip()
end_str = parts[1].strip()
# Try to parse as numeric range
try:
start = int(start_str)
end = int(end_str)
# Validate range
if start < 1 or end > 22 or start > end:
return []
# Generate range
return [str(i) for i in range(start, end + 1)]
except ValueError:
# Not a numeric range, treat as invalid
return []
# Single chromosome - validate and return
if chrom_str in valid_chroms:
return [chrom_str]
return []
def simulate(
vcf_name: str
) -> bool:
"""Run stdpopsim simulation."""
try:
# Select a random integer between 10 and 20
chromosome = random.randint(10, 20)
chrom_value = parse_chromosome_spec(chromosome)[0]
chromosome_chr = f"chr{chrom_value}"
species_name = "HomSap"
# Get species with caching and specific error handling
species = stdpopsim.get_species(species_name)
model = species.demographic_models[random.randint(0, len(species.demographic_models) - 1)]
population_model = model.id
genome_model = species.genetic_maps[random.randint(0, len(species.genetic_maps) - 1)].id
print(chromosome, population_model, genome_model)
# Get contig with genetic map with caching and specific error handling
contig = species.get_contig(
chromosome_chr,
genetic_map=genome_model,
mutation_rate=model.mutation_rate,
)
# Get engine with caching and specific error handling
engine = stdpopsim.get_engine("msprime")
# Try sampling from available populations until simulation succeeds.
population_names = [p.name for p in model.populations]
random.shuffle(population_names)
last_exc = None
ts = None
for cand in population_names:
samples = {cand: 1}
try:
ts = engine.simulate(model, contig, samples)
break
except Exception as e:
last_exc = e
err = str(e).lower()
# If error indicates non-sampling population, try another candidate
if "non-sampling population" in err or "non sampling population" in err or "non-sampling" in err:
continue
# Otherwise re-raise as it's an unexpected simulation error
raise Exception(f"Simulation failed: {e}")
if ts is None:
raise Exception(f"Simulation failed for all populations: {last_exc}")
# Write to VCF with correct chromosome name
try:
with open(vcf_name, "w") as out:
ts.write_vcf(out, contig_id=chromosome_chr)
except IOError as e:
raise Exception(f"Failed to write VCF file to {vcf_name}: {e}")
except Exception as e:
raise Exception(f"Error writing VCF file: {e}")
# Verify file was created and has content
if not os.path.exists(vcf_name) or os.path.getsize(vcf_name) == 0:
raise Exception(f"VCF file was not created or is empty: {vcf_name}")
return True
except Exception as e:
raise Exception(f"Unexpected simulation error: {e}")
def main():
vcf_path = os.path.abspath("data/out.vcf")
simulate(vcf_path)
if __name__ == '__main__':
main()
I'm not sure why this issue occurs.
Any help or suggestions would be greatly appreciated.
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels