diff --git a/README.md b/README.md index 31a339a..46002f3 100644 --- a/README.md +++ b/README.md @@ -56,12 +56,12 @@ No compilation is required for ntRoot (only the dependencies), so simply add the ## Usage ``` -usage: ntroot [-h] [-r REFERENCE] [--reads READS] [--genome GENOME [GENOME ...]] -l L [-k K] [--tile TILE] [--lai] [-t T] [-z Z] [-j J] [-Y Y] [--custom_vcf CUSTOM_VCF] - [--strip_info] [-v] [-V] [-n] [-f] +usage: ntroot [-h] [-r REFERENCE] [--reads READS] [--genome GENOME [GENOME ...]] -l L [--exome] [-k K] [--tile TILE] [--lai] [-t T] [-z Z] [-j J] [--cutoff CUTOFF] [-Y Y] + [--custom_vcf CUSTOM_VCF] [--masked] [--exome_bed EXOME_BED] [--strip_info] [-v] [-V] [-n] [-f] ntRoot: Ancestry inference from genomic data -optional arguments: +options: -h, --help show this help message and exit -r REFERENCE, --reference REFERENCE Reference genome (FASTA, Multi-FASTA, and/or gzipped compatible) @@ -69,15 +69,20 @@ optional arguments: --genome GENOME [GENOME ...] Genome assembly file(s) for detecting SNVs compared to --reference -l L input IVC VCF file with annotated variants (e.g., 1000GP_integrated_snv_v2a_27022019.GRCh38.phased_gt1.vcf.gz, clinvar.vcf, etc.) + --exome Input reads for detecting SNVs are from whole exome sequencing. If provided, must also specify either --exome_bed or --masked. --cutoff 2 is implied unless otherwise specified. -k K k-mer size --tile TILE Tile size for ancestry fraction inference (bp) [default=5000000] --lai Output ancestry predictons per tile in a separate output file -t T Number of threads [default=4] -z Z Minimum contig length [default=100] -j J controls size of k-mer subset. When checking subset of k-mers, check every jth k-mer [default=3] + --cutoff CUTOFF Minimum coverage of k-mers in ntEdit Bloom filter. Solid k-mers are used if set to 0 [0] -Y Y Ratio of number of k-mers in the k subset that should be present to accept an edit (higher=stringent) [default=0.55] --custom_vcf CUSTOM_VCF Input VCF for computing ancestry. When specified, ntRoot will skip the ntEdit step, and predict ancestry from the provided VCF. + --masked Exome Mode (--exome) only: Indicates that the reference genome provided with --reference has all NON-targeted exonic regions hard-masked. + --exome_bed EXOME_BED + Exome Mode (--exome) only: BED file of exome targeted regions. --strip_info When using --custom_vcf, strip the existing INFO field from the input VCF. -v, --verbose Verbose mode [default=False] -V, --version show program's version number and exit @@ -117,6 +122,7 @@ GRCh38.fa.gz readme +**Running ntRoot with whole genome sequencing reads or genome assemblies** Users will specify:
@@ -128,6 +134,19 @@ Example command: ntroot -k 55 --reference GRCh38.fa.gz --reads ERR3242308_ -t 48 -Y 0.55 -l 1000GP_integrated_snv_v2a_27022019.GRCh38.phased_gt1.vcf.gz+**Running ntRoot with whole exome sequencing** + +If your input reads are from whole exome sequencing, the regions of your reference genome that are NOT targeted exonic regions should be hard-masked (converted to Ns): +
+ntroot -k 55 --exome --masked --reference masked_GRCh38.fa.gz --reads test_exome -t 48 -Y 0.55 -l 1000GP_integrated_snv_v2a_27022019.GRCh38.phased_gt1.vcf.gz ++ntRoot can perform the masking automatically if you do not already have a masked reference file. In that case, provide a BED file with all the targeted regions, and ntRoot will use bedtools to mask the reference regions that are NOT targeted regions: +
+ntroot -k 55 --exome --exome_bed exome_seq_regions.bed --reference GRCh38.fa.gz --reads test_exome -t 48 -Y 0.55 -l 1000GP_integrated_snv_v2a_27022019.GRCh38.phased_gt1.vcf.gz ++ +**Running ntRoot using a pre-existing VCF file:** + If you would like to infer ancestry from a pre-existing VCF file:
ntroot -r GRCh38.fa.gz --custom_vcf third_party.vcf -l 1000GP_integrated_snv_v2a_27022019.GRCh38.phased_gt1.vcf.gz
diff --git a/demo/chr20-21.fa.gz b/demo/chr20-21.fa.gz
new file mode 100644
index 0000000..d288e21
Binary files /dev/null and b/demo/chr20-21.fa.gz differ
diff --git a/demo/pop-spec-snp_chr20-21.vcf.gz b/demo/pop-spec-snp_chr20-21.vcf.gz
new file mode 100644
index 0000000..bfcedb4
Binary files /dev/null and b/demo/pop-spec-snp_chr20-21.vcf.gz differ
diff --git a/demo/run_ntroot_demo.sh b/demo/run_ntroot_demo.sh
index 75884c1..e8a234d 100755
--- a/demo/run_ntroot_demo.sh
+++ b/demo/run_ntroot_demo.sh
@@ -6,8 +6,12 @@ echo "Please ensure that ntRoot is on your PATH"
set -eux -o pipefail
echo "Running ntRoot reads demo..."
-wget https://www.bcgsc.ca/downloads/btl/ntroot/reads_demo/ERR3239334.chr21_1.fq.gz
-wget https://www.bcgsc.ca/downloads/btl/ntroot/reads_demo/ERR3239334.chr21_2.fq.gz
+if [ ! -f ERR3239334.chr21_1.fq.gz ]; then
+ wget https://www.bcgsc.ca/downloads/btl/ntroot/reads_demo/ERR3239334.chr21_1.fq.gz
+fi
+if [ ! -f ERR3239334.chr21_2.fq.gz ]; then
+ wget https://www.bcgsc.ca/downloads/btl/ntroot/reads_demo/ERR3239334.chr21_2.fq.gz
+fi
ntroot --reference chr21.fa --reads ERR3239334.chr21_ -k 55 -l pop-spec-snp_chr21.vcf.gz
@@ -42,6 +46,34 @@ else
echo "ntRoot input VCF test failed.. Please check your installation."
exit 1
fi
-
+
+echo "Running ntRoot --exome reads demo with --exome_bed..."
+
+if [ ! -f HG00864_ERR050736.chr20-21.fastq.gz ]; then
+ wget https://www.bcgsc.ca/downloads/btl/ntroot/reads_demo/HG00864_ERR050736.chr20-21.fastq.gz
+fi
+if [ ! -f HG00864_ERR050737.chr20-21.fastq.gz ]; then
+ wget https://www.bcgsc.ca/downloads/btl/ntroot/reads_demo/HG00864_ERR050737.chr20-21.fastq.gz
+fi
+
+ntroot --reference chr20-21.fa.gz --reads HG00864 -k 55 -l pop-spec-snp_chr20-21.vcf.gz --exome --exome_bed exome_targets.bed
+prediction=$(cat HG00864_ntedit_k55_exome_variants.vcf_ancestry-predictions_tile5000000.tsv | awk '{print $1}' |head -n 2 |tail -n 1)
+if [ ${prediction} == "EAS" ]; then
+ echo "ntRoot --exome reads test successful!"
+else
+ echo "ntRoot --exome reads test failed.. Please check your installation."
+ exit 1
+fi
+
+echo "Running ntRoot --exome demo with --masked (input reference already masked based on exon target coordinates)"
+ntroot --exome --reference masked_chr20-21.fa.gz --reads HG00864 -k 55 -l pop-spec-snp_chr20-21.vcf.gz --masked --force
+prediction=$(cat HG00864_ntedit_k55_exome_variants.vcf_ancestry-predictions_tile5000000.tsv | awk '{print $1}' |head -n 2 |tail -n 1)
+if [ ${prediction} == "EAS" ]; then
+ echo "ntRoot --exome masked test successful!"
+else
+ echo "ntRoot --exome masked test failed.. Please check your installation."
+ exit 1
+fi
+
echo "Done ntRoot tests!"
diff --git a/ntroot b/ntroot
index b9b205b..87aa3bc 100755
--- a/ntroot
+++ b/ntroot
@@ -34,6 +34,9 @@ def set_up_parser():
parser.add_argument("-l",
help="input IVC VCF file with annotated variants (e.g., 1000GP_integrated_snv_v2a_27022019.GRCh38.phased_gt1.vcf.gz, clinvar.vcf, etc.)",
type=str, required=True)
+ parser.add_argument("--exome", help="Input reads for detecting SNVs are from whole exome sequencing. "
+ "If provided, must also specify either --exome_bed or --masked. --cutoff 2 is implied unless otherwise specified.",
+ action="store_true")
parser.add_argument("-k",
help="k-mer size",
@@ -50,6 +53,8 @@ def set_up_parser():
help="controls size of k-mer subset. When checking subset of k-mers, check every jth k-mer "
"[default=3]",
default=3, type=int)
+ parser.add_argument("--cutoff", help="Minimum coverage of k-mers in ntEdit Bloom filter. "
+ "Solid k-mers are used if set to 0 [0]", default=0, type=int)
parser.add_argument("-Y",
help="Ratio of number of k-mers in the k subset that should be present to accept "
"an edit (higher=stringent) "
@@ -58,7 +63,14 @@ def set_up_parser():
"When specified, ntRoot will skip the ntEdit step, and "
"predict ancestry from the provided VCF.",
type=str)
- parser.add_argument("--strip_info", help="When using --custom_vcf, strip the existing INFO field from the input VCF.",
+ parser.add_argument("--masked", help="Exome Mode (--exome) only: "
+ "Indicates that the reference genome provided with --reference "
+ "has all NON-targeted exonic regions hard-masked. ",
+ action="store_true")
+ parser.add_argument("--exome_bed", help="Exome Mode (--exome) only: BED file of exome targeted regions. ",
+ type=str)
+ parser.add_argument("--strip_info", help="When using --custom_vcf, "
+ "strip the existing INFO field from the input VCF.",
action="store_true")
parser.add_argument("-v", "--verbose",
help="Verbose mode [default=False]", action="store_true", default=False)
@@ -80,6 +92,25 @@ def main():
if ((args.reads and args.genome) or (not args.reads and not args.genome)) and not args.custom_vcf:
raise argparse.ArgumentTypeError("Please specify --reads OR --genome")
+ if args.exome and not (args.masked or args.exome_bed):
+ raise argparse.ArgumentTypeError("In exome mode, please specify either --masked or --exome_bed")
+
+ if args.exome and (args.masked and args.exome_bed):
+ raise argparse.ArgumentTypeError("In exome mode, please specify either --masked OR --exome_bed")
+
+ if not args.exome and (args.masked or args.exome_bed):
+ raise argparse.ArgumentTypeError("Parameters --masked and --exome_bed are only used only in exome mode")
+
+ if args.exome and args.masked:
+ print("--exome and --masked specified. Assuming reference genome provided with --reference has all"
+ " NON-targeted exonic regions hard-masked.", flush=True)
+
+ if args.exome and args.cutoff < 2:
+ args.cutoff = 2
+ print("In exome mode, minimum k-mer coverage cutoff is set to 2 by default "
+ "unless explicitly set higher. Setting cutoff to 2.",
+ flush=True)
+
if not args.draft and not args.reference:
raise argparse.ArgumentTypeError("Please specify the reference genome with --reference")
if args.draft:
@@ -95,7 +126,11 @@ def main():
"Parameter settings:"]
if args.reads:
- if args.lai:
+ if args.exome and args.lai:
+ smk_rule = "ntroot_reads_exome_lai"
+ elif args.exome:
+ smk_rule = "ntroot_reads_exome"
+ elif args.lai:
smk_rule = "ntroot_reads_lai"
else:
smk_rule = "ntroot_reads"
@@ -127,6 +162,19 @@ def main():
intro_string.append(f"\t--reads {args.reads}")
command += f"reads={args.reads}"
+ intro_string.append(f"\t--cutoff {args.cutoff}")
+ command += f" cutoff={args.cutoff}"
+
+ if args.exome:
+ intro_string.append("\t--exome")
+ command += " exome=True"
+ if args.masked:
+ intro_string.append("\t--masked")
+ command += " masked=True"
+ elif args.exome_bed:
+ intro_string.append(f"\t--exome_bed {args.exome_bed}")
+ command += f" exome_bed={args.exome_bed}"
+
intro_string.extend([f"\t--reference {args.reference}",
f"\t-t {args.t}"])
diff --git a/ntroot_run_pipeline.smk b/ntroot_run_pipeline.smk
index 7cd9b5d..85ed11a 100644
--- a/ntroot_run_pipeline.smk
+++ b/ntroot_run_pipeline.smk
@@ -7,6 +7,8 @@ import shutil
onsuccess:
shutil.rmtree(".snakemake", ignore_errors=True)
+ruleorder: exome_mask_fasta > exome_masked_provided
+
# Read parameters from config or set default values
reference=config["reference"]
draft_base = os.path.basename(os.path.realpath(reference))
@@ -28,6 +30,7 @@ v = config["verbose"] if "verbose" in config else 0
j = config["j_param"] if "j_param" in config else 3
Y = config["Y_param"] if "Y_param" in config else 0.55
l = config["l_vcf"] if "l_vcf" in config else ""
+cutoff = config["cutoff"] if "cutoff" in config else 0
# Ancestry inference parameters
tile_size = config["tile_size"] if "tile_size" in config else 5000000
@@ -37,6 +40,12 @@ input_vcf = config["input_vcf"] if "input_vcf" in config else None
input_vcf_basename = os.path.basename(os.path.realpath(input_vcf)) if input_vcf else "None"
strip_info = config["strip_info"] if "strip_info" in config else None
+# Exome parameters
+exome = config["exome"] if "exome" in config else False
+masked = config["masked"] if "masked" in config else False
+exome_bed = config["exome_bed"] if "exome_bed" in config else ""
+exome_bed_base = os.path.basename(os.path.realpath(exome_bed)) if exome_bed else "None"
+
# time command
mac_time_command = "command time -l -o"
linux_time_command = "command time -v -o"
@@ -51,12 +60,18 @@ rule ntroot_genome:
rule ntroot_reads:
input: f"{reads_prefix}_ntedit_k{k}_variants.vcf_ancestry-predictions_tile{tile_size}.tsv"
+rule ntroot_reads_exome:
+ input: f"{reads_prefix}_ntedit_k{k}_exome_variants.vcf_ancestry-predictions_tile{tile_size}.tsv"
+
rule ntroot_genome_lai:
input: f"{genome_prefix}_ntedit_k{k}_variants.vcf_ancestry-predictions-tile-resolution_tile{tile_size}.tsv"
rule ntroot_reads_lai:
input: f"{reads_prefix}_ntedit_k{k}_variants.vcf_ancestry-predictions-tile-resolution_tile{tile_size}.tsv"
+rule ntroot_reads_exome_lai:
+ input: f"{reads_prefix}_ntedit_k{k}_exome_variants.vcf_ancestry-predictions-tile-resolution_tile{tile_size}.tsv"
+
rule ntroot_input_vcf:
input: f"{input_vcf_basename}.cross-ref.vcf_ancestry-predictions_tile{tile_size}.tsv"
@@ -73,28 +88,32 @@ rule ntedit_reads:
out_bf = temp(f"{reads_prefix}_k{k}.bf")
params:
benchmark = f"{time_command} ntedit_snv_k{k}.time",
- params = f"-k {k} -t {t} -z {z} -j {j} -Y {Y} --solid ",
+ params = f"-k {k} -t {t} -z {z} -j {j} -Y {Y}",
+ cutoff = f"--cutoff {cutoff}" if cutoff > 0 else "--solid",
vcf_input = f"-l {l}" if l else ""
shell:
- "{params.benchmark} run-ntedit snv --reference {reference} --reads {reads_prefix_full} {params.params} "
- "{params.vcf_input}"
+ "{params.benchmark} run-ntedit snv --reference {input.reference} --reads {reads_prefix_full} {params.params} "
+ "{params.vcf_input} {params.cutoff}"
-rule ntedit_genome:
+rule ntedit_exome_reads:
input:
- reference = reference,
- genomes = genomes
+ reference = f"masked_{draft_base}"
output:
- out_vcf = f"{genome_prefix}_ntedit_k{k}_variants.vcf",
- out_fa = temp(f"{genome_prefix}_ntedit_k{k}_edited.fa"),
- out_changes = temp(f"{genome_prefix}_ntedit_k{k}_changes.tsv"),
- out_bf = temp(f"{genome_prefix}_k{k}.bf")
+ out_vcf = f"{reads_prefix}_ntedit_k{k}_exome_variants.vcf",
+ out_fa = temp(f"{reads_prefix}_ntedit_k{k}_edited.fa"),
+ out_changes = temp(f"{reads_prefix}_ntedit_k{k}_changes.tsv"),
+ out_bf = temp(f"{reads_prefix}_k{k}.bf")
params:
benchmark = f"{time_command} ntedit_snv_k{k}.time",
params = f"-k {k} -t {t} -z {z} -j {j} -Y {Y}",
- vcf_input = f"-l {l}" if l else ""
+ cutoff = f"--cutoff {cutoff}" if cutoff > 0 else "--solid",
+ vcf_input = f"-l {l}" if l else "",
+ out_vcf_tmp = f"{reads_prefix}_ntedit_k{k}_variants.vcf"
shell:
- "{params.benchmark} run-ntedit snv --reference {reference} --genome {input.genomes} {params.params} "
- " {params.vcf_input}"
+ """
+ {params.benchmark} run-ntedit snv --reference {input.reference} --reads {reads_prefix_full} {params.params} {params.vcf_input} {params.cutoff}
+ mv {params.out_vcf_tmp} {output.out_vcf}
+ """
rule samtools_faidx:
input: reference = reference
@@ -107,6 +126,64 @@ rule samtools_faidx:
else:
shell("{params.benchmark} samtools faidx -o {output.out_fai} {input.reference}")
+rule exome_masked_provided:
+ input:
+ reference = reference
+ output:
+ masked_reference = f"masked_{draft_base}"
+ shell:
+ "ln -sf {input.reference} {output.masked_reference}"
+
+rule exome_sort_bed:
+ input:
+ bed = exome_bed,
+ ref_fai = rules.samtools_faidx.output
+ output:
+ sorted_bed = temp(f"{exome_bed_base}.sorted.bed")
+ shell:
+ "bedtools sort -i {input.bed} -faidx {input.ref_fai} > {output.sorted_bed}"
+
+rule exome_complement_bed:
+ input:
+ sorted_bed = f"{exome_bed_base}.sorted.bed",
+ ref_fai = rules.samtools_faidx.output
+ output:
+ complement_bed = temp(f"{exome_bed_base}.sorted.bed.complement.bed")
+ shell:
+ "bedtools complement -i {input.sorted_bed} -g {input.ref_fai} > {output.complement_bed}"
+
+rule exome_mask_fasta:
+ input:
+ reference = reference,
+ complement_bed = rules.exome_complement_bed.output
+ output:
+ masked_reference = f"masked_{draft_base}"
+ run:
+ if input.reference.endswith(".gz"):
+ shell("bedtools maskfasta -fi <(gunzip -c {input.reference}) -bed {input.complement_bed} -fo tmp_{output.masked_reference}")
+ shell("gzip -c tmp_{output.masked_reference} > {output.masked_reference}")
+ shell("rm tmp_{output.masked_reference}")
+ else:
+ shell("bedtools maskfasta -fi {input.reference} -bed {input.complement_bed} -fo {output.masked_reference}")
+
+rule ntedit_genome:
+ input:
+ reference = reference,
+ genomes = genomes
+ output:
+ out_vcf = f"{genome_prefix}_ntedit_k{k}_variants.vcf",
+ out_fa = temp(f"{genome_prefix}_ntedit_k{k}_edited.fa"),
+ out_changes = temp(f"{genome_prefix}_ntedit_k{k}_changes.tsv"),
+ out_bf = temp(f"{genome_prefix}_k{k}.bf")
+ params:
+ benchmark = f"{time_command} ntedit_snv_k{k}.time",
+ params = f"-k {k} -t {t} -z {z} -j {j} -Y {Y}",
+ vcf_input = f"-l {l}" if l else ""
+ shell:
+ "{params.benchmark} run-ntedit snv --reference {reference} --genome {input.genomes} {params.params} "
+ " {params.vcf_input}"
+
+
rule ancestry_prediction:
input:
vcf = "{vcf}"