From f6427092eededa0bc440099ca0e289a552ffd6de Mon Sep 17 00:00:00 2001 From: Anjing Liu Date: Thu, 9 Apr 2026 11:56:43 -0400 Subject: [PATCH] Add trans_geno_chromosome parameter and improve trans-QTL pipeline - Add `--trans-geno-chromosome` parameter to specify a different genotype chromosome for trans analysis, enabling phenotype chrX vs genotype chrY analysis - Support both single .bed/.pgen file (filter variants by chromosome after loading) and manifest file (load chromosome-specific plink file) modes - Update trans output filename: `_chrX_geno_chrY.trans_qtl` when trans_geno_chromosome is set, `_chrX.trans_qtl` otherwise - Fix region_list parsing in trans step to support comment lines, no header, and tab-separated format - Remove --entrypoint from help output Note: when running trans analysis split by chromosome (e.g. phenotype chr1 x genotype chr2, phenotype chr1 x genotype chr3, etc.), the q-values computed per-run are NOT accurate because they are based on a subset of tests. The q-values need to be recalculated after combining all results across runs. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../TensorQTL/TensorQTL.ipynb | 47 +++++++++++++++++-- 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/code/association_scan/TensorQTL/TensorQTL.ipynb b/code/association_scan/TensorQTL/TensorQTL.ipynb index 2b2912036..9a4dfdbe2 100644 --- a/code/association_scan/TensorQTL/TensorQTL.ipynb +++ b/code/association_scan/TensorQTL/TensorQTL.ipynb @@ -1318,6 +1318,9 @@ "parameter: chromosome = []\n", "# Minor allele count cutoff\n", "parameter: MAC = 0\n", + "# Specify genotype chromosome for trans analysis (e.g. --trans-geno-chromosome 5)\n", + "# When set, trans step loads this genotype chrom instead of the one from input_files\n", + "parameter: trans_geno_chromosome = \"\"\n", "\n", "# Specify the cis window for the up and downstream radius to analyze around the region of interest in units of bp\n", "# This parameter will be set to zero if `customized_cis_windows` is provided.\n", @@ -1422,7 +1425,8 @@ " input_chroms = [x[0] for x in input_files]\n", " input_files = [x[1:] for x in input_files]\n", " if len(name) == 0:\n", - " name = f'{path(input_files[0][0]):bnn}' if len(input_files) == 1 else f'{path(input_files[0][0]):bnnn}'\n" + " name = f'{path(input_files[0][0]):bnn}' if len(input_files) == 1 else f'{path(input_files[0][0]):bnnn}'\n", + "" ] }, { @@ -1869,7 +1873,7 @@ "parameter: pval = 0.0\n", "\n", "input: input_files, group_by = len(input_files[0]), group_with = \"input_chroms\"\n", - "output: nominal = f'{cwd:a}/{_input[0]:bnn}{\"_%s\" % input_chroms[_index] if input_chroms[_index] != 0 else \"\"}.trans_qtl{\"_p_%.0e\" % pval if pval > 0.0 else \"\"}.pairs.tsv.gz'\n", + "output: nominal = f'{cwd:a}/{_input[0]:bnn}{\"_chr%s\" % input_chroms[_index] if input_chroms[_index] != 0 else \"\"}{\"_geno_chr%s\" % trans_geno_chromosome if trans_geno_chromosome else \"\"}.trans_qtl{\"_p_%.0e\" % pval if pval > 0.0 else \"\"}.pairs.tsv.gz'\n", "\n", "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'\n", "python: expand= \"$[ ]\", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout',container =container\n", @@ -1896,7 +1900,7 @@ "\n", " ## Analyze only the regions listed\n", " if $[region_list.is_file()]:\n", - " region = pd.read_csv(\"$[region_list:a]\")\n", + " region = pd.read_csv(\"$[region_list:a]\", comment=\"#\", header=None, sep=\"\\t\")\n", " phenotype_column = 1 if len(region.columns) == 1 else $[region_list_phenotype_column]\n", " keep_region = region.iloc[:, phenotype_column-1].astype(str).str.strip().to_list()\n", " phenotype_df = phenotype_df[phenotype_df.index.isin(keep_region)]\n", @@ -1948,7 +1952,39 @@ " covariates_df = covariates_df[keep_cols]\n", " \n", "\n", + " ## If trans_geno_chromosome is specified, load that genotype chrom instead\n", + " trans_geno_chrom = \"$[trans_geno_chromosome]\"\n", + " if trans_geno_chrom:\n", + " trans_geno_chrom = trans_geno_chrom.replace(\"chr\", \"\")\n", + " geno_file_str = \"$[genotype_file:a]\"\n", + " if geno_file_str.endswith(\".bed\") or geno_file_str.endswith(\".pgen\"):\n", + " # Single plink file: load as-is, will filter variants by chrom after loading\n", + " print(f\"Trans mode: will filter genotype to chr{trans_geno_chrom} after loading\")\n", + " else:\n", + " # Manifest file: find the plink file for the specified chromosome\n", + " geno_list_df = pd.read_csv(geno_file_str, sep=\"\\t\")\n", + " geno_list_df[\"#id\"] = [x.replace(\"chr\",\"\") for x in geno_list_df[\"#id\"].astype(str)]\n", + " geno_row = geno_list_df[geno_list_df[\"#id\"] == trans_geno_chrom]\n", + " if len(geno_row) == 0:\n", + " raise ValueError(f\"No genotype file found for chromosome {trans_geno_chrom}\")\n", + " geno_plink = geno_row[\"#path\"].values[0]\n", + " if geno_plink.endswith(\".bed\"):\n", + " geno_plink = geno_plink[:-4]\n", + " elif geno_plink.endswith(\".pgen\"):\n", + " geno_plink = geno_plink[:-5]\n", + " plink_prefix_path = geno_plink\n", + " print(f\"Trans mode: overriding genotype to chr{trans_geno_chrom} from {plink_prefix_path}\")\n", + "\n", " genotype_df, variant_df = genotypeio.load_genotypes(plink_prefix_path, dosages = True) \n", + "\n", + " ## Filter genotype to specified chromosome if trans_geno_chrom is set\n", + " if trans_geno_chrom:\n", + " chrom_filter = variant_df[\"chrom\"].astype(str).str.replace(\"chr\", \"\") == trans_geno_chrom\n", + " if chrom_filter.sum() == 0:\n", + " raise ValueError(f\"No variants found for chromosome {trans_geno_chrom} in genotype data\")\n", + " variant_df = variant_df[chrom_filter]\n", + " genotype_df = genotype_df.loc[variant_df.index]\n", + " print(f\"Trans mode: filtered to {len(variant_df)} variants on chr{trans_geno_chrom}\")\n", " ## use custom sample list to subset the covariates data\n", " if $[keep_sample.is_file()]:\n", " sample_list = pd.read_csv(\"$[keep_sample:a]\", comment=\"#\", header=None, names=[\"sample_id\"], sep=\"\\t\")\n", @@ -2178,7 +2214,8 @@ "\n", "bash: expand= \"$[ ]\", stderr = f'{_output[\"nominal\"]:nnn}.stderr', stdout = f'{_output[\"nominal\"]:nnn}.stdout', container = container\n", " bgzip --compress-level 9 $[_output[\"nominal\"]:n]\n", - " tabix -S 1 -s 1 -b 2 -e 2 $[_output[\"nominal\"]]\n" + " tabix -S 1 -s 1 -b 2 -e 2 $[_output[\"nominal\"]]\n", + "" ] } ], @@ -2217,4 +2254,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +}