From f6427092eededa0bc440099ca0e289a552ffd6de Mon Sep 17 00:00:00 2001
From: Anjing Liu <aliu1@alpha1.cm.cluster>
Date: Thu, 9 Apr 2026 11:56:43 -0400
Subject: [PATCH] Add trans_geno_chromosome parameter and improve trans-QTL
 pipeline

- Add `--trans-geno-chromosome` parameter to specify a different genotype
  chromosome for trans analysis, enabling phenotype chrX vs genotype chrY
  analysis
- Support both single .bed/.pgen file (filter variants by chromosome after
  loading) and manifest file (load chromosome-specific plink file) modes
- Update trans output filename: `_chrX_geno_chrY.trans_qtl` when
  trans_geno_chromosome is set, `_chrX.trans_qtl` otherwise
- Fix region_list parsing in trans step to support comment lines, no header,
  and tab-separated format
- Remove --entrypoint from help output

Note: when running trans analysis split by chromosome (e.g. phenotype chr1 x
genotype chr2, phenotype chr1 x genotype chr3, etc.), the q-values computed
per-run are NOT accurate because they are based on a subset of tests. The
q-values need to be recalculated after combining all results across runs.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../TensorQTL/TensorQTL.ipynb                 | 47 +++++++++++++++++--
 1 file changed, 42 insertions(+), 5 deletions(-)

diff --git a/code/association_scan/TensorQTL/TensorQTL.ipynb b/code/association_scan/TensorQTL/TensorQTL.ipynb
index 2b2912036..9a4dfdbe2 100644
--- a/code/association_scan/TensorQTL/TensorQTL.ipynb
+++ b/code/association_scan/TensorQTL/TensorQTL.ipynb
@@ -1318,6 +1318,9 @@
     "parameter: chromosome = []\n",
     "# Minor allele count cutoff\n",
     "parameter: MAC = 0\n",
+    "# Specify genotype chromosome for trans analysis (e.g. --trans-geno-chromosome 5)\n",
+    "# When set, trans step loads this genotype chrom instead of the one from input_files\n",
+    "parameter: trans_geno_chromosome = \"\"\n",
     "\n",
     "# Specify the cis window for the up and downstream radius to analyze around the region of interest in units of bp\n",
     "# This parameter will be set to zero if `customized_cis_windows` is provided.\n",
@@ -1422,7 +1425,8 @@
     "    input_chroms = [x[0] for x in input_files]\n",
     "    input_files = [x[1:] for x in input_files]\n",
     "    if len(name) == 0:\n",
-    "        name = f'{path(input_files[0][0]):bnn}' if len(input_files) == 1 else f'{path(input_files[0][0]):bnnn}'\n"
+    "        name = f'{path(input_files[0][0]):bnn}' if len(input_files) == 1 else f'{path(input_files[0][0]):bnnn}'\n",
+    ""
    ]
   },
   {
@@ -1869,7 +1873,7 @@
     "parameter: pval = 0.0\n",
     "\n",
     "input: input_files, group_by = len(input_files[0]), group_with = \"input_chroms\"\n",
-    "output: nominal = f'{cwd:a}/{_input[0]:bnn}{\"_%s\" % input_chroms[_index] if input_chroms[_index] != 0 else \"\"}.trans_qtl{\"_p_%.0e\" % pval if pval > 0.0 else \"\"}.pairs.tsv.gz'\n",
+    "output: nominal = f'{cwd:a}/{_input[0]:bnn}{\"_chr%s\" % input_chroms[_index] if input_chroms[_index] != 0 else \"\"}{\"_geno_chr%s\" % trans_geno_chromosome if trans_geno_chromosome else \"\"}.trans_qtl{\"_p_%.0e\" % pval if pval > 0.0 else \"\"}.pairs.tsv.gz'\n",
     "\n",
     "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'\n",
     "python: expand= \"$[ ]\", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout',container =container\n",
@@ -1896,7 +1900,7 @@
     "\n",
     "    ## Analyze only the regions listed\n",
     "    if $[region_list.is_file()]:\n",
-    "        region = pd.read_csv(\"$[region_list:a]\")\n",
+    "        region = pd.read_csv(\"$[region_list:a]\", comment=\"#\", header=None, sep=\"\\t\")\n",
     "        phenotype_column = 1 if len(region.columns) == 1 else $[region_list_phenotype_column]\n",
     "        keep_region = region.iloc[:, phenotype_column-1].astype(str).str.strip().to_list()\n",
     "        phenotype_df = phenotype_df[phenotype_df.index.isin(keep_region)]\n",
@@ -1948,7 +1952,39 @@
     "            covariates_df = covariates_df[keep_cols]\n",
     "       \n",
     "\n",
+    "    ## If trans_geno_chromosome is specified, load that genotype chrom instead\n",
+    "    trans_geno_chrom = \"$[trans_geno_chromosome]\"\n",
+    "    if trans_geno_chrom:\n",
+    "        trans_geno_chrom = trans_geno_chrom.replace(\"chr\", \"\")\n",
+    "        geno_file_str = \"$[genotype_file:a]\"\n",
+    "        if geno_file_str.endswith(\".bed\") or geno_file_str.endswith(\".pgen\"):\n",
+    "            # Single plink file: load as-is, will filter variants by chrom after loading\n",
+    "            print(f\"Trans mode: will filter genotype to chr{trans_geno_chrom} after loading\")\n",
+    "        else:\n",
+    "            # Manifest file: find the plink file for the specified chromosome\n",
+    "            geno_list_df = pd.read_csv(geno_file_str, sep=\"\\t\")\n",
+    "            geno_list_df[\"#id\"] = [x.replace(\"chr\",\"\") for x in geno_list_df[\"#id\"].astype(str)]\n",
+    "            geno_row = geno_list_df[geno_list_df[\"#id\"] == trans_geno_chrom]\n",
+    "            if len(geno_row) == 0:\n",
+    "                raise ValueError(f\"No genotype file found for chromosome {trans_geno_chrom}\")\n",
+    "            geno_plink = geno_row[\"#path\"].values[0]\n",
+    "            if geno_plink.endswith(\".bed\"):\n",
+    "                geno_plink = geno_plink[:-4]\n",
+    "            elif geno_plink.endswith(\".pgen\"):\n",
+    "                geno_plink = geno_plink[:-5]\n",
+    "            plink_prefix_path = geno_plink\n",
+    "            print(f\"Trans mode: overriding genotype to chr{trans_geno_chrom} from {plink_prefix_path}\")\n",
+    "\n",
     "    genotype_df, variant_df = genotypeio.load_genotypes(plink_prefix_path, dosages = True) \n",
+    "\n",
+    "    ## Filter genotype to specified chromosome if trans_geno_chrom is set\n",
+    "    if trans_geno_chrom:\n",
+    "        chrom_filter = variant_df[\"chrom\"].astype(str).str.replace(\"chr\", \"\") == trans_geno_chrom\n",
+    "        if chrom_filter.sum() == 0:\n",
+    "            raise ValueError(f\"No variants found for chromosome {trans_geno_chrom} in genotype data\")\n",
+    "        variant_df = variant_df[chrom_filter]\n",
+    "        genotype_df = genotype_df.loc[variant_df.index]\n",
+    "        print(f\"Trans mode: filtered to {len(variant_df)} variants on chr{trans_geno_chrom}\")\n",
     "    ## use custom sample list to subset the covariates data\n",
     "    if $[keep_sample.is_file()]:\n",
     "        sample_list = pd.read_csv(\"$[keep_sample:a]\", comment=\"#\", header=None, names=[\"sample_id\"], sep=\"\\t\")\n",
@@ -2178,7 +2214,8 @@
     "\n",
     "bash: expand= \"$[ ]\", stderr = f'{_output[\"nominal\"]:nnn}.stderr', stdout = f'{_output[\"nominal\"]:nnn}.stdout', container = container\n",
     "        bgzip --compress-level 9 $[_output[\"nominal\"]:n]\n",
-    "        tabix -S 1 -s 1 -b 2 -e 2 $[_output[\"nominal\"]]\n"
+    "        tabix -S 1 -s 1 -b 2 -e 2 $[_output[\"nominal\"]]\n",
+    ""
    ]
   }
  ],
@@ -2217,4 +2254,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
\ No newline at end of file
+}