StatFunGen · gaow · Apr 24, 2026 · Apr 23, 2026
diff --git a/code/data_preprocessing/covariate/covariate_hidden_factor.ipynb b/code/data_preprocessing/covariate/covariate_hidden_factor.ipynb
@@ -741,7 +741,7 @@
     "    }\n",
     "    if (MPPCNum == 0) stop(\"Invalid choice of methods to choose K for PCA: ${choose_k_method}\")\n",
     "\n",
-    "    MPPCsDF <- as.data.frame(residExpPC$rotated[, 1:MPPCNum])\n",
+    "    MPPCsDF <- as.data.frame(residExpPC$rotated[, 1:MPPCNum, drop = FALSE])\n",
     "    MPColMatrix <-  matrix(c(rep('Hidden_Factor_PC', times=MPPCNum), seq(1, MPPCNum)), ncol=2, nrow=MPPCNum)\n",
     "    colnames(MPPCsDF) <- apply(MPColMatrix, 1, function(X){return(paste0(X[1], X[2]))})\n",
     "    # Add #id Column\n",

diff --git a/code/data_preprocessing/genotype/GWAS_QC.ipynb b/code/data_preprocessing/genotype/GWAS_QC.ipynb
@@ -1352,13 +1352,21 @@
     "    phenoFile <- read_delim(${_input[1]:ar}, col_names=TRUE)\n",
     "    if (${\"TRUE\" if sample_participant_lookup.is_file() else \"FALSE\"}) {\n",
     "        sample_lookup <- fread(${sample_participant_lookup:ar}, header=TRUE)\n",
-    "        colnames(sample_lookup) <- c(\"sample_id\", \"genotype_id\") # FIXME: This is for the old lookup table with columns c(\"sample_id\", \"participant_id\")\n",
-    "        # rename phenotype file according to lookup file\n",
-    "        colnames(phenoFile)[-c(1:4)] <- phenoFile %>%\n",
-    "          colnames() %>%\n",
-    "          .[-c(1:4)] %>%\n",
-    "          match(sample_lookup$sample_id) %>%\n",
-    "          sample_lookup$genotype_id[.]\n",
+    "        if (all(c(\"sample_id\", \"genotype_id\") %in% colnames(sample_lookup))) {\n",
+    "            sample_lookup <- sample_lookup[, .(sample_id, genotype_id)]\n",
+    "        } else if (all(c(\"sample_id\", \"participant_id\") %in% colnames(sample_lookup))) {\n",
+    "            sample_lookup <- sample_lookup[, .(sample_id, genotype_id = participant_id)]\n",
+    "        } else {\n",
+    "            sample_lookup <- sample_lookup[, 1:2]\n",
+    "            colnames(sample_lookup) <- c(\"sample_id\", \"genotype_id\")\n",
+    "        }\n",
+    "        sample_lookup <- unique(sample_lookup)\n",
+    "        pheno_sample_ids <- colnames(phenoFile)[-c(1:4)]\n",
+    "        renamed_genotype_ids <- sample_lookup$genotype_id[match(pheno_sample_ids, sample_lookup$sample_id)]\n",
+    "        if (any(is.na(renamed_genotype_ids))) {\n",
+    "            stop(\"Some phenotype samples could not be mapped to genotype IDs using sample_participant_lookup\")\n",
+    "        }\n",
+    "        colnames(phenoFile)[-c(1:4)] <- renamed_genotype_ids\n",
     "      output_file <- paste0(${phenoFile:nnr}, '.rename_sample.bed')\n",
     "      # rename phenotype file\n",
     "      phenoFile$start <- as.integer(phenoFile$start) \n",
@@ -1374,9 +1382,11 @@
     "        colnames(sample_lookup) <- c(\"genotype_id\", \"sample_id\")\n",
     "    }\n",
     "    sample_lookup <- sample_lookup %>%\n",
+    "    transmute(genotype_id = genotype_id, sample_id = sample_id) %>%\n",
+    "    distinct() %>%\n",
     "    filter(\n",
     "        genotype_id %in% genoFam$V2,\n",
-    "        sample_id %in% colnames(phenoFile)\n",
+    "        genotype_id %in% colnames(phenoFile)\n",
     "    )\n",
     "    \n",
     "    genoFam %>%\n",

diff --git a/code/molecular_phenotypes/calling/RNA_calling.ipynb b/code/molecular_phenotypes/calling/RNA_calling.ipynb
@@ -681,6 +681,13 @@
     "parameter: uncompressed = False\n",
     "import os\n",
     "import pandas as pd\n",
+    "def fastqc_output_base(target):\n",
+    "    target = target[0] if hasattr(target, '__getitem__') and not isinstance(target, (str, bytes, os.PathLike)) else target\n",
+    "    name = os.path.basename(str(target))\n",
+    "    for suffix in (\".fastq.gz\", \".fq.gz\", \".fastq\", \".fq\", \".bam\", \".sam\", \".cram\"):\n",
+    "        if name.endswith(suffix):\n",
+    "            return name[:-len(suffix)]\n",
+    "    return os.path.splitext(name)[0]\n",
     "## FIX: The way to get sample needs to be revamped to 1. Accomodate rf/fr as column 2. accomodate single end read (Only 2 fq/samples)\n",
     "sample_inv = pd.read_csv(sample_list,sep = \"\\t\")\n",
     "\n",
@@ -704,14 +711,21 @@
     "    sample_inv = sample_inv.drop(\"read_length\" , axis = 1)\n",
     "    \n",
     "\n",
+    "## Read columns follow the documented fq1/fq2 contract; fall back only for legacy one/two-read-column sample sheets\n",
+    "file_columns = [x for x in [\"fq1\", \"fq2\"] if x in sample_inv.columns]\n",
+    "if len(file_columns) == 0:\n",
+    "    legacy_file_columns = [x for x in sample_inv.columns if x != \"ID\"]\n",
+    "    stop_if(len(legacy_file_columns) == 0 or len(legacy_file_columns) > 2,\n",
+    "        msg = \"Sample list must contain fq1 and optional fq2 columns, or a legacy ID + one/two read-column layout after removing strand/read_length\")\n",
+    "    file_columns = legacy_file_columns\n",
+    "\n",
     "## Extract sample_id\n",
-    "sample_inv_list = sample_inv.values.tolist()\n",
+    "sample_inv_list = sample_inv[[\"ID\"] + file_columns].values.tolist()\n",
     "sample_id = [x[0] for x in sample_inv_list]\n",
     "\n",
-    "    \n",
     "## Get the file name for single/paired end data\n",
     "file_inv = [x[1:] for x in sample_inv_list]\n",
-    "file_inv = [item for sublist in file_inv for item in sublist]\n",
+    "file_inv = [item for sublist in file_inv for item in sublist if pd.notna(item) and str(item).strip() != \"\"]\n",
     "\n",
     "raw_reads = [f'{data_dir}/{x}' for x in file_inv]\n",
     "\n",
@@ -793,7 +807,7 @@
    "source": [
     "[fastqc]\n",
     "input: raw_reads, group_by =  1\n",
-    "output: f'{cwd}/{_input:bn}_fastqc.html',f'{cwd}/{_input:bn}_fastqc.zip' \n",
+    "output: f'{cwd}/{fastqc_output_base(_input)}_fastqc.html',f'{cwd}/{fastqc_output_base(_input)}_fastqc.zip' \n",
     "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads\n",
     "bash: expand= \"${ }\", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout', container=container\n",
     "    fastqc ${_input} -o ${_output[0]:d}"
@@ -2094,4 +2108,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}