diff --git a/code/data_preprocessing/covariate/covariate_hidden_factor.ipynb b/code/data_preprocessing/covariate/covariate_hidden_factor.ipynb index 47500525c..2bfcda48a 100644 --- a/code/data_preprocessing/covariate/covariate_hidden_factor.ipynb +++ b/code/data_preprocessing/covariate/covariate_hidden_factor.ipynb @@ -741,7 +741,7 @@ " }\n", " if (MPPCNum == 0) stop(\"Invalid choice of methods to choose K for PCA: ${choose_k_method}\")\n", "\n", - " MPPCsDF <- as.data.frame(residExpPC$rotated[, 1:MPPCNum])\n", + " MPPCsDF <- as.data.frame(residExpPC$rotated[, 1:MPPCNum, drop = FALSE])\n", " MPColMatrix <- matrix(c(rep('Hidden_Factor_PC', times=MPPCNum), seq(1, MPPCNum)), ncol=2, nrow=MPPCNum)\n", " colnames(MPPCsDF) <- apply(MPColMatrix, 1, function(X){return(paste0(X[1], X[2]))})\n", " # Add #id Column\n", diff --git a/code/data_preprocessing/genotype/GWAS_QC.ipynb b/code/data_preprocessing/genotype/GWAS_QC.ipynb index 788f8650f..9345ec8c0 100644 --- a/code/data_preprocessing/genotype/GWAS_QC.ipynb +++ b/code/data_preprocessing/genotype/GWAS_QC.ipynb @@ -1352,13 +1352,21 @@ " phenoFile <- read_delim(${_input[1]:ar}, col_names=TRUE)\n", " if (${\"TRUE\" if sample_participant_lookup.is_file() else \"FALSE\"}) {\n", " sample_lookup <- fread(${sample_participant_lookup:ar}, header=TRUE)\n", - " colnames(sample_lookup) <- c(\"sample_id\", \"genotype_id\") # FIXME: This is for the old lookup table with columns c(\"sample_id\", \"participant_id\")\n", - " # rename phenotype file according to lookup file\n", - " colnames(phenoFile)[-c(1:4)] <- phenoFile %>%\n", - " colnames() %>%\n", - " .[-c(1:4)] %>%\n", - " match(sample_lookup$sample_id) %>%\n", - " sample_lookup$genotype_id[.]\n", + " if (all(c(\"sample_id\", \"genotype_id\") %in% colnames(sample_lookup))) {\n", + " sample_lookup <- sample_lookup[, .(sample_id, genotype_id)]\n", + " } else if (all(c(\"sample_id\", \"participant_id\") %in% colnames(sample_lookup))) {\n", + " sample_lookup <- sample_lookup[, .(sample_id, genotype_id = participant_id)]\n", + " } else {\n", + " sample_lookup <- sample_lookup[, 1:2]\n", + " colnames(sample_lookup) <- c(\"sample_id\", \"genotype_id\")\n", + " }\n", + " sample_lookup <- unique(sample_lookup)\n", + " pheno_sample_ids <- colnames(phenoFile)[-c(1:4)]\n", + " renamed_genotype_ids <- sample_lookup$genotype_id[match(pheno_sample_ids, sample_lookup$sample_id)]\n", + " if (any(is.na(renamed_genotype_ids))) {\n", + " stop(\"Some phenotype samples could not be mapped to genotype IDs using sample_participant_lookup\")\n", + " }\n", + " colnames(phenoFile)[-c(1:4)] <- renamed_genotype_ids\n", " output_file <- paste0(${phenoFile:nnr}, '.rename_sample.bed')\n", " # rename phenotype file\n", " phenoFile$start <- as.integer(phenoFile$start) \n", @@ -1374,9 +1382,11 @@ " colnames(sample_lookup) <- c(\"genotype_id\", \"sample_id\")\n", " }\n", " sample_lookup <- sample_lookup %>%\n", + " transmute(genotype_id = genotype_id, sample_id = sample_id) %>%\n", + " distinct() %>%\n", " filter(\n", " genotype_id %in% genoFam$V2,\n", - " sample_id %in% colnames(phenoFile)\n", + " genotype_id %in% colnames(phenoFile)\n", " )\n", " \n", " genoFam %>%\n", diff --git a/code/molecular_phenotypes/calling/RNA_calling.ipynb b/code/molecular_phenotypes/calling/RNA_calling.ipynb index 5306913ab..c1d46c733 100644 --- a/code/molecular_phenotypes/calling/RNA_calling.ipynb +++ b/code/molecular_phenotypes/calling/RNA_calling.ipynb @@ -681,6 +681,13 @@ "parameter: uncompressed = False\n", "import os\n", "import pandas as pd\n", + "def fastqc_output_base(target):\n", + " target = target[0] if hasattr(target, '__getitem__') and not isinstance(target, (str, bytes, os.PathLike)) else target\n", + " name = os.path.basename(str(target))\n", + " for suffix in (\".fastq.gz\", \".fq.gz\", \".fastq\", \".fq\", \".bam\", \".sam\", \".cram\"):\n", + " if name.endswith(suffix):\n", + " return name[:-len(suffix)]\n", + " return os.path.splitext(name)[0]\n", "## FIX: The way to get sample needs to be revamped to 1. Accomodate rf/fr as column 2. accomodate single end read (Only 2 fq/samples)\n", "sample_inv = pd.read_csv(sample_list,sep = \"\\t\")\n", "\n", @@ -704,14 +711,21 @@ " sample_inv = sample_inv.drop(\"read_length\" , axis = 1)\n", " \n", "\n", + "## Read columns follow the documented fq1/fq2 contract; fall back only for legacy one/two-read-column sample sheets\n", + "file_columns = [x for x in [\"fq1\", \"fq2\"] if x in sample_inv.columns]\n", + "if len(file_columns) == 0:\n", + " legacy_file_columns = [x for x in sample_inv.columns if x != \"ID\"]\n", + " stop_if(len(legacy_file_columns) == 0 or len(legacy_file_columns) > 2,\n", + " msg = \"Sample list must contain fq1 and optional fq2 columns, or a legacy ID + one/two read-column layout after removing strand/read_length\")\n", + " file_columns = legacy_file_columns\n", + "\n", "## Extract sample_id\n", - "sample_inv_list = sample_inv.values.tolist()\n", + "sample_inv_list = sample_inv[[\"ID\"] + file_columns].values.tolist()\n", "sample_id = [x[0] for x in sample_inv_list]\n", "\n", - " \n", "## Get the file name for single/paired end data\n", "file_inv = [x[1:] for x in sample_inv_list]\n", - "file_inv = [item for sublist in file_inv for item in sublist]\n", + "file_inv = [item for sublist in file_inv for item in sublist if pd.notna(item) and str(item).strip() != \"\"]\n", "\n", "raw_reads = [f'{data_dir}/{x}' for x in file_inv]\n", "\n", @@ -793,7 +807,7 @@ "source": [ "[fastqc]\n", "input: raw_reads, group_by = 1\n", - "output: f'{cwd}/{_input:bn}_fastqc.html',f'{cwd}/{_input:bn}_fastqc.zip' \n", + "output: f'{cwd}/{fastqc_output_base(_input)}_fastqc.html',f'{cwd}/{fastqc_output_base(_input)}_fastqc.zip' \n", "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads\n", "bash: expand= \"${ }\", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout', container=container\n", " fastqc ${_input} -o ${_output[0]:d}" @@ -2094,4 +2108,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +}