Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -741,7 +741,7 @@
" }\n",
" if (MPPCNum == 0) stop(\"Invalid choice of methods to choose K for PCA: ${choose_k_method}\")\n",
"\n",
" MPPCsDF <- as.data.frame(residExpPC$rotated[, 1:MPPCNum])\n",
" MPPCsDF <- as.data.frame(residExpPC$rotated[, 1:MPPCNum, drop = FALSE])\n",
" MPColMatrix <- matrix(c(rep('Hidden_Factor_PC', times=MPPCNum), seq(1, MPPCNum)), ncol=2, nrow=MPPCNum)\n",
" colnames(MPPCsDF) <- apply(MPColMatrix, 1, function(X){return(paste0(X[1], X[2]))})\n",
" # Add #id Column\n",
Expand Down
26 changes: 18 additions & 8 deletions code/data_preprocessing/genotype/GWAS_QC.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1352,13 +1352,21 @@
" phenoFile <- read_delim(${_input[1]:ar}, col_names=TRUE)\n",
" if (${\"TRUE\" if sample_participant_lookup.is_file() else \"FALSE\"}) {\n",
" sample_lookup <- fread(${sample_participant_lookup:ar}, header=TRUE)\n",
" colnames(sample_lookup) <- c(\"sample_id\", \"genotype_id\") # FIXME: This is for the old lookup table with columns c(\"sample_id\", \"participant_id\")\n",
" # rename phenotype file according to lookup file\n",
" colnames(phenoFile)[-c(1:4)] <- phenoFile %>%\n",
" colnames() %>%\n",
" .[-c(1:4)] %>%\n",
" match(sample_lookup$sample_id) %>%\n",
" sample_lookup$genotype_id[.]\n",
" if (all(c(\"sample_id\", \"genotype_id\") %in% colnames(sample_lookup))) {\n",
" sample_lookup <- sample_lookup[, .(sample_id, genotype_id)]\n",
" } else if (all(c(\"sample_id\", \"participant_id\") %in% colnames(sample_lookup))) {\n",
" sample_lookup <- sample_lookup[, .(sample_id, genotype_id = participant_id)]\n",
" } else {\n",
" sample_lookup <- sample_lookup[, 1:2]\n",
" colnames(sample_lookup) <- c(\"sample_id\", \"genotype_id\")\n",
" }\n",
" sample_lookup <- unique(sample_lookup)\n",
" pheno_sample_ids <- colnames(phenoFile)[-c(1:4)]\n",
" renamed_genotype_ids <- sample_lookup$genotype_id[match(pheno_sample_ids, sample_lookup$sample_id)]\n",
" if (any(is.na(renamed_genotype_ids))) {\n",
" stop(\"Some phenotype samples could not be mapped to genotype IDs using sample_participant_lookup\")\n",
" }\n",
" colnames(phenoFile)[-c(1:4)] <- renamed_genotype_ids\n",
" output_file <- paste0(${phenoFile:nnr}, '.rename_sample.bed')\n",
" # rename phenotype file\n",
" phenoFile$start <- as.integer(phenoFile$start) \n",
Expand All @@ -1374,9 +1382,11 @@
" colnames(sample_lookup) <- c(\"genotype_id\", \"sample_id\")\n",
" }\n",
" sample_lookup <- sample_lookup %>%\n",
" transmute(genotype_id = genotype_id, sample_id = sample_id) %>%\n",
" distinct() %>%\n",
" filter(\n",
" genotype_id %in% genoFam$V2,\n",
" sample_id %in% colnames(phenoFile)\n",
" genotype_id %in% colnames(phenoFile)\n",
" )\n",
" \n",
" genoFam %>%\n",
Expand Down
24 changes: 19 additions & 5 deletions code/molecular_phenotypes/calling/RNA_calling.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -681,6 +681,13 @@
"parameter: uncompressed = False\n",
"import os\n",
"import pandas as pd\n",
"def fastqc_output_base(target):\n",
" target = target[0] if hasattr(target, '__getitem__') and not isinstance(target, (str, bytes, os.PathLike)) else target\n",
" name = os.path.basename(str(target))\n",
" for suffix in (\".fastq.gz\", \".fq.gz\", \".fastq\", \".fq\", \".bam\", \".sam\", \".cram\"):\n",
" if name.endswith(suffix):\n",
" return name[:-len(suffix)]\n",
" return os.path.splitext(name)[0]\n",
"## FIX: The way to get sample needs to be revamped to 1. Accomodate rf/fr as column 2. accomodate single end read (Only 2 fq/samples)\n",
"sample_inv = pd.read_csv(sample_list,sep = \"\\t\")\n",
"\n",
Expand All @@ -704,14 +711,21 @@
" sample_inv = sample_inv.drop(\"read_length\" , axis = 1)\n",
" \n",
"\n",
"## Read columns follow the documented fq1/fq2 contract; fall back only for legacy one/two-read-column sample sheets\n",
"file_columns = [x for x in [\"fq1\", \"fq2\"] if x in sample_inv.columns]\n",
"if len(file_columns) == 0:\n",
" legacy_file_columns = [x for x in sample_inv.columns if x != \"ID\"]\n",
" stop_if(len(legacy_file_columns) == 0 or len(legacy_file_columns) > 2,\n",
" msg = \"Sample list must contain fq1 and optional fq2 columns, or a legacy ID + one/two read-column layout after removing strand/read_length\")\n",
" file_columns = legacy_file_columns\n",
"\n",
"## Extract sample_id\n",
"sample_inv_list = sample_inv.values.tolist()\n",
"sample_inv_list = sample_inv[[\"ID\"] + file_columns].values.tolist()\n",
"sample_id = [x[0] for x in sample_inv_list]\n",
"\n",
" \n",
"## Get the file name for single/paired end data\n",
"file_inv = [x[1:] for x in sample_inv_list]\n",
"file_inv = [item for sublist in file_inv for item in sublist]\n",
"file_inv = [item for sublist in file_inv for item in sublist if pd.notna(item) and str(item).strip() != \"\"]\n",
"\n",
"raw_reads = [f'{data_dir}/{x}' for x in file_inv]\n",
"\n",
Expand Down Expand Up @@ -793,7 +807,7 @@
"source": [
"[fastqc]\n",
"input: raw_reads, group_by = 1\n",
"output: f'{cwd}/{_input:bn}_fastqc.html',f'{cwd}/{_input:bn}_fastqc.zip' \n",
"output: f'{cwd}/{fastqc_output_base(_input)}_fastqc.html',f'{cwd}/{fastqc_output_base(_input)}_fastqc.zip' \n",
"task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads\n",
"bash: expand= \"${ }\", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout', container=container\n",
" fastqc ${_input} -o ${_output[0]:d}"
Expand Down Expand Up @@ -2094,4 +2108,4 @@
},
"nbformat": 4,
"nbformat_minor": 5
}
}