From 04e4b3981293089f3f67058cd82bf866027daddd Mon Sep 17 00:00:00 2001 From: Zongheng Yang Date: Tue, 17 Mar 2015 10:13:42 -0700 Subject: [PATCH] Optimizations to R ETL process (esp. loading). - Directly reads from .txt file instead of saving out to .Rdata first then reading back again. Prototyped for Regression. - Even if the .Rdata step is desired, using fread() has much better performance. --- code/R_benchmark/generate_Rdata.R | 15 +++++--- code/R_benchmark/vanilla_R_benchmark.R | 49 ++++++++++++++------------ 2 files changed, 38 insertions(+), 26 deletions(-) diff --git a/code/R_benchmark/generate_Rdata.R b/code/R_benchmark/generate_Rdata.R index e7f0e58..a300f3d 100644 --- a/code/R_benchmark/generate_Rdata.R +++ b/code/R_benchmark/generate_Rdata.R @@ -1,3 +1,6 @@ +library(data.table) +suppressPackageStartupMessages(library(bit64)) + args <- commandArgs(trailingOnly = TRUE) PATH <- args[1] NGENES <- args[2] @@ -9,14 +12,18 @@ PATIENTS <- paste(PATH, '/PatientMetaData-', NGENES, '-', NPATIENTS, sep="") TXT <- '.txt' RDATA <- '.Rdata' -geo <- read.csv(paste(GEO, TXT, sep="")) +geo <- fread(paste(GEO, TXT, sep="")) save(geo, file=paste(GEO, RDATA, sep="")) +geo <- NULL -go <- read.csv(paste(GO, TXT, sep="")) +go <- fread(paste(GO, TXT, sep="")) save(go, file=paste(GO, RDATA, sep="")) +go <- NULL -genes <- read.csv(paste(GENES, TXT, sep="")) +genes <- fread(paste(GENES, TXT, sep="")) save(genes, file=paste(GENES, RDATA, sep="")) +genes <- NULL -patients <- read.csv(paste(PATIENTS, TXT, sep="")) +patients <- fread(paste(PATIENTS, TXT, sep="")) save(patients, file=paste(PATIENTS, RDATA, sep="")) +patients <- NULL diff --git a/code/R_benchmark/vanilla_R_benchmark.R b/code/R_benchmark/vanilla_R_benchmark.R index e2916e0..aa61db9 100644 --- a/code/R_benchmark/vanilla_R_benchmark.R +++ b/code/R_benchmark/vanilla_R_benchmark.R @@ -3,38 +3,43 @@ args <- commandArgs(trailingOnly = TRUE) PATH <- args[1] NGENES <- args[2] NPATIENTS <- args[3] -GEO <- paste(PATH, '/GEO-', NGENES, '-', NPATIENTS, '.Rdata', sep="") -GO <- paste(PATH, '/GO-', NGENES, '-', NPATIENTS, '.Rdata', sep="") -GENES <- paste(PATH, '/GeneMetaData-', NGENES, '-', NPATIENTS, '.Rdata', sep="") -PATIENTS <- paste(PATH, '/PatientMetaData-', NGENES, '-', NPATIENTS, '.Rdata', sep="") +GEO <- paste(PATH, '/GEO-', NGENES, '-', NPATIENTS, '.txt', sep="") +GO <- paste(PATH, '/GO-', NGENES, '-', NPATIENTS, '.txt', sep="") +GENES <- paste(PATH, '/GeneMetaData-', NGENES, '-', NPATIENTS, '.txt', sep="") +PATIENTS <- paste(PATH, '/PatientMetaData-', NGENES, '-', NPATIENTS, '.txt', sep="") regression <- function() { library(Matrix) library(data.table) + suppressPackageStartupMessages(library(bit64)) ptm = proc.time() ### Data Management ops start ### - load(GEO) - load(GENES) - load(PATIENTS) + geo <- as.data.frame(fread(GEO)) + genes <- as.data.frame(fread(GENES)) + patients <- as.data.frame(fread(PATIENTS)) - sub_gmd = genes[genes$func < 250,] + colnames(genes) <- c("id", "target", "position", "length", "function") + colnames(geo) <- c("geneid", "patientid", "expression value") + colnames(patients) <- c("id", "age", "gender", "zipcode", "disease", "drug response") + + sub_gmd = genes[genes$`function` < 250,] # convert to data tables colnames(sub_gmd)[1] = "geneid" sub_gmd_dt = data.table(sub_gmd, key="geneid") geo_dt = data.table(geo, key="geneid") - + # join - A = merge(geo_dt, sub_gmd_dt)[,c("patientid", "geneid", "expression.value"), with=F] - + A = merge(geo_dt, sub_gmd_dt)[,c("patientid", "geneid", "expression value"), with=F] + # store as matrix library(reshape2) - A = acast(A, list(names(A)[1], names(A)[2])); - response = patients[,"drug.response"] + A = acast(A, list(colnames(A)[1], colnames(A)[2])); + response = patients[,"drug response"] ### Data management ops end ### cat(sprintf('Regression data management: %f\n', (proc.time() - ptm)['elapsed'])) @@ -68,12 +73,12 @@ covariance <- function() # join A = merge(geo_dt, sub_pmd_dt)[,c("patientid", "geneid", "expression.value"), with=F] - + # store as matrix library(reshape2) A = acast(A, list(names(A)[1], names(A)[2])); midtm = (proc.time() - ptm)['elapsed'] - ptm = proc.time() + ptm = proc.time() # calculate covariance covar = cov(A) @@ -82,8 +87,8 @@ covariance <- function() covar <- which(covar>0.01*(max(covar))return, arr.ind=T) res = merge(covar, gmd_dt, by.x='row', by.y='id') - res = merge(res, gmd_dt, by.x='col', by.y='id') - + res = merge(res, gmd_dt, by.x='col', by.y='id') + ### Data management ops end ### cat(sprintf('Regression data management: %f\n', (proc.time() - ptm)['elapsed'] + midtm)) } @@ -99,7 +104,7 @@ biclustering<-function() load(GEO) load(PATIENTS) - + sub_pmd = patients[patients$gender==1 & patients$age<=40,] # convert to data tables @@ -117,13 +122,13 @@ biclustering<-function() ### Data management ops end ### cat(sprintf('Regression data management: %f\n', (proc.time() - ptm)['elapsed'])) ptm = proc.time() - + # run biclustering library(biclust) library("s4vd") biclust(A, method=BCssvd, K=1) cat(sprintf('Biclust analytics: %f\n', (proc.time() - ptm)['elapsed'])) -} +} svd_irlba <- function() { @@ -187,7 +192,7 @@ stats <- function() library(reshape2) A = acast(geo, list(names(geo)[1], names(geo)[2])); go = sparseMatrix(go[,1], go[,2], x=go[,3]) - + ### Data management ops end ### cat(sprintf('Stats data management: %f\n', (proc.time() - ptm)['elapsed'])) ptm = proc.time() @@ -209,4 +214,4 @@ print(paste('Regression: ', system.time(regression(), gcFirst=T)['elapsed'], sep print(paste('SVD: ', system.time(svd_irlba(), gcFirst=T)['elapsed'], sep='')); print(paste('Covariance: ', system.time(covariance(), gcFirst=T)['elapsed'], sep='')); print(paste('Biclustering: ', system.time(biclustering(), gcFirst=T)['elapsed'], sep='')); -print(paste('Stats: ', system.time(stats(), gcFirst=T)['elapsed'], sep='')); +print(paste('Stats: ', system.time(stats(), gcFirst=T)['elapsed'], sep=''));