diff --git a/CVDSBA_UMAP.Rmd b/CVDSBA_UMAP.Rmd new file mode 100644 index 0000000..f5860ef --- /dev/null +++ b/CVDSBA_UMAP.Rmd @@ -0,0 +1,180 @@ + +```{r metabo} + + +rm(list=ls()) +library(umap) +source("common.R") +source("datamining_library_ge20200306.R") +library(readr) +library(plyr) +library(readxl) +library(stringr) +library(magrittr) +library(Mfuzz) + +df<-read.csv("meta_matrix_log2delectNA08anddrug36_NAmin20200329(1).csv" ) + +row.names(df)<-c(as.matrix(df$X)) + +df2<-data.frame(t(df[-1])) + +df3<-data.frame(row.names(df2),df2) + +df3$row.names.df2.<-gsub("\\d+","",df3$row.names.df2.,perl = T) + +df1<-df3 + +names(df1)<-c("label",c(as.matrix(names(df1[,-c(1)])))) + +set.seed(2020) + +color<-c("firebrick1","forestgreen","dodgerblue","navyblue") + +pdf("UMAP_2020_allprotein_meta_847feature.pdf") + +drawUMAP(df1[,],ptColors = color,rowNormalization =F, colNormalization = T) + + +``` + + + + +```{r proteome} +rm(list=ls()) +library(umap) +source("common.R") +source("datamining_library_ge20200306.R") +library(readr) +library(plyr) +library(readxl) +library(stringr) +library(magrittr) +library(Mfuzz) + +info <- read_xlsx("sampleinfo2.xlsx") + +info$TMT <- gsub("^b","F",info$TMT) + + ################d +df2<-read.table("proteomic_matrix_ratio20200328.txt",header = T,sep = "\t",row.names = 1) + +df3<-data.frame(names(df2),t(df2)) + +names(df3)<-c("TMT",as.matrix(names(df3[,-1]))) + + +df4<-merge(info,df3,by.x = "TMT") + + +df1<-df4 +names(df1)<-c('patient',"label",c(as.matrix(names(df1[,-c(1:2)])))) + +df2<-data.frame(df1[-c(which(df1$label==c("NA"))),]) + +row.names(df2)<-c(as.matrix(df2$patient)) + +set.seed(2020) + +color<-c("firebrick1","forestgreen","dodgerblue","navyblue") + +pdf("UMAP_2020_allprotein.pdf") + +drawUMAP(df2[,-1],ptColors = color,rowNormalization =F, colNormalization = T) + +``` + +```{r proteome_severe_nonsevere_label} +rm(list=ls()) +library(umap) +source("common_change_label.R") +source("datamining_library_ge20200306.R") +library(readr) +library(plyr) +library(readxl) +library(stringr) +protein<-read.csv("proteomic_training_severe_nonsevere.csv",row.names = 1) +color=c("#DA942C","#8D2926") +set.seed(2020) +pdf("proteomic_training_severe_nonsevereABCD.pdf") +drawUMAP(protein,ptColors = color,rowNormalization =F, colNormalization = T) + +rm(list=ls()) +library(umap) +source("common_change_label.R") +source("datamining_library_ge20200306.R") +library(readr) +library(plyr) +library(readxl) +library(stringr) +protein<-read.csv("proteomic_training_severe_nonsevere.csv",row.names = 1) +protein<-protein[1:18,] +color=c("#DA942C") +set.seed(2020) +pdf("proteomic_training_nonsevere.pdf") +drawUMAP(protein,ptColors = color,rowNormalization =F, colNormalization = T) +rm(list=ls()) +# library(umap) +source("common_change_label.R") +source("datamining_library_ge20200306.R") +library(readr) +library(plyr) +library(readxl) +library(stringr) +protein<-read.csv("proteomic_training_severe_nonsevere.csv",row.names = 1) +protein<-protein[19:31,] +color=c("#8D2926","black") +set.seed(2020) +pdf("proteomic_training_severe.pdf") +drawUMAP(protein,ptColors = color,rowNormalization =F, colNormalization = T) + +``` + +```{r metabo_severe_nonsevere_label} +rm(list=ls()) +library(umap) +source("common_change_label.R") +source("datamining_library_ge20200306.R") +library(readr) +library(plyr) +library(readxl) +library(stringr) +protein<-read.csv("meta_training_severe_nonsevere.csv",row.names = 1) +color=c("#DA942C","#8D2926") +set.seed(2020) +pdf("meta_training_severe_nonsevere_ABCD.pdf") +drawUMAP(protein,ptColors = color,rowNormalization =F, colNormalization = T) +rm(list=ls()) +library(umap) +source("common_change_label.R") +source("datamining_library_ge20200306.R") +library(readr) +library(plyr) +library(readxl) +library(stringr) +protein<-read.csv("meta_training_severe_nonsevere.csv",row.names = 1) +protein<-protein[1:18,] +color=c("#DA942C") +set.seed(2020) +pdf("meta_training_nonsevere.pdf") +drawUMAP(protein,ptColors = color,rowNormalization =F, colNormalization = T) +rm(list=ls()) +# library(umap) +source("common_change_label.R") +source("datamining_library_ge20200306.R") +library(readr) +library(plyr) +library(readxl) +library(stringr) +protein<-read.csv("meta_training_severe_nonsevere.csv",row.names = 1) +protein<-protein[19:31,] +color=c("#8D2926","black") +set.seed(2020) +pdf("meta_training_severe.pdf") +drawUMAP(protein,ptColors = color,rowNormalization =F, colNormalization = T) + +``` + + + diff --git a/CVDSBA_heatmap.Rmd b/CVDSBA_heatmap.Rmd new file mode 100644 index 0000000..24e35d9 --- /dev/null +++ b/CVDSBA_heatmap.Rmd @@ -0,0 +1,323 @@ + + + +```{r heatmap_pathway_proteomics} + +rm(list = ls()) +library(readr) +library(plyr) +library(readxl) +library(stringr) +library(magrittr) +source("datamining_library_ge20200306.R") +library(RColorBrewer) +library(pheatmap) + + +matrix<-read.table("proteomic_matrix_delect5andNA15_ratio20200329.txt",header = T,sep = "\t",row.names = 1) + +protein<-read_excel("Fig3_heatmap_prots.xlsx",sheet=1) + + +Fig3prot<-c(as.matrix(protein$UniprotID)) + + +info<-read_excel("sTable1_20200329-1900.xlsx",sheet = 2) + +info2<-info[,c(3,4,14,15)] + +info2.1<-info[,c(3,5,14,15)] + + + +names(info2)<-c("type","patient","sex","age") + +names(info2.1)<-c("type","patient","sex","age") + + +matrix2<-data.frame(names(matrix),t(matrix)) + + + + +names(matrix2)<-c('patient',c(as.matrix(names(matrix2[,-c(1)])))) + + + +matrix3<-merge(info2,matrix2,by.x = "patient") + +matrix4<-merge(info2.1,matrix2,by.x = "patient") + + +matrix5<-rbind(matrix4,matrix3) + + +matrix5$type<-gsub("\\d+","",perl = T,matrix5$type) + + +matrix5$sex<-gsub("1","M",perl = T,matrix5$sex) + +matrix5$sex<-gsub("0","F",perl = T,matrix5$sex) + +# matrix5.1<-matrix5[-c(which(matrix5$type==c("jbdz"))),] + + + + +write.csv(matrix5,"heatmapfig2.csv",row.names = F) + + + +matrix5.1<-matrix5[] + + + +label<-matrix5.1[,c(1:4)] + +matrix6<-matrix5.1[,-c(1,3:4)] + +row.names(matrix6)<-c(as.matrix(label[,1])) + + +matrix7<-data.frame(matrix6[,c(1,which(names(matrix6)%in%Fig3prot))]) + + + +matrix7$type<-factor(matrix7$type,levels = c("jkdz","jbdz","PT","ZX")) + + + +matrix7<-matrix7[order(matrix7[,1]),] + + + + + + + +annotation_col<- data.frame(type = factor(label$type,levels = c("jkdz","jbdz","PT","ZX")), + + sex=label$sex, + age=label$age, + + + row.names = label$patient) + +type_color <- c("#85B22E","#5F80B4","#E29827","#922927") +names(type_color) <- c("jkdz","jbdz","PT","ZX") + +sex_color <- c("red","#016D06") + +names(sex_color) <- c("F","M") + +ann_colors <- list(type=type_color,sex=sex_color) +# +# colors = c( brewer.pal(11,"RdYlGn")[9:2]) + +colors = colorRampPalette(c("blue", "white","red" ))(1000) +matrix8<-data.frame(t(matrix7[,-1])) + + + +matrix9<-data.frame(row.names(matrix8),matrix8) + + +matrix9$row.names.matrix8.<-factor(matrix9$row.names.matrix8.,levels=c(as.matrix(Fig3prot))) + +matrix9<-matrix9[order(matrix9[,1]),] + + +gene<-protein[,c(5,6)] + + +for (i in 1:nrow(gene)) { + + + gene[i,3]<-paste0(gene[i,1],"_",gene[i,2]) + +} + + +row.names(matrix9)<-c(as.matrix(gene[,3])) + + + + +matrix10<-matrix9[,-1] + + + + + + + +matrix11<-data.frame(scale(matrix10,center = T)) + + + + + +pheatmap(matrix11,scale="row",color = colors, annotation_col = annotation_col, + annotation_colors = ann_colors, fontsize_col = 10, cluster_rows = T, cluster_cols = F,show_rownames =T, show_colnames = F,fontsize = 10,cellwidth=10,cellheight=10,filename = paste0("Heatmap for cluster ","Fig3_UN_fourtype",".pdf"),main = paste0("Heatmap for clusterFig3","")) + + + + +``` + +```{heatmapp_metabo} + + +rm(list = ls()) +library(readr) +library(plyr) +library(readxl) +library(stringr) +library(magrittr) +source("datamining_library_ge20200306.R") +library(RColorBrewer) +library(pheatmap) + + +all<-read.csv("metabo/meta_matrix_log2delectNA08anddrug36_NAmin20200329(1).csv") + +row.names(all)<-c(as.matrix(all[,1])) +protein<-read_excel("metabo/heatmap_0415.xlsx") +protein70<-c(as.matrix(protein[,1])) + +row.names(protein)<-c(as.matrix(protein[,1])) + +protein70order<- c(as.matrix(names(data.frame(t(protein))))) + +matrix<-data.frame(all[which(all$X%in%protein70),]) + +info<-read_excel("sTable1_20200329-1900.xlsx",sheet = 2) + + +info2<-info[,c(8,14,15)] + + + + +names(info2)<-c("type","sex","age") + + +matrix2<-data.frame(names(matrix[,-1]),t(matrix[,-1])) + + + +names(matrix2)<-c("type",c(as.matrix(matrix[,1]))) + + + + + + +matrix3<-merge(info2,matrix2,by.x = "type") + +matrix5<-data.frame(c(as.matrix(matrix3[,1])),matrix3) + + + + + +matrix5$type<-gsub("\\d+","",perl = T,matrix5$type) + + +matrix5$sex<-gsub("1","M",perl = T,matrix5$sex) + +matrix5$sex<-gsub("0","F",perl = T,matrix5$sex) + +# matrix5.1<-matrix5[-c(which(matrix5$type==c("jbdz"))),] +matrix5.1<-matrix5[,] + +label<-matrix5.1[,c(1:4)] + + + + +matrix6<-matrix5.1[,-c(1,3:4)] + +row.names(matrix6)<-c(as.matrix(label[,1])) + + +# matrix7<-data.frame(matrix6[,c(1,which(names(matrix6)%in%protein70))]) + + + +matrix6$type<-factor(matrix6$type,levels = c("jkdz","jbdz","PT","ZX")) + + + +matrix6<-matrix6[order(matrix6[,1]),] + + + + + + + +annotation_col<- data.frame(type = factor(label$type,levels = c("jkdz","jbdz","PT","ZX")), + + sex=label$sex, + age=label$age, + + + row.names = label$c.as.matrix.matrix3...1...) + +type_color <- c("#85B22E","#5F80B4","#E29827","#922927") +names(type_color) <- c("jkdz","jbdz","PT","ZX") + +sex_color <- c("red","#016D06") + +names(sex_color) <- c("F","M") + +ann_colors <- list(type=type_color,sex=sex_color) +# +# colors = c( brewer.pal(11,"RdYlGn")[9:2]) + +colors = colorRampPalette(c("blue", "white","red" ))(1000) + +matrix7<-data.frame(t(matrix6[,-1])) + + + +matrix8<-data.frame(row.names(matrix7),matrix7) + + + +matrix8$row.names.matrix7.<-factor(matrix8$row.names.matrix7.,levels=protein70order) + + + +matrix8<-matrix8[order(matrix8[,1]),] + + +row.names(matrix8)<-protein70 + +matrix9<-matrix8[,-1] + + +# matrix9<-data.frame(row.names(matrix8),matrix8) + + +# matrix9$row.names.matrix8.<-factor(matrix9$row.names.matrix8.,levels=c(as.matrix(Fig3prot))) + +# matrix9<-matrix9[order(matrix9[,1]),] +# +# +# +# matrix10<-matrix9[,-1] +# +write.csv(matrix9,"metabo/heatmap_meta.csv",row.names = T) +matrix9[is.na(matrix9)]<-0 + + +pheatmap(matrix9,scale = "row",color = colors, annotation_col = annotation_col, + annotation_colors = ann_colors, fontsize_col = 10, cluster_rows = F, cluster_cols = F,show_rownames =T, show_colnames = F,fontsize = 10,cellwidth=10,cellheight=10,filename = paste0("metabo/Heatmap ","Fig3_meta_uncluster_80_four_type_final20200415",".pdf"),main = paste0("Heatmap for clusterFig3_meta_PATHWAY_final","")) + +``` + + + + diff --git a/CVDSBA_mfuzz.Rmd b/CVDSBA_mfuzz.Rmd new file mode 100644 index 0000000..3795e62 --- /dev/null +++ b/CVDSBA_mfuzz.Rmd @@ -0,0 +1,363 @@ +--- +title: "covid19_mfuzz" +author: "liuwe" +date: "2020年3月26日" +output: html_document +--- +```{r mfuzz-proteomics} +rm(list = ls()) +library(readr) +library(plyr) +library(readxl) +library(stringr) +library(magrittr) +library(Mfuzz) +source("datamining_library_ge20200306.R") + +info <- read_xlsx("sampleinfo2.xlsx") +info$TMT <- gsub("^b","F",info$TMT) + + ################d + +df2<-read.table("proteomic_matrix_ratio20200328.txt",header = T,sep = "\t",row.names = 1) + +df3<-data.frame(names(df2),t(df2)) + +names(df3)<-c("TMT",as.matrix(names(df3[,-1]))) + + +df4<-merge(info,df3,by.x = "TMT") + + + +df5<-df4[-c(which(df4$Type=="NA")),-1] + + + + +df6<-aggregate(df5[,colnames(df5)[2:ncol(df5)]],by=list(df5$Type),mean,na.rm= TRUE) + +df6<-df6[c(1,2,3,4),] + +row.names(df6)<-c(as.matrix((df6[,1]))) + + +df7<-data.frame(t(df6[,-1])) + +df7<-data.frame(df7[,c(1,2,3,4)]) + +################## + +set.seed(2021) + +a<-ge.mfuzz.cselection(df7,range = seq(3,10,1)) + +b<-ge.mfuzz.getresult(a,8,"4_protein_mfuzz_ge2021type_8") + + + + +``` + +```{r anova-proteomics} +rm(list = ls()) +library(readr) +library(plyr) +library(readxl) +library(stringr) +library(magrittr) +source("datamining_library_ge20200306.R") + +info <- read_xlsx("sampleinfo2.xlsx") +info$TMT <- gsub("^b","F",info$TMT) + +df2<-read.table("proteomic_matrix_ratio20200328.txt",header = T,sep = "\t",row.names = 1) +df3<-data.frame(names(df2),t(df2)) + +names(df3)<-c("TMT",as.matrix(names(df3[,-1]))) + + +df4<-merge(info,df3,by.x = "TMT") + + + +df5<-df4[-c(which(df4$Type=="NA")),-1] + + +############################################ + +Matrix<-df5 + +for (K in 1:8) { + + + +mfuzz_prot<-read.csv(paste0("4_protein_mfuzz_ge2021type_8/mfuzz_",K,".csv")) + +prot<-as.matrix(mfuzz_prot$X) + +Matrix2<-data.frame(Matrix[,1],Matrix[,c(which(names(Matrix)%in% prot))]) + + + + +col<-ncol(Matrix2) +names(Matrix2)<-c("label",as.matrix(names(Matrix2[,-1]))) +anova<-data.frame(names(Matrix2[,-1])) + + +for (i in 2:col) { + aov<-(summary(aov(Matrix2[,i] ~ Matrix2[,1],Matrix2))[[1]])$`Pr(>F)`[1] + aov<-as.numeric(aov) + # aov<-as.numeric(aov,digits = 4, scientific = F) + anova[c(i-1),2]<-aov +} + +names(anova)<-c('protein',"pvalue") + +anova2<-data.frame(anova[c(which(anova$pvalue<=0.05)),]) + + +write.csv(anova2,paste0("4_protein_mfuzz_ge2021type_8/",K,"_mfuzz_anova_0.05.csv"),row.names = F) + + + +} + +``` + + + +```{r mfuzz-metabo} + +rm(list = ls()) +library(readr) +library(plyr) +library(readxl) +library(stringr) +library(magrittr) +source("datamining_library_ge20200306.R") +library(RColorBrewer) +library(pheatmap) +library(Mfuzz) + +matrix<-read.csv("meta/meta_matrix_log2delectNA08anddrug36_NAmin20200329(1).csv") + +row.names(matrix)<-c(as.matrix(matrix[,1])) + + + + +info<-read_excel("meta/sTable1_20200329-1900.xlsx",sheet = 2) + + +info2<-info[,c(8,14,15)] + + + + +names(info2)<-c("type","sex","age") + + +matrix2<-data.frame(names(matrix[,-1]),t(matrix[,-1])) + + + +names(matrix2)<-c("type",c(as.matrix(matrix[,1]))) + + +matrix3<-merge(info2,matrix2,by.x = "type") + +matrix5<-data.frame(c(as.matrix(matrix3[,1])),matrix3) + + + + + +matrix5$type<-gsub("\\d+","",perl = T,matrix5$type) + + +matrix5$sex<-gsub("1","M",perl = T,matrix5$sex) + +matrix5$sex<-gsub("0","F",perl = T,matrix5$sex) + + + + + +df5<-data.frame(matrix5[,-c(1,3,4)]) + + + +df6<-aggregate(df5[,colnames(df5)[2:ncol(df5)]],by=list(df5$type),mean,na.rm= TRUE) + + + +row.names(df6)<-c(as.matrix(df6$Group.1)) + + +df7.1<-data.frame(t(df6[,-1])) + +df7<-data.frame(df7.1[,c(1,3,4)]) + + +row.names(df7)<-row.names(matrix) + +################## + +set.seed(2021) + +a<-ge.mfuzz.cselection(df7,range = seq(3,10,1)) + +b<-ge.mfuzz.getresult(a,8,"meta/mfuzz_ge2021type_8_meta_new") + + + + +``` + +```{r anova-metabo} +rm(list = ls()) +library(readr) +library(plyr) +library(readxl) +library(stringr) +library(magrittr) +library(Mfuzz) +source("datamining_library_ge20200306.R") +rm(list = ls()) +library(readr) +library(plyr) +library(readxl) +library(stringr) +library(magrittr) +source("datamining_library_ge20200306.R") +library(RColorBrewer) +library(pheatmap) + + +matrix<-read.csv("meta/meta_matrix_log2delectNA08anddrug36_NAmin20200329(1).csv") + +row.names(matrix)<-c(as.matrix(matrix[,1])) + + + + +info<-read_excel("meta/sTable1_20200329-1900.xlsx",sheet = 2) + + +info2<-info[,c(8,14,15)] + + + + +names(info2)<-c("type","sex","age") + + +matrix2<-data.frame(names(matrix[,-1]),t(matrix[,-1])) + + + +names(matrix2)<-c("type",c(as.matrix(matrix[,1]))) + + + + + + +matrix3<-merge(info2,matrix2,by.x = "type") + +matrix5<-data.frame(c(as.matrix(matrix3[,1])),matrix3) + + + + + +matrix5$type<-gsub("\\d+","",perl = T,matrix5$type) + + +matrix5$sex<-gsub("1","M",perl = T,matrix5$sex) + +matrix5$sex<-gsub("0","F",perl = T,matrix5$sex) + + + + + +df5<-data.frame(matrix5[,-c(1,3,4)]) + + + + + + + +############################################ + +df6<-data.frame(df5[-c(which(df5$type=="jbdz")),]) + +Matrix<-df6 + + + + +for (K in 1:8) { + +mfuzz_prot<-read.csv(paste0("meta/mfuzz_ge2021type_8_meta_new/mfuzz_",K,".csv")) + + +row.names(mfuzz_prot)<-c(as.matrix(mfuzz_prot[,1])) + + +mfuzz_prot2<-data.frame(t(mfuzz_prot)) + +prot<-as.matrix(names(mfuzz_prot2)) + + + + + + + +Matrix2<-data.frame(Matrix[,1],Matrix[,c(which(names(Matrix)%in% prot))]) + + +col<-ncol(Matrix2) +names(Matrix2)<-c("label",as.matrix(names(Matrix2[,-1]))) +anova<-data.frame(names(Matrix2[,-1])) + + +for (i in 2:col) { + aov<-(summary(aov(Matrix2[,i] ~ Matrix2[,1],Matrix2))[[1]])$`Pr(>F)`[1] + aov<-as.numeric(aov) + # aov<-as.numeric(aov,digits = 4, scientific = F) + anova[c(i-1),2]<-aov +} + +names(anova)<-c('protein',"pvalue") + +anova2<-data.frame(anova[c(which(anova$pvalue<=0.05)),]) + + +write.csv(anova2,paste0("meta/mfuzz_ge2021type_8_meta_new/",K,"_mfuzz_anova_0.05.csv"),row.names = F) + + +} + + + +``` + + + + + + + + + + + + + + diff --git a/README.md b/README.md index 581ef15..9ad034a 100644 --- a/README.md +++ b/README.md @@ -15,3 +15,12 @@ Code for processing and analyzing metabonomics data ## datamining_library_ge20200306.R Common self-built function library + +## CVDSBA_UMAP.Rmd +Umap analysis and visualization + +## CVDSBA_heatmap.Rmd +Heatmap analysis and visualization + +## CVDSBA_mfuzz.Rmd +Mfuzz analysis and visualization diff --git a/common.R b/common.R new file mode 100644 index 0000000..e1c858e --- /dev/null +++ b/common.R @@ -0,0 +1,408 @@ +library("FactoMineR") +library("factoextra") +library("grid") +library("data.table") +library("ggplot2") +library(pheatmap); +library("RColorBrewer"); +library("tsne"); + + +###tSNE plot DE: dataframe with rows as sample and columns as features,a column with name 'label' is required, represents the label of samples +# lblColors: a vector containing colors for each label, like lblColors=c(M = "forestgreen", N = "gray0", P="firebrick", C= "red",A="blue") +drawTSNE <- function(DF,ptColors,rowNormalization=F,colNormalization=F,perplexity=10,strTitle='tSNE'){ + M <- DF[,colnames(DF)!='label'] + if(rowNormalization){M <- data.frame(t(apply(M,1,function(v){(v-mean(v,na.rm=T))/sd(v,na.rm=T)})))} + if(colNormalization){M <- apply(M,2,function(v){(v-mean(v,na.rm=T))/sd(v,na.rm=T)})} + M[is.na(M)] <- 0 + indx <- match('label',colnames(M)) + clnames <- colnames(DF)[colnames(DF)!='label'] + tsn = tsne(M,perplexity =perplexity) + cnames <- colnames(M) + tsn <- data.frame(tsn,DF$label) + colnames(tsn) <- c("X","Y","label") + rownames(tsn) <- rownames(M) + #tsn <- tsn[-c(which(tsn$X==min(tsn$X)),which(tsn$Y==min(tsn$Y))),] + #tsn <- tsn[-c(which(tsn$X==max(tsn$X)),which(tsn$Y==max(tsn$Y))),] + #lblColors <- c(N='#537e35',M='#e17832',A='#f5b819',C='#5992c6',P='#282f89',W='mediumorchid3') + + #lblColors <- c(A='#537e35',M='#e17832',N='#f5b819',B='#5992c6',C='#282f89',W='mediumorchid3') + p <- ggplot(tsn, aes(x=X, y=Y, colour=label)) + geom_point(size=4) + p <- p + theme( panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + panel.border = element_blank(), + plot.title = element_text(size=15), + axis.line.x = element_line(color="black", size = 0.5), + axis.line.y = element_line(color="black", size = 0.5), + panel.background = element_blank()) + p <- p + labs(title=strTitle) + p <- p + scale_colour_manual(values=ptColors) + p +} + +drawPCA<- function(DF,ptColors,rowNormalization=F,colNormalization=F,strTitle=NULL){ +## M is a matrix or dataframe, rows are samples and columns are features, rownames are sample names + M <- DF[,colnames(DF)!='label'] + if(rowNormalization){ + M <- data.frame(t(apply(M,1,function(v){(v-mean(v,na.rm=T))/sd(v,na.rm=T)}))) + #print('Normalization by row is done!') + } + if(colNormalization){ + M <- apply(M,2,function(v){(v-mean(v,na.rm=T))/sd(v,na.rm=T)}) + } + clnames <- colnames(DF)[colnames(DF)!='label'] + M[is.na(M)] <- 0 + m1 <- prcomp(M,colNormalization); + Y <- scale(M, m1$center, m1$scale) %*% m1$rotation + Y <- Y[,c(1,2)] + + Y <- data.frame(Y,DF$label); + colnames(Y) <- c("PC1","PC2","label") + if(is.null(strTitle)){ + strTitle <- sprintf("PCA:%d features",length(clnames)) + } + eigs <- m1$sdev^2 + percentages <- eigs[1:2] / sum(eigs) + # lblColors <- c(N='#537e35',M='#e17832',A='#f5b819',C='#5992c6',P='#282f89',W='mediumorchid3') + #lblColors <- c(training='#537e35',validation='#e17832',A='#f5b819',C='#5992c6',P='#282f89',W='mediumorchid3') + p <- ggplot(Y, aes(x=PC1, y=PC2, colour=label)) + geom_point(size=4) + p <- p + theme( panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + panel.border = element_blank(), + axis.line.x = element_line(color="black", size = 0.25), + axis.line.y = element_line(color="black", size = 0.25), + plot.title = element_text(size=16), + panel.background = element_blank()) + + strLabx <- sprintf("PC1(%4.2f%%)",percentages[1]*100) + p <- p + labs(x =strLabx,y = sprintf("PC2(%4.2f%%)",percentages[2]*100), + title =strTitle) + p <- p + scale_colour_manual(values=ptColors) + + p +} + +# parse gene symbol from uniprot website +# acc: protein access Id like ' "O00264" "O00468" "O14773" "O14979" "O15230" "O43175" "O43707" +# return: the gene symbol for the acc +geneName <- function(acc){ + kk <- read.table(sprintf('https://www.uniprot.org/uniprot/%s.fasta',acc),sep="\n",stringsAsFactors=F,header=F) + a <- strsplit(kk[1,]," ")[[1]] + b <- a[grepl("GN=",a)] + strsplit(b,"=")[[1]][2] +} +getSubset <- function(Labels,M0,missRate,imputation=F){ + lbls <- Labels + lbls <- if(length(Labels)==1){lbls <- unlist(strsplit(lbls,""))}else{lbls <- Labels} + tmp <- M0[M0$label %in% lbls,] + R0 <- apply(tmp,2,function(v){sum(is.na(v))/length(v)*100}) + tmp <- tmp[,R0<=missRate] + + #if(!imputation){tmp[is.na(tmp)] <- 0} + if(imputation){ #### kNN imputation + K=3 + t1 <- sapply(1:dim(tmp)[1],function(i) { + lbl <- tmp[i, 'label'] + v0 <- tmp[i, colnames(tmp) != 'label'] + feature0 <- names(v0)[is.na(v0)] + if(length(feature0)>0){ + k0 <- tmp[-i,] + k0 <- k0[k0$label==lbl,-1] + k0 <- rbind(v0,k0) + k0 <- apply(k0,2,function(v){(v-mean(v,na.rm=T))/sd(v,na.rm=T)}) + k0[is.na(k0)] <- 0 + + D0 <- as.matrix(dist(k0)) + d0 <- order(D0[1,])[2:(K+1)] + if(length(feature0)==1){ + v0[feature0] <- mean(tmp[d0,feature0],na.rm=T) + }else{ + v0[feature0] <- apply(tmp[d0,feature0],2,function(v){mean(v,na.rm=T)}) + } + } + v0 + }) + + t1 <- apply(t(t1),2,unlist) + rownames(t1) <- rownames(tmp) + t1 <- data.frame(t1) + clnames <- colnames(t1) + t1$label <- tmp$label + t1 <- t1[,c('label',clnames)] + #print(sprintf("get subset for %s with %d rows and %d features",paste0(lbls,collapse=","),dim(t1)[1],dim(t1)[2])) + tmp <- t1 + } + return(tmp) +} + + + RFScore <- function(feature,M1=t1,nFolds=10,it=10){ + #tmp <- data.frame(t(apply(M1[,feature],1,function(v0){v <- as.numeric(v0); (v-mean(v))/sd(v)}))) + tmp <- M1[,c('label',feature)] + clsses <- unique(tmp$label) + #rownames(tmp) <- sapply(1:dim(tmp)[1],function(v){paste0('R',v)}) + sampleNumber <- sapply(clsses,function(v){sum(tmp$label==v)}) + names(sampleNumber) <- clsses + maxLabel <- names(sampleNumber)[max(sampleNumber)==sampleNumber] + minLabel <- names(sampleNumber)[min(sampleNumber)==sampleNumber] + size <- min(sampleNumber) + + avgAcc <- 0; + probs <- rep(0,max(sampleNumber)) + names(probs) <- rownames(tmp)[tmp$label==maxLabel] + its <- 0 + indx2 <- rownames(tmp)[tmp$label==minLabel] + + prediction <- matrix(0,nrow=dim(M1)[1],ncol=2) + rownames(prediction) <- rownames(M1) + + while(sum(probs==0)>0){ + indx1 <- names(sort(probs))[1:size] + probs[indx1] <- probs[indx1]+1 + tmp1 <- tmp[c(indx1,indx2),] + for(fd in 1:nFolds){ + folds <- createFolds(tmp1$label,nFolds) + for(fold in folds){ + valids <- tmp1[fold,] + rownames(valids) <- rownames(tmp1)[fold] + + trains <- tmp1[setdiff(1:dim(tmp1)[1],fold),] + trains$label <- as.factor(trains$label) + + tmpRF <- randomForest(label ~ . ,data=trains,importance=T,ntree=1000,nodesize=5) + predicted <- predict(tmpRF,valids,type='prob') + prediction[rownames(predicted),] <- predicted+prediction[rownames(predicted),] + + } + } + }#while + colnames(prediction) <- colnames(predicted) + prediction <- data.frame(t(apply(prediction,1,function(v){v/sum(v)}))) + prediction$predicted <- as.character(apply(prediction,1,function(v){names(v)[v==max(v)]})) + prediction$observed <- M1$label + rtv <-sprintf("%s %4.3f",paste0(feature,collapse=","),sum(prediction$observed==prediction$predicted)/dim(M1)[1]*100) + #prediction + rtv + } +############################################################################################# +RFPredict <- function(trainM,testM){ + + clsses <- unique(trainM$label) + #rownames(tmp) <- sapply(1:dim(tmp)[1],function(v){paste0('R',v)}) + sampleNumber <- sapply(clsses,function(v){sum(tmp$label==v)}) + names(sampleNumber) <- clsses + maxLabel <- names(sampleNumber)[max(sampleNumber)==sampleNumber] + minLabel <- names(sampleNumber)[min(sampleNumber)==sampleNumber] + size <- min(sampleNumber) + + avgAcc <- 0; + probs <- rep(0,max(sampleNumber)) + names(probs) <- rownames(tmp)[tmp$label==maxLabel] + its <- 0 + indx2 <- rownames(tmp)[tmp$label==minLabel] + + prediction <- matrix(0,nrow=dim(M1)[1],ncol=2) + rownames(prediction) <- rownames(M1) + + while(sum(probs==0)>0){ + indx1 <- names(sort(probs))[1:size] + probs[indx1] <- probs[indx1]+1 + tmp1 <- tmp[c(indx1,indx2),] + for(fd in 1:nFolds){ + folds <- createFolds(tmp1$label,nFolds) + for(fold in folds){ + valids <- tmp1[fold,] + rownames(valids) <- rownames(tmp1)[fold] + + trains <- tmp1[setdiff(1:dim(tmp1)[1],fold),] + trains$label <- as.factor(trains$label) + + tmpRF <- randomForest(label ~ . ,data=trains,importance=T,ntree=1000,nodesize=5) + predicted <- predict(tmpRF,valids,type='prob') + prediction[rownames(predicted),] <- predicted+prediction[rownames(predicted),] + + } + } + }#while + colnames(prediction) <- colnames(predicted) + prediction <- data.frame(t(apply(prediction,1,function(v){v/sum(v)}))) + prediction$predicted <- as.character(apply(prediction,1,function(v){names(v)[v==max(v)]})) + prediction$observed <- M1$label + rtv <-sprintf("%s %4.3f",paste0(feature,collapse=","),sum(prediction$observed==prediction$predicted)/dim(M1)[1]*100) + #prediction + rtv + } + + +############################################################################################################### +drawVolcano <- function(df1,pdfPath,strTitle="volcano plot",outFile=NULL){ + label <- df1$label + lbls = unique(df1$label) + table(df1$label) # A143,C75 + fc <- apply(2^df1[,colnames(df1)!='label'],2,function(x){ + log2( mean(na.omit(x[label==lbls[1]])) /mean(na.omit(x[label==lbls[2]]))) + }) + + df1[is.na(df1)] <- 0 + pValue <- apply(df1[,colnames(df1)!='label'], 2, function(v) { + p1 <- t.test(v[label == lbls[1]], v[label == lbls[2]], paired = F, var.equal = F)$p.value + p1 <- p.adjust(p1,method="BH") + p1 + }) + + pdf(pdfPath) + plot(fc, -log10(pValue), col = '#00000033', pch = 19,xlab = 'log2(FC)', ylab = '-log10(p-value)', main = strTitle) + abline(h = 1.3, v = c(-log2(1.5),log2(1.5)), lty = 2, lwd = 1) + + up <- fc >= log2(1.5) & pValue <= 0.05 + points(fc[up], -log10(pValue[up]), col = 1,bg = brewer.pal(9,"YlOrRd")[6], pch = 21, cex = 2) + + down <- fc <= -log2(1.5) & pValue <= 0.05 + points(fc[down], -log10(pValue[down]), col = 1,bg = brewer.pal(11,"RdBu")[9], pch = 21, cex = 2) + dev.off() + + name = list(up = data.frame(prot = colnames(df1[,colnames(df1)!='label'])[up],fc = fc[up], p_value = pValue[up], + type = rep("Upregulation",sum(up)),stringsAsFactors=F), + down = data.frame(prot = colnames(df1[,colnames(df1)!='label'])[down],fc = fc[down], p_value = pValue[down], + type = rep("Downregulation",sum(down))),stringsAsFactors=F) + name1 = rbind(name[[1]],name[[2]]) + rownames(name1) <- 1:dim(name1)[1] + + if( !is.null(outFile) ){ + write.table(name1,file=outFile,sep="\t",col.names=T,row.names=F,quote=F) + } + #write.xlsx(name1, "Table_1_TPD_volcano_prot_AC10_fc1.5_190131.xlsx",showNA = T,row.names = F) + sum(up) + sum(down) + return(name1) +} + +drawUMAP <- function(M1,ptColors, strTitle="UMAP",rowNormalization=T,colNormalization=F){ + + if(!'label' %in% colnames(M1)){ + print('A column with named label must existed in data frame') + return(NULL) + } + + tmp <- M1[,colnames(M1)!='label'] + if(rowNormalization){ + tmp <- data.frame(t(apply(tmp,1,function(v){(v-mean(v,na.rm=T))/sd(v,na.rm=T)})),stringsAsFactors=F) + rownames(tmp) <- rownames(M1) + } + if(colNormalization){ tmp <- apply(tmp,2,function(v){(v-mean(v,na.rm=T))/sd(v,na.rm=T)}) } + tmp[is.na(tmp)] <- 0 + + obj = umap(d=tmp,method='naive') + clnames <- colnames(tmp) + df1 <- data.frame(obj$layout) + df1$label <- M1$label + colnames(df1) <- c('X','Y','label') + + p <- ggplot(df1, aes(x=X, y=Y, colour=label)) + geom_point(size=4) + p <- p + theme( panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + panel.border = element_blank(), + plot.title = element_text(size=16), + axis.line.x = element_line(color="black", size = 0.5), + axis.line.y = element_line(color="black", size = 0.5), + + panel.background = element_blank()) + + #strLabx <- sprintf("PC1(%4.2f%%)",percentages[1]*100) + p <- p + labs(title =strTitle) + p <- p + scale_colour_manual(values=ptColors) + p + } + +Volcano <- function(df1,pdfPath,outFile=NULL,thresholdFC=1.5,thresholdPValue=0.05,strTitle="volcano plot"){ + label <- df1$label + lbls = unique(df1$label) + table(df1$label) # A143,C75 + fc <- apply(2^df1[,colnames(df1)!='label'],2,function(x){ + log2( mean(na.omit(x[label==lbls[1]])) /mean(na.omit(x[label==lbls[2]]))) + }) + + df1[is.na(df1)] <- 0 + pValue <- apply(df1[,colnames(df1)!='label'], 2, function(v) { + p1 <- t.test(v[label == lbls[1]], v[label == lbls[2]], paired = F, var.equal = F)$p.value + p1 <- p.adjust(p1,method="BH") + p1 + }) + if(!is.null(pdfPath)){ + pdf(pdfPath) + plot(fc, -log10(pValue), col = '#00000033', pch = 19,xlab = 'log2(FoldChange)', ylab = '-log10(P-value)', main = strTitle) + abline(h = -log10(thresholdPValue), v = c(-log2(thresholdFC),log2(thresholdFC)), lty = 2, lwd = 1) + + up <- fc >= log2(thresholdFC) & pValue <= thresholdPValue + points(fc[up], -log10(pValue[up]), col = 1,bg = brewer.pal(9,"YlOrRd")[6], pch = 21, cex = 2) + + down <- fc <= -log2(thresholdFC) & pValue <= thresholdPValue + points(fc[down], -log10(pValue[down]), col = 1,bg = brewer.pal(11,"RdBu")[9], pch = 21, cex = 2) + dev.off() + } + + name = list(up = data.frame(prot = colnames(df1[,colnames(df1)!='label'])[up],fc = fc[up], p_value = pValue[up], + type = rep("Upregulation",sum(up)),stringsAsFactors=F), + down = data.frame(prot = colnames(df1[,colnames(df1)!='label'])[down],fc = fc[down], p_value = pValue[down], + type = rep("Downregulation",sum(down))),stringsAsFactors=F) + name1 = rbind(name[[1]],name[[2]],stringsAsFactors=F) + rownames(name1) <- 1:dim(name1)[1] + name1 <- name1[order(abs(name1$fc),decreasing=T),] + if( !is.null(outFile) ){ + write.table(name1,file=outFile,sep="\t",col.names=T,row.names=F,quote=F) + } + #write.xlsx(name1, "Table_1_TPD_volcano_prot_AC10_fc1.5_190131.xlsx",showNA = T,row.names = F) + sum(up) + sum(down) + return(name1) + } + +RFScore1 <- function(M1,nFolds=10,itors=10){ + if(! 'label' %in% colnames(M1)){ + print('A column with name "label" is required for labeling the class of each sample') + return(NULL) + } + clsses <- unique(M1$label) + #rownames(tmp) <- sapply(1:dim(tmp)[1],function(v){paste0('R',v)}) + sampleNumber <- sapply(clsses,function(v){sum(M1$label==v)}) + names(sampleNumber) <- clsses + maxLabel <- names(sampleNumber)[max(sampleNumber)==sampleNumber] + minLabel <- names(sampleNumber)[min(sampleNumber)==sampleNumber] + size <- min(sampleNumber) + + probs <- rep(0,max(sampleNumber)) + names(probs) <- rownames(M1)[M1$label==maxLabel] + + indx2 <- rownames(M1)[M1$label==minLabel] + + prediction <- matrix(0,nrow=dim(M1)[1],ncol=2) + rownames(prediction) <- rownames(M1) + + while(sum(probs==0)>0){ + indx1 <- names(sort(probs))[1:size] + probs[indx1] <- probs[indx1]+1 + + tmp1 <- M1[c(indx1,indx2),] + for(i in 1:itors){ + folds <- createFolds(tmp1$label,nFolds) + for(fold in folds){ + valids <- tmp1[fold,] + rownames(valids) <- rownames(tmp1)[fold] + trains <- tmp1[setdiff(1:dim(tmp1)[1],fold),] + trains$label <- as.factor(trains$label) + tmpRF <- randomForest(label ~ . ,data=trains,importance=T,ntree=1000,nodesize=5) + predicted <- predict(tmpRF,valids,type='prob') + prediction[rownames(predicted),] <- predicted+prediction[rownames(predicted),] + } + } + }#while + colnames(prediction) <- colnames(predicted) + prediction <- data.frame(t(apply(prediction,1,function(v){v/sum(v)}))) + prediction$predicted <- as.character(apply(prediction,1,function(v){names(v)[v==max(v)]})) + prediction$observed <- M1$label + feature <- colnames(M1)[colnames(M1) !='label'] + rtv <-sprintf("%s %4.3f",paste0(feature,collapse=","),sum(prediction$observed==prediction$predicted)/dim(M1)[1]*100) + print(rtv) + #prediction + rtv +} diff --git a/common_change_label.R b/common_change_label.R new file mode 100644 index 0000000..b51f5f7 --- /dev/null +++ b/common_change_label.R @@ -0,0 +1,231 @@ +# This file includes PCA UMAP TSNE and Vocano plot +# copyright: wuzhicheng@fudan.edu.cn +# +library("FactoMineR") +library("factoextra") +library("grid") +library("data.table") +library("ggplot2") +library(pheatmap); +library("RColorBrewer"); +library("tsne"); + + +# draw PCA plot + +# +#DF is a dataframe, rows are samples and columns are proteins, rownames are sample names and DF must have a column +#named label to indicate which class the corresponding row below to. +#ptColors?? point colors example as : c('B'='red','M'='black'). the number of colors equals with the number of distinct classes +# strTitle: title text showed on the plot. +# + +drawPCA<- function(DF,ptColors,rowNormalization=T,colNormalization=T,strTitle=NULL){ + M <- DF[,colnames(DF)!='label'] + if(rowNormalization){ + M <- data.frame(t(apply(M,1,function(v){(v-mean(v,na.rm=T))/sd(v,na.rm=T)}))) + #print('Normalization by row is done!') + } + if(colNormalization){ + M <- apply(M,2,function(v){(v-mean(v,na.rm=T))/sd(v,na.rm=T)}) + } + clnames <- colnames(DF)[colnames(DF)!='label'] + M[is.na(M)] <- 0 + m1 <- prcomp(M,colNormalization); + Y <- scale(M, m1$center, m1$scale) %*% m1$rotation + Y <- Y[,c(1,2)] + + Y <- data.frame(Y,DF$label); + colnames(Y) <- c("PC1","PC2","label") + if(is.null(strTitle)){ + strTitle <- sprintf("PCA:%d features",length(clnames)) + } + eigs <- m1$sdev^2 + percentages <- eigs[1:2] / sum(eigs) + # lblColors <- c(N='#537e35',M='#e17832',A='#f5b819',C='#5992c6',P='#282f89',W='mediumorchid3') + #lblColors <- c(training='#537e35',validation='#e17832',A='#f5b819',C='#5992c6',P='#282f89',W='mediumorchid3') + p <- ggplot(Y, aes(x=PC1, y=PC2, colour=label)) + geom_point(size=4) + p <- p + theme( panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + panel.border = element_blank(), + axis.line.x = element_line(color="black", size = 0.25), + axis.line.y = element_line(color="black", size = 0.25), + plot.title = element_text(size=16), + panel.background = element_blank()) + + strLabx <- sprintf("PC1(%4.2f%%)",percentages[1]*100) + p <- p + labs(x =strLabx,y = sprintf("PC2(%4.2f%%)",percentages[2]*100), + title =strTitle) + p <- p + scale_colour_manual(values=ptColors) + + p +} + +###tSNE plot DE: dataframe with rows as sample and columns as features,a column with name 'label' is required, represents the label of samples +# lblColors: a vector containing colors for each label, like lblColors=c(M = "forestgreen", N = "gray0", P="firebrick", C= "red",A="blue") +drawTSNE <- function(DF,ptColors,rowNormalization=F,colNormalization=F,perplexity=10,strTitle='tSNE'){ + M <- DF[,colnames(DF)!='label'] + if(rowNormalization){M <- data.frame(t(apply(M,1,function(v){(v-mean(v,na.rm=T))/sd(v,na.rm=T)})))} + if(colNormalization){M <- apply(M,2,function(v){(v-mean(v,na.rm=T))/sd(v,na.rm=T)})} + M[is.na(M)] <- 0 + indx <- match('label',colnames(M)) + clnames <- colnames(DF)[colnames(DF)!='label'] + tsn = tsne(M,perplexity =perplexity) + cnames <- colnames(M) + tsn <- data.frame(tsn,DF$label) + colnames(tsn) <- c("X","Y","label") + rownames(tsn) <- rownames(M) + #tsn <- tsn[-c(which(tsn$X==min(tsn$X)),which(tsn$Y==min(tsn$Y))),] + #tsn <- tsn[-c(which(tsn$X==max(tsn$X)),which(tsn$Y==max(tsn$Y))),] + #lblColors <- c(N='#537e35',M='#e17832',A='#f5b819',C='#5992c6',P='#282f89',W='mediumorchid3') + + #lblColors <- c(A='#537e35',M='#e17832',N='#f5b819',B='#5992c6',C='#282f89',W='mediumorchid3') + p <- ggplot(tsn, aes(x=X, y=Y, colour=label)) + geom_point(size=4) + p <- p + theme( panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + panel.border = element_blank(), + plot.title = element_text(size=15), + axis.line.x = element_line(color="black", size = 0.5), + axis.line.y = element_line(color="black", size = 0.5), + panel.background = element_blank()) + p <- p + labs(title=strTitle) + p <- p + scale_colour_manual(values=ptColors) + p +} + +# parse gene symbol from uniprot website +# acc: protein access Id like ' "O00264" "O00468" "O14773" "O14979" "O15230" "O43175" "O43707" +# return: the gene symbol for the acc +geneName <- function(acc){ + kk <- read.table(sprintf('https://www.uniprot.org/uniprot/%s.fasta',acc),sep="\n",stringsAsFactors=F,header=F) + a <- strsplit(kk[1,]," ")[[1]] + b <- a[grepl("GN=",a)] + strsplit(b,"=")[[1]][2] +} +############################################################################################################### +drawVolcano <- function(df1,pdfPath,strTitle="volcano plot",outFile=NULL){ + label <- df1$label + lbls = unique(df1$label) + table(df1$label) # A143,C75 + fc <- apply(2^df1[,colnames(df1)!='label'],2,function(x){ + log2( mean(na.omit(x[label==lbls[1]])) /mean(na.omit(x[label==lbls[2]]))) + }) + + df1[is.na(df1)] <- 0 + pValue <- apply(df1[,colnames(df1)!='label'], 2, function(v) { + p1 <- t.test(v[label == lbls[1]], v[label == lbls[2]], paired = F, var.equal = F)$p.value + p1 <- p.adjust(p1,method="BH") + p1 + }) + + pdf(pdfPath) + plot(fc, -log10(pValue), col = '#00000033', pch = 19,xlab = 'log2(FC)', ylab = '-log10(p-value)', main = strTitle) + abline(h = 1.3, v = c(-log2(1.5),log2(1.5)), lty = 2, lwd = 1) + + up <- fc >= log2(1.5) & pValue <= 0.05 + points(fc[up], -log10(pValue[up]), col = 1,bg = brewer.pal(9,"YlOrRd")[6], pch = 21, cex = 2) + + down <- fc <= -log2(1.5) & pValue <= 0.05 + points(fc[down], -log10(pValue[down]), col = 1,bg = brewer.pal(11,"RdBu")[9], pch = 21, cex = 2) + dev.off() + + name = list(up = data.frame(prot = colnames(df1[,colnames(df1)!='label'])[up],fc = fc[up], p_value = pValue[up], + type = rep("Upregulation",sum(up)),stringsAsFactors=F), + down = data.frame(prot = colnames(df1[,colnames(df1)!='label'])[down],fc = fc[down], p_value = pValue[down], + type = rep("Downregulation",sum(down))),stringsAsFactors=F) + name1 = rbind(name[[1]],name[[2]]) + rownames(name1) <- 1:dim(name1)[1] + + if( !is.null(outFile) ){ + write.table(name1,file=outFile,sep="\t",col.names=T,row.names=F,quote=F) + } + #write.xlsx(name1, "Table_1_TPD_volcano_prot_AC10_fc1.5_190131.xlsx",showNA = T,row.names = F) + sum(up) + sum(down) + return(name1) +} +# see drawPCA +drawUMAP <- function(M1,ptColors, strTitle="UMAP",rowNormalization=T,colNormalization=F){ + + if(!'label' %in% colnames(M1)){ + print('A column with named label must existed in data frame') + return(NULL) + } + + tmp <- M1[,colnames(M1)!='label'] + if(rowNormalization){ + tmp <- data.frame(t(apply(tmp,1,function(v){(v-mean(v,na.rm=T))/sd(v,na.rm=T)})),stringsAsFactors=F) + rownames(tmp) <- rownames(M1) + } + if(colNormalization){ tmp <- apply(tmp,2,function(v){(v-mean(v,na.rm=T))/sd(v,na.rm=T)}) } + tmp[is.na(tmp)] <- 0 + + obj = umap(d=tmp,method='naive',n_neighbors = 10) + clnames <- colnames(tmp) + df1 <- data.frame(obj$layout) + df1$label <- M1$label + colnames(df1) <- c('X','Y','label') + + p <- ggplot(df1, aes(x=X, y=Y, colour=label)) + geom_point(size=4) + p <- p + theme( panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + panel.border = element_blank(), + plot.title = element_text(size=16), + axis.line.x = element_line(color="black", size = 0.5), + axis.line.y = element_line(color="black", size = 0.5), + + panel.background = element_blank())+ + geom_text( + label = row.names(M1), + colour = "blue", + size = 2, + nudge_y = 0.1 + ) + + #strLabx <- sprintf("PC1(%4.2f%%)",percentages[1]*100) + p <- p + labs(title =strTitle) + p <- p + scale_colour_manual(values=ptColors) + p + } +Volcano <- function(df1,pdfPath,outFile=NULL,thresholdFC=1.5,thresholdPValue=0.05,strTitle="volcano plot"){ + label <- df1$label + lbls = unique(df1$label) + table(df1$label) # A143,C75 + fc <- apply(2^df1[,colnames(df1)!='label'],2,function(x){ + log2( mean(na.omit(x[label==lbls[1]])) /mean(na.omit(x[label==lbls[2]]))) + }) + + df1[is.na(df1)] <- 0 + pValue <- apply(df1[,colnames(df1)!='label'], 2, function(v) { + p1 <- t.test(v[label == lbls[1]], v[label == lbls[2]], paired = F, var.equal = F)$p.value + p1 <- p.adjust(p1,method="BH") + p1 + }) + if(!is.null(pdfPath)){ + pdf(pdfPath) + plot(fc, -log10(pValue), col = '#00000033', pch = 19,xlab = 'log2(FoldChange)', ylab = '-log10(P-value)', main = strTitle) + abline(h = -log10(thresholdPValue), v = c(-log2(thresholdFC),log2(thresholdFC)), lty = 2, lwd = 1) + + up <- fc >= log2(thresholdFC) & pValue <= thresholdPValue + points(fc[up], -log10(pValue[up]), col = 1,bg = brewer.pal(9,"YlOrRd")[6], pch = 21, cex = 2) + + down <- fc <= -log2(thresholdFC) & pValue <= thresholdPValue + points(fc[down], -log10(pValue[down]), col = 1,bg = brewer.pal(11,"RdBu")[9], pch = 21, cex = 2) + dev.off() + } + + name = list(up = data.frame(prot = colnames(df1[,colnames(df1)!='label'])[up],fc = fc[up], p_value = pValue[up], + type = rep("Upregulation",sum(up)),stringsAsFactors=F), + down = data.frame(prot = colnames(df1[,colnames(df1)!='label'])[down],fc = fc[down], p_value = pValue[down], + type = rep("Downregulation",sum(down))),stringsAsFactors=F) + name1 = rbind(name[[1]],name[[2]],stringsAsFactors=F) + rownames(name1) <- 1:dim(name1)[1] + name1 <- name1[order(abs(name1$fc),decreasing=T),] + if( !is.null(outFile) ){ + write.table(name1,file=outFile,sep="\t",col.names=T,row.names=F,quote=F) + } + #write.xlsx(name1, "Table_1_TPD_volcano_prot_AC10_fc1.5_190131.xlsx",showNA = T,row.names = F) + sum(up) + sum(down) + return(name1) + } diff --git a/data/weigang_mata_boxplot.csv b/data/weigang_mata_boxplot.csv new file mode 100644 index 0000000..4f7aef0 --- /dev/null +++ b/data/weigang_mata_boxplot.csv @@ -0,0 +1,17 @@ +meta +1-(1-enyl-palmitoyl)-2-arachidonoyl-GPC (P-16:0/20:4)* +arachidonate (20:4n6) +"bilirubin degradation product, C16H18N2O5 (1)" +"bilirubin degradation product, C16H18N2O5 (2)" +"bilirubin degradation product, C16H18N2O5 (3)" +"bilirubin degradation product, C16H18N2O5 (4)" +"bilirubin degradation product, C17H18N2O4 (1)" +"bilirubin degradation product, C17H18N2O4 (2)" +"bilirubin degradation product, C17H18N2O4 (3)" +"bilirubin degradation product, C17H20N2O5 (1)" +"bilirubin degradation product, C17H20N2O5 (2)" +choline +kynurenine +mannose +serotonin +sphingosine diff --git a/proteomics_tmt_20200325.Rmd b/proteomics_tmt_20200325.Rmd index 9182256..899c98a 100644 --- a/proteomics_tmt_20200325.Rmd +++ b/proteomics_tmt_20200325.Rmd @@ -425,7 +425,7 @@ dev.off() boxplot ```{r} -prot2 <- c("P02743","P02776","P02775","P02765","P35542","O95445","P04004") +prot2 <- c("P01011","Q9UK55","P02741","Q86UD1","P13796","P0DJI8","P0DJI9") label <- info$Type[match(names(df2),info$TMT)] df4 <- df2[prot2,label!="NA"] label4 <- info$Type[match(names(df4),info$TMT)] @@ -435,48 +435,18 @@ for (i in 1:nrow(df.nor)) { nm <- row.names(df.nor)[i] pv <- c() data1 <- data.frame(value=as.numeric(df.nor[i,]),type=label4) - a1 <- 0 - b1 <- 0 - for (a in lav) { + for (a in lav) { for (b in lav) { - if(a>=b){ - next - }else{ + a_type <- which(label4 %in% a) b_type <- which(label4 %in% b) c <- t.test(df.nor[i,a_type],df.nor[i,b_type], paired = F, var.equal = F)$p.value - pv <- paste0(pv,"\n",a,"_",b,":",format(c,digits = 3, scientific = FALSE)) - } - b1=b1+1 + pv <- paste0(pv,paste0(" ",a,"_",b,":",format(c,digits = 3, scientific = FALSE))) + } - a1=a1+1 } - - plot.boxplot <- function(data,x,y,type,filename,title="boxplot"){ - a <- ggplot(data=data, aes(x =x, y =y ,color=type,group=type)) + - geom_jitter(alpha = 0.3,size=3) + - geom_boxplot(alpha = .5,size=1)+ - labs(x="sample",y="value",fill= "type")+ - ggtitle(title)+ - theme_bw() + - theme(panel.border = element_blank())+ - theme(axis.line = element_line(size=1, colour = "black")) + - theme(panel.grid =element_blank())+ - theme(axis.text = element_text(size = 15,colour = "black"),text = element_text(size = 15,colour = "black"))+ - theme(axis.text.x = element_text( hjust = 1,angle = 45))+ - scale_x_discrete(limit=c("jkdz","jbdz","PT","ZX") )+ - scale_color_manual(limits=c("jkdz","jbdz","PT","ZX"), values=c("#85B22E","#5F80B4","#E29827","#922927")) - ggsave(paste0(filename, ".pdf"),plot=a,width=8,height=8) -} - plot.boxplot( data1,data1$type,data1$value,data1$type,paste0("prot7_boxplot/",i, "_boxplot"),title=paste0(nm, pv)) + ge.plot.boxplot( data1,data1$type,data1$value,data1$type,paste0("prot7_boxplot/",nm, "_boxplot"),title=paste0(nm, pv)) } - - -which(grepl("choline",row.names(df2) )) - -``` - ``` - diff --git a/sTable1_20200329-2100.xlsx b/sTable1_20200329-2100.xlsx new file mode 100644 index 0000000..842e2a0 Binary files /dev/null and b/sTable1_20200329-2100.xlsx differ