-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathmakeGraphicalAbstract.R
More file actions
191 lines (162 loc) · 6.52 KB
/
makeGraphicalAbstract.R
File metadata and controls
191 lines (162 loc) · 6.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
library(cellAlign)
require(ggplot2)
library(geiger)
library(sp)
library(rgeos)
generate_data = function(slope, sd, population_size){
"
Generates a population of data that has specified slope and deviation. The Y intercept is set to 1.
It also generates a population of data to be used for showing no increase expression (a flat line).
This flat linecomes from the average expression value of the original population
Parameters:
slope: what the slope of the population should be. Mimics increaseing (or decreasing) protein expression
sd: the standard devation of the data. Measures the biological variability in data
population_size: how many data points to generate. Defaults to 1,000
Return:
A list, where time_traj is the pseudotime values,
Y is the expression values for the population with slope and sd,
and flat_Y is the expression values for the flat line
"
X = runif(population_size)
X = as.data.frame(sort(X))
X = t(X)
Ynorm <- slope * X +1+ rnorm(population_size,sd = sd)
Yflat = rep(mean(Ynorm), population_size)
Yflat = t(Yflat)
colnames(Ynorm) = paste0("t", 1:population_size)
rownames(Ynorm) = "gene"
colnames(Yflat) = paste0("t", 1:population_size)
rownames(Yflat) = "gene"
X = t(X)
rownames(X) = paste0("t", 1:population_size)
colnames(X) = NULL
data = list("time_traj" = X, "Y" = Ynorm, "flat_Y"=Yflat)
return(data)
}
subsample = function(population_data, number_of_cells){
"
Subsamples from the population generated by the generate_data() function.
It splits that data into quarters and samples rnadomly form each one, to get an even distribution across time.
Parameters:
population_data is a dataframe that is the result of generate_data() that we're going to subsample from.
number_of_cells the number of cells to subsample
Returns:
List where traj is the pseudotime trajectory for the subsample, and exp is the protein expression values for the subsample
"
traj = population_data$time_traj
exp = population_data$Y
trajdf = as.data.frame(traj)
#divide time into quarters
q = split(as.vector(trajdf), factor(sort(rank(as.vector(trajdf))%%4)))
first = q$`0`
second = q$`1`
third = q$`2`
fourth = q$`3`
#get an even number of sample from each
remainder = number_of_cells %% 4
s_size = number_of_cells %/% 4
s_size_plus1 = s_size
s_size_plus2 = s_size
s_size_plus3 = s_size
if (remainder == 1){
s_size_plus1 = s_size_plus1 + 1
}
if (remainder == 2){
s_size_plus1 = s_size_plus1 + 1
s_size_plus2 = s_size_plus2+1
}
if (remainder == 3){
s_size_plus1 = s_size_plus1 + 1
s_size_plus2 = s_size_plus2+1
s_size_plus3 = s_size_plus3 +1
}
first_index = rownames(first)
first_sample = sample(first_index, s_size_plus1)
second_index = rownames(second)
second_sample = sample(second_index, s_size_plus2)
third_index = rownames(third)
third_sample = sample(third_index, s_size_plus3)
fourth_index = rownames(fourth)
fourth_sample = sample(fourth_index, s_size)
all_samples = c(first_sample, second_sample, third_sample, fourth_sample)
#subset traj
trajdf <- cbind(index = rownames(trajdf), trajdf)
rownames(trajdf) <- 1:nrow(trajdf)
traj_subset = trajdf[which((trajdf$index %in% all_samples)==TRUE),]
rownames(traj_subset) <- traj_subset$index
traj_subset$index=NULL
traj_subset = as.matrix(traj_subset)
colnames(traj_subset) = NULL
#subset exp
exp_subset = exp[ ,which((colnames(exp) %in% all_samples)==TRUE)]
exp_subset = t(exp_subset)
rownames(exp_subset)="gene"
exp_subset = as.data.frame(exp_subset)
subsample = list("traj"=traj_subset, "exp"=exp_subset)
return(subsample)
}
interpolate = function(time_traj, expression, subsample=TRUE, number_of_points=200){
"
Interpolates the data along the trajectory.
Data is interpolated by equally spaced points along the pseudotime trajectory.
Paraeters:
time_traj: the pseudo time trajectory
expression: the protein expression values
subsample: boolean that says weather we're interpolating a line for the subsample or the entire population. Defaults to TRUE
number_of_points: the number of points to interpolate (the points will all be equally spaced). Defualts to 200
Return:
A list where interpolated is a dataframe containg traj and exp for the interpolated line,
and points is a dataframe containing traj and exp data for the points (not an interpolated line)
"
Y = expression
X = time_traj
interGlobalLPS = cellAlign::interWeights(expDataBatch = Y, trajCond =X,
winSz = 0.1, numPts = number_of_points)
require(reshape2)
whichgene = "gene"
selectedLPS<-interGlobalLPS$interpolatedVals[whichgene,]
dfLPSi = data.frame(traj = interGlobalLPS$traj, value=(selectedLPS), error=interGlobalLPS$error[whichgene,])
if(subsample==TRUE){
dfLPS = data.frame(traj = X, gene=t(Y[whichgene,]))
}else{
dfLPS = data.frame(traj = X, gene=Y[whichgene,])
}
dfLPSM = melt(dfLPS, id.vars = 'traj')
result = list("interpolated" = dfLPSi, "points"=dfLPSM)
return(result)
}
data = generate_data(slope=2, sd=1, population_size=5000)
inter = interpolate(data$time_traj, data$Y, subsample=FALSE)
#Graph full population
ggplot(inter$interpolated, aes(x=traj,y=value)) + geom_point(data=inter$points, aes(x=traj,y=value))+
# geom_point(data=inter_S$points, aes(x=traj,y=value), color="red", size=2)+
geom_line(data=inter$interpolated, color='blue', size=3)+
scale_color_discrete(name = "Protein expression trend", labels = c("interpolated"))+
theme_classic()+
xlab("Time Course Trajectory")+
ylab("Protein Expression")+
theme(axis.title.x = element_text(size=25, face="bold"),
axis.title.y = element_text(size=25, face="bold"),
axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y= element_blank()
)+
ylim(-2,6)
##Take subsamples of 7, 20, and 100
mysample = subsample(data, 7)
inter_S = interpolate(mysample$traj, mysample$exp, subsample=TRUE)
#Graph subsample
ggplot(inter_S$interpolated, aes(x=traj,y=value)) + geom_line(size = 2) + geom_point(data=inter_S$points, aes(x=traj,y=value))+
geom_line(data=inter$interpolated, color='blue', size=2)+
theme_classic()+
xlab("")+
ylab("")+
theme(axis.title.x = element_text(size=16, face="bold"),
axis.title.y = element_text(size=16, face="bold"),
axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y= element_blank()
)+
ylim(-2,6)