-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path5simulations.Rmd
292 lines (244 loc) · 10.1 KB
/
5simulations.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
# Simulations
Refer back to the data generation in \@ref{syntheticdata}. We will be
top-censoring the data at 0.5. Before censoring, this is what the data looks
like.
```{r load-data2, fig.width = 6, fig.height = 4, eval = FALSE}
## Load the "original" model
orig_model = readRDS(file=file.path("~/repos/flowcut/inst/output", "orig_model.RDS"))
## Generate data
set.seed(100)
isignal = 10
new_model = flowcut::make_model(orig_model, isignal)
## new_model = flowcut::make_model(orig_sim_model, isignal)
ylist = flowcut::gen_1d(new_model, nt = 100)
flowtrend::plot_1d(ylist, obj = new_model)
```
After censoring, this is it.
```{r, fig.width = 6, fig.height = 4, eval = FALSE}
## Censor it
ylist = lapply(ylist, function(y){
y = pmin(y, 0.5)
})
flowtrend::plot_1d(ylist, obj = new_model)
```
We need to specify a few things (1) like the censoring limits `Cbox` and (2)
`countslist` before running the MCMC.
```{r, eval = FALSE}
## Form the censoring "box"
Cbox = rbind(c(-Inf, 0.5))
## Counts are all equal for now
countslist = lapply(ylist, function(y){ rep(1, nrow(y)) })
## Save the metadata
datobj = list(ylist=ylist, countslist=countslist, Cbox=Cbox, X=orig_model$X)
## datobj = list(ylist=ylist, countslist=countslist, Cbox=Cbox, X=orig_sim_model$X)
saveRDS(datobj,
file.path("~/repos/flowcut/inst/output",
paste0("isignal-", isignal, "-datobj.RDS")))
```
We also need some *prior elicitation* to prevent the cluster means from changing
too much across time.
(code copy-pasted as-is for now, not meant to be run)
```{r, eval = FALSE}
gg <- maxdev_to_gg(datobj$X,
dimdat = 3,
maxdev = 0.5,
numclust = 10,
ggvec = (1:40)/200,
n.cores = detectCores(),
Nmc = 1e4, viz=FALSE)
gg <- 0.01287879
## hist(ball.deviance(0.01,0.5,t(X))$msd, breaks = "FD")
## hist(ball.deviance(0.1,0.5,X)$msd, breaks = "FD")
## hist(ball.deviance(0.2,0.5,X)$msd, breaks = "FD")
## hist(ball.deviance(0.5,0.5,X)$msd, breaks = "FD")
## hist(ball.deviance(1,0.5,X)$msd, breaks = "FD")
## hist(ball.deviance(2,0.5,X)$msd, breaks = "FD")
```
Next, we run the MCMC.
```{r, eval = FALSE}
## Run the MCMC
Nmc <- 1e3 * 5
Nburn <- 500
Nmc <- 20
Nburn <- 10
set.seed(123)
Gibbs.res <- run.Gibbs.fast(ylist = datobj$ylist,
countslist = datobj$countslist,
numclust = 2,
Nmc = Nmc, Nburn = Nburn,
gg = 0.1,
X = t(datobj$X),
Cbox = datobj$Cbox, verbose = TRUE)
## Save the results
saveRDS(Gibbs.res,
file.path("~/repos/flowcut/inst/output",
paste0("isignal-", isignal, "-gibbs.RDS")))
```
## Simulation helpers
Here is a helper function for obtaining the membership probabilities of a new
dataset |ynew| given GMM model |gmm_model|.
```{r}
#' Get the membership probabilities of a new dataset |ynew| given GMM model |gmm_model|.
#'
#' @param gmm_model from mclust or me.weighted
#' @param ynew new data to get predicted posterior membership for.
#'
#' @return estimated posterior probabilities from an estimated GMM model
get_posterior_probs <- function(gmm_model, ynew){
stopifnot(ncol(ynew) == 1)
numclust = length(gmm_model$parameters$mean)
mn = gmm_model$parameters$mean
sd = gmm_model$parameters$variance$sigmasq %>% sqrt()
pro = gmm_model$parameters$pro
post_prob = lapply(1:numclust, function(iclust){
weighted_d1 = dnorm(ynew, mean = mn[iclust], sd = sd[iclust]) * pro[iclust]
}) %>% do.call(cbind, .)
post_prob = post_prob/rowSums(post_prob)
## Give up on posterior probability calculation and make all things 1/2, when
## this (rarely) happens.
if(any(is.nan(post_prob))) post_prob = matrix(1/2, nrow=nrow(post_prob), ncol=2)
return(post_prob)
}
```
Also, here is a helper to split the data into (1) censored and (2) non-censored
("inner") particles; used in `preimpute()`.
```{r}
#' A helper to split the data into (1) censored and (2) non-censored ("inner") particles.
#'
#' @param datobj List containing ylist, countslist, X, and Cbox.
#'
#' @return List of particles on /and/ inside censor boundary. And the directions
#' in which they were censored.
#'
#' @export
split_data <- function(datobj){
## At each time point, set aside censored points.
censor.direction.full.list <- lapply(datobj$ylist,
function(x){
flowcut:::censorIndicator(x[,1, drop=FALSE],
datobj$Cbox[1,,drop=FALSE])})
censor.which.list = censor.direction.full.list %>%
lapply(function(x) apply(x,1, function(xx){ sum(abs(xx)) > 0 })) %>%
lapply(which)
censored_ylist <- mapply(function(y, censor_which){y[censor_which,,drop=FALSE]},
datobj$ylist, censor.which.list, SIMPLIFY = FALSE)
censored_direction <- mapply(function(censor_direction, censor_which){
censor_direction[censor_which,,drop=FALSE]
}, censor.direction.full.list, censor.which.list, SIMPLIFY = FALSE)
## here are data
inner_ylist <- mapply(function(y, censor_which){
if(length(censor_which) > 0) return(y[-censor_which,,drop=FALSE])
if(!length(censor_which) > 0) return(y)
}, datobj$ylist, censor.which.list, SIMPLIFY = FALSE)
inner_countslist <- mapply(function(counts, censor_which){
if(length(censor_which) > 0) return(counts[-censor_which])
if(!length(censor_which) > 0) return(counts)
}, datobj$countslist, censor.which.list, SIMPLIFY = FALSE)
return(list(censored_ylist = censored_ylist,
censored_direction = censored_direction,
inner_ylist = inner_ylist,
inner_countslist = inner_countslist,
censor.which.list = censor.which.list
))
}
```
Here is a function that will pre-impute data for a 2-cluster 1-dimensional
dataset; used in `preimpute()`.
```{r}
#' At each time point, we will (1) set aside the censored points, and (2) estimate
#' a 2-component GMM (2) take the censored points, and "impute" them according to those
#' mixtures. (After that, we'll (3) fit flowmix on the resulting imputed data.)
#'
#' This is only designed for 1-dimensional data with two clusters.
#'
#' @param datobj List with ylist (particles), countslist, X, and Cbox.
#'
#' @return datobj with imputed ylist.
#'
#' @export
preimpute <- function(datobj){
## Setup
stopifnot(ncol(datobj$ylist[[1]]) == 1)
stopifnot("Cbox" %in% names(datobj))
## Split the data
splitres = split_data(datobj)
inner_ylist = splitres$inner_ylist
inner_countslist = splitres$inner_countslist
censored_ylist = splitres$censored_ylist
censored_direction = splitres$censored_direction
censor.which.list = splitres$censor.which.list
## What happens when the "inner" data is empty-ish at some times? We sample
## some points from the rest of the data, and fill the empty-ish cytograms in
## so that there are at least 5 particles in each cytogram.
inner_ntlist = sapply(inner_ylist, nrow)
min_num_particles = 5
times_empty = which(inner_ntlist < min_num_particles)
collapsed_ylist =
inner_ylist[-times_empty] %>%
lapply(function(y){sample(y, size=ceiling(nrow(y)/3), replace=FALSE)}) %>%
Reduce(c, .)
if(length(times_empty) > 0){
for(tt_empty in times_empty){
nt = length(inner_ylist[[tt_empty]])
extra_points = sample(collapsed_ylist, min_num_particles - nt) %>% cbind()
inner_ylist[[tt_empty]] <- rbind(inner_ylist[[tt_empty]], extra_points)
inner_countslist[[tt_empty]] <- c(inner_countslist[[tt_empty]], rep(1, length(extra_points)))
}
}
stopifnot(all(sapply(inner_ylist, nrow)>=min_num_particles))
## Fit 2-mixture Gaussian at each time point.
gmm_model_list <- mapply(function(y, counts){
## z is just a random initialization of the memberships of the points.
z = mclust::unmap(sample(1:2, size = length(y), replace=TRUE))
## Fit a Gaussian mixture model for weighted data.
##res = mclust::me.weighted(as.numeric(y), modelName = "V", weights = counts, z = z)
res = mclust::Mclust(as.numeric(y), G=2, modelName = "V", verbose = FALSE, warn = FALSE)##, weights = counts, z = z)
if(is.null(res)){
res = mclust::Mclust(as.numeric(y), G=2, modelName = "E", verbose = FALSE, warn = FALSE)##, weights = counts, z = z)
res$parameters$variance$sigmasq = rep(res$parameters$variance$sigmasq,2)
}
assertthat::assert_that(!is.null(res))
return(res)
}, inner_ylist, inner_countslist, SIMPLIFY = FALSE)
## Impute the points
imputed_censored_ylist = mapply(function(gmm_model, ynew, dirs){
## Impute based on 2-cluster model
if(nrow(ynew) == 0) return(ynew)
ynew_imputed = ynew
post_probs = get_posterior_probs(gmm_model, ynew)
new_mems = post_probs %>% apply(1, function(ps){ sample(1:2, 1, prob=ps) })
## Get the gaussian means and SDs
means = gmm_model$parameters$mean
sds = gmm_model$parameters$variance$sigmasq %>% sqrt()
## Information for imputing each points
impute_mat = tibble(new_mems = new_mems, dir = dirs)
## Impute the points
for(irow in 1:nrow(impute_mat)){
iclust = impute_mat$new_mems[irow]
if(impute_mat$dir[irow] == 1){
imputed_point = truncnorm::rtruncnorm(1, a = datobj$Cbox[1,2], mean = means[iclust], sd = sds[iclust])
}
if(impute_mat$dir[irow] == -1){
imputed_point = truncnorm::rtruncnorm(1, b = datobj$Cbox[1,1], mean = means[iclust], sd = sds[iclust])
}
ynew_imputed[irow] = imputed_point
}
return(ynew_imputed)
}, gmm_model_list, censored_ylist, censored_direction, SIMPLIFY = FALSE)
stopifnot(all(sapply(imputed_censored_ylist, nrow) == sapply(censored_ylist, nrow)))
## Return the datobj (ylist, countslist, and flowmix).
TT = length(datobj$ylist)
new_ylist = datobj$ylist
for(tt in 1:TT){
inds = censor.which.list[[tt]]
if(length(inds) == 0) next
ynew = datobj$ylist[[tt]]
ynew[inds] = imputed_censored_ylist[[tt]]
new_ylist[[tt]] = ynew
}
stopifnot(all(sapply(new_ylist, nrow) == sapply(datobj$ylist, nrow)))
new_datobj = datobj
new_datobj$ylist = new_ylist
return(new_datobj)
}
```