5simulations.Rmd

# Simulations

Refer back to the data generation in \@ref{syntheticdata}. We will be
top-censoring the data at 0.5. Before censoring, this is what the data looks
like.

```{r load-data2, fig.width = 6, fig.height = 4, eval = FALSE}
## Load the "original" model
orig_model = readRDS(file=file.path("~/repos/flowcut/inst/output", "orig_model.RDS"))

## Generate data
set.seed(100)
isignal = 10
new_model = flowcut::make_model(orig_model, isignal)
## new_model = flowcut::make_model(orig_sim_model, isignal)
ylist = flowcut::gen_1d(new_model, nt = 100)
flowtrend::plot_1d(ylist, obj = new_model)
```

After censoring, this is it.


```{r, fig.width = 6, fig.height = 4, eval = FALSE}
## Censor it
ylist = lapply(ylist, function(y){
  y = pmin(y, 0.5)
})
flowtrend::plot_1d(ylist, obj = new_model)
```

We need to specify a few things (1) like the censoring limits `Cbox` and (2)
`countslist` before running the MCMC.

```{r, eval = FALSE}
## Form the censoring "box"
Cbox = rbind(c(-Inf, 0.5)) 

## Counts are all equal for now
countslist = lapply(ylist, function(y){ rep(1, nrow(y)) })

## Save the metadata
datobj = list(ylist=ylist, countslist=countslist, Cbox=Cbox, X=orig_model$X)
## datobj = list(ylist=ylist, countslist=countslist, Cbox=Cbox, X=orig_sim_model$X)
saveRDS(datobj,
        file.path("~/repos/flowcut/inst/output",
                  paste0("isignal-", isignal, "-datobj.RDS")))
```

We also need some *prior elicitation* to prevent the cluster means from changing
too much across time.

(code copy-pasted as-is for now, not meant to be run)

```{r, eval = FALSE}
gg <- maxdev_to_gg(datobj$X, 
                   dimdat = 3,
                   maxdev = 0.5,
                   numclust = 10,
                   ggvec = (1:40)/200, 
                   n.cores = detectCores(),
                   Nmc = 1e4, viz=FALSE)
gg <- 0.01287879
## hist(ball.deviance(0.01,0.5,t(X))$msd, breaks = "FD")
## hist(ball.deviance(0.1,0.5,X)$msd, breaks = "FD")
## hist(ball.deviance(0.2,0.5,X)$msd, breaks = "FD")
## hist(ball.deviance(0.5,0.5,X)$msd, breaks = "FD")
## hist(ball.deviance(1,0.5,X)$msd, breaks = "FD")
## hist(ball.deviance(2,0.5,X)$msd, breaks = "FD")
```


Next, we run the MCMC.

```{r, eval = FALSE}
## Run the MCMC
Nmc <- 1e3 * 5
Nburn <- 500

Nmc <- 20
Nburn <- 10

set.seed(123)
Gibbs.res <- run.Gibbs.fast(ylist = datobj$ylist,
                            countslist = datobj$countslist,
                            numclust = 2,
                            Nmc = Nmc, Nburn = Nburn,
                            gg = 0.1, 
                            X = t(datobj$X),
                            Cbox = datobj$Cbox, verbose = TRUE)

## Save the results
saveRDS(Gibbs.res,
        file.path("~/repos/flowcut/inst/output",
                  paste0("isignal-", isignal, "-gibbs.RDS")))
```


## Simulation helpers


Here is a helper function for obtaining the membership probabilities of a new
dataset |ynew| given GMM model |gmm_model|.

```{r}
#' Get the membership probabilities of a new dataset |ynew| given GMM model |gmm_model|.
#'
#' @param gmm_model from mclust or me.weighted
#' @param ynew new data to get predicted posterior membership for.
#'
#' @return estimated posterior probabilities from an estimated GMM model
get_posterior_probs <- function(gmm_model, ynew){
  stopifnot(ncol(ynew) == 1)
  numclust = length(gmm_model$parameters$mean)
  mn = gmm_model$parameters$mean
  sd = gmm_model$parameters$variance$sigmasq %>% sqrt()
  pro = gmm_model$parameters$pro
  post_prob = lapply(1:numclust, function(iclust){
    weighted_d1 = dnorm(ynew, mean = mn[iclust], sd = sd[iclust]) * pro[iclust]
  })  %>% do.call(cbind, .)
  post_prob = post_prob/rowSums(post_prob)

  ## Give up on posterior probability calculation and make all things 1/2, when
  ## this (rarely) happens.
  if(any(is.nan(post_prob))) post_prob = matrix(1/2, nrow=nrow(post_prob), ncol=2)

  return(post_prob)
}
```

Also, here is a helper to split the data into (1) censored and (2) non-censored
("inner") particles; used in `preimpute()`.

```{r}
#' A helper to split the data into (1) censored and (2) non-censored ("inner") particles.
#'
#' @param datobj List containing ylist, countslist, X, and Cbox.
#'
#' @return List of particles on /and/ inside censor boundary. And the directions
#'   in which they were censored.
#'
#' @export
split_data <- function(datobj){
  ## At each time point, set aside censored points.
  censor.direction.full.list <- lapply(datobj$ylist,
                        function(x){
                          flowcut:::censorIndicator(x[,1, drop=FALSE],
                                                    datobj$Cbox[1,,drop=FALSE])})
  censor.which.list = censor.direction.full.list %>%
    lapply(function(x) apply(x,1, function(xx){ sum(abs(xx)) > 0 })) %>%
    lapply(which)
  censored_ylist <- mapply(function(y, censor_which){y[censor_which,,drop=FALSE]},
         datobj$ylist, censor.which.list, SIMPLIFY = FALSE)
  censored_direction <- mapply(function(censor_direction, censor_which){
    censor_direction[censor_which,,drop=FALSE]
  }, censor.direction.full.list, censor.which.list, SIMPLIFY = FALSE)

  ## here are data
  inner_ylist <- mapply(function(y, censor_which){
    if(length(censor_which) > 0) return(y[-censor_which,,drop=FALSE])
    if(!length(censor_which) > 0) return(y)
  }, datobj$ylist, censor.which.list, SIMPLIFY = FALSE)
  inner_countslist <- mapply(function(counts, censor_which){
    if(length(censor_which) > 0) return(counts[-censor_which])
    if(!length(censor_which) > 0) return(counts)
  }, datobj$countslist, censor.which.list, SIMPLIFY = FALSE)

  return(list(censored_ylist = censored_ylist,
              censored_direction = censored_direction,
              inner_ylist = inner_ylist,
              inner_countslist = inner_countslist,
              censor.which.list  = censor.which.list
              ))
}

```


Here is a function that will pre-impute data for a 2-cluster 1-dimensional
dataset; used in `preimpute()`.

```{r}
#' At each time point, we will (1) set aside the censored points, and (2) estimate
#' a 2-component GMM (2) take the censored points, and "impute" them according to those
#' mixtures. (After that, we'll (3) fit flowmix on the resulting imputed data.)
#'
#' This is only designed for 1-dimensional data with two clusters.
#'
#' @param datobj List with ylist (particles), countslist, X, and Cbox.
#'
#' @return datobj with imputed ylist.
#'
#' @export
preimpute <- function(datobj){

  ## Setup
  stopifnot(ncol(datobj$ylist[[1]]) == 1)
  stopifnot("Cbox" %in% names(datobj))

  ## Split the data
  splitres = split_data(datobj)
  inner_ylist = splitres$inner_ylist
  inner_countslist = splitres$inner_countslist
  censored_ylist = splitres$censored_ylist
  censored_direction = splitres$censored_direction
  censor.which.list = splitres$censor.which.list

  ## What happens when the "inner" data is empty-ish at some times? We sample
  ## some points from the rest of the data, and fill the empty-ish cytograms in
  ## so that there are at least 5 particles in each cytogram.
  inner_ntlist = sapply(inner_ylist, nrow)
  min_num_particles = 5
  times_empty = which(inner_ntlist < min_num_particles)
  collapsed_ylist =
    inner_ylist[-times_empty] %>%
    lapply(function(y){sample(y, size=ceiling(nrow(y)/3), replace=FALSE)}) %>% 
    Reduce(c, .)

  if(length(times_empty) > 0){
    for(tt_empty in times_empty){
      nt = length(inner_ylist[[tt_empty]])
      extra_points = sample(collapsed_ylist, min_num_particles - nt) %>% cbind()
      inner_ylist[[tt_empty]] <- rbind(inner_ylist[[tt_empty]], extra_points)
      inner_countslist[[tt_empty]] <- c(inner_countslist[[tt_empty]], rep(1, length(extra_points)))
    }
  }
  stopifnot(all(sapply(inner_ylist, nrow)>=min_num_particles))

  ## Fit 2-mixture Gaussian at each time point.
  gmm_model_list <- mapply(function(y, counts){

    ## z is just a random initialization of the memberships of the points.
    z = mclust::unmap(sample(1:2, size = length(y), replace=TRUE))

    ## Fit a Gaussian mixture model for weighted data.
    ##res = mclust::me.weighted(as.numeric(y), modelName = "V", weights = counts, z = z)
    res = mclust::Mclust(as.numeric(y), G=2, modelName = "V", verbose = FALSE, warn = FALSE)##, weights = counts, z = z)
    if(is.null(res)){
      res = mclust::Mclust(as.numeric(y), G=2, modelName = "E", verbose = FALSE, warn = FALSE)##, weights = counts, z = z)
      res$parameters$variance$sigmasq = rep(res$parameters$variance$sigmasq,2)
    }
    assertthat::assert_that(!is.null(res))
    return(res)
  }, inner_ylist, inner_countslist, SIMPLIFY = FALSE)

  ## Impute the points
  imputed_censored_ylist = mapply(function(gmm_model, ynew, dirs){

    ## Impute based on 2-cluster model
    if(nrow(ynew) == 0) return(ynew)
    ynew_imputed = ynew
    post_probs = get_posterior_probs(gmm_model, ynew)
    new_mems = post_probs %>% apply(1, function(ps){ sample(1:2, 1, prob=ps) })

    ## Get the gaussian means and SDs
    means = gmm_model$parameters$mean
    sds = gmm_model$parameters$variance$sigmasq %>% sqrt()

    ## Information for imputing each points
    impute_mat = tibble(new_mems = new_mems, dir = dirs)

    ## Impute the points
    for(irow in 1:nrow(impute_mat)){
      iclust = impute_mat$new_mems[irow]
      if(impute_mat$dir[irow] == 1){
        imputed_point = truncnorm::rtruncnorm(1, a = datobj$Cbox[1,2], mean = means[iclust], sd = sds[iclust])
      }
      if(impute_mat$dir[irow] == -1){
        imputed_point = truncnorm::rtruncnorm(1, b = datobj$Cbox[1,1], mean = means[iclust], sd = sds[iclust])
      }
      ynew_imputed[irow] = imputed_point
    }
    return(ynew_imputed)
  }, gmm_model_list, censored_ylist, censored_direction, SIMPLIFY = FALSE)
  stopifnot(all(sapply(imputed_censored_ylist, nrow) == sapply(censored_ylist, nrow)))

  ##  Return the datobj (ylist, countslist, and flowmix).
  TT = length(datobj$ylist)
  new_ylist = datobj$ylist
  for(tt in 1:TT){
    inds = censor.which.list[[tt]]
    if(length(inds) == 0) next
    ynew = datobj$ylist[[tt]]
    ynew[inds] = imputed_censored_ylist[[tt]]
    new_ylist[[tt]] = ynew
  }
  stopifnot(all(sapply(new_ylist, nrow) == sapply(datobj$ylist, nrow)))
  new_datobj = datobj
  new_datobj$ylist = new_ylist
  return(new_datobj)
}
```