v1.51 - features and maintenance

- fixed some non-ASCII chars in Fieldgoals.csv - updates to various man pages - added some new man pages (map2stan-class, rethinking e.g.) - removed glmm() and bglmm() - NAMESPACE gardening continues; importing more explicitly from dependencies now - ensemble() now understands replace list, like link() - some source file renaming for sanity - fixed resample() so it uses all samples from all chains to compute new WAIC - map2stan(): improved handling of variable names with dots . in them; now warns - map2stan(): notifies user when imputing from NAs - various plotting functions that call set_nice_margins() now respect mfrow/mfcol settings better - sim() now understands replace list, like link() - removed various lingering require() calls to dependencies now in NAMESPACE
rmcelreath · Mar 14, 2015 · ec401c7 · ec401c7
1 parent 3893c91
commit ec401c7
Show file tree

Hide file tree

Showing 18 changed files with 280 additions and 201 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,11 +1,11 @@
 Package: rethinking
 Type: Package
 Title: Statistical Rethinking book package
-Version: 1.50
-Date: 2015-03-07
+Version: 1.51
+Date: 2015-03-14
 Author: Richard McElreath
 Maintainer: Richard McElreath <[email protected]>
 Imports: coda, MASS
-Extends: rstan
+Depends: rstan, parallel
 Description: Utilities for fitting and comparing models
 License: GPL (>= 3)
diff --git a/NAMESPACE b/NAMESPACE
@@ -2,4 +2,10 @@ exportPattern("^[^x]")
 importFrom(coda, HPDinterval)
 importFrom(coda, as.mcmc)
 importFrom(MASS, mvrnorm)
-importClassesFrom(rstan)
+importFrom(rstan, stan)
+importFrom(rstan, extract)
+importFrom(rstan, sflist2stanfit)
+importFrom(rstan, summary)
+importClassesFrom(rstan , stanfit)
+importFrom(parallel, mclapply)
+importFrom(parallel, parLapply)
diff --git a/R/ensemble.R b/R/ensemble.R
@@ -1,5 +1,5 @@
 # build ensemble of samples using DIC/WAIC weights
-ensemble <- function( ... , data , n=1e3 , WAIC=TRUE , refresh=0 ) {
+ensemble <- function( ... , data , n=1e3 , WAIC=TRUE , refresh=0 , replace=list() ) {
     # retrieve list of models
     L <- list(...)
     if ( is.list(L[[1]]) && length(L)==1 )
@@ -22,11 +22,11 @@ ensemble <- function( ... , data , n=1e3 , WAIC=TRUE , refresh=0 ) {
     # generate simulated predictions for each model
     # then compose ensemble using weights
     if ( missing(data) ) {
-        link.list <- lapply( L , function(m) link(m , n=n , refresh=refresh ) )
-        sim.list <- lapply( L , function(m) sim(m , n=n , refresh=refresh ) )
+        link.list <- lapply( L , function(m) link(m , n=n , refresh=refresh , replace=replace ) )
+        sim.list <- lapply( L , function(m) sim(m , n=n , refresh=refresh , replace=replace ) )
     } else {
-        link.list <- lapply( L , function(m) link(m , data=data , n=n , refresh=refresh ) )
-        sim.list <- lapply( L , function(m) sim(m , data=data , n=n , refresh=refresh ) )
+        link.list <- lapply( L , function(m) link(m , data=data , n=n , refresh=refresh , replace=replace ) )
+        sim.list <- lapply( L , function(m) sim(m , data=data , n=n , refresh=refresh , replace=replace ) )
     }
     #print(str(sim.list))
 
@@ -37,7 +37,8 @@ ensemble <- function( ... , data , n=1e3 , WAIC=TRUE , refresh=0 ) {
     if ( length(L)>1 )
         for ( i in 2:length(idx) ) {
             idx_start[i] <- min( idx_start[i-1] + idx[i-1] , n )
-            idx_end[i] <- min( idx_end[i-1] + idx[i] , n )
+            idx_end[i] <- min( idx_start[i] + idx[i] - 1 , n )
+            if ( i==length(idx) ) idx_end[i] <- n
         }
     #print(idx)
     #print(idx_start)

diff --git a/R/map-predict.r → R/link-map.r b/R/map-predict.r → R/link-map.r
diff --git a/R/map2stan_predict.r → R/link-map2stan.r b/R/map2stan_predict.r → R/link-map2stan.r
@@ -186,7 +186,7 @@ function( fit , data , n=1000 , post , refresh=0.1 , replace=list() , flatten=TR
                 if ( lm_lik[k] == "ordered_logistic" ) {
                     # need vector of probabilities as result
                     cuts <- init[['cutpoints']]
-                    v <- predict.ordlogit( r , cuts )
+                    v <- predict_ordlogit( r , cuts )
                     #r <- t(v)
                     r <- v
                 }
@@ -215,7 +215,7 @@ function( fit , data , n=1000 , post , refresh=0.1 , replace=list() , flatten=TR
 }
 )
 
-predict.ordlogit <- function( phi , a ) {
+predict_ordlogit <- function( phi , a ) {
     K <- 1:(length(a)+1)
     a <- c( as.numeric(a) , Inf )
     p <- sapply( K , function(k) logistic(a[k]-phi) )

diff --git a/R/map2stan-class.r b/R/map2stan-class.r
@@ -150,7 +150,7 @@ resample <- function( object , iter=1e4 , warmup=1000 , chains=1 , cores=1 , DIC
         fit <- stan( fit=object@stanfit , data=data , init=init , pars=object@pars , iter=iter , warmup=warmup , chains=chains , ... )
     } else {
         init[[1]] <- object@start
-        require(parallel)
+        #require(parallel)
         sys <- .Platform$OS.type
         if ( missing(rng_seed) ) rng_seed <- sample( 1:1e5 , 1 )
         if ( sys=='unix' ) {
@@ -170,7 +170,7 @@ resample <- function( object , iter=1e4 , warmup=1000 , chains=1 , cores=1 , DIC
             env0 <- list( fit=fit, data=data, pars=pars, rng_seed=rng_seed, iter=iter, warmup=warmup )
             clusterExport(cl = CL, c("iter","warmup","data", "fit", "pars", "rng_seed"), as.environment(env0))
             sflist <- parLapply(CL, 1:chains, fun = function(cid) {
-                require(rstan)
+                #require(rstan)
                 stan(fit = fit, data = data, pars = pars, chains = 1, 
                   iter = iter, warmup = warmup, seed = rng_seed, 
                   chain_id = cid)
@@ -218,7 +218,7 @@ resample <- function( object , iter=1e4 , warmup=1000 , chains=1 , cores=1 , DIC
     #WAIC
     if ( WAIC==TRUE ) {
         attr(result,"WAIC") <- NULL
-        waic_calc <- try(WAIC(result))
+        waic_calc <- try(WAIC(result,n=0))
         attr(result,"WAIC") <- waic_calc
     } else {
         # clear out any old WAIC calculation

diff --git a/R/map2stan.r b/R/map2stan.r
@@ -6,14 +6,16 @@
 # templates map R density functions onto Stan functions
 
 # to-do:
+# (*) different chains get different random start values
+# (*) support cores argument -> compile -> resample
 # (*) need to do p*theta and (1-p)*theta multiplication outside likelihood in betabinomial (similar story for gammapoisson) --- or is there an operator for pairwise vector multiplication?
 # (-) handle improper input more gracefully
 # (-) add "as is" formula type, with quoted text on RHS to dump into Stan code
 
 ##################
 # map2stan itself
 
-map2stan <- function( flist , data , start , pars , constraints=list() , types=list() , sample=TRUE , iter=2000 , chains=1 , debug=FALSE , verbose=FALSE , WAIC=TRUE , ... ) {
+map2stan <- function( flist , data , start , pars , constraints=list() , types=list() , sample=TRUE , iter=2000 , chains=1 , debug=FALSE , verbose=FALSE , WAIC=TRUE , cores=1 , ... ) {
 
     ########################################
     # empty Stan code
@@ -397,6 +399,15 @@ map2stan <- function( flist , data , start , pars , constraints=list() , types=l
     impute_bank <- list()
     d <- as.list(data)
 
+    # undot all the variable names
+    for ( i in 1:length(d) ) {
+        raw_name <- names(d)[i]
+        new_name <- undot(raw_name)
+        if ( raw_name != new_name )
+            warning( concat("Renaming variable '",raw_name,"' to '",new_name,"' internally.\nYou should rename the variable to remove all dots '.'") )
+        names(d)[i] <- new_name
+    }
+
     ##################################
     # check for previous fit object
     # if found, build again to get data right, but use compiled model later
@@ -490,7 +501,7 @@ map2stan <- function( flist , data , start , pars , constraints=list() , types=l
 
             # check for missing values
             # if found, indicates predictor variable for imputation
-            if ( any(is.na(d[[lik$outcome]])) ) {
+            if ( any(is.na(d[[undot(lik$outcome)]])) ) {
                 # overwrite with top 'N' name
                 fp[['used_predictors']][[undot(lik$outcome)]] <- list( var=undot(lik$outcome) , N='N' , type=lik$out_type )
 
@@ -510,7 +521,10 @@ map2stan <- function( flist , data , start , pars , constraints=list() , types=l
                 # flag as used
                 fp[['used_predictors']][[missingness_name]] <- list( var=missingness_name , N=N_missing_name , type="int" )
                 # replace variable with missing values
-                d[[lik$outcome]] <- var_temp
+                d[[undot(lik$outcome)]] <- var_temp
+
+                # message to user
+                message(concat("Imputing ",length(var_missingness)," missing values (NA) in variable '",undot(lik$outcome),"'."))
             }
 
             # check for binomial size variable and mark used

diff --git a/R/plotting.r b/R/plotting.r
@@ -1,9 +1,12 @@
 # plotting
 
 set_nice_margins <- function() {
-    mfrow_old <- par("mfrow")
-    on.exit(par(mfrow = mfrow_old))
-    par(mgp = c(1.5, 0.5, 0), mar = c(2.5, 2.5, 2, 1) + 0.1, tck = -0.02)
+    par_mf <- par("mfrow","mfcol")
+    if ( all(unlist(par_mf)==1) ) {
+        #par_old <- par(no.readonly = TRUE)
+        #on.exit(par(par_old))
+        par(mgp = c(1.5, 0.5, 0), mar = c(2.5, 2.5, 2, 1) + 0.1, tck = -0.02)
+    }
 }
 
 dens <- function( x , adj=0.5 , norm.comp=FALSE , main="" , show.HPDI=FALSE , show.zero=FALSE , rm.na=TRUE , add=FALSE , ...) {

diff --git a/R/sim.r b/R/sim.r
@@ -7,7 +7,7 @@ function( fit , data , n=1000 , ... ) {
 )
 
 setMethod("sim", "map",
-function( fit , data , n=1000 , post , ll=FALSE , refresh=0.1 , ... ) {
+function( fit , data , n=1000 , post , ll=FALSE , refresh=0.1 , replace=list() , ... ) {
     # when ll=FALSE, simulates sampling, one sample for each sample in posterior
     # when ll=TRUE, computes loglik of each observation, for each sample in posterior
 
@@ -32,7 +32,7 @@ function( fit , data , n=1000 , post , ll=FALSE , refresh=0.1 , ... ) {
     # get linear model values from link
     # use our posterior samples, so later parameters have right correlation structure
     # don't flatten result, so we end up with a named list, even if only one element
-    pred <- link( fit , data=data , n=n , post=post , flatten=FALSE , refresh=refresh , ... )
+    pred <- link( fit , data=data , n=n , post=post , flatten=FALSE , refresh=refresh , replace=replace , ... )
 
     # extract likelihood, assuming it is first element of formula
     lik <- flist_untag(fit@formula)[[1]]
@@ -159,7 +159,7 @@ function( fit , data , n=1000 , post , ll=FALSE , refresh=0.1 , ... ) {
 )
 
 setMethod("sim", "map2stan",
-function( fit , data , n=0 , post , ... ) {
+function( fit , data , n , post , refresh=0.1 , replace=list() , ... ) {
 
     ########################################
     # check arguments
@@ -174,12 +174,13 @@ function( fit , data , n=0 , post , ... ) {
     if ( missing(post) ) 
         post <- extract.samples(fit,n=n)
 
-    n <- dim(post[[1]])[1] # in case n=0 was passed
+    if ( missing(n) )
+        n <- dim(post[[1]])[1] # in case n=0 was passed
 
     # get linear model values from link
     # use our posterior samples, so later parameters have right correlation structure
     # don't flatten result, so we end up with a named list, even if only one element
-    pred <- link( fit , data=data , n=n , post=post , flatten=FALSE , ... )
+    pred <- link( fit , data=data , n=n , post=post , flatten=FALSE , refresh=refresh , replace=replace , ... )
 
     # extract likelihood, assuming it is first element of formula
     lik <- flist_untag(fit@formula)[[1]]

diff --git a/R/utilities.r b/R/utilities.r
@@ -59,21 +59,21 @@ progbar <- function( current , min=0 , max=100 , starttime , update.interval=100
 }
 
 # convenience interface to glmer (lme4) that always uses REML=FALSE
-glmm <- function( ... , family , REML=FALSE ) {
-    require(lme4)
-    if ( missing(family) ) {
-        result <- lmer( ... , REML=REML )
-    } else {
-        result <- glmer( ... , family=family )
-    }
-    result
-}
-
-bglmm <- function( ... , REML=FALSE ) {
-    require(lme4)
-    require(blme)
-    blmer( ... , REML=REML )
-}
+#glmm <- function( ... , family , REML=FALSE ) {
+#    require(lme4)
+#    if ( missing(family) ) {
+#        result <- lmer( ... , REML=REML )
+#    } else {
+#        result <- glmer( ... , family=family )
+#    }
+#    result
+#}
+
+#bglmm <- function( ... , REML=FALSE ) {
+#    require(lme4)
+#    require(blme)
+#    blmer( ... , REML=REML )
+#}
 
 covmat <- function( m , digits=4 ) {
     # upper diag is covariances
@@ -171,7 +171,7 @@ replicate2 <- function (n, expr, interval=0.1, simplify = "array") {
 
 # multi-core replicate
 mcreplicate <- function (n, expr, refresh = 0.1, mc.cores=2 ) {
-    require(parallel)
+    #require(parallel)
     show_progress <- function(i) {
         intervaln <- floor(n * refresh)
         if (floor(i/intervaln) == i/intervaln) {