From 2062c6ff75bcc0782f8cfa589906e240d5195b8c Mon Sep 17 00:00:00 2001
From: Michael Mayer <mayermichael79@gmail.com>
Date: Tue, 7 Nov 2023 20:14:04 +0100
Subject: [PATCH 1/2] Update news

---
 NEWS.md               |  6 ++---
 backlog/calibration.R | 57 +++++++++++++++++++++++--------------------
 2 files changed, 33 insertions(+), 30 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index e8f6cbfe..3ecfc5ae 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -2,7 +2,7 @@
 
 ## Enhancements
 
-- {hstats} now also work for factor predictions. The levels are represented by one-hot-encoded columns.
+- {hstats} now also work for factor predictions. The levels are represented by one-hot-encoded columns ([PR#101](https://github.com/mayer79/hstats/pull/101)).
 - The plot method of a two-dimensional PDP has recieved the option `d2_geom = "line"`. Instead of a heatmap of the two features, one of the features is moved to color grouping. Combined with `swap_dim = TRUE`, you can swap the role of the two `v` variables without recalculating anything. The idea was proposed by [Roel Verbelen](https://github.com/RoelVerbelen) in [issue #91](https://github.com/mayer79/hstats/issues/91), see also [issue #94](https://github.com/mayer79/hstats/issues/94).
 
 ## Bug fixes
@@ -11,8 +11,8 @@
 
 ## Other changes
 
-- Much faster one-hot-encoding, thanks to Mathias Ambühl.
-- Most functions are slightly faster.
+- Much faster one-hot-encoding, thanks to Mathias Ambühl ([PR#101](https://github.com/mayer79/hstats/pull/101)).
+- Most functions are slightly faster ([PR#101](https://github.com/mayer79/hstats/pull/101)).
 - Add unit tests to compare against {iml}.
 - Made all examples "tibble" and "data.table" friendly.
 - Revised input checks in loss functions (relevant for `perm_importance()` and `average_loss()`).
diff --git a/backlog/calibration.R b/backlog/calibration.R
index 7b1e47c9..7ad4fd68 100644
--- a/backlog/calibration.R
+++ b/backlog/calibration.R
@@ -58,9 +58,12 @@ calibration <- function(object, ...) {
 
 #' @describeIn calibration Default method.
 #' @export
-calibration.default <- function(object, v, X, y = NULL, pred_fun = stats::predict, 
+calibration.default <- function(object, v, X, y = NULL,
+                                pred_fun = stats::predict, 
                                 BY = NULL, by_size = 4L, 
-                                grid_size = 17L, 
+                                breaks = 17L, trim = c(0.01, 0.99), 
+                                include.lowest = TRUE,  
+                                right = TRUE, na.rm = TRUE,
                                 pred = NULL,
                                 n_max = 1000L, w = NULL, ...) {
   stopifnot(
@@ -70,11 +73,7 @@ calibration.default <- function(object, v, X, y = NULL, pred_fun = stats::predic
   )
   
   if (!is.null(y)) {
-    y <- prepare_y(y = y, X = X)[["y"]]
-    if (is.factor(y) || is.character(y)) {
-      y <- stats::model.matrix(~ as.factor(y) + 0)
-    }
-    y <- align_pred(y)
+    y <- prepare_y(y = y, X = X, ohe = TRUE)[["y"]]
   }
   if (!is.null(w)) {
     w <- prepare_w(w = w, X = X)[["w"]]
@@ -83,49 +82,53 @@ calibration.default <- function(object, v, X, y = NULL, pred_fun = stats::predic
     BY2 <- prepare_by(BY = BY, X = X, by_size = by_size)
     BY <- BY2[["BY"]]
   }
-  g <- v_grouped <- approx_vector(X[[v]], m = grid_size)
-  grid <- sort(unique(v_grouped), na.last = TRUE)
-  
+  h <- hist2(
+    X[[v]], 
+    breaks = breaks, 
+    trim = trim, 
+    include.lowest = include.lowest, 
+    right = right,
+    na.rm = TRUE
+  )
+
   if (!is.null(BY)) {
-    g <- paste(BY, g, sep = ":")
+    g <- paste(BY, h$x, sep = ":")
+  } else {
+    g <- h$x
   }
   
   # Average predicted
   if (is.null(pred)) {
     pred <- pred_fun(object, X, ...)
   }
-  pred <- align_pred(pred)
-  tmp <- gwColMeans(pred, g = g, w = w, mean_only = FALSE)
-  avg_pred <- tmp[["mean"]]
-  
-  # Exposure
-  exposure <- tmp[["denom"]]
-  
+  pred <- prepare_pred(pred, ohe = TRUE)
+  pr <- gwColMeans(pred, g = g, w = w)
+
   # Average observed
-  avg_obs <- if (!is.null(y)) gwColMeans(y, g = g, w = w)
+  avg_obs <- if (!is.null(y)) gwColMeans(y, g = g, w = w)$M
   
   # Partial dependence
   pd <- partial_dep(
     object = object, 
     v = v, 
     X = X, 
-    grid = grid, 
+    grid = h$grid, 
     pred_fun = pred_fun, 
     BY = BY, 
     w = w, 
     ...
-  )[["data"]]
+  )
   
   out <- list(
     v = v,
-    K = ncol(pred),
-    pred_names = colnames(pred),
-    grid = grid,
+    K = ncol(pr$M),
+    pred_names = colnames(pr$M),
+    grid = h[-1L],
     BY,
     avg_obs = avg_obs,
-    avg_pred = avg_pred,
-    pd = pd,
-    exposure = exposure
+    avg_pred = pr$M,
+    pd = pd[["data"]],
+    exposure = pr$w
   )
   return(structure(out, class = "calibration"))
 }

From 096e969e93c560ed8e6e5cb45268088b507e4c75 Mon Sep 17 00:00:00 2001
From: Michael Mayer <mayermichael79@gmail.com>
Date: Tue, 7 Nov 2023 22:04:13 +0100
Subject: [PATCH 2/2] slightly better error handling in core calculations

---
 R/utils_calculate.R             | 15 +++++----------
 backlog/colMeans_factors.R      |  8 ++++++++
 tests/testthat/test_calculate.R |  1 -
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/R/utils_calculate.R b/R/utils_calculate.R
index 5353366d..875b4b9d 100644
--- a/R/utils_calculate.R
+++ b/R/utils_calculate.R
@@ -85,7 +85,7 @@ wcolMeans <- function(x, w = NULL) {
 #' @noRd
 #' @keywords internal
 #'
-#' @param x Factor.
+#' @param x Factor-like.
 #' @returns Named vector.
 colMeans_factor <- function(x) {
   x <- as.factor(x)
@@ -133,9 +133,6 @@ wrowmean <- function(x, ngroups = 1L, w = NULL) {
   }
 
   # General version
-  if (!is.matrix(x)) {
-    x <- as.matrix(x)
-  }
   wrowmean_matrix(x, ngroups = ngroups, w = w)
 }
 
@@ -146,13 +143,11 @@ wrowmean <- function(x, ngroups = 1L, w = NULL) {
 #' @noRd
 #' @keywords internal
 #'
-#' @param x Factor.
+#' @param x Factor-like.
 #' @param ngroups Number of subsequent, equals sized groups.
 #' @returns Matrix with column names.
 rowmean_factor <- function(x, ngroups = 1L) {
-  if (!is.factor(x)) {
-    stop("x must be a factor.")
-  }
+  x <- as.factor(x)
   lev <- levels(x)
   n_bg <- length(x) %/% ngroups
   dim(x) <- c(n_bg, ngroups)
@@ -192,13 +187,13 @@ wrowmean_vector <- function(x, ngroups = 1L, w = NULL) {
 #' @noRd
 #' @keywords internal
 #'
-#' @param x Matrix.
+#' @param x Matrix-like.
 #' @param ngroups Number of subsequent, equals sized groups.
 #' @param w Optional vector of case weights of length `NROW(x) / ngroups`.
 #' @returns Matrix.
 wrowmean_matrix <- function(x, ngroups = 1L, w = NULL) {
   if (!is.matrix(x)) {
-    stop("x must be a matrix.")
+    x <- as.matrix(x)
   }
   n_bg <- nrow(x) %/% ngroups
   g <- rep_each(ngroups, each = n_bg)  # rep(seq_len(ngroups), each = n_bg)
diff --git a/backlog/colMeans_factors.R b/backlog/colMeans_factors.R
index 8b0acf48..adcc54c5 100644
--- a/backlog/colMeans_factors.R
+++ b/backlog/colMeans_factors.R
@@ -17,3 +17,11 @@ gcolMeans_factor <- function(x, g = NULL) {
   out
 }
 
+wrowmean_matrix2 <- function(x, ngroups = 1L, w = NULL) {
+  if (!is.matrix(x)) {
+    stop("x must be a matrix.")
+  }
+  dim(x) <- c(nrow(x)/ngroups, ngroups, ncol(x))
+  out <- colMeans(aperm(x, c(1, 3, 2)))
+  t.default(out)
+}
diff --git a/tests/testthat/test_calculate.R b/tests/testthat/test_calculate.R
index 8463bcba..7c039c29 100644
--- a/tests/testthat/test_calculate.R
+++ b/tests/testthat/test_calculate.R
@@ -112,7 +112,6 @@ test_that("rowmean_factor() works for factor input", {
   x <- factor(c("C", "A", "C", "C", "A", "A"))
   out <- rowmean_factor(x, ngroups = 2L)
   
-  expect_error(rowmean_factor(1:3))
   expect_true(is.matrix(out))
   expect_equal(out, cbind(A = c(1/3, 2/3), C = c(2/3, 1/3)))
   expect_equal(out, wrowmean_matrix(fdummy(x), ngroups = 2L))