From b8c1a67132edeb626a07f714cba5b67235ca1d3a Mon Sep 17 00:00:00 2001 From: Michael Mayer Date: Fri, 5 Jul 2024 18:47:01 +0200 Subject: [PATCH 1/6] Remove possibility to use factor-valued predictions --- DESCRIPTION | 2 +- NEWS.md | 7 +++ R/hstats.R | 2 +- R/ice.R | 1 - R/pd_raw.R | 5 +- R/utils_input.R | 20 +++--- packaging.R | 2 +- tests/testthat/test_calculate.R | 100 ------------------------------ tests/testthat/test_hstats.R | 6 -- tests/testthat/test_ice.R | 9 --- tests/testthat/test_input.R | 5 -- tests/testthat/test_partial_dep.R | 8 --- 12 files changed, 23 insertions(+), 144 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index f161602c..b20a6113 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: hstats Title: Interaction Statistics -Version: 1.1.2 +Version: 1.2.0 Authors@R: person("Michael", "Mayer", , "mayermichael79@gmail.com", role = c("aut", "cre")) Description: Fast, model-agnostic implementation of different H-statistics diff --git a/NEWS.md b/NEWS.md index 4cf55340..9009049c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ +# hstats 1.2.0 + +# Major changes + +- Factor-valued predictions are no longer possible. +- Consequently, also removed "classification_error" loss. + # hstats 1.1.2 ## ICE plots diff --git a/R/hstats.R b/R/hstats.R index 3a4b5fcd..5cbb7562 100644 --- a/R/hstats.R +++ b/R/hstats.R @@ -188,7 +188,7 @@ hstats.default <- function(object, X, v = NULL, } # Predictions ("F" in Friedman and Popescu) always calculated (cheap) - f <- wcenter(prepare_pred(pred_fun(object, X, ...), ohe = TRUE), w = w) + f <- wcenter(prepare_pred(pred_fun(object, X, ...)), w = w) mean_f2 <- wcolMeans(f^2, w = w) # A vector # Initialize first progress bar diff --git a/R/ice.R b/R/ice.R index 12df435a..8b36dbac 100644 --- a/R/ice.R +++ b/R/ice.R @@ -117,7 +117,6 @@ ice.default <- function(object, v, X, pred_fun = stats::predict, grid = grid, pred_fun = pred_fun, pred_only = FALSE, - ohe = TRUE, ... ) pred <- ice_out[["pred"]] diff --git a/R/pd_raw.R b/R/pd_raw.R index 3bff4d5c..2549e878 100644 --- a/R/pd_raw.R +++ b/R/pd_raw.R @@ -54,11 +54,10 @@ pd_raw <- function(object, v, X, grid, pred_fun = stats::predict, #' predictions. Otherwise, a list with two elements: `pred` (predictions) #' and `grid_pred` (the corresponding grid values in the same mode as the input, #' but replicated over `X`). -#' @param ohe Should factor output be one-hot encoded? Default is `FALSE`. #' @returns #' Either a vector/matrix of predictions or a list with predictions and grid. ice_raw <- function(object, v, X, grid, pred_fun = stats::predict, - pred_only = TRUE, ohe = FALSE, ...) { + pred_only = TRUE, ...) { D1 <- length(v) == 1L n <- nrow(X) n_grid <- NROW(grid) @@ -79,7 +78,7 @@ ice_raw <- function(object, v, X, grid, pred_fun = stats::predict, } # Calculate matrix/vector of predictions - pred <- prepare_pred(pred_fun(object, X_pred, ...), ohe = ohe) + pred <- prepare_pred(pred_fun(object, X_pred, ...)) if (pred_only) { return(pred) diff --git a/R/utils_input.R b/R/utils_input.R index 8f2251bf..1f5505aa 100644 --- a/R/utils_input.R +++ b/R/utils_input.R @@ -1,21 +1,23 @@ #' Prepares Predictions #' -#' Converts predictions to vector, matrix or factor. +#' Converts predictions to vector or matrix. #' #' @noRd #' @keywords internal #' #' @param x Object representing model predictions. -#' @param ohe If `x` is a factor: should it be one-hot encoded? Default is `FALSE`. -#' @returns Like `x`, but converted to matrix, vector, or factor. -prepare_pred <- function(x, ohe = FALSE) { +#' @returns Like `x`, but converted to matrix or vector. +prepare_pred <- function(x) { if (is.data.frame(x) && ncol(x) == 1L) { x <- x[[1L]] } - if (ohe && is.factor(x)) { - return(fdummy(x)) + if (!is.vector(x) || !is.matrix(x)) { + x <- as.matrix(x) } - if (is.vector(x) || is.matrix(x) || is.factor(x)) x else as.matrix(x) + if (!is.numeric(x)) { + stop("Predictions must be numeric!") + } + return(x) } #' Prepares Group BY Variable @@ -99,7 +101,7 @@ prepare_w <- function(w, X) { #' #' @returns A list with "y" (vector, matrix, or factor) and "y_names" (if `y` #' was passed as column names). -prepare_y <- function(y, X, ohe = FALSE) { +prepare_y <- function(y, X) { if (NROW(y) < nrow(X) && all(y %in% colnames(X))) { y_names <- y if (is.data.frame(X) && length(y) == 1L) { @@ -111,6 +113,6 @@ prepare_y <- function(y, X, ohe = FALSE) { stopifnot(NROW(y) == nrow(X)) y_names <- NULL } - list(y = prepare_pred(y, ohe = ohe), y_names = y_names) + list(y = prepare_pred(y), y_names = y_names) } diff --git a/packaging.R b/packaging.R index 69c040b2..5359f72b 100644 --- a/packaging.R +++ b/packaging.R @@ -15,7 +15,7 @@ library(usethis) use_description( fields = list( Title = "Interaction Statistics", - Version = "1.1.2", + Version = "1.2.0", Description = "Fast, model-agnostic implementation of different H-statistics introduced by Jerome H. Friedman and Bogdan E. Popescu (2008) . These statistics quantify interaction strength per feature, feature pair, diff --git a/tests/testthat/test_calculate.R b/tests/testthat/test_calculate.R index b3278ee1..d78618d2 100644 --- a/tests/testthat/test_calculate.R +++ b/tests/testthat/test_calculate.R @@ -4,7 +4,6 @@ test_that("rep_each() works", { expect_true(is.integer(rep_each(100, 100))) }) -# The next two checks copied from {kernelshap} test_that("rep_rows() gives the same as usual subsetting (except rownames)", { setrn <- function(x) {rownames(x) <- 1:nrow(x); x} @@ -45,38 +44,6 @@ test_that("fdummy() respects factor level order", { expect_equal(colnames(d1), rev(colnames(d2))) }) -test_that("colMeans_factor() works", { - x <- c("A", "A", "C", "D") - expect_equal(colMeans_factor(x), colMeans(fdummy(x))) - - expect_equal(colMeans_factor(c("A", "B", "B")), c(A = 1/3, B = 2/3)) - expect_equal( - colMeans_factor(factor(c("A", "B", "B"), levels = c("B", "A", "C"))), - c(B = 2/3, A = 1/3, C = 0) - ) - expect_equal( - colMeans_factor(iris$Species), c(setosa = 1/3, versicolor = 1/3, virginica = 1/3) - ) - expect_equal( - colMeans_factor(iris$Species[101:150]), c(setosa = 0, versicolor = 0, virginica = 1) - ) -}) - -test_that("colMeans_factor() works for singletons", { - x <- c("A") - expect_equal(colMeans_factor(x), colMeans(fdummy(x))) - expect_true(is.vector(colMeans_factor(x))) -}) - -test_that("colMeans_factor() respects factor level order", { - x1 <- factor(c("A", "A", "C", "D")) - x2 <- factor(x1, levels = rev(levels(x1))) - d1 <- colMeans_factor(x1) - d2 <- colMeans_factor(x2) - expect_equal(d1, d2[names(d1)]) - expect_equal(names(d1), rev(names(d2))) -}) - test_that("wcolMeans() works", { x <- cbind(a = 1:6, b = 6:1) x_df <- as.data.frame(x) @@ -100,9 +67,6 @@ test_that("wrowmean_vector() works for vectors and 1D matrices", { out1 <- wrowmean_vector(x1, ngroups = 2L) out2 <- wrowmean_vector(x2, ngroups = 2L) - expect_error(wrowmean_vector(factor(1, 2))) - expect_error(wrowmean_vector(matrix(1:4, ncol = 2L))) - expect_true(is.matrix(out1)) expect_equal(out1, unname(out2)) expect_equal(out2, cbind(a = c(5, 2))) @@ -128,32 +92,6 @@ test_that("wrowmean_vector() works for vectors and 1D matrices", { expect_equal(out2, wrowmean(x2, ngroups = 2L, w = 1:3)) }) -test_that("rowmean_factor() works for factor input", { - x <- factor(c("C", "A", "C", "C", "A", "A")) - out <- rowmean_factor(x, ngroups = 2L) - - expect_true(is.matrix(out)) - expect_equal(out, cbind(A = c(1/3, 2/3), C = c(2/3, 1/3))) - expect_equal(out, wrowmean_matrix(fdummy(x), ngroups = 2L)) - expect_equal(out, wrowmean(x, ngroups = 2L)) -}) - -test_that("rowmean() works for factor input with weights", { - x <- factor(c("C", "A", "C", "C", "A", "A")) - out <- wrowmean(x, ngroups = 2L) - - # Constant weights have no effect - expect_equal(wrowmean(x, ngroups = 2L, w = c(1, 1, 1)), out) - expect_equal(wrowmean(x, ngroups = 2L, w = c(4, 4, 4)), out) - - # Non-constant weights - A1 <- weighted.mean(x[1:3] == "A", 1:3) - A2 <- weighted.mean(x[4:6] == "A", 1:3) - C1 <- weighted.mean(x[1:3] == "C", 1:3) - C2 <- weighted.mean(x[4:6] == "C", 1:3) - expect_equal(wrowmean(x, ngroups = 2L, w = 1:3), cbind(A = c(A1, A2), C = c(C1, C2))) -}) - test_that("wrowmean_matrix() works for matrices with > 1 columns", { x <- cbind(x = 6:1, z = 1:6) out <- wrowmean_matrix(x, ngroups = 2L) @@ -181,10 +119,6 @@ test_that("wrowmean_matrix() works for matrices with > 1 columns", { expect_equal(out, wrowmean(x, ngroups = 2L, w = 1:3)) }) -test_that("wrowmean() checks for correct weight lengths", { - expect_error(wrowmean(1:10, ngroups = 2L, w = 1:10)) -}) - test_that("wrowmean() equals wColMeans() for ngroups = 1", { x <- 1:10 expect_equal(wrowmean(x), rbind(wcolMeans(x))) @@ -197,10 +131,6 @@ test_that("wrowmean() equals wColMeans() for ngroups = 1", { X <- cbind(a = 1:10, b = 10:1) expect_equal(wrowmean(X), rbind(wcolMeans(X))) expect_equal(wrowmean(X, w = 1:10), rbind(wcolMeans(X, w = 1:10))) - - x <- factor(c("B", "B", "B", "C")) - expect_equal(wrowmean(x), rbind(wcolMeans(x))) - expect_equal(wrowmean(x, w = 1:4), rbind(wcolMeans(x, w = 1:4))) }) test_that("gwcolMeans() works", { @@ -233,36 +163,6 @@ test_that("gwcolMeans() works", { expect_equal(r$w, c(sum(3:6), sum(1:2))) }) -test_that("gwcolMeans() works for factors", { - x <- factor(c("A", "B", "A", "B", "A", "A"), levels = c("B", "A")) - g <- c(2, 2, 1, 1, 1, 1) - w1 <- rep(2, times = 6) - w2 <- 1:6 - - # Ungrouped - r <- gwColMeans(x) - expect_equal(r$M, rbind(wcolMeans(x))) - expect_equal(r$w, length(x)) - - r <- gwColMeans(x, w = w1) - expect_equal(r$M, rbind(wcolMeans(x, w = w1))) - expect_equal(r$w, sum(w1)) - - r <- gwColMeans(x, w = w2) - expect_equal(r$M, rbind(wcolMeans(x, w = w2))) - expect_equal(r$w, sum(w2)) - - # Grouped - r <- gwColMeans(x, g = g) - expect_equal(r$M[2L, ], wcolMeans(x[g == 2])) - expect_equal(r$w, c(4, 2)) - - # Grouped and weighted - r <- gwColMeans(x, g = g, w = w2) - expect_equal(r$M[2L, ], wcolMeans(x[g == 2], w = w2[g == 2])) - expect_equal(r$w, c(sum(3:6), sum(1:2))) -}) - test_that("wcenter() works for matrices with > 1 columns", { x <- cbind(a = 1:6, b = 6:1) expect_equal(wcenter(x), cbind(a = 1:6 - mean(1:6), b = 6:1 - mean(6:1))) diff --git a/tests/testthat/test_hstats.R b/tests/testthat/test_hstats.R index 5df71942..e9d6e7eb 100644 --- a/tests/testthat/test_hstats.R +++ b/tests/testthat/test_hstats.R @@ -356,12 +356,6 @@ test_that("hstats() matches {iml} 0.11.1 in a specific case", { ) }) -test_that("hstats() works for factor predictions", { - pf <- function(m, X) factor(X[, "v1"], levels = 0:1, labels = c("zero", "one")) - out <- hstats(1, X = cbind(v1 = 0:1, v2 = 0), pred_fun = pf, verbose = FALSE) - expect_equal(out$h2$num, cbind(zero = 0, one = 0)) -}) - test_that("hstats() gives all zero statistics if prediction is constant", { pf <- function(m, X) numeric(nrow(X)) out <- hstats(1, X = iris[1:4], pred_fun = pf, verbose = FALSE) diff --git a/tests/testthat/test_ice.R b/tests/testthat/test_ice.R index 449d2eaa..8111e2a0 100644 --- a/tests/testthat/test_ice.R +++ b/tests/testthat/test_ice.R @@ -236,12 +236,3 @@ test_that("ice() works with missing value in BY", { expect_true(anyNA(r$data$x3)) expect_s3_class(plot(r), "ggplot") }) - -test_that("ice() works for factor predictions", { - pf <- function(m, X) factor(X[, "v1"], levels = 0:1, labels = c("zero", "one")) - out <- ice(1, v = "v1", X = cbind(v1 = 0:1), pred_fun = pf) - out <- out$data[out$data$obs_ == 1L, c("zero", "one")] - out <- as.matrix(out) - row.names(out) <- NULL - expect_equal(out, cbind(zero = 1:0, one = 0:1)) -}) diff --git a/tests/testthat/test_input.R b/tests/testthat/test_input.R index 5d700a93..274e4b55 100644 --- a/tests/testthat/test_input.R +++ b/tests/testthat/test_input.R @@ -3,8 +3,6 @@ test_that("prepare_pred() works", { expect_equal(prepare_pred(iris["Species"]), iris$Species) expect_equal(prepare_pred(iris$Sepal.Width), iris$Sepal.Width) expect_equal(prepare_pred(iris["Sepal.Width"]), iris$Sepal.Width) - expect_equal(prepare_pred(iris$Species, ohe = TRUE), fdummy(iris$Species)) - expect_equal(prepare_pred(1:3, ohe = TRUE), 1:3) }) test_that("prepare_w() works", { @@ -43,7 +41,4 @@ test_that("prepare_y() works", { out <- prepare_y("Sepal.Width", X = iris) expect_equal(out$y, iris$Sepal.Width) expect_equal(out$y_names, "Sepal.Width") - - # OHE - expect_equal(prepare_y(iris$Species, X = iris, ohe = TRUE)$y, fdummy(iris$Species)) }) diff --git a/tests/testthat/test_partial_dep.R b/tests/testthat/test_partial_dep.R index a23a3d3c..78147833 100644 --- a/tests/testthat/test_partial_dep.R +++ b/tests/testthat/test_partial_dep.R @@ -532,11 +532,3 @@ test_that(".compress_grid() leaves grid unchanged if unique", { expect_equal(out$grid, g) expect_equal(out$reindex, NULL) }) - -test_that("partial_dep() works for factor predictions", { - pf <- function(m, X) factor(X[, "v1"], levels = 0:1, labels = c("zero", "one")) - out <- partial_dep(1, v = "v1", X = cbind(v1 = 0:1), pred_fun = pf) - out <- as.matrix(out$data[c("zero", "one")]) - row.names(out) <- NULL - expect_equal(out, cbind(zero = 1:0, one = 0:1)) -}) From fbcfb8af220833d7f7754e016b2a61f381c5975c Mon Sep 17 00:00:00 2001 From: Michael Mayer Date: Fri, 5 Jul 2024 18:55:15 +0200 Subject: [PATCH 2/6] Simplified computes --- R/utils_calculate.R | 108 ++++---------------------------------------- R/utils_input.R | 1 - 2 files changed, 9 insertions(+), 100 deletions(-) diff --git a/R/utils_calculate.R b/R/utils_calculate.R index 49c50f16..a1f5620c 100644 --- a/R/utils_calculate.R +++ b/R/utils_calculate.R @@ -16,14 +16,6 @@ rep_each <- function(m, each) { dim(out) <- NULL out } -# -# # Same as rep.int(seq_len(m), times) -# rep_times <- function(m, times) { -# out <- .row(dim = c(m, times)) -# dim(out) <- NULL -# out -# } - #' Fast OHE #' @@ -54,49 +46,14 @@ fdummy <- function(x) { #' @noRd #' @keywords internal #' -#' @param x A vector, factor, or matrix-like. +#' @param x A vector or matrix-like. #' @param w Optional case weights. #' @returns A vector of column means. wcolMeans <- function(x, w = NULL) { - if (is.factor(x)) { - if (is.null(w)) { - return(colMeans_factor(x)) - } - x <- fdummy(x) - } else if (!is.matrix(x)) { + if (!is.matrix(x)) { x <- as.matrix(x) } - if (is.null(w)) { - return(colMeans(x)) - } - if (length(w) != nrow(x)) { - stop("w must be of length nrow(x).") - } - if (!is.double(w)) { - w <- as.double(w) - } - colSums(x * w) / sum(w) -} - -#' colMeans() for Factors -#' -#' Internal function used to calculate proportions of factor levels. -#' It is less memory-hungry than `colMeans(fdummy(x))`, and much faster. -#' A weighted version via `rowsum(w, x)` is not consistently faster than -#' `wcolMeans(fdummy(x))`, so we currently focus on the non-weighted case. -#' Furthermore, `rowsum()` drops empty factor levels. -#' -#' @noRd -#' @keywords internal -#' -#' @param x Factor-like. -#' @returns Named vector. -colMeans_factor <- function(x) { - x <- as.factor(x) - lev <- levels(x) - out <- tabulate(x, nbins = length(lev)) / length(x) - names(out) <- lev - out + if (is.null(w)) colMeans(x) else colSums(x * w) / sum(w) } #' Grouped Column Means @@ -106,7 +63,7 @@ colMeans_factor <- function(x) { #' @noRd #' @keywords internal #' -#' @param x Vector, factor or matrix-like. +#' @param x Vector or matrix-like. #' @param ngroups Number of groups (`x` was stacked that many times). #' @param w Optional vector with case weights of length `NROW(x) / ngroups`. #' @returns A (g x K) matrix, where g is the number of groups, and K = NCOL(x). @@ -115,51 +72,13 @@ wrowmean <- function(x, ngroups = 1L, w = NULL) { return(t.default(wcolMeans(x, w = w))) } - # Prep w - if (!is.null(w)) { - if (length(w) != NROW(x) %/% ngroups) { - stop("w must be of length NROW(x) / ngroups.") - } - if (!is.double(w)) { - w <- as.double(w) - } - } - - # Very fast case for factors w/o weights and vectors/1d-matrices - if (is.factor(x)) { - if (is.null(w)) { - return(rowmean_factor(x, ngroups = ngroups)) - } - x <- fdummy(x) - } + # Very fast case for 1d structures if (is.vector(x) || (is.matrix(x) && ncol(x) == 1L)) { return(wrowmean_vector(x, ngroups = ngroups, w = w)) } - - # General version wrowmean_matrix(x, ngroups = ngroups, w = w) } -#' (w)rowmean() for Factors (without weights) -#' -#' Grouped `colMeans_factor()` for equal sized groups. -#' -#' @noRd -#' @keywords internal -#' -#' @param x Factor-like. -#' @param ngroups Number of subsequent, equals sized groups. -#' @returns Matrix with column names. -rowmean_factor <- function(x, ngroups = 1L) { - x <- as.factor(x) - lev <- levels(x) - n_bg <- length(x) %/% ngroups - dim(x) <- c(n_bg, ngroups) - out <- t.default(apply(x, 2L, FUN = tabulate, nbins = length(lev))) - colnames(out) <- lev - out / n_bg -} - #' wrowmean() for Vectors #' #' Grouped means over fixed-length groups for vectors or 1d matrices. @@ -170,14 +89,11 @@ rowmean_factor <- function(x, ngroups = 1L) { #' @param x Vector or matrix with one column. #' @param ngroups Number of subsequent, equals sized groups. #' @param w Optional vector of case weights of length `NROW(x) / ngroups`. -#' @returns Matrix. +#' @returns Matrix with one column. wrowmean_vector <- function(x, ngroups = 1L, w = NULL) { - if (!(is.vector(x) || (is.matrix(x) && ncol(x) == 1L))) { - stop("x must be a vector or a single column matrix") - } nm <- if (is.matrix(x)) colnames(x) dim(x) <- c(length(x) %/% ngroups, ngroups) - out <- if (is.null(w)) colMeans(x) else colSums(x * w) / sum(w) + out <- wcolMeans(x, w = w) dim(out) <- c(ngroups, 1L) if (!is.null(nm)) { colnames(out) <- nm @@ -202,7 +118,6 @@ wrowmean_matrix <- function(x, ngroups = 1L, w = NULL) { } n_bg <- nrow(x) %/% ngroups g <- rep_each(ngroups, each = n_bg) # rep(seq_len(ngroups), each = n_bg) - # Even faster: cbind(collapse::fmean(x, g = g, w = w)) if (is.null(w)) { out <- rowsum(x, group = g, reorder = FALSE) / n_bg } else { @@ -221,7 +136,7 @@ wrowmean_matrix <- function(x, ngroups = 1L, w = NULL) { #' @noRd #' @keywords internal #' -#' @param x A vector, factor or matrix-like object. +#' @param x A vector or matrix-like object. #' @param g Optional grouping variable. #' @param w Optional case weights. #' @returns A list with two elements: "M" represents a matrix of grouped (column) @@ -236,17 +151,12 @@ gwColMeans <- function(x, g = NULL, w = NULL) { } # Now the interesting case - if (is.factor(x)) { - x <- fdummy(x) - } else if (!is.vector(x) && !is.matrix(x)) { + if (!is.vector(x) || !is.matrix(x)) { x <- as.matrix(x) } if (is.null(w)) { w <- rep.int(1, NROW(x)) } else { - if (!is.double(w)) { - w <- as.double(w) - } x <- x * w # w is correctly recycled over columns } denom <- as.numeric(rowsum(w, group = g)) diff --git a/R/utils_input.R b/R/utils_input.R index 1f5505aa..10c8eb7a 100644 --- a/R/utils_input.R +++ b/R/utils_input.R @@ -115,4 +115,3 @@ prepare_y <- function(y, X) { } list(y = prepare_pred(y), y_names = y_names) } - From dbca7266e707a74692eb054a8ffa80ae1f893634 Mon Sep 17 00:00:00 2001 From: Michael Mayer Date: Fri, 5 Jul 2024 21:37:35 +0200 Subject: [PATCH 3/6] Adabt functions that use response --- NEWS.md | 2 +- R/average_loss.R | 15 ++-- R/losses.R | 109 +++++++++++++------------- R/perm_importance.R | 13 +-- R/utils_input.R | 22 ++++-- man/average_loss.Rd | 12 +-- man/perm_importance.Rd | 15 ++-- tests/testthat/test_average_loss.R | 42 +--------- tests/testthat/test_input.R | 6 +- tests/testthat/test_losses.R | 30 +------ tests/testthat/test_perm_importance.R | 34 +------- 11 files changed, 102 insertions(+), 198 deletions(-) diff --git a/NEWS.md b/NEWS.md index 9009049c..2ef616b1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,6 @@ # hstats 1.2.0 -# Major changes +## Major changes - Factor-valued predictions are no longer possible. - Consequently, also removed "classification_error" loss. diff --git a/R/average_loss.R b/R/average_loss.R index 2a08510c..a1e322a3 100644 --- a/R/average_loss.R +++ b/R/average_loss.R @@ -22,25 +22,19 @@ #' or as discrete vector with corresponding levels. #' The latter case is turned into a dummy matrix by a fast version of #' `model.matrix(~ as.factor(y) + 0)`. -#' - "classification_error": Misclassification error. Both the -#' observed values `y` and the predictions can be character/factor. This -#' loss function can be used in non-probabilistic classification settings. -#' BUT: Probabilistic classification (with "mlogloss") is clearly preferred in most -#' situations. #' - A function with signature `f(actual, predicted)`, returning a numeric #' vector or matrix of the same length as the input. #' #' @inheritParams hstats #' @param y Vector/matrix of the response, or the corresponding column names in `X`. #' @param loss One of "squared_error", "logloss", "mlogloss", "poisson", -#' "gamma", "absolute_error", "classification_error". Alternatively, a loss function +#' "gamma", or "absolute_error". Alternatively, a loss function #' can be provided that turns observed and predicted values into a numeric vector or #' matrix of unit losses of the same length as `X`. #' For "mlogloss", the response `y` can either be a dummy matrix or a discrete vector. #' The latter case is handled via a fast version of `model.matrix(~ as.factor(y) + 0)`. -#' For "classification_error", both predictions and responses can be non-numeric. -#' For "squared_error", both predictions and responses can be factors with identical -#' levels. In this case, squared error is evaulated for each one-hot-encoded column. +#' For "squared_error", the response can be a factor with levels in column order of +#' the predictions. In this case, squared error is evaluated for each one-hot-encoded column. #' @param agg_cols Should multivariate losses be summed up? Default is `FALSE`. #' In combination with the squared error loss, `agg_cols = TRUE` gives #' the Brier score for (probabilistic) classification. @@ -116,7 +110,8 @@ average_loss.default <- function(object, X, y, #' @describeIn average_loss Method for "ranger" models. #' @export average_loss.ranger <- function(object, X, y, - pred_fun = function(m, X, ...) stats::predict(m, X, ...)$predictions, + pred_fun = function(m, X, ...) + stats::predict(m, X, ...)$predictions, loss = "squared_error", agg_cols = FALSE, BY = NULL, by_size = 4L, diff --git a/R/losses.R b/R/losses.R index c6b517fe..b207b5ba 100644 --- a/R/losses.R +++ b/R/losses.R @@ -1,3 +1,25 @@ +#' Input Checks for Losses +#' +#' Internal function with general input checks. +#' +#' @noRd +#' @keywords internal +#' +#' @param actual Actual values. +#' @param predicted Predictions. +#' @returns `TRUE` +check_loss <- function(actual, predicted) { + stopifnot( + is.vector(actual) || is.matrix(actual), + is.vector(predicted) || is.matrix(predicted), + is.numeric(actual), + is.numeric(predicted), + NROW(actual) == NROW(predicted), + NCOL(actual) == 1L || NCOL(actual) == NCOL(predicted) + ) + return(TRUE) +} + #' Squared Error Loss #' #' Internal function. Calculates squared error. @@ -5,14 +27,13 @@ #' @noRd #' @keywords internal #' -#' @param actual A numeric vector/matrix, or factor. -#' @param predicted A numeric vector/matrix, or factor. +#' @param actual A numeric vector or matrix. +#' @param predicted A numeric vector or matrix. #' @returns Vector or matrix of numeric losses. loss_squared_error <- function(actual, predicted) { - actual <- drop(prepare_pred(actual, ohe = TRUE)) - predicted <- prepare_pred(predicted, ohe = TRUE) - - (actual - predicted)^2 + check_loss(actual, predicted) + + return((drop(actual) - predicted)^2) } #' Absolute Error Loss @@ -26,13 +47,9 @@ loss_squared_error <- function(actual, predicted) { #' @param predicted A numeric vector/matrix. #' @returns Vector or matrix of numeric losses. loss_absolute_error <- function(actual, predicted) { - actual <- drop(prepare_pred(actual)) - predicted <- prepare_pred(predicted) - if (is.factor(actual) || is.factor(predicted)) { - stop("Absolute loss does not make sense for factors.") - } - - abs(actual - predicted) + check_loss(actual, predicted) + + return(abs(drop(actual) - predicted)) } #' Poisson Deviance Loss @@ -46,20 +63,18 @@ loss_absolute_error <- function(actual, predicted) { #' @param predicted A numeric vector/matrix with non-negative values. #' @returns Vector or matrix of numeric losses. loss_poisson <- function(actual, predicted) { - actual <- drop(prepare_pred(actual)) - predicted <- prepare_pred(predicted) - if (is.factor(actual) || is.factor(predicted)) { - stop("Poisson loss does not make sense for factors.") - } + check_loss(actual, predicted) stopifnot( all(predicted >= 0), all(actual >= 0) ) + actual <- drop(actual) + out <- predicted p <- actual > 0 out[p] <- (actual * log(actual / predicted) - (actual - predicted))[p] - 2 * out + return(2 * out) } #' Gamma Deviance Loss @@ -73,34 +88,15 @@ loss_poisson <- function(actual, predicted) { #' @param predicted A numeric vector/matrix with positive values. #' @returns Vector or matrix of numeric losses. loss_gamma <- function(actual, predicted) { - actual <- drop(prepare_pred(actual)) - predicted <- prepare_pred(predicted) - if (is.factor(actual) || is.factor(predicted)) { - stop("Gamma loss does not make sense for factors.") - } + check_loss(actual, predicted) stopifnot( all(predicted > 0), all(actual > 0) ) - -2 * (log(actual / predicted) - (actual - predicted) / predicted) -} - -#' Classification Error Loss -#' -#' Internal function. Calculates per-row misclassification errors. -#' -#' @noRd -#' @keywords internal -#' -#' @param actual A vector/factor/matrix with discrete values. -#' @param predicted A vector/factor/matrix with discrete values. -#' @returns Vector or matrix of numeric losses. -loss_classification_error <- function(actual, predicted) { - actual <- drop(prepare_pred(actual)) - predicted <- prepare_pred(predicted) + actual <- drop(actual) - (actual != predicted) * 1.0 + return(-2 * (log(actual / predicted) - (actual - predicted) / predicted)) } #' Log Loss @@ -115,11 +111,7 @@ loss_classification_error <- function(actual, predicted) { #' @param predicted A numeric vector/matrix with values between 0 and 1. #' @returns Vector or matrix of numeric losses. loss_logloss <- function(actual, predicted) { - actual <- drop(prepare_pred(actual)) - predicted <- prepare_pred(predicted) - if (is.factor(actual) || is.factor(predicted)) { - stop("Log loss does not make sense for factors.") - } + check_loss(actual, predicted) stopifnot( all(predicted >= 0), all(predicted <= 1), @@ -127,7 +119,9 @@ loss_logloss <- function(actual, predicted) { all(actual <= 1) ) - -xlogy(actual, predicted) - xlogy(1 - actual, 1 - predicted) + actual <- drop(actual) + + return(-xlogy(actual, predicted) - xlogy(1 - actual, 1 - predicted)) } #' Multi-Column Log Loss @@ -139,28 +133,34 @@ loss_logloss <- function(actual, predicted) { #' @keywords internal #' #' @param actual A numeric matrix with values between 0 and 1, or a -#' discrete vector that will be one-hot-encoded by a fast version of -#' `model.matrix(~ as.factor(actual) + 0)`. +#' factor, or a discrete numeric vector that will be one-hot-encoded by a +#' fast version of `model.matrix(~ as.factor(actual) + 0)`. #' The column order of `predicted` must be in line with this! #' @param predicted A numeric matrix with values between 0 and 1. -#' @returns `TRUE` (or an error message). +#' @returns A numeric vector of losses. loss_mlogloss <- function(actual, predicted) { - actual <- prepare_pred(actual) - predicted <- prepare_pred(predicted) if (NCOL(actual) == 1L) { # not only for factors actual <- fdummy(actual) } + stopifnot( + is.matrix(actual), is.matrix(predicted), - ncol(predicted) >= 2L, + + is.numeric(actual), + is.numeric(predicted), + + nrow(actual) == nrow(predicted), ncol(actual) == ncol(predicted), + ncol(predicted) >= 2L, + all(predicted >= 0), all(predicted <= 1), all(actual >= 0), all(actual <= 1) ) - unname(-rowSums(xlogy(actual, predicted))) + return(unname(-rowSums(xlogy(actual, predicted)))) } #' Calculates x*log(y) @@ -198,7 +198,6 @@ get_loss_fun <- function(loss) { poisson = loss_poisson, gamma = loss_gamma, absolute_error = loss_absolute_error, - classification_error = loss_classification_error, stop("Unknown loss function.") ) } diff --git a/R/perm_importance.R b/R/perm_importance.R index 3c4d5d99..5df1b02e 100644 --- a/R/perm_importance.R +++ b/R/perm_importance.R @@ -2,7 +2,7 @@ #' #' Calculates permutation importance for a set of features or a set of feature groups. #' By default, importance is calculated for all columns in `X` (except column names -#' used as response `y` or case weight `w`). +#' used as response `y` or as case weight `w`). #' #' The permutation importance of a feature is defined as the increase in the average #' loss when shuffling the corresponding feature values before calculating predictions. @@ -31,7 +31,7 @@ #' # MODEL 1: Linear regression #' fit <- lm(Sepal.Length ~ ., data = iris) #' s <- perm_importance(fit, X = iris, y = "Sepal.Length") - +#' #' s #' s$M #' s$SE # Standard errors are available thanks to repeated shuffling @@ -103,7 +103,7 @@ perm_importance.default <- function(object, X, y, v = NULL, if (nrow(X) > n_max) { ix <- sample(nrow(X), n_max) X <- X[ix, , drop = FALSE] - if (is.vector(y) || is.factor(y)) { + if (is.vector(y)) { y <- y[ix] } else { y <- y[ix, , drop = FALSE] @@ -126,10 +126,10 @@ perm_importance.default <- function(object, X, y, v = NULL, if (m_rep > 1L) { ind <- rep.int(seq_len(n), m_rep) X <- rep_rows(X, ind) - if (is.vector(y) || is.factor(y)) { + if (is.vector(y)) { y <- y[ind] } else { - y <- rep_rows(y, ind) + y <- y[ind, , drop = FALSE] } } @@ -206,7 +206,8 @@ perm_importance.default <- function(object, X, y, v = NULL, #' @describeIn perm_importance Method for "ranger" models. #' @export perm_importance.ranger <- function(object, X, y, v = NULL, - pred_fun = function(m, X, ...) stats::predict(m, X, ...)$predictions, + pred_fun = function(m, X, ...) + stats::predict(m, X, ...)$predictions, loss = "squared_error", m_rep = 4L, agg_cols = FALSE, normalize = FALSE, n_max = 10000L, diff --git a/R/utils_input.R b/R/utils_input.R index 10c8eb7a..d3d162f0 100644 --- a/R/utils_input.R +++ b/R/utils_input.R @@ -11,7 +11,7 @@ prepare_pred <- function(x) { if (is.data.frame(x) && ncol(x) == 1L) { x <- x[[1L]] } - if (!is.vector(x) || !is.matrix(x)) { + if (!is.vector(x) && !is.matrix(x)) { x <- as.matrix(x) } if (!is.numeric(x)) { @@ -95,11 +95,11 @@ prepare_w <- function(w, X) { #' #' @noRd #' @keywords internal -#' @param y Vector/matrix-like of the same length as `X`, or column names in `X`. +#' @param y Vector, factor, or matrix-like of the same length as `X`, +#' or column names in `X`. #' @param X Matrix-like. -#' @param ohe If y is a factor: should it be one-hot encoded? Default is `FALSE`. #' -#' @returns A list with "y" (vector, matrix, or factor) and "y_names" (if `y` +#' @returns A list with "y" (vector, matrix) and "y_names" (if `y` #' was passed as column names). prepare_y <- function(y, X) { if (NROW(y) < nrow(X) && all(y %in% colnames(X))) { @@ -110,8 +110,20 @@ prepare_y <- function(y, X) { y <- X[, y] } } else { + if (is.data.frame(y) && ncol(y) == 1L) { + y <- y[[1L]] + } stopifnot(NROW(y) == nrow(X)) y_names <- NULL } - list(y = prepare_pred(y), y_names = y_names) + if (is.factor(y)) { + y <- fdummy(y) + } + if (!is.vector(y) && !is.matrix(y)) { + y <- as.matrix(y) + } + if (!is.numeric(y)) { + stop("Response must be numeric (or factor.)") + } + list(y = y, y_names = y_names) } diff --git a/man/average_loss.Rd b/man/average_loss.Rd index abac4d7a..01af2a54 100644 --- a/man/average_loss.Rd +++ b/man/average_loss.Rd @@ -67,14 +67,13 @@ model) can be passed via \code{...}. The default, \code{\link[stats:predict]{sta most cases.} \item{loss}{One of "squared_error", "logloss", "mlogloss", "poisson", -"gamma", "absolute_error", "classification_error". Alternatively, a loss function +"gamma", or "absolute_error". Alternatively, a loss function can be provided that turns observed and predicted values into a numeric vector or matrix of unit losses of the same length as \code{X}. For "mlogloss", the response \code{y} can either be a dummy matrix or a discrete vector. The latter case is handled via a fast version of \code{model.matrix(~ as.factor(y) + 0)}. -For "classification_error", both predictions and responses can be non-numeric. -For "squared_error", both predictions and responses can be factors with identical -levels. In this case, squared error is evaulated for each one-hot-encoded column.} +For "squared_error", the response can be a factor with levels in column order of +the predictions. In this case, squared error is evaluated for each one-hot-encoded column.} \item{agg_cols}{Should multivariate losses be summed up? Default is \code{FALSE}. In combination with the squared error loss, \code{agg_cols = TRUE} gives @@ -136,11 +135,6 @@ The observed values \code{y} are either passed as (n x K) dummy matrix, or as discrete vector with corresponding levels. The latter case is turned into a dummy matrix by a fast version of \code{model.matrix(~ as.factor(y) + 0)}. -\item "classification_error": Misclassification error. Both the -observed values \code{y} and the predictions can be character/factor. This -loss function can be used in non-probabilistic classification settings. -BUT: Probabilistic classification (with "mlogloss") is clearly preferred in most -situations. \item A function with signature \code{f(actual, predicted)}, returning a numeric vector or matrix of the same length as the input. } diff --git a/man/perm_importance.Rd b/man/perm_importance.Rd index d47df3f9..af27a2a9 100644 --- a/man/perm_importance.Rd +++ b/man/perm_importance.Rd @@ -80,14 +80,13 @@ model) can be passed via \code{...}. The default, \code{\link[stats:predict]{sta most cases.} \item{loss}{One of "squared_error", "logloss", "mlogloss", "poisson", -"gamma", "absolute_error", "classification_error". Alternatively, a loss function +"gamma", or "absolute_error". Alternatively, a loss function can be provided that turns observed and predicted values into a numeric vector or matrix of unit losses of the same length as \code{X}. For "mlogloss", the response \code{y} can either be a dummy matrix or a discrete vector. The latter case is handled via a fast version of \code{model.matrix(~ as.factor(y) + 0)}. -For "classification_error", both predictions and responses can be non-numeric. -For "squared_error", both predictions and responses can be factors with identical -levels. In this case, squared error is evaulated for each one-hot-encoded column.} +For "squared_error", the response can be a factor with levels in column order of +the predictions. In this case, squared error is evaluated for each one-hot-encoded column.} \item{m_rep}{Number of permutations (default 4).} @@ -122,7 +121,7 @@ Currently, supported only for \code{\link[=perm_importance]{perm_importance()}}. \description{ Calculates permutation importance for a set of features or a set of feature groups. By default, importance is calculated for all columns in \code{X} (except column names -used as response \code{y} or case weight \code{w}). +used as response \code{y} or as case weight \code{w}). } \details{ The permutation importance of a feature is defined as the increase in the average @@ -161,11 +160,6 @@ The observed values \code{y} are either passed as (n x K) dummy matrix, or as discrete vector with corresponding levels. The latter case is turned into a dummy matrix by a fast version of \code{model.matrix(~ as.factor(y) + 0)}. -\item "classification_error": Misclassification error. Both the -observed values \code{y} and the predictions can be character/factor. This -loss function can be used in non-probabilistic classification settings. -BUT: Probabilistic classification (with "mlogloss") is clearly preferred in most -situations. \item A function with signature \code{f(actual, predicted)}, returning a numeric vector or matrix of the same length as the input. } @@ -175,6 +169,7 @@ vector or matrix of the same length as the input. # MODEL 1: Linear regression fit <- lm(Sepal.Length ~ ., data = iris) s <- perm_importance(fit, X = iris, y = "Sepal.Length") + s s$M s$SE # Standard errors are available thanks to repeated shuffling diff --git a/tests/testthat/test_average_loss.R b/tests/testthat/test_average_loss.R index b7d58998..a6a7786d 100644 --- a/tests/testthat/test_average_loss.R +++ b/tests/testthat/test_average_loss.R @@ -219,10 +219,10 @@ test_that("average_loss() works with weights and grouped (multi)", { expect_equal(s3, s4) }) -test_that("mlogloss works with either matrix y or vector y", { +test_that("mlogloss works with either matrix y or factor y", { pf <- function(m, X) cbind(1 - (0.1 + 0.7 * (X == 1)), 0.1 + 0.7 * (X == 1)) X <- cbind(z = c(1, 0, 0, 1, 1)) - y <- c("B", "A", "B", "B", "B") + y <- factor(c("B", "A", "B", "B", "B")) Y <- model.matrix(~ y + 0) s1 <- average_loss(NULL, X = X, y = Y, loss = "mlogloss", pred_fun = pf) s2 <- average_loss(NULL, X = X, y = y, loss = "mlogloss", pred_fun = pf) @@ -230,7 +230,7 @@ test_that("mlogloss works with either matrix y or vector y", { }) test_that("loss_mlogloss() is in line with loss_logloss() in binary case", { - y <- iris$Species == "setosa" + y <- (iris$Species == "setosa") * 1 Y <- cbind(no = 1 - y, yes = y) fit <- glm(y ~ Sepal.Length, data = iris, family = binomial()) pf <- function(m, X, multi = FALSE) { @@ -245,42 +245,6 @@ test_that("loss_mlogloss() is in line with loss_logloss() in binary case", { ) }) -test_that("classification_error works on factors", { - pf <- function(m, X) rep("setosa", times = nrow(X)) - expect_error(average_loss(1, X = iris, y = iris$Species, pred_fun = pf)) - expect_equal( - c(average_loss( - 1, X = iris, y = iris$Species, pred_fun = pf, loss = "classification_error" - )$M), - 2/3 - ) - expect_equal( - c(average_loss( - 1, - X = iris, - y = iris$Species, - pred_fun = pf, - loss = "classification_error", - BY = iris$Species - )$M), - c(0, 1, 1) - ) -}) - -test_that("factor predictions work as well (squared error)", { - expect_equal( - average_loss( - 1, - X = iris, - y = iris$Species, - pred_fun = function(m, X) - factor(rep("setosa", times = nrow(X)), levels = levels(iris$Species)), - agg_cols = TRUE - )$M, - matrix((0 + 2 + 2) / 3) - ) -}) - test_that("average_loss() works with missing values", { X <- data.frame(x1 = 1:6, x2 = c(NA, 1, 2, 1, 1, 3), x3 = factor(c("A", NA, NA, "B", "A", "A"))) y <- 1:6 diff --git a/tests/testthat/test_input.R b/tests/testthat/test_input.R index 274e4b55..f8a19102 100644 --- a/tests/testthat/test_input.R +++ b/tests/testthat/test_input.R @@ -1,6 +1,6 @@ test_that("prepare_pred() works", { expect_equal(prepare_pred(iris[1:4]), data.matrix(iris[1:4])) - expect_equal(prepare_pred(iris["Species"]), iris$Species) + expect_error(prepare_pred(iris["Species"])) expect_equal(prepare_pred(iris$Sepal.Width), iris$Sepal.Width) expect_equal(prepare_pred(iris["Sepal.Width"]), iris$Sepal.Width) }) @@ -24,7 +24,7 @@ test_that("prepare_by() works", { test_that("prepare_y() works", { # "Vector" interface expect_equal(prepare_y(iris[1:4], X = iris)$y, data.matrix(iris[1:4])) - expect_equal(prepare_y(iris["Species"], X = iris)$y, iris$Species) + expect_equal(prepare_y(iris["Species"], X = iris)$y, fdummy(iris$Species)) expect_equal(prepare_y(iris$Sepal.Width, X = iris)$y, iris$Sepal.Width) expect_equal(prepare_y(iris["Sepal.Width"], X = iris)$y, iris$Sepal.Width) @@ -35,7 +35,7 @@ test_that("prepare_y() works", { expect_equal(out$y_names, cn) out <- prepare_y("Species", X = iris) - expect_equal(out$y, iris$Species) + expect_equal(out$y, fdummy(iris$Species)) expect_equal(out$y_names, "Species") out <- prepare_y("Sepal.Width", X = iris) diff --git a/tests/testthat/test_losses.R b/tests/testthat/test_losses.R index e7bf3c23..72ca51c9 100644 --- a/tests/testthat/test_losses.R +++ b/tests/testthat/test_losses.R @@ -54,7 +54,8 @@ test_that("losses are identical to stats package for specific case (multivariate expect_equal(loss_gamma(y, p), Gamma()$dev.resid(y2, p, 1)) y <- 0 - y2 <- replicate(2, y) + y2 <- cbind(y, y) + colnames(y2) <- NULL p <- rbind(replicate(2, 0.1)) expect_equal(loss_poisson(y2, p), poisson()$dev.resid(y2, p, 1)) @@ -92,26 +93,6 @@ test_that("loss_absolute_error() works for specific case (multivariate)", { expect_equal(loss_absolute_error(y, p), abs(y2 - p)) }) -test_that("loss_classification_error() works for specific case", { - y <- iris$Species - p <- rev(iris$Species) - expect_equal(loss_classification_error(y, p), 0 + (y != p)) - - # Single input - y <- y[1L] - p <- rev(iris$Species)[1L] - expect_equal(loss_classification_error(y, p), 0 + (y != p)) -}) - -test_that("loss_classification_error() works for specific case (multivariate)", { - y <- iris$Species - y2 <- replicate(2L, y) - p <- y2[nrow(y2):1, ] - colnames(p) <- c("a", "b") - expect_equal(loss_classification_error(y2, p), 0 + (y2 != p)) - expect_equal(loss_classification_error(y, p), 0 + (y2 != p)) -}) - test_that("loss_mlogloss() is in line with loss_logloss() in binary case", { y <- c(0, 0, 0, 1, 1) pred <- c(0, 0.1, 0.2, 1, 0.9) @@ -135,13 +116,6 @@ test_that("loss_mlogloss() either understands matrix responses or factors", { expect_equal(loss_mlogloss(Y, pred), loss_mlogloss(y, pred)) }) -test_that("squared error can be calculated on factors", { - expect_equal( - colSums(loss_squared_error(iris$Species, rev(iris$Species))), - c(setosa = 100, versicolor = 0, virginica = 100) - ) -}) - test_that("Some errors are thrown", { expect_error(loss_poisson(-1:1, 1:2)) expect_error(loss_poisson(0:1, -1:0)) diff --git a/tests/testthat/test_perm_importance.R b/tests/testthat/test_perm_importance.R index 6fe356bb..3cd45085 100644 --- a/tests/testthat/test_perm_importance.R +++ b/tests/testthat/test_perm_importance.R @@ -155,36 +155,6 @@ test_that("matrix case works as well", { expect_equal(s2$M["Petal.Length", ], s3$M["Petal.Length", ]) }) -test_that("non-numeric predictions can work as well (classification error)", { - expect_equal( - c(perm_importance( - 1, - v = "Sepal.Length", - X = iris, - y = iris$Species, - pred_fun = function(m, X) rep("setosa", times = nrow(X)), - loss = "classification_error", - verbose = FALSE - )$M), - 0 - ) -}) - -test_that("factor predictions can work as well (squared error)", { - expect_equal( - perm_importance( - 1, - v = "Sepal.Length", - X = iris, - y = iris$Species, - pred_fun = function(m, X) - factor(rep("setosa", times = nrow(X)), levels = levels(iris$Species)), - verbose = FALSE - )$M, - rbind(Sepal.Length = c(setosa = 0, versicolor = 0, virginica = 0)) - ) -}) - #================================================ # Multivariate model #================================================ @@ -356,7 +326,7 @@ test_that("groups of variables work as well", { test_that("mlogloss works with either matrix y or vector y", { pred_fun <- function(m, X) cbind(1 - (0.1 + 0.7 * (X == 1)), 0.1 + 0.7 * (X == 1)) X <- cbind(z = c(1, 0, 0, 1, 1)) - y <- c("B", "A", "B", "B", "B") + y <- c(2, 1, 2, 2, 2) Y <- model.matrix(~ y + 0) set.seed(1L) s1 <- perm_importance( @@ -389,7 +359,7 @@ test_that("Single output multiple models works without recycling y", { }) test_that("loss_mlogloss() is in line with loss_logloss() in binary case", { - y <- iris$Species == "setosa" + y <- (iris$Species == "setosa") * 1 Y <- cbind(no = 1 - y, yes = y) fit <- glm(y ~ Sepal.Length, data = iris, family = binomial()) pf <- function(m, X, multi = FALSE) { From 79601884382825e29ca591123711fee64b58fa0d Mon Sep 17 00:00:00 2001 From: Michael Mayer Date: Fri, 5 Jul 2024 21:52:01 +0200 Subject: [PATCH 4/6] Update README --- README.md | 35 ++--------------------------------- 1 file changed, 2 insertions(+), 33 deletions(-) diff --git a/README.md b/README.md index 342ed982..0da5a4a7 100644 --- a/README.md +++ b/README.md @@ -369,33 +369,6 @@ plot(H, normalize = FALSE, squared = FALSE, facet_scales = "free_y", ncol = 1) ![](man/figures/xgboost.svg) -### Non-probabilistic classification - -When predictions are factor levels, {hstats} uses internal one-hot-encoding. Usually, probabilistic classification makes more sense though. - -```r -library(ranger) - -set.seed(1) - -fit <- ranger(Species ~ ., data = train) -average_loss(fit, X = valid, y = "Species", loss = "classification_error") # 0 - -perm_importance(fit, X = valid, y = "Species", loss = "classification_error") -# Petal.Width Petal.Length Sepal.Length Sepal.Width -# 0.350 0.275 0.000 0.000 - -(s <- hstats(fit, X = train[, -5])) -# H^2 (normalized) -# setosa versicolor virginica -# 0.09885417 0.46151300 0.23238800 - -plot(s, normalize = FALSE, squared = FALSE) - -partial_dep(fit, v = "Petal.Length", X = train, BY = "Petal.Width") |> - plot(show_points = FALSE) -``` - ## Meta-learning packages Here, we provide examples for {tidymodels}, {caret}, and {mlr3}. @@ -469,18 +442,14 @@ set.seed(1) task_iris <- TaskClassif$new(id = "class", backend = iris, target = "Species") fit_rf <- lrn("classif.ranger", predict_type = "prob") fit_rf$train(task_iris) -s <- hstats(fit_rf, X = iris[, -5]) +s <- hstats(fit_rf, X = iris[, -5], predict_type = "prob") plot(s) -# Permutation importance (probabilistic using multi-logloss) +# Permutation importance (wrt multi-logloss) p <- perm_importance( fit_rf, X = iris, y = "Species", loss = "mlogloss", predict_type = "prob" ) plot(p) - -# Non-probabilistic using classification error -perm_importance(fit_rf, X = iris, y = "Species", loss = "classification_error") |> - plot() ``` ## Background From f003dc3ab9593a558dab24729eaa89b3141db0a1 Mon Sep 17 00:00:00 2001 From: Michael Mayer Date: Sat, 6 Jul 2024 18:11:41 +0200 Subject: [PATCH 5/6] Delegate OHE of response to losses --- NEWS.md | 4 ++ R/losses.R | 18 +++--- R/perm_importance.R | 8 +-- R/utils_input.R | 9 +-- backlog/benchmark.R | 80 ++++++++++++++------------- backlog/colMeans_factors.R | 27 --------- tests/testthat/test_average_loss.R | 2 +- tests/testthat/test_input.R | 4 +- tests/testthat/test_perm_importance.R | 2 +- 9 files changed, 68 insertions(+), 86 deletions(-) delete mode 100644 backlog/colMeans_factors.R diff --git a/NEWS.md b/NEWS.md index 2ef616b1..7465a113 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,6 +5,10 @@ - Factor-valued predictions are no longer possible. - Consequently, also removed "classification_error" loss. +## Minor changes + +- Code simplifications. + # hstats 1.1.2 ## ICE plots diff --git a/R/losses.R b/R/losses.R index b207b5ba..f9b40de1 100644 --- a/R/losses.R +++ b/R/losses.R @@ -12,8 +12,8 @@ check_loss <- function(actual, predicted) { stopifnot( is.vector(actual) || is.matrix(actual), is.vector(predicted) || is.matrix(predicted), - is.numeric(actual), - is.numeric(predicted), + is.numeric(actual) || is.logical(actual), + is.numeric(predicted) || is.logical(predicted), NROW(actual) == NROW(predicted), NCOL(actual) == 1L || NCOL(actual) == NCOL(predicted) ) @@ -27,10 +27,14 @@ check_loss <- function(actual, predicted) { #' @noRd #' @keywords internal #' -#' @param actual A numeric vector or matrix. +#' @param actual A numeric vector or matrix, or a factor with levels in the same order +#' as the column names of `predicted`. #' @param predicted A numeric vector or matrix. #' @returns Vector or matrix of numeric losses. loss_squared_error <- function(actual, predicted) { + if (is.factor(actual)) { + actual <- fdummy(actual) + } check_loss(actual, predicted) return((drop(actual) - predicted)^2) @@ -147,11 +151,10 @@ loss_mlogloss <- function(actual, predicted) { is.matrix(actual), is.matrix(predicted), - is.numeric(actual), - is.numeric(predicted), + is.numeric(actual) || is.logical(actual), + is.numeric(predicted) || is.logical(predicted), - nrow(actual) == nrow(predicted), - ncol(actual) == ncol(predicted), + dim(actual) == dim(predicted), ncol(predicted) >= 2L, all(predicted >= 0), @@ -176,6 +179,7 @@ loss_mlogloss <- function(actual, predicted) { xlogy <- function(x, y) { out <- x * log(y) out[x == 0] <- 0 + return(out) } diff --git a/R/perm_importance.R b/R/perm_importance.R index 5df1b02e..20fdcfb5 100644 --- a/R/perm_importance.R +++ b/R/perm_importance.R @@ -103,9 +103,9 @@ perm_importance.default <- function(object, X, y, v = NULL, if (nrow(X) > n_max) { ix <- sample(nrow(X), n_max) X <- X[ix, , drop = FALSE] - if (is.vector(y)) { + if (is.vector(y) || is.factor(y)) { y <- y[ix] - } else { + } else { # matrix case y <- y[ix, , drop = FALSE] } if (!is.null(w)) { @@ -126,9 +126,9 @@ perm_importance.default <- function(object, X, y, v = NULL, if (m_rep > 1L) { ind <- rep.int(seq_len(n), m_rep) X <- rep_rows(X, ind) - if (is.vector(y)) { + if (is.vector(y) || is.factor(y)) { y <- y[ind] - } else { + } else { # matrix case y <- y[ind, , drop = FALSE] } } diff --git a/R/utils_input.R b/R/utils_input.R index d3d162f0..d0845e36 100644 --- a/R/utils_input.R +++ b/R/utils_input.R @@ -14,7 +14,7 @@ prepare_pred <- function(x) { if (!is.vector(x) && !is.matrix(x)) { x <- as.matrix(x) } - if (!is.numeric(x)) { + if (!is.numeric(x) && !is.logical(x)) { stop("Predictions must be numeric!") } return(x) @@ -116,13 +116,10 @@ prepare_y <- function(y, X) { stopifnot(NROW(y) == nrow(X)) y_names <- NULL } - if (is.factor(y)) { - y <- fdummy(y) - } - if (!is.vector(y) && !is.matrix(y)) { + if (!is.vector(y) && !is.matrix(y) && !is.factor(y)) { y <- as.matrix(y) } - if (!is.numeric(y)) { + if (!is.numeric(y) && !is.logical(y) && !is.factor(y)) { stop("Response must be numeric (or factor.)") } list(y = y, y_names = y_names) diff --git a/backlog/benchmark.R b/backlog/benchmark.R index c367b679..3923a1c9 100644 --- a/backlog/benchmark.R +++ b/backlog/benchmark.R @@ -115,12 +115,11 @@ bench::mark( check = FALSE, min_iterations = 3 ) - -# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time -# 1 iml 1.72s 1.75s 0.574 210.6MB 1.34 3 7 5.23s -# 2 dalex 744.82ms 760.02ms 1.31 35.2MB 0.877 3 2 2.28s -# 3 flashlight 1.29s 1.35s 0.742 63MB 0.990 3 4 4.04s -# 4 hstats 407.26ms 412.31ms 2.43 26.5MB 0 3 0 1.23s +# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result +# 1 iml 1.76s 1.76s 0.565 211.6MB 3.39 3 18 5.31s +# 2 dalex 688.54ms 697.71ms 1.44 35.2MB 1.91 3 4 2.09s +# 3 flashlight 667.51ms 676.07ms 1.47 28.1MB 1.96 3 4 2.04s +# 4 hstats 392.15ms 414.41ms 2.39 26.6MB 0.796 3 1 1.26s # Partial dependence (cont) v <- "tot_lvg_area" @@ -132,12 +131,12 @@ bench::mark( check = FALSE, min_iterations = 3 ) -# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time -# 1 iml 1.14s 1.16s 0.861 376.7MB 3.73 3 13 3.48s -# 2 dalex 653.24ms 654.51ms 1.35 192.8MB 2.24 3 5 2.23s -# 3 flashlight 352.34ms 361.79ms 2.72 66.7MB 0.906 3 1 1.1s -# 4 hstats 239.03ms 242.79ms 4.04 14.2MB 1.35 3 1 743.43ms - +# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result +# +# 1 iml 1.2s 1.4s 0.726 376.9MB 4.12 3 17 4.13s +# 2 dalex 759.3ms 760.6ms 1.28 192.8MB 2.55 3 6 2.35s +# 3 flashlight 369.1ms 403.1ms 2.55 66.8MB 2.55 3 3 1.18s +# 4 hstats 242.1ms 243.8ms 4.03 14.2MB 0 3 0 744.25ms # # Partial dependence (discrete) v <- "structure_quality" bench::mark( @@ -148,30 +147,31 @@ bench::mark( check = FALSE, min_iterations = 3 ) -# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time -# 1 iml 100.6ms 103.6ms 9.46 13.34MB 0 5 0 529ms -# 2 dalex 172.4ms 177.9ms 5.62 20.55MB 2.81 2 1 356ms -# 3 flashlight 43.5ms 45.5ms 21.9 6.36MB 2.19 10 1 457ms -# 4 hstats 25.3ms 25.8ms 37.9 1.54MB 2.10 18 1 475ms - +# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result +# +# 1 iml 107.9ms 108ms 9.26 13.64MB 9.26 2 2 216ms +# 2 dalex 172ms 172.2ms 5.81 21.14MB 2.90 2 1 344ms +# 3 flashlight 40.3ms 41.6ms 23.8 8.61MB 2.16 11 1 462ms +# 4 hstats 24.5ms 25.9ms 35.5 1.64MB 0 18 0 507ms + # H-Stats -> we use a subset of 500 rows X_v500 <- X_valid[1:500, ] mod500 <- Predictor$new(fit, data = as.data.frame(X_v500), predict.function = predf) fl500 <- flashlight(fl, data = as.data.frame(valid[1:500, ])) -# iml # 225s total, using slow exact calculations -system.time( # 90s +# iml # 243s total, using slow exact calculations +system.time( # 110s iml_overall <- Interaction$new(mod500, grid.size = 500) ) -system.time( # 135s for all combinations of latitude +system.time( # 133s for all combinations of latitude iml_pairwise <- Interaction$new(mod500, grid.size = 500, feature = "latitude") ) -# flashlight: 13s total, doing only one pairwise calculation, otherwise would take 63s -system.time( # 11.5s +# flashlight: 14s total, doing only one pairwise calculation, otherwise would take 63s +system.time( # 11.7s fl_overall <- light_interaction(fl500, v = x, grid_size = Inf, n_max = Inf) ) -system.time( # 2.4s +system.time( # 2.3s fl_pairwise <- light_interaction( fl500, v = coord, grid_size = Inf, n_max = Inf, pairwise = TRUE ) @@ -185,34 +185,38 @@ system.time({ } ) -# Using 50 quantiles to approximate dense numerics: 0.9s +# Using 50 quantiles to approximate dense numerics: 0.8s system.time( H_approx <- hstats(fit, v = x, X = X_v500, n_max = Inf, approx = TRUE) ) # Overall statistics correspond exactly -iml_overall$results |> filter(.interaction > 1e-6) +iml_overall$results |> + filter(.interaction > 1e-6) # .feature .interaction -# 1: latitude 0.2791144 -# 2: longitude 0.2791144 +# 1: latitude 0.2458269 +# 2: longitude 0.2458269 -fl_overall$data |> subset(value > 0, select = c(variable, value)) -# variable value -# 1 latitude 0.279 -# 2 longitude 0.279 +fl_overall$data |> + subset(value_ > 0, select = c(variable_, value_)) +# variable_ value_ +# 3 latitude 0.2458269 +# 4 longitude 0.2458269 hstats_overall # longitude latitude -# 0.2791144 0.2791144 +# 0.2458269 0.2458269 # Pairwise results match as well -iml_pairwise$results |> filter(.interaction > 1e-6) +iml_pairwise$results |> + filter(.interaction > 1e-6) # .feature .interaction -# 1: longitude:latitude 0.4339574 +# 1: longitude:latitude 0.3942526 -fl_pairwise$data |> subset(value > 0, select = c(variable, value)) -# latitude:longitude 0.434 +fl_pairwise$data |> + subset(value_ > 0, select = c(variable_, value_)) +# latitude:longitude 0.3942526 hstats_pairwise # latitude:longitude -# 0.4339574 \ No newline at end of file +# 0.3942526 \ No newline at end of file diff --git a/backlog/colMeans_factors.R b/backlog/colMeans_factors.R deleted file mode 100644 index adcc54c5..00000000 --- a/backlog/colMeans_factors.R +++ /dev/null @@ -1,27 +0,0 @@ -#' gcolMeans() for Factors -#' -#' Grouped version of `colMeans_factor()`. -#' -#' @noRd -#' @keywords internal -#' -#' @params x Factor. -#' @returns Named vector. -gcolMeans_factor <- function(x, g = NULL) { - if (is.null(g)) { - colMeans_factor(x) - } - x <- as.factor(x) - out <- t.default(sapply(split.default(x, g), colMeans_factor)) - colnames(out) <- levels(x) - out -} - -wrowmean_matrix2 <- function(x, ngroups = 1L, w = NULL) { - if (!is.matrix(x)) { - stop("x must be a matrix.") - } - dim(x) <- c(nrow(x)/ngroups, ngroups, ncol(x)) - out <- colMeans(aperm(x, c(1, 3, 2))) - t.default(out) -} diff --git a/tests/testthat/test_average_loss.R b/tests/testthat/test_average_loss.R index a6a7786d..fd8d1c6f 100644 --- a/tests/testthat/test_average_loss.R +++ b/tests/testthat/test_average_loss.R @@ -230,7 +230,7 @@ test_that("mlogloss works with either matrix y or factor y", { }) test_that("loss_mlogloss() is in line with loss_logloss() in binary case", { - y <- (iris$Species == "setosa") * 1 + y <- (iris$Species == "setosa") Y <- cbind(no = 1 - y, yes = y) fit <- glm(y ~ Sepal.Length, data = iris, family = binomial()) pf <- function(m, X, multi = FALSE) { diff --git a/tests/testthat/test_input.R b/tests/testthat/test_input.R index f8a19102..3bb5616e 100644 --- a/tests/testthat/test_input.R +++ b/tests/testthat/test_input.R @@ -24,7 +24,7 @@ test_that("prepare_by() works", { test_that("prepare_y() works", { # "Vector" interface expect_equal(prepare_y(iris[1:4], X = iris)$y, data.matrix(iris[1:4])) - expect_equal(prepare_y(iris["Species"], X = iris)$y, fdummy(iris$Species)) + expect_equal(prepare_y(iris["Species"], X = iris)$y, iris$Species) expect_equal(prepare_y(iris$Sepal.Width, X = iris)$y, iris$Sepal.Width) expect_equal(prepare_y(iris["Sepal.Width"], X = iris)$y, iris$Sepal.Width) @@ -35,7 +35,7 @@ test_that("prepare_y() works", { expect_equal(out$y_names, cn) out <- prepare_y("Species", X = iris) - expect_equal(out$y, fdummy(iris$Species)) + expect_equal(out$y, iris$Species) expect_equal(out$y_names, "Species") out <- prepare_y("Sepal.Width", X = iris) diff --git a/tests/testthat/test_perm_importance.R b/tests/testthat/test_perm_importance.R index 3cd45085..15247a83 100644 --- a/tests/testthat/test_perm_importance.R +++ b/tests/testthat/test_perm_importance.R @@ -359,7 +359,7 @@ test_that("Single output multiple models works without recycling y", { }) test_that("loss_mlogloss() is in line with loss_logloss() in binary case", { - y <- (iris$Species == "setosa") * 1 + y <- (iris$Species == "setosa") Y <- cbind(no = 1 - y, yes = y) fit <- glm(y ~ Sepal.Length, data = iris, family = binomial()) pf <- function(m, X, multi = FALSE) { From 2278089e432b1e00ffd00eed8253fc0830fc5dfa Mon Sep 17 00:00:00 2001 From: Michael Mayer Date: Sun, 7 Jul 2024 10:00:46 +0200 Subject: [PATCH 6/6] Review of PR --- R/utils_calculate.R | 2 +- R/utils_input.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/utils_calculate.R b/R/utils_calculate.R index a1f5620c..8ee58827 100644 --- a/R/utils_calculate.R +++ b/R/utils_calculate.R @@ -151,7 +151,7 @@ gwColMeans <- function(x, g = NULL, w = NULL) { } # Now the interesting case - if (!is.vector(x) || !is.matrix(x)) { + if (!is.vector(x) && !is.matrix(x)) { x <- as.matrix(x) } if (is.null(w)) { diff --git a/R/utils_input.R b/R/utils_input.R index d0845e36..f46257c6 100644 --- a/R/utils_input.R +++ b/R/utils_input.R @@ -99,7 +99,7 @@ prepare_w <- function(w, X) { #' or column names in `X`. #' @param X Matrix-like. #' -#' @returns A list with "y" (vector, matrix) and "y_names" (if `y` +#' @returns A list with "y" (vector, matrix, or factor) and "y_names" (if `y` #' was passed as column names). prepare_y <- function(y, X) { if (NROW(y) < nrow(X) && all(y %in% colnames(X))) {