-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpsy-data-test.Rmd
139 lines (106 loc) · 3.86 KB
/
psy-data-test.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
---
output:
pdf_document:
fig_caption: true
fig_width: 6
fig_height: 4
params:
date: !r Sys.Date()
# target.label: "PERF.all"
target.label: "PERF09"
# features.set: "big5items"
features.set: "big5composites"
split.ratio: 0.80
cv.repeats: 100
# cv.repeats: 10
impute.method: "noimpute"
# impute.method: "medianImpute"
title: "Job Performance Analysis with
target = `r params$target.label` and features set = `r params$features.set`"
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE)
# knitr::opts_knit$set(global.par = TRUE)
options(digits = 3)
# devtools::install_github("agilebean/machinelearningtools", force = TRUE)
# unloadNamespace("machinelearningtools")
# load libraries
libraries <- c("dplyr", "magrittr", "tidyverse"
, "sjlabelled" # read SPSS
, "caret", "doParallel"
, "DataExplorer", "RColorBrewer"
, "machinelearningtools"
, "knitr", "pander"
)
sapply(libraries, require, character.only = TRUE)
if (params$impute.method == "noimpute") {
dataset.label <- "data/dataset.rds" %>% print
} else {
dataset.label <- "data/dataset.NA.rds" %>% print
}
# nominal <- FALSE # with ordinal as ORDERED factors
nominal <- TRUE # with ordinal as NOMINAL factor
seed <- 17
get_features <- function(target_label, features_set, data_set) {
data_set %>%
select(-target_label,
-starts_with("TO"),
-starts_with("PERF")
) %>%
{
if (features_set == "big5items") {
# remove composite scores - equivalent to (-nn, -ee, -oo, -aa, -cc)
select(., -matches("(oo|cc|ee|aa|nn)$"))
} else if (features_set == "big5composites") {
# remove Big5 items
select(., -matches(".*(1|2|3|4|5|6)"))
} else { . }
} %>%
names %T>% print
}
```
```{r get model, cache=TRUE, include=TRUE}
print(params$target.label)
print(params$features.set)
models.list.name <- paste0(c("data/models.list",
params$target.label,
params$features.set,
paste0(params$cv.repeats, "repeats"),
params$impute.method,
"rds"),
collapse = ".") %T>% print
models.list <- models.list.name %>% readRDS
training_configuration_number <- models.list[[1]] %>% .$control %>% .$number
training_configuration_repeats <- models.list[[1]] %>% .$control %>% .$repeats
```
Training the model on the training set with `r training_configuration_number`-fold cross-validation, repeated `r training_configuration_repeats` times, yields the following results. Linear regression as benchmark reference is denoted as "lm".
```{r train the model,fig.width=7, fig.height=5, cache=TRUE, echo=FALSE}
########################################
########################################
# get model metrics
models.metrics <- models.list %>%
list_modify(
glmnet = NULL,
kknn = NULL,
xgbTree = NULL,
xgbLinear = NULL,
svmLinear = NULL,
ranger = NULL) %>%
get_model_metrics()
# display Rsquared trainingset performance - table
models.metrics$metric2.training %>% kable(caption = "training set performance: Rsquared")
# display Rsquared trainingset performance - boxplots
models.metrics$metric2.resamples.boxplots
# display RMSE trainingset performance - table
models.metrics$metric1.training %>% kable(caption = "training set performance: RMSE")
# display RMSE trainingset performance - boxplots
models.metrics$metric1.resamples.boxplots
# models.metrics$RMSE.training %>% filter(str_detect(model, "^lm$"))
rank.lm.training <- which(models.metrics$metric1.training$model == "lm")
```
```{r testing set benchmark all}
models.metrics$benchmark.all %>% kable(caption = "training vs. testing set performance: RMSE")
```
```{r}
models.metrics$metric1.resamples.boxplots
```