-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path.Rhistory
197 lines (197 loc) · 6.14 KB
/
.Rhistory
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
g <- matrix(
c(2, 1, 2, 1, 4, 3, 2, 4, 2)
nrows = 3,
ncol = 2)
g <- matrix(
c(2, 1, 2, 1, 4, 3, 2, 4, 2),
nrows = 3,
ncol = 2)
g <- matrix(
c(2, 1, 2, 1, 4, 3, 2, 4, 2),
nrow = 3,
ncol = 3)
h <- as.matrix(g)
h <- matrix(
c(3, -1, 2, -1, 4, 3, 2, 3, -1),
nrow = 3,
ncol = 3)
i <- matrix(
c(-3, 1, 0, 0, 4, 1, 2, 1, 2),
nrow = 3,
ncol = 3
)
j <- T(g)
j <- t(g)
j <- t(g)
j <- t(g)
j
print(g)
print(j)
g <- matrix(
c(2, 1, 2, 1, 4, 3, 2, 4, 2),
nrow = 3)
print(g)
g <- matrix(
c(2, 1, 2, 1, 4, 4, 2, 3, 2),
nrow = 3,
ncol = 3)
print(g)
h <- matrix(
c(3, -1, 2, -1, 4, 3, 2, 3, -1),
nrow = 3,
ncol = 3)
i <- matrix(
c(-3, 0, 2, 1, 4, 1, 0, 1, 2),
nrow = 3,
ncol = 3)
r <- matrix(
c(3,1,2,5)
nrow = 2,
ncol = 2)
r <- matrix(
c(3,1,2,5),
nrow = 2,
ncol = 2)
s <- inv(r)
library(matlib)
solve()
solve()?
s <- solve(r)
r <- matrix(
c(3,1,2,5),
nrow = 2,
ncol = 2)
r <- matrix(
c(3,1,2,5),
nrow = 2,
ncol = 2)
s <- solve(r)
s
det(s)
s <- solve(r)
r <- matrix(
c(3,1,2,5),
nrow = 2,
ncol = 2)
s <- solve(r)
det(s)
tuesdata <- tidytuesdayR::tt_load('2020-02-11')
hotels <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-02-11/hotels.csv')
tuesdata <- tidytuesdayR::tt_load('2020-02-11')
install.packages(c("klaR", "lattice", "qvcalc", "stringi", "vctrs"))
install.packages(c("backports", "boot", "broom", "callr", "caret", "class", "classInt", "cli", "crosstalk", "dbplyr", "digest", "dplyr", "ellipsis", "fable", "fabletools", "forcats", "foreach", "forecast", "fs", "gender", "ggfortify", "ggplot2", "ggrepel", "glue", "gplots", "gtools", "haven", "igraph", "janitor", "labelled", "lattice", "lava", "lifecycle", "lme4", "lsa", "lubridate", "manipulateWidget", "maptools", "MASS", "matrixStats", "ModelMetrics", "modelr", "modeltools", "nloptr", "openxlsx", "pillar", "pkgbuild", "plyr", "pROC", "proxy", "ps", "purrr", "qdapTools", "quantmod", "quantreg", "Rcpp", "RcppArmadillo", "RcppParallel", "RCurl", "recipes", "reshape2", "reticulate", "rlang", "scales", "shiny", "SnowballC", "sp", "spacyr", "SQUAREM", "survey", "survival", "TeachingDemos", "testthat", "tibble", "tidyr", "tidyselect", "tinytex", "vctrs", "withr", "xfun", "xml2", "zoo"))
# Working Directory
wd <- paste("~/Dev/DS_Projects/is_analytical")
setwd(wd)
# Load libraries
library(pastecs) # For descriptive stats
library(ggplot2) # For plotting
library(dplyr) # For data manipulation
library(caret) # For varImp, confusionMatrix
library(gridExtra) # For printing/arranging multiple plots
library(stargazer) # For saving model summaries as jpeg / html files
library(tidyr) # To reshape the data to different format
library(MASS) # For linear/quadratic discriminant analysis
library(klaR) # For simple LDA plot
library(scales) # For tick label precision
work_styles_df <- read.csv("assets/Work_Styles.csv")
# Read in total_df only after Data Engineering has been completed
total_df <- read.csv("assets/total_df.csv", row.names=NULL, header=T)
# If necessary, you can load total_df here
total_df <- read.csv("assets/total_df.csv", row.names=NULL, header=T)
summ <- stat.desc(total_df)
summ # Variables have different scales
# check_accuracy()
# Function takes either the discretized results or the probabilities as model
# outputs; in the case of the latter, it uses a cutoff of 0.50 to determine
# whether the occupation is analytical or not
check_accuracy <- function(df){
count<- 0
i <- 0
if(class(df$yhat) == 'factor'){
for(i in 1:length(df$y)){
if(df[i, "yhat"] == df[i, "y"]){
count <- count + 1
}
}
}else if(class(df$yhat) == 'numeric'){
for(i in 1:length(df$y)){
if(df[i, "yhat"] < 0.5 & df[i, "y"] == 0){
count <- count + 1
}else if(df[i, "yhat"] >= 0.5 & df[i, "y"] == 1){
count <- count + 1
}
}
}
return(count / length(df$y))
}# End check_accuracy()
# discretize_output()
# Function returns a numerical value, 0 or 1, where 0 is not analytical and 1 is
# analytical with a cutoff of 0.50; not analytical < 0.50 >= is analytical
discretize_output <- function(val, cutoff){
if(val < cutoff){
return(as.factor(0))}
return(as.factor(1))
} # End discretize_output()
# calc_f1()
# Function returns the f1 stat for a model; takes recall and precision inputs
calc_f1 <- function(precision, recall){
return(2 * (precision * recall) / (precision + recall))
} #End calc_f
# Set seed for train/test splits
set.seed(987654)
# 70 / 30 test / train split
# For L/QDA, n >= 5k
dt <- sample(nrow(total_df), nrow(total_df) * 0.7)
train <- total_df[dt,]
test <- total_df[-dt,]
nrow(train)
nrow(test)
# Split training data into 3 sets for 3 models
# Logit model
dt_log <- sample(nrow(train), nrow(train) * 0.333)
logit_train <- train[dt_log,]
train <- train[-dt_log,]
# LDA model
dt_lda <- sample(nrow(train), nrow(train) * 0.5)
lda_train <- train[dt_lda,]
# QDA model
qda_train <- train[-dt_lda,]
nrow(logit_train)
nrow(lda_train)
nrow(qda_train)
# Fit logistic regression model
logit_model <-
glm(y ~ Category.Flexibility + Deductive.Reasoning +
Flexibility.of.Closure + Fluency.of.Ideas +
Inductive.Reasoning + Information.Ordering +
Mathematical.Reasoning + Memorization +
Number.Facility + Oral.Comprehension +
Oral.Expression + Originality +
Perceptual.Speed + Problem.Sensitivity +
Selective.Attention + Spatial.Orientation +
Speed.of.Closure + Time.Sharing +
Visualization + Written.Comprehension +
Written.Expression,
family="binomial",
data=logit_train)
# Turn off scientific notation
options(scipen=999)
# Save output
logit_train$yhat <- fitted(logit_model)
# Discretize output: not analytical < 0.50 >= analytical
logit_train$class <- sapply(logit_train$yhat,
function(x) discretize_output(x, 0.5))
# Check accuracy
logit_train_score <- check_accuracy(logit_train)
logit_train_score # Prints 0.8844444
LogModel1 <- glm(ASTHMA4 ~ DRKMONTHLY + DRKWEEKLY, data=analytic, family="binomial")
summary(LogModel1)
#Confusion Matrix
# Convert y values to factor
logit_train$y <- sapply(logit_train$y, function(x) as.factor(x))
confusionMatrix(logit_train$class, logit_train$y)
# Take a look at model performance measures
logit_summary <- summary(logit_model) # Null deviance: 311.70
logit_summary # Residual deviance: 130.13
logit_summary$Pr