.Rhistory

g <- matrix(
c(2, 1, 2, 1, 4, 3, 2, 4, 2)
nrows = 3,
ncol = 2)
g <- matrix(
c(2, 1, 2, 1, 4, 3, 2, 4, 2),
nrows = 3,
ncol = 2)
g <- matrix(
c(2, 1, 2, 1, 4, 3, 2, 4, 2),
nrow = 3,
ncol = 3)
h <- as.matrix(g)
h <- matrix(
c(3, -1, 2, -1, 4, 3, 2, 3, -1),
nrow = 3,
ncol = 3)
i <- matrix(
c(-3, 1, 0, 0, 4, 1, 2, 1, 2),
nrow = 3,
ncol = 3
)
j <- T(g)
j <- t(g)
j <- t(g)
j <- t(g)
j
print(g)
print(j)
g <- matrix(
c(2, 1, 2, 1, 4, 3, 2, 4, 2),
nrow = 3)
print(g)
g <- matrix(
c(2, 1, 2, 1, 4, 4, 2, 3, 2),
nrow = 3,
ncol = 3)
print(g)
h <- matrix(
c(3, -1, 2, -1, 4, 3, 2, 3, -1),
nrow = 3,
ncol = 3)
i <- matrix(
c(-3, 0, 2, 1, 4, 1, 0, 1, 2),
nrow = 3,
ncol = 3)
r <- matrix(
c(3,1,2,5)
nrow = 2,
ncol = 2)
r <- matrix(
c(3,1,2,5),
nrow = 2,
ncol = 2)
s <- inv(r)
library(matlib)
solve()
solve()?
s <- solve(r)
r <- matrix(
c(3,1,2,5),
nrow = 2,
ncol = 2)
r <- matrix(
c(3,1,2,5),
nrow = 2,
ncol = 2)
s <- solve(r)
s
det(s)
s <- solve(r)
r <- matrix(
c(3,1,2,5),
nrow = 2,
ncol = 2)
s <- solve(r)
det(s)
tuesdata <- tidytuesdayR::tt_load('2020-02-11')
hotels <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-02-11/hotels.csv')
tuesdata <- tidytuesdayR::tt_load('2020-02-11')
install.packages(c("klaR", "lattice", "qvcalc", "stringi", "vctrs"))
install.packages(c("backports", "boot", "broom", "callr", "caret", "class", "classInt", "cli", "crosstalk", "dbplyr", "digest", "dplyr", "ellipsis", "fable", "fabletools", "forcats", "foreach", "forecast", "fs", "gender", "ggfortify", "ggplot2", "ggrepel", "glue", "gplots", "gtools", "haven", "igraph", "janitor", "labelled", "lattice", "lava", "lifecycle", "lme4", "lsa", "lubridate", "manipulateWidget", "maptools", "MASS", "matrixStats", "ModelMetrics", "modelr", "modeltools", "nloptr", "openxlsx", "pillar", "pkgbuild", "plyr", "pROC", "proxy", "ps", "purrr", "qdapTools", "quantmod", "quantreg", "Rcpp", "RcppArmadillo", "RcppParallel", "RCurl", "recipes", "reshape2", "reticulate", "rlang", "scales", "shiny", "SnowballC", "sp", "spacyr", "SQUAREM", "survey", "survival", "TeachingDemos", "testthat", "tibble", "tidyr", "tidyselect", "tinytex", "vctrs", "withr", "xfun", "xml2", "zoo"))
# Working Directory
wd <- paste("~/Dev/DS_Projects/is_analytical")
setwd(wd)
# Load libraries
library(pastecs)    # For descriptive stats
library(ggplot2)    # For plotting
library(dplyr)      # For data manipulation
library(caret)      # For varImp, confusionMatrix
library(gridExtra)  # For printing/arranging multiple plots
library(stargazer)  # For saving model summaries as jpeg / html files
library(tidyr)      # To reshape the data to different format
library(MASS)       # For linear/quadratic discriminant analysis
library(klaR)       # For simple LDA plot
library(scales)     # For tick label precision
work_styles_df <- read.csv("assets/Work_Styles.csv")
# Read in total_df only after Data Engineering has been completed
total_df <- read.csv("assets/total_df.csv", row.names=NULL, header=T)
# If necessary, you can load total_df here
total_df <- read.csv("assets/total_df.csv", row.names=NULL, header=T)
summ <- stat.desc(total_df)
summ                           # Variables have different scales
# check_accuracy()
# Function takes either the discretized results or the probabilities as model
# outputs; in the case of the latter, it uses a cutoff of 0.50 to determine
# whether the occupation is analytical or not
check_accuracy <- function(df){
count<- 0
i <- 0
if(class(df$yhat) == 'factor'){
for(i in 1:length(df$y)){
if(df[i, "yhat"] == df[i, "y"]){
count <- count + 1
}
}
}else if(class(df$yhat) == 'numeric'){
for(i in 1:length(df$y)){
if(df[i, "yhat"] < 0.5 & df[i, "y"] == 0){
count <- count + 1
}else if(df[i, "yhat"] >= 0.5 & df[i, "y"] == 1){
count <- count + 1
}
}
}
return(count / length(df$y))
}# End check_accuracy()
# discretize_output()
# Function returns a numerical value, 0 or 1, where 0 is not analytical and 1 is
# analytical with a cutoff of 0.50; not analytical < 0.50 >= is analytical
discretize_output <- function(val, cutoff){
if(val < cutoff){
return(as.factor(0))}
return(as.factor(1))
} # End discretize_output()
# calc_f1()
# Function returns the f1 stat for a model; takes recall and precision inputs
calc_f1 <- function(precision, recall){
return(2 * (precision * recall) / (precision + recall))
} #End calc_f
# Set seed for train/test splits
set.seed(987654)
# 70 / 30 test / train split
# For L/QDA, n >= 5k
dt <- sample(nrow(total_df), nrow(total_df) * 0.7)
train <- total_df[dt,]
test <- total_df[-dt,]
nrow(train)
nrow(test)
# Split training data into 3 sets for 3 models
# Logit model
dt_log <- sample(nrow(train), nrow(train) * 0.333)
logit_train <- train[dt_log,]
train <- train[-dt_log,]
# LDA model
dt_lda <- sample(nrow(train), nrow(train) * 0.5)
lda_train <- train[dt_lda,]
# QDA model
qda_train <- train[-dt_lda,]
nrow(logit_train)
nrow(lda_train)
nrow(qda_train)
# Fit logistic regression model
logit_model <-
glm(y ~ Category.Flexibility + Deductive.Reasoning +
Flexibility.of.Closure + Fluency.of.Ideas +
Inductive.Reasoning + Information.Ordering +
Mathematical.Reasoning + Memorization +
Number.Facility + Oral.Comprehension +
Oral.Expression + Originality +
Perceptual.Speed + Problem.Sensitivity +
Selective.Attention + Spatial.Orientation +
Speed.of.Closure + Time.Sharing +
Visualization + Written.Comprehension +
Written.Expression,
family="binomial",
data=logit_train)
# Turn off scientific notation
options(scipen=999)
# Save output
logit_train$yhat <- fitted(logit_model)
# Discretize output: not analytical < 0.50 >= analytical
logit_train$class <- sapply(logit_train$yhat,
function(x) discretize_output(x, 0.5))
# Check accuracy
logit_train_score <- check_accuracy(logit_train)
logit_train_score                                # Prints 0.8844444
LogModel1 <- glm(ASTHMA4 ~ DRKMONTHLY + DRKWEEKLY, data=analytic, family="binomial")
summary(LogModel1)
#Confusion Matrix
# Convert y values to factor
logit_train$y <- sapply(logit_train$y, function(x) as.factor(x))
confusionMatrix(logit_train$class, logit_train$y)
# Take a look at model performance measures
logit_summary <- summary(logit_model)            # Null deviance: 311.70
logit_summary                                    # Residual deviance: 130.13
logit_summary$Pr