Skip to content

Latest commit

 

History

History
370 lines (294 loc) · 13.7 KB

rCh05.md

File metadata and controls

370 lines (294 loc) · 13.7 KB
source('runDir.R')
runDir('../CodeExamples/c05_Choosing_and_evaluating_models',
       '../Spambase')
[1] "############################### start  57 Fri Jun 17 10:32:53 2016"
[1] "#####  running  ../CodeExamples/c05_Choosing_and_evaluating_models/00057_example_5.1_of_section_5.2.1.R"
[1] "#####   in directory ../Spambase"

> # example 5.1 of section 5.2.1 
> # (example 5.1 of section 5.2.1)  : Choosing and evaluating models : Evaluating models : Evaluating classification models 
> # Title: Building and applying a logistic regression spam model 
> 
> spamD <- read.table('spamD.tsv',header=T,sep='\t')

> spamTrain <- subset(spamD,spamD$rgroup>=10)

> spamTest <- subset(spamD,spamD$rgroup<10)

> spamVars <- setdiff(colnames(spamD),list('rgroup','spam'))

> spamFormula <- as.formula(paste('spam=="spam"',
    paste(spamVars,collapse=' + '),sep=' ~ '))

> spamModel <- glm(spamFormula,family=binomial(link='logit'),
    data=spamTrain)

Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred


> spamTrain$pred <- predict(spamModel,newdata=spamTrain,
    type='response')

> spamTest$pred <- predict(spamModel,newdata=spamTest,
    type='response')

> print(with(spamTest,table(y=spam,glmPred=pred>0.5)))
          glmPred
y          FALSE TRUE
  non-spam   264   14
  spam        22  158

> ##           glmPred
> ## y          FALSE TRUE
> ##   non-spam   264   14
> ##   spam        22  158
> 
[1] "############################### end  57 Fri Jun 17 10:32:54 2016"
[1] "############################### start  58 Fri Jun 17 10:32:54 2016"
[1] "#####  running  ../CodeExamples/c05_Choosing_and_evaluating_models/00058_example_5.2_of_section_5.2.1.R"
[1] "#####   in directory ../Spambase"

> # example 5.2 of section 5.2.1 
> # (example 5.2 of section 5.2.1)  : Choosing and evaluating models : Evaluating models : Evaluating classification models 
> # Title: Spam classifications 
> 
> sample <- spamTest[c(7,35,224,327),c('spam','pred')]

> print(sample)
         spam         pred
115      spam 0.9903246227
361      spam 0.4800498077
2300 non-spam 0.0006846551
3428 non-spam 0.0001434345

> ##          spam         pred
> ## 115      spam 0.9903246227
> ## 361      spam 0.4800498077
> ## 2300 non-spam 0.0006846551
> ## 3428 non-spam 0.0001434345
> 
[1] "############################### end  58 Fri Jun 17 10:32:54 2016"
[1] "############################### start  59 Fri Jun 17 10:32:54 2016"
[1] "#####  running  ../CodeExamples/c05_Choosing_and_evaluating_models/00059_example_5.3_of_section_5.2.1.R"
[1] "#####   in directory ../Spambase"

> # example 5.3 of section 5.2.1 
> # (example 5.3 of section 5.2.1)  : Choosing and evaluating models : Evaluating models : Evaluating classification models 
> # Title: Spam confusion matrix 
> 
> cM <- table(truth=spamTest$spam,prediction=spamTest$pred>0.5)

> print(cM)
          prediction
truth      FALSE TRUE
  non-spam   264   14
  spam        22  158

> ##          prediction
> ## truth      FALSE TRUE
> ##   non-spam   264   14
> ##   spam        22  158
> 
[1] "############################### end  59 Fri Jun 17 10:32:54 2016"
[1] "############################### start  60 Fri Jun 17 10:32:54 2016"
[1] "#####  running  ../CodeExamples/c05_Choosing_and_evaluating_models/00060_example_5.4_of_section_5.2.1.R"
[1] "#####   in directory ../Spambase"

> # example 5.4 of section 5.2.1 
> # (example 5.4 of section 5.2.1)  : Choosing and evaluating models : Evaluating models : Evaluating classification models 
> # Title: Entering data by hand 
> 
> t <- as.table(matrix(data=c(288-1,17,1,13882-17),nrow=2,ncol=2))

> rownames(t) <- rownames(cM)

> colnames(t) <- colnames(cM)

> print(t)
         FALSE  TRUE
non-spam   287     1
spam        17 13865

> ##          FALSE  TRUE
> ## non-spam   287     1
> ## spam        17 13865
> 
[1] "############################### end  60 Fri Jun 17 10:32:54 2016"
[1] "############################### start  61 Fri Jun 17 10:32:54 2016"
[1] "#####  running  ../CodeExamples/c05_Choosing_and_evaluating_models/00061_example_5.5_of_section_5.2.2.R"
[1] "#####   in directory ../Spambase"

> # example 5.5 of section 5.2.2 
> # (example 5.5 of section 5.2.2)  : Choosing and evaluating models : Evaluating models : Evaluating scoring models 
> # Title: Plotting residuals 
> 
> d <- data.frame(y=(1:10)^2,x=1:10)

> model <- lm(y~x,data=d)

> d$prediction <- predict(model,newdata=d)

> library('ggplot2')

> ggplot(data=d) + geom_point(aes(x=x,y=y)) +
     geom_line(aes(x=x,y=prediction),color='blue') +
     geom_segment(aes(x=x,y=prediction,yend=y,xend=x)) +
     scale_y_continuous('')

[1] "############################### end  61 Fri Jun 17 10:32:54 2016"
[1] "############################### start  62 Fri Jun 17 10:32:54 2016"
[1] "#####  running  ../CodeExamples/c05_Choosing_and_evaluating_models/00062_example_5.6_of_section_5.2.3.R"
[1] "#####   in directory ../Spambase"

> # example 5.6 of section 5.2.3 
> # (example 5.6 of section 5.2.3)  : Choosing and evaluating models : Evaluating models : Evaluating probability models 
> # Title: Making a double density plot 
> 
> ggplot(data=spamTest) +
    geom_density(aes(x=pred,color=spam,linetype=spam))

[1] "############################### end  62 Fri Jun 17 10:32:54 2016"
[1] "############################### start  63 Fri Jun 17 10:32:54 2016"
[1] "#####  running  ../CodeExamples/c05_Choosing_and_evaluating_models/00063_example_5.7_of_section_5.2.3.R"
[1] "#####   in directory ../Spambase"

> # example 5.7 of section 5.2.3 
> # (example 5.7 of section 5.2.3)  : Choosing and evaluating models : Evaluating models : Evaluating probability models 
> # Title: Plotting the receiver operating characteristic curve 
> 
> library('ROCR')

Loading required package: gplots


Attaching package: 'gplots'

The following object is masked from 'package:stats':

    lowess

> eval <- prediction(spamTest$pred,spamTest$spam)

> plot(performance(eval,"tpr","fpr"))

> print(attributes(performance(eval,'auc'))$y.values[[1]])
[1] 0.9660072

> ## [1] 0.9660072
> 
[1] "############################### end  63 Fri Jun 17 10:32:55 2016"
[1] "############################### start  64 Fri Jun 17 10:32:55 2016"
[1] "#####  running  ../CodeExamples/c05_Choosing_and_evaluating_models/00064_example_5.8_of_section_5.2.3.R"
[1] "#####   in directory ../Spambase"

> # example 5.8 of section 5.2.3 
> # (example 5.8 of section 5.2.3)  : Choosing and evaluating models : Evaluating models : Evaluating probability models 
> # Title: Calculating log likelihood 
> 
> sum(ifelse(spamTest$spam=='spam',
    log(spamTest$pred),
    log(1-spamTest$pred)))
[1] -134.9478

> ## [1] -134.9478
> sum(ifelse(spamTest$spam=='spam',
    log(spamTest$pred),
    log(1-spamTest$pred)))/dim(spamTest)[[1]]
[1] -0.2946458

> ## [1] -0.2946458
> 
[1] "############################### end  64 Fri Jun 17 10:32:55 2016"
[1] "############################### start  65 Fri Jun 17 10:32:55 2016"
[1] "#####  running  ../CodeExamples/c05_Choosing_and_evaluating_models/00065_example_5.9_of_section_5.2.3.R"
[1] "#####   in directory ../Spambase"

> # example 5.9 of section 5.2.3 
> # (example 5.9 of section 5.2.3)  : Choosing and evaluating models : Evaluating models : Evaluating probability models 
> # Title: Computing the null model’s log likelihood 
> 
> pNull <- sum(ifelse(spamTest$spam=='spam',1,0))/dim(spamTest)[[1]]

> sum(ifelse(spamTest$spam=='spam',1,0))*log(pNull) +
    sum(ifelse(spamTest$spam=='spam',0,1))*log(1-pNull)
[1] -306.8952

> ## [1] -306.8952
> 
[1] "############################### end  65 Fri Jun 17 10:32:55 2016"
[1] "############################### start  66 Fri Jun 17 10:32:55 2016"
[1] "#####  running  ../CodeExamples/c05_Choosing_and_evaluating_models/00066_example_5.10_of_section_5.2.3.R"
[1] "#####   in directory ../Spambase"

> # example 5.10 of section 5.2.3 
> # (example 5.10 of section 5.2.3)  : Choosing and evaluating models : Evaluating models : Evaluating probability models 
> # Title: Calculating entropy and conditional entropy 
> 
> entropy <- function(x) {  # Note: 1 
     xpos <- x[x>0]
     scaled <- xpos/sum(xpos)
     sum(-scaled*log(scaled,2))
   }

> print(entropy(table(spamTest$spam)))  # Note: 2 
[1] 0.9667165

> ## [1] 0.9667165
> 
> conditionalEntropy <- function(t) {   # Note: 3 
     (sum(t[,1])*entropy(t[,1]) + sum(t[,2])*entropy(t[,2]))/sum(t)
   }

> print(conditionalEntropy(cM))     # Note: 4 
[1] 0.3971897

> ## [1] 0.3971897
> 
> # Note 1: 
> #   Define function that computes the entropy 
> #   from list of outcome counts 
> 
> # Note 2: 
> #   Calculate entropy of spam/non-spam 
> #   distribution 
> 
> # Note 3: 
> #   Function to calculate conditional or 
> #   remaining entropy of spam distribution (rows) 
> #   given prediction (columns) 
> 
> # Note 4: 
> #   Calculate conditional or remaining entropy 
> #   of spam distribution given prediction 
> 
[1] "############################### end  66 Fri Jun 17 10:32:55 2016"
[1] "############################### start  67 Fri Jun 17 10:32:55 2016"
[1] "#####  running  ../CodeExamples/c05_Choosing_and_evaluating_models/00067_example_5.11_of_section_5.2.5.R"
[1] "#####   in directory ../Spambase"

> # example 5.11 of section 5.2.5 
> # (example 5.11 of section 5.2.5)  : Choosing and evaluating models : Evaluating models : Evaluating clustering models 
> # Title: Clustering random data in the plane 
> 
> set.seed(32297)

> d <- data.frame(x=runif(100),y=runif(100))

> clus <- kmeans(d,centers=5)

> d$cluster <- clus$cluster
[1] "############################### end  67 Fri Jun 17 10:32:55 2016"
[1] "############################### start  68 Fri Jun 17 10:32:55 2016"
[1] "#####  running  ../CodeExamples/c05_Choosing_and_evaluating_models/00068_example_5.12_of_section_5.2.5.R"
[1] "#####   in directory ../Spambase"

> # example 5.12 of section 5.2.5 
> # (example 5.12 of section 5.2.5)  : Choosing and evaluating models : Evaluating models : Evaluating clustering models 
> # Title: Plotting our clusters 
> 
> library('ggplot2'); library('grDevices')

> h <- do.call(rbind,
    lapply(unique(clus$cluster),
       function(c) { f <- subset(d,cluster==c); f[chull(f),]}))

> ggplot() +
  geom_text(data=d,aes(label=cluster,x=x,y=y,
    color=cluster),size=3)  +
  geom_polygon(data=h,aes(x=x,y=y,group=cluster,fill=as.factor(cluster)),
    alpha=0.4,linetype=0) +
  theme(legend.position = "none")

[1] "############################### end  68 Fri Jun 17 10:32:55 2016"
[1] "############################### start  69 Fri Jun 17 10:32:55 2016"
[1] "#####  running  ../CodeExamples/c05_Choosing_and_evaluating_models/00069_example_5.13_of_section_5.2.5.R"
[1] "#####   in directory ../Spambase"

> # example 5.13 of section 5.2.5 
> # (example 5.13 of section 5.2.5)  : Choosing and evaluating models : Evaluating models : Evaluating clustering models 
> # Title: Calculating the size of each cluster 
> 
> table(d$cluster)

 1  2  3  4  5 
10 27 18 17 28 

> ##  1  2  3  4  5
> ## 10 27 18 17 28
> 
[1] "############################### end  69 Fri Jun 17 10:32:55 2016"
[1] "############################### start  70 Fri Jun 17 10:32:55 2016"
[1] "#####  running  ../CodeExamples/c05_Choosing_and_evaluating_models/00070_example_5.14_of_section_5.2.5.R"
[1] "#####   in directory ../Spambase"

> # example 5.14 of section 5.2.5 
> # (example 5.14 of section 5.2.5)  : Choosing and evaluating models : Evaluating models : Evaluating clustering models 
> # Title: Calculating the typical distance between items in every pair of clusters 
> 
> library('reshape2')

> n <- dim(d)[[1]]

> pairs <- data.frame(
    ca = as.vector(outer(1:n,1:n,function(a,b) d[a,'cluster'])),
    cb = as.vector(outer(1:n,1:n,function(a,b) d[b,'cluster'])),
    dist = as.vector(outer(1:n,1:n,function(a,b)
            sqrt((d[a,'x']-d[b,'x'])^2 + (d[a,'y']-d[b,'y'])^2)))
    )

> dcast(pairs,ca~cb,value.var='dist',mean)
  ca         1         2         3         4         5
1  1 0.1478480 0.6524103 0.3780785 0.4404508 0.7544134
2  2 0.6524103 0.2794181 0.5551967 0.4990632 0.5165320
3  3 0.3780785 0.5551967 0.2031272 0.6122986 0.4656730
4  4 0.4404508 0.4990632 0.6122986 0.2048268 0.8365336
5  5 0.7544134 0.5165320 0.4656730 0.8365336 0.2221314

> ##   ca         1         2         3         4         5
> ## 1  1 0.1478480 0.6524103 0.3780785 0.4404508 0.7544134
> ## 2  2 0.6524103 0.2794181 0.5551967 0.4990632 0.5165320
> ## 3  3 0.3780785 0.5551967 0.2031272 0.6122986 0.4656730
> ## 4  4 0.4404508 0.4990632 0.6122986 0.2048268 0.8365336
> ## 5  5 0.7544134 0.5165320 0.4656730 0.8365336 0.2221314
> 
[1] "############################### end  70 Fri Jun 17 10:32:55 2016"