RunExamples/rCh09.html

<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">rm</span>(<span class="dt">list=</span><span class="kw">ls</span>())
<span class="kw">source</span>(<span class="st">&#39;runDir.R&#39;</span>)</code></pre></div>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(<span class="st">&#39;ggplot2&#39;</span>)
<span class="kw">runDir</span>(<span class="st">&#39;../CodeExamples/c09_Exploring_advanced_methods&#39;</span>,
      <span class="st">&#39;../Spambase&#39;</span>,<span class="dt">last=</span><span class="dv">179</span>)</code></pre></div>
<pre><code>[1] &quot;############################### start  165 Fri Jun 17 10:42:20 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c09_Exploring_advanced_methods/00165_example_9.1_of_section_9.1.1.R&quot;
[1] &quot;#####   in directory ../Spambase&quot;

&gt; # example 9.1 of section 9.1.1 
&gt; # (example 9.1 of section 9.1.1)  : Exploring advanced methods : Using bagging and random forests to reduce training variance : Using bagging to improve prediction 
&gt; # Title: Preparing Spambase data and evaluating the performance of decision trees 
&gt; 
&gt; spamD &lt;- read.table(&#39;spamD.tsv&#39;,header=T,sep=&#39;\t&#39;)    # Note: 1 

&gt; spamTrain &lt;- subset(spamD,spamD$rgroup&gt;=10)

&gt; spamTest &lt;- subset(spamD,spamD$rgroup&lt;10)

&gt; spamVars &lt;- setdiff(colnames(spamD),list(&#39;rgroup&#39;,&#39;spam&#39;))

&gt; spamFormula &lt;- as.formula(paste(&#39;spam==&quot;spam&quot;&#39;,       # Note: 2 
                           paste(spamVars,collapse=&#39; + &#39;),sep=&#39; ~ &#39;))

&gt; loglikelihood &lt;- function(y, py) {    # Note: 3 
   pysmooth &lt;- ifelse(py==0, 1e-12,
                   ifelse(py==1, 1-1e-12, py))
 
   sum(y * log(pysmooth) + (1-y)*log(1 - pysmooth))
 }

&gt; accuracyMeasures &lt;- function(pred, truth, name=&quot;model&quot;) {     # Note: 4 
   dev.norm &lt;- -2*loglikelihood(as.numeric(truth), pred)/length(pred)   # Note: 5 
   ctable &lt;- table(truth=truth,
                  pred=(pred&gt;0.5))                                      # Note: 6 
   accuracy &lt;- sum(diag(ctable))/sum(ctable)
   precision &lt;- ctable[2,2]/sum(ctable[,2])
   recall &lt;- ctable[2,2]/sum(ctable[2,])
   f1 &lt;- 2*precision*recall/(precision+recall)
   data.frame(model=name, accuracy=accuracy, f1=f1, dev.norm)
 }

&gt; library(rpart)                                                    # Note: 7 

&gt; treemodel &lt;- rpart(spamFormula, spamTrain)

&gt; accuracyMeasures(predict(treemodel, newdata=spamTrain),   # Note: 8 
                  spamTrain$spam==&quot;spam&quot;,
                  name=&quot;tree, training&quot;)
           model  accuracy      f1  dev.norm
1 tree, training 0.9104514 0.88337 0.5618654

&gt; accuracyMeasures(predict(treemodel, newdata=spamTest),
                  spamTest$spam==&quot;spam&quot;,
                  name=&quot;tree, test&quot;)
       model  accuracy        f1  dev.norm
1 tree, test 0.8799127 0.8414986 0.6702857

&gt; # Note 1: 
&gt; #   Load the data and split into training (90% of data) 
&gt; #   and test (10% of data) sets. 
&gt; 
&gt; # Note 2: 
&gt; #   Use all the features and do binary classification, 
&gt; #   where TRUE corresponds to spam documents. 
&gt; 
&gt; # Note 3: 
&gt; #   A function to calculate log likelihood (for 
&gt; #   calculating deviance). 
&gt; 
&gt; # Note 4: 
&gt; #   A function to calculate and return various measures 
&gt; #   on the model: normalized deviance, prediction accuracy, and f1, which is the 
&gt; #   harmonic mean of precision and recall. 
&gt; 
&gt; # Note 5: 
&gt; #   Normalize the deviance by the number of data points 
&gt; #   so that we can compare the deviance across training and test 
&gt; #   sets. 
&gt; 
&gt; # Note 6: 
&gt; #   Convert the class probability estimator into a 
&gt; #   classifier by labeling documents that score greater than 0.5 as 
&gt; #   spam. 
&gt; 
&gt; # Note 7: 
&gt; #   Load the rpart library and fit a decision tree 
&gt; #   model. 
&gt; 
&gt; # Note 8: 
&gt; #   Evaluate the decision tree model against the 
&gt; #   training and test sets. 
&gt; 
[1] &quot;############################### end  165 Fri Jun 17 10:42:20 2016&quot;
[1] &quot;############################### start  167 Fri Jun 17 10:42:20 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c09_Exploring_advanced_methods/00167_example_9.2_of_section_9.1.1.R&quot;
[1] &quot;#####   in directory ../Spambase&quot;

&gt; # example 9.2 of section 9.1.1 
&gt; # (example 9.2 of section 9.1.1)  : Exploring advanced methods : Using bagging and random forests to reduce training variance : Using bagging to improve prediction 
&gt; # Title: Bagging decision trees 
&gt; 
&gt; ntrain &lt;- dim(spamTrain)[1]

&gt; n &lt;- ntrain                   # Note: 1 

&gt; ntree &lt;- 100

&gt; samples &lt;- sapply(1:ntree,        # Note: 2 
                  FUN = function(iter)
                    {sample(1:ntrain, size=n, replace=T)})

&gt; treelist &lt;-lapply(1:ntree,        # Note: 3 
                   FUN=function(iter)
                   {samp &lt;- samples[,iter];
                    rpart(spamFormula, spamTrain[samp,])})

&gt; predict.bag &lt;- function(treelist, newdata) {      # Note: 4 
   preds &lt;- sapply(1:length(treelist),
                  FUN=function(iter) {
                    predict(treelist[[iter]], newdata=newdata)})
   predsums &lt;- rowSums(preds)
   predsums/length(treelist)
 }

&gt; accuracyMeasures(predict.bag(treelist, newdata=spamTrain),    # Note: 5 
                  spamTrain$spam==&quot;spam&quot;,
                  name=&quot;bagging, training&quot;)
              model  accuracy        f1  dev.norm
1 bagging, training 0.9215544 0.8973144 0.4719576

&gt; accuracyMeasures(predict.bag(treelist, newdata=spamTest),
                  spamTest$spam==&quot;spam&quot;,
                  name=&quot;bagging, test&quot;)
          model  accuracy        f1  dev.norm
1 bagging, test 0.9061135 0.8731563 0.5325537

&gt; # Note 1: 
&gt; #   Use bootstrap samples the same size as the training 
&gt; #   set, with 100 trees. 
&gt; 
&gt; # Note 2: 
&gt; #   Build the bootstrap samples by sampling the row indices of spamTrain with replacement. Each 
&gt; #   column of the matrix samples represents the row indices into spamTrain 
&gt; #   that comprise the bootstrap sample. 
&gt; 
&gt; # Note 3: 
&gt; #   Train the individual decision trees and return them 
&gt; #   in a list. Note: this step can take a few minutes. 
&gt; 
&gt; # Note 4: 
&gt; #   predict.bag assumes the underlying classifier returns decision probabilities, not 
&gt; #   decisions. 
&gt; 
&gt; # Note 5: 
&gt; #   Evaluate the bagged decision trees against the 
&gt; #   training and test sets. 
&gt; 
[1] &quot;############################### end  167 Fri Jun 17 10:43:03 2016&quot;
[1] &quot;############################### start  169 Fri Jun 17 10:43:03 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c09_Exploring_advanced_methods/00169_example_9.3_of_section_9.1.2.R&quot;
[1] &quot;#####   in directory ../Spambase&quot;

&gt; # example 9.3 of section 9.1.2 
&gt; # (example 9.3 of section 9.1.2)  : Exploring advanced methods : Using bagging and random forests to reduce training variance : Using random forests to further improve prediction 
&gt; # Title: Using random forests 
&gt; 
&gt; library(randomForest)             # Note: 1 

randomForest 4.6-12

Type rfNews() to see new features/changes/bug fixes.


Attaching package: &#39;randomForest&#39;

The following object is masked from &#39;package:ggplot2&#39;:

    margin


&gt; set.seed(5123512)     # Note: 2 

&gt; fmodel &lt;- randomForest(x=spamTrain[,spamVars],    # Note: 3 
         y=spamTrain$spam,
         ntree=100,     # Note: 4 
         nodesize=7,    # Note: 5 
         importance=T)  # Note: 6 

&gt; accuracyMeasures(predict(fmodel,  # Note: 7 
    newdata=spamTrain[,spamVars],type=&#39;prob&#39;)[,&#39;spam&#39;],
    spamTrain$spam==&quot;spam&quot;,name=&quot;random forest, train&quot;)
                 model  accuracy        f1  dev.norm
1 random forest, train 0.9884142 0.9851943 0.1428786

&gt; ##                  model  accuracy        f1  dev.norm
&gt; ## 1 random forest, train 0.9884142 0.9706611 0.1428786
&gt; accuracyMeasures(predict(fmodel,
    newdata=spamTest[,spamVars],type=&#39;prob&#39;)[,&#39;spam&#39;],
    spamTest$spam==&quot;spam&quot;,name=&quot;random forest, test&quot;)
                model  accuracy        f1  dev.norm
1 random forest, test 0.9541485 0.9401709 0.3972416

&gt; ##                 model  accuracy        f1  dev.norm
&gt; ## 1 random forest, test 0.9541485 0.8845029 0.3972416
&gt; 
&gt; # Note 1: 
&gt; #   Load the randomForest package. 
&gt; 
&gt; # Note 2: 
&gt; #   Set the pseudo-random seed to a known value to try 
&gt; #   and make the random forest run repeatable. 
&gt; 
&gt; # Note 3: 
&gt; #   Call the randomForest() function to build the model 
&gt; #   with explanatory variables as x and the category to be predicted as 
&gt; #   y. 
&gt; 
&gt; # Note 4: 
&gt; #   Use 100 trees to be compatible with our bagging 
&gt; #   example. The default is 500 trees. 
&gt; 
&gt; # Note 5: 
&gt; #   Specify that each node of a tree must have a minimum 
&gt; #   of 7 elements, to be compatible with the default minimum node size that rpart() 
&gt; #   uses on this training set. 
&gt; 
&gt; # Note 6: 
&gt; #   Tell the algorithm to save information to be used for 
&gt; #   calculating variable importance (we’ll see this later). 
&gt; 
&gt; # Note 7: 
&gt; #   Report the model quality. 
&gt; 
[1] &quot;############################### end  169 Fri Jun 17 10:43:08 2016&quot;
[1] &quot;############################### start  171 Fri Jun 17 10:43:08 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c09_Exploring_advanced_methods/00171_example_9.4_of_section_9.1.2.R&quot;
[1] &quot;#####   in directory ../Spambase&quot;

&gt; # example 9.4 of section 9.1.2 
&gt; # (example 9.4 of section 9.1.2)  : Exploring advanced methods : Using bagging and random forests to reduce training variance : Using random forests to further improve prediction 
&gt; # Title: randomForest variable importances 
&gt; 
&gt; varImp &lt;- importance(fmodel)                  # Note: 1 

&gt; varImp[1:10, ]                            # Note: 2 
                    non-spam       spam MeanDecreaseAccuracy
word.freq.make      2.096811  3.7304353             4.334207
word.freq.address   3.603167  3.9967031             4.977452
word.freq.all       2.799456  4.9527834             4.924958
word.freq.3d        3.000273  0.4125932             2.917972
word.freq.our       9.037946  7.9421391            10.731509
word.freq.over      5.879377  4.2402613             5.751371
word.freq.remove   16.637390 13.9331691            17.753122
word.freq.internet  7.301055  4.4458342             7.947515
word.freq.order     3.937897  4.3587883             4.866540
word.freq.mail      5.022432  3.4701224             6.103929
                   MeanDecreaseGini
word.freq.make             5.877954
word.freq.address         10.081640
word.freq.all             23.524720
word.freq.3d               1.550635
word.freq.our             52.569163
word.freq.over            11.820391
word.freq.remove         174.126926
word.freq.internet        22.578106
word.freq.order           11.809265
word.freq.mail            11.127200

&gt; ##                     non-spam       spam MeanDecreaseAccuracy
&gt; ## word.freq.make      2.096811  3.7304353             4.334207
&gt; ## word.freq.address   3.603167  3.9967031             4.977452
&gt; ## word.freq.all       2.799456  4.9527834             4.924958
&gt; ## word.freq.3d        3.000273  0.4125932             2.917972
&gt; ## word.freq.our       9.037946  7.9421391            10.731509
&gt; ## word.freq.over      5.879377  4.2402613             5.751371
&gt; ## word.freq.remove   16.637390 13.9331691            17.753122
&gt; ## word.freq.internet  7.301055  4.4458342             7.947515
&gt; ## word.freq.order     3.937897  4.3587883             4.866540
&gt; ## word.freq.mail      5.022432  3.4701224             6.103929
&gt; 
&gt; varImpPlot(fmodel, type=1)                        # Note: 3


&gt; # Note 1: 
&gt; #   Call importance() on the spam 
&gt; #   model. 
&gt; 
&gt; # Note 2: 
&gt; #   The importance() function returns a matrix of 
&gt; #   importance measures (larger values = more important). 
&gt; 
&gt; # Note 3: 
&gt; #   Plot the variable importance as measured by 
&gt; #   accuracy change. 
&gt; 
[1] &quot;############################### end  171 Fri Jun 17 10:43:08 2016&quot;
[1] &quot;############################### start  172 Fri Jun 17 10:43:08 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c09_Exploring_advanced_methods/00172_example_9.5_of_section_9.1.2.R&quot;
[1] &quot;#####   in directory ../Spambase&quot;

&gt; # example 9.5 of section 9.1.2 
&gt; # (example 9.5 of section 9.1.2)  : Exploring advanced methods : Using bagging and random forests to reduce training variance : Using random forests to further improve prediction 
&gt; # Title: Fitting with fewer variables 
&gt; 
&gt; selVars &lt;- names(sort(varImp[,1], decreasing=T))[1:25]    # Note: 1 

&gt; fsel &lt;- randomForest(x=spamTrain[,selVars],y=spamTrain$spam,  # Note: 2 
                            ntree=100,
                            nodesize=7,
                            importance=T)

&gt; accuracyMeasures(predict(fsel,
    newdata=spamTrain[,selVars],type=&#39;prob&#39;)[,&#39;spam&#39;],
    spamTrain$spam==&quot;spam&quot;,name=&quot;RF small, train&quot;)
            model  accuracy        f1  dev.norm
1 RF small, train 0.9864832 0.9827267 0.1379438

&gt; ##             model  accuracy        f1  dev.norm
&gt; ## 1 RF small, train 0.9876901 0.9688546 0.1506817
&gt; 
&gt; accuracyMeasures(predict(fsel,
    newdata=spamTest[,selVars],type=&#39;prob&#39;)[,&#39;spam&#39;],
    spamTest$spam==&quot;spam&quot;,name=&quot;RF small, test&quot;)
           model  accuracy        f1  dev.norm
1 RF small, test 0.9497817 0.9348442 0.3985712

&gt; ##            model  accuracy        f1 dev.norm
&gt; ## 1 RF small, test 0.9497817 0.8738142 0.400825
&gt; 
&gt; # Note 1: 
&gt; #   Sort the variables by their importance, as 
&gt; #   measured by accuracy change. 
&gt; 
&gt; # Note 2: 
&gt; #   Build a random forest model using only the 25 
&gt; #   most important variables. 
&gt; 
[1] &quot;############################### end  172 Fri Jun 17 10:43:11 2016&quot;
[1] &quot;############################### start  175 Fri Jun 17 10:43:11 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c09_Exploring_advanced_methods/00175_example_9.6_of_section_9.2.2.R&quot;
[1] &quot;#####   in directory ../Spambase&quot;

&gt; # example 9.6 of section 9.2.2 
&gt; # (example 9.6 of section 9.2.2)  : Exploring advanced methods : Using generalized additive models (GAMs) to learn non-monotone relationships : A one-dimensional regression example 
&gt; # Title: Preparing an artificial problem 
&gt; 
&gt; set.seed(602957)

&gt; x &lt;- rnorm(1000)

&gt; noise &lt;- rnorm(1000, sd=1.5)

&gt; y &lt;- 3*sin(2*x) + cos(0.75*x) - 1.5*(x^2 ) + noise

&gt; select &lt;- runif(1000)

&gt; frame &lt;- data.frame(y=y, x = x)

&gt; train &lt;- frame[select &gt; 0.1,]

&gt; test &lt;-frame[select &lt;= 0.1,]
[1] &quot;############################### end  175 Fri Jun 17 10:43:11 2016&quot;
[1] &quot;############################### start  176 Fri Jun 17 10:43:11 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c09_Exploring_advanced_methods/00176_example_9.7_of_section_9.2.2.R&quot;
[1] &quot;#####   in directory ../Spambase&quot;

&gt; # example 9.7 of section 9.2.2 
&gt; # (example 9.7 of section 9.2.2)  : Exploring advanced methods : Using generalized additive models (GAMs) to learn non-monotone relationships : A one-dimensional regression example 
&gt; # Title: Linear regression applied to our artificial example 
&gt; 
&gt; lin.model &lt;- lm(y ~ x, data=train)

&gt; summary(lin.model)

Call:
lm(formula = y ~ x, data = train)

Residuals:
    Min      1Q  Median      3Q     Max 
-17.698  -1.774   0.193   2.499   7.529 

Coefficients:
            Estimate Std. Error t value Pr(&gt;|t|)    
(Intercept)  -0.8330     0.1161  -7.175 1.51e-12 ***
x             0.7395     0.1197   6.180 9.74e-10 ***
---
Signif. codes:  0 &#39;***&#39; 0.001 &#39;**&#39; 0.01 &#39;*&#39; 0.05 &#39;.&#39; 0.1 &#39; &#39; 1

Residual standard error: 3.485 on 899 degrees of freedom
Multiple R-squared:  0.04075,   Adjusted R-squared:  0.03968 
F-statistic: 38.19 on 1 and 899 DF,  p-value: 9.737e-10


&gt; ## Call:
&gt; ## lm(formula = y ~ x, data = train)
&gt; ##
&gt; ## Residuals:
&gt; ##     Min      1Q  Median      3Q     Max
&gt; ## -17.698  -1.774   0.193   2.499   7.529
&gt; ##
&gt; ## Coefficients:
&gt; ##             Estimate Std. Error t value Pr(&gt;|t|)
&gt; ## (Intercept)  -0.8330     0.1161  -7.175 1.51e-12 ***
&gt; ## x             0.7395     0.1197   6.180 9.74e-10 ***
&gt; ## ---
&gt; ## Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
&gt; ##
&gt; ## Residual standard error: 3.485 on 899 degrees of freedom
&gt; ## Multiple R-squared:  0.04075,   Adjusted R-squared:  0.03968
&gt; ## F-statistic: 38.19 on 1 and 899 DF,  p-value: 9.737e-10
&gt; 
&gt; #
&gt; # calculate the root mean squared error (rmse)
&gt; #
&gt; resid.lin &lt;- train$y-predict(lin.model)

&gt; sqrt(mean(resid.lin^2))
[1] 3.481091

&gt; ## [1] 3.481091
&gt; 
[1] &quot;############################### end  176 Fri Jun 17 10:43:11 2016&quot;
[1] &quot;############################### start  177 Fri Jun 17 10:43:11 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c09_Exploring_advanced_methods/00177_example_9.8_of_section_9.2.2.R&quot;
[1] &quot;#####   in directory ../Spambase&quot;

&gt; # example 9.8 of section 9.2.2 
&gt; # (example 9.8 of section 9.2.2)  : Exploring advanced methods : Using generalized additive models (GAMs) to learn non-monotone relationships : A one-dimensional regression example 
&gt; # Title: GAM applied to our artificial example 
&gt; 
&gt; library(mgcv)                                 # Note: 1 

Loading required package: nlme

This is mgcv 1.8-12. For overview type &#39;help(&quot;mgcv-package&quot;)&#39;.</code></pre>
<div class="figure">
<img src="rCh09_files/figure-markdown_github/ch9ex1-1.png" alt="" />

</div>
<pre><code>&gt; glin.model &lt;- gam(y~s(x), data=train)     # Note: 2 

&gt; glin.model$converged                          # Note: 3 
[1] TRUE

&gt; ## [1] TRUE
&gt; 
&gt; summary(glin.model)

Family: gaussian 
Link function: identity 

Formula:
y ~ s(x)

Parametric coefficients:
            Estimate Std. Error t value Pr(&gt;|t|)    
(Intercept) -0.83467    0.04852   -17.2   &lt;2e-16 ***
---
Signif. codes:  0 &#39;***&#39; 0.001 &#39;**&#39; 0.01 &#39;*&#39; 0.05 &#39;.&#39; 0.1 &#39; &#39; 1

Approximate significance of smooth terms:
       edf Ref.df     F p-value    
s(x) 8.685  8.972 497.4  &lt;2e-16 ***
---
Signif. codes:  0 &#39;***&#39; 0.001 &#39;**&#39; 0.01 &#39;*&#39; 0.05 &#39;.&#39; 0.1 &#39; &#39; 1

R-sq.(adj) =  0.832   Deviance explained = 83.4%
GCV =  2.144  Scale est. = 2.121     n = 901

&gt; ## Family: gaussian                                   # Note: 4 
&gt; ## Link function: identity
&gt; ##
&gt; ## Formula:
&gt; ## y ~ s(x)
&gt; ##
&gt; ## Parametric coefficients:                           # Note: 5 
&gt; ##             Estimate Std. Error t value Pr(&gt;|t|)
&gt; ## (Intercept) -0.83467    0.04852   -17.2   &lt;2e-16 ***
&gt; ## ---
&gt; ## Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
&gt; ##
&gt; ## Approximate significance of smooth terms:          # Note: 6 
&gt; ##        edf Ref.df     F p-value
&gt; ## s(x) 8.685  8.972 497.8  &lt;2e-16 ***
&gt; ## ---
&gt; ## Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
&gt; ##
&gt; ## R-sq.(adj) =  0.832   Deviance explained = 83.4%           # Note: 7 
&gt; ## GCV score =  2.144  Scale est. = 2.121     n = 901
&gt; 
&gt; #
&gt; # calculate the root mean squared error (rmse)
&gt; #
&gt; resid.glin &lt;- train$y-predict(glin.model)

&gt; sqrt(mean(resid.glin^2))
[1] 1.448514

&gt; ## [1] 1.448514
&gt; 
&gt; # Note 1: 
&gt; #   Load the mgcv package. 
&gt; 
&gt; # Note 2: 
&gt; #   Build the model, specifying that x should be 
&gt; #   treated as a nonlinear variable. 
&gt; 
&gt; # Note 3: 
&gt; #   The converged parameter tells you if the algorithm 
&gt; #   converged. You should only trust the output if this is TRUE. 
&gt; 
&gt; # Note 4: 
&gt; #   Setting family=gaussian and link=identity tells you that the model was treated with the same 
&gt; #   distributions assumptions as a standard linear regression. 
&gt; 
&gt; # Note 5: 
&gt; #   The parametric coefficients are the linear terms (in this example, only the constant term). 
&gt; #   This section of the summary tells you which linear terms were 
&gt; #   significantly different from 0. 
&gt; 
&gt; # Note 6: 
&gt; #   The smooth terms are the nonlinear terms. This section of the summary tells you which 
&gt; #   nonlinear terms were significantly different from 0. It also tells you 
&gt; #   the effective degrees of freedom (edf) used up to build each smooth 
&gt; #   term. An edf near 1 indicates that the variable has an approximately 
&gt; #   linear relationship to the output. 
&gt; 
&gt; # Note 7: 
&gt; #   “R-sq (adj)” is the adjusted R-squared. “Deviance 
&gt; #   explained” is the raw R-squared (0.834). 
&gt; 
[1] &quot;############################### end  177 Fri Jun 17 10:43:12 2016&quot;
[1] &quot;############################### start  178 Fri Jun 17 10:43:12 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c09_Exploring_advanced_methods/00178_example_9.9_of_section_9.2.2.R&quot;
[1] &quot;#####   in directory ../Spambase&quot;

&gt; # example 9.9 of section 9.2.2 
&gt; # (example 9.9 of section 9.2.2)  : Exploring advanced methods : Using generalized additive models (GAMs) to learn non-monotone relationships : A one-dimensional regression example 
&gt; # Title: Comparing linear regression and GAM performance 
&gt; 
&gt; actual &lt;- test$y

&gt; pred.lin &lt;- predict(lin.model, newdata=test)      # Note: 1 

&gt; pred.glin &lt;- predict(glin.model, newdata=test)

&gt; resid.lin &lt;- actual-pred.lin

&gt; resid.glin &lt;- actual-pred.glin

&gt; sqrt(mean(resid.lin^2))       # Note: 2 
[1] 2.792653

&gt; ## [1] 2.792653
&gt; sqrt(mean(resid.glin^2))
[1] 1.401399

&gt; ## [1] 1.401399
&gt; 
&gt; cor(actual, pred.lin)^2       # Note: 3 
[1] 0.1543172

&gt; ## [1] 0.1543172
&gt; cor(actual, pred.glin)^2
[1] 0.7828869

&gt; ## [1] 0.7828869
&gt; 
&gt; # Note 1: 
&gt; #   Call both models on the test 
&gt; #   data. 
&gt; 
&gt; # Note 2: 
&gt; #   Compare the RMSE of the linear model and the GAM 
&gt; #   on the test data. 
&gt; 
&gt; # Note 3: 
&gt; #   Compare the R-squared of the linear model and the 
&gt; #   GAM on test data. 
&gt; 
[1] &quot;############################### end  178 Fri Jun 17 10:43:12 2016&quot;
[1] &quot;############################### start  179 Fri Jun 17 10:43:12 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c09_Exploring_advanced_methods/00179_example_9.10_of_section_9.2.3.R&quot;
[1] &quot;#####   in directory ../Spambase&quot;

&gt; # example 9.10 of section 9.2.3 
&gt; # (example 9.10 of section 9.2.3)  : Exploring advanced methods : Using generalized additive models (GAMs) to learn non-monotone relationships : Extracting the nonlinear relationships 
&gt; # Title: Extracting a learned spline from a GAM 
&gt; 
&gt; sx &lt;- predict(glin.model, type=&quot;terms&quot;)

&gt; summary(sx)
      s(x)           
 Min.   :-17.527035  
 1st Qu.: -2.378636  
 Median :  0.009427  
 Mean   :  0.000000  
 3rd Qu.:  2.869166  
 Max.   :  4.084999  

&gt; ##       s(x)
&gt; ##  Min.   :-17.527035
&gt; ##  1st Qu.: -2.378636
&gt; ##  Median :  0.009427
&gt; ##  Mean   :  0.000000
&gt; ##  3rd Qu.:  2.869166
&gt; ##  Max.   :  4.084999
&gt; 
&gt; xframe &lt;- cbind(train, sx=sx[,1])

&gt; ggplot(xframe, aes(x=x)) + geom_point(aes(y=y), alpha=0.4) +
                              geom_line(aes(y=sx))</code></pre>
<div class="figure">
<img src="rCh09_files/figure-markdown_github/ch9ex1-2.png" alt="" />

</div>
<pre><code>[1] &quot;############################### end  179 Fri Jun 17 10:43:12 2016&quot;</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">rm</span>(<span class="dt">list=</span><span class="kw">ls</span>())
<span class="kw">source</span>(<span class="st">&#39;runDir.R&#39;</span>)</code></pre></div>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(<span class="st">&#39;ggplot2&#39;</span>)
<span class="kw">runDir</span>(<span class="st">&#39;../CodeExamples/c09_Exploring_advanced_methods&#39;</span>,
      <span class="st">&#39;../CDC&#39;</span>,<span class="dt">first=</span><span class="dv">180</span>,<span class="dt">last=</span><span class="dv">184</span>)</code></pre></div>
<pre><code>[1] &quot;############################### start  180 Fri Jun 17 10:43:12 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c09_Exploring_advanced_methods/00180_example_9.11_of_section_9.2.4.R&quot;
[1] &quot;#####   in directory ../CDC&quot;

&gt; # example 9.11 of section 9.2.4 
&gt; # (example 9.11 of section 9.2.4)  : Exploring advanced methods : Using generalized additive models (GAMs) to learn non-monotone relationships : Using GAM on actual data 
&gt; # Title: Applying linear regression (with and without GAM) to health data 
&gt; 
&gt; library(mgcv)

&gt; library(ggplot2)

&gt; load(&quot;NatalBirthData.rData&quot;)

&gt; train &lt;- sdata[sdata$ORIGRANDGROUP&lt;=5,]

&gt; test &lt;- sdata[sdata$ORIGRANDGROUP&gt;5,]

&gt; form.lin &lt;- as.formula(&quot;DBWT ~ PWGT + WTGAIN + MAGER + UPREVIS&quot;)

&gt; linmodel &lt;- lm(form.lin, data=train)      # Note: 1 

&gt; summary(linmodel)

Call:
lm(formula = form.lin, data = train)

Residuals:
     Min       1Q   Median       3Q      Max 
-3155.43  -272.09    45.04   349.81  2870.55 

Coefficients:
             Estimate Std. Error t value Pr(&gt;|t|)    
(Intercept) 2419.7090    31.9291  75.784  &lt; 2e-16 ***
PWGT           2.1713     0.1241  17.494  &lt; 2e-16 ***
WTGAIN         7.5773     0.3178  23.840  &lt; 2e-16 ***
MAGER          5.3213     0.7787   6.834  8.6e-12 ***
UPREVIS       12.8753     1.1786  10.924  &lt; 2e-16 ***
---
Signif. codes:  0 &#39;***&#39; 0.001 &#39;**&#39; 0.01 &#39;*&#39; 0.05 &#39;.&#39; 0.1 &#39; &#39; 1

Residual standard error: 562.7 on 14381 degrees of freedom
Multiple R-squared:  0.06596,   Adjusted R-squared:  0.0657 
F-statistic: 253.9 on 4 and 14381 DF,  p-value: &lt; 2.2e-16


&gt; ## Call:
&gt; ## lm(formula = form.lin, data = train)
&gt; ##
&gt; ## Residuals:
&gt; ##      Min       1Q   Median       3Q      Max 
&gt; ## -3155.43  -272.09    45.04   349.81  2870.55 
&gt; ##
&gt; ## Coefficients:
&gt; ##              Estimate Std. Error t value Pr(&gt;|t|)    
&gt; ## (Intercept) 2419.7090    31.9291  75.784  &lt; 2e-16 ***
&gt; ## PWGT           2.1713     0.1241  17.494  &lt; 2e-16 ***
&gt; ## WTGAIN         7.5773     0.3178  23.840  &lt; 2e-16 ***
&gt; ## MAGER          5.3213     0.7787   6.834  8.6e-12 ***
&gt; ## UPREVIS       12.8753     1.1786  10.924  &lt; 2e-16 ***
&gt; ## ---
&gt; ## Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
&gt; ##
&gt; ## Residual standard error: 562.7 on 14381 degrees of freedom
&gt; ## Multiple R-squared:  0.06596, Adjusted R-squared:  0.0657  # Note: 2 
&gt; ## F-statistic: 253.9 on 4 and 14381 DF,  p-value: &lt; 2.2e-16
&gt; 
&gt; form.glin &lt;- as.formula(&quot;DBWT ~ s(PWGT) + s(WTGAIN) +
                         s(MAGER) + s(UPREVIS)&quot;)

&gt; glinmodel &lt;- gam(form.glin, data=train)                       # Note: 3 

&gt; glinmodel$converged                                           # Note: 4 
[1] TRUE

&gt; ## [1] TRUE
&gt; summary(glinmodel)

Family: gaussian 
Link function: identity 

Formula:
DBWT ~ s(PWGT) + s(WTGAIN) + s(MAGER) + s(UPREVIS)

Parametric coefficients:
            Estimate Std. Error t value Pr(&gt;|t|)    
(Intercept) 3276.948      4.623   708.8   &lt;2e-16 ***
---
Signif. codes:  0 &#39;***&#39; 0.001 &#39;**&#39; 0.01 &#39;*&#39; 0.05 &#39;.&#39; 0.1 &#39; &#39; 1

Approximate significance of smooth terms:
             edf Ref.df       F  p-value    
s(PWGT)    5.374  6.443  69.010  &lt; 2e-16 ***
s(WTGAIN)  4.719  5.743 102.313  &lt; 2e-16 ***
s(MAGER)   7.742  8.428   7.145 1.37e-09 ***
s(UPREVIS) 5.491  6.425  48.423  &lt; 2e-16 ***
---
Signif. codes:  0 &#39;***&#39; 0.001 &#39;**&#39; 0.01 &#39;*&#39; 0.05 &#39;.&#39; 0.1 &#39; &#39; 1

R-sq.(adj) =  0.0927   Deviance explained = 9.42%
GCV = 3.0804e+05  Scale est. = 3.0752e+05  n = 14386

&gt; ## Family: gaussian 
&gt; ## Link function: identity 
&gt; ##
&gt; ## Formula:
&gt; ## DBWT ~ s(PWGT) + s(WTGAIN) + s(MAGER) + s(UPREVIS)
&gt; ##
&gt; ## Parametric coefficients:
&gt; ##             Estimate Std. Error t value Pr(&gt;|t|)    
&gt; ## (Intercept) 3276.948      4.623   708.8   &lt;2e-16 ***
&gt; ## ---
&gt; ## Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
&gt; ##
&gt; ## Approximate significance of smooth terms:
&gt; ##              edf Ref.df       F  p-value    
&gt; ## s(PWGT)    5.374  6.443  68.981  &lt; 2e-16 ***
&gt; ## s(WTGAIN)  4.719  5.743 102.313  &lt; 2e-16 ***
&gt; ## s(MAGER)   7.742  8.428   6.959 1.82e-09 ***
&gt; ## s(UPREVIS) 5.491  6.425  48.423  &lt; 2e-16 ***
&gt; ## ---
&gt; ## Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
&gt; ##
&gt; ## R-sq.(adj) =  0.0927   Deviance explained = 9.42%  # Note: 5 
&gt; ## GCV score = 3.0804e+05  Scale est. = 3.0752e+05  n = 14386
&gt; 
&gt; # Note 1: 
&gt; #   Build a linear model with four 
&gt; #   variables. 
&gt; 
&gt; # Note 2: 
&gt; #   The model explains about 7% of the variance; all 
&gt; #   coefficients are significantly different from 0. 
&gt; 
&gt; # Note 3: 
&gt; #   Build a GAM with the same 
&gt; #   variables. 
&gt; 
&gt; # Note 4: 
&gt; #   Verify that the model has 
&gt; #   converged. 
&gt; 
&gt; # Note 5: 
&gt; #   The model explains just under 10% of the variance; 
&gt; #   all variables have a nonlinear effect significantly different from 
&gt; #   0. 
&gt; 
[1] &quot;############################### end  180 Fri Jun 17 10:43:13 2016&quot;
[1] &quot;############################### start  181 Fri Jun 17 10:43:13 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c09_Exploring_advanced_methods/00181_example_9.12_of_section_9.2.4.R&quot;
[1] &quot;#####   in directory ../CDC&quot;

&gt; # example 9.12 of section 9.2.4 
&gt; # (example 9.12 of section 9.2.4)  : Exploring advanced methods : Using generalized additive models (GAMs) to learn non-monotone relationships : Using GAM on actual data 
&gt; # Title: Plotting GAM results 
&gt; 
&gt; terms &lt;- predict(glinmodel, type=&quot;terms&quot;)         # Note: 1 

&gt; tframe &lt;- cbind(DBWT = train$DBWT, as.data.frame(terms))      # Note: 2 

&gt; colnames(tframe) &lt;- gsub(&#39;[()]&#39;, &#39;&#39;, colnames(tframe))        # Note: 3 

&gt; pframe &lt;- cbind(tframe, train[,c(&quot;PWGT&quot;, &quot;WTGAIN&quot;,
                                        &quot;MAGER&quot;, &quot;UPREVIS&quot;)])           # Note: 4 

&gt; p1 &lt;- ggplot(pframe, aes(x=PWGT)) +
    geom_point(aes(y=scale(sPWGT, scale=F))) +      # Note: 5 
    geom_smooth(aes(y=scale(DBWT, scale=F))) # +    # Note: 6 

&gt; # [...]   # Note: 7
&gt; 
&gt; # Note 1: 
&gt; #   Get the matrix of s() 
&gt; #   functions. 
&gt; 
&gt; # Note 2: 
&gt; #   Bind in birth weight; convert to data 
&gt; #   frame. 
&gt; 
&gt; # Note 3: 
&gt; #   Make the column names reference-friendly 
&gt; #   (“s(PWGT)” is converted to “sPWGT”, etc.). 
&gt; 
&gt; # Note 4: 
&gt; #   Bind in the input variables. 
&gt; 
&gt; # Note 5: 
&gt; #   Plot s(PWGT) shifted to be zero mean versus PWGT (mother’s weight) as points. 
&gt; 
&gt; # Note 6: 
&gt; #   Plot the smoothing curve of DWBT (birth weight) shifted to be zero mean versus PWGT (mother’s 
&gt; #   weight). 
&gt; 
&gt; # Note 7: 
&gt; #   Repeat for remaining variables (omitted for 
&gt; #   brevity). 
&gt; 
[1] &quot;############################### end  181 Fri Jun 17 10:43:13 2016&quot;
[1] &quot;############################### start  182 Fri Jun 17 10:43:13 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c09_Exploring_advanced_methods/00182_example_9.13_of_section_9.2.4.R&quot;
[1] &quot;#####   in directory ../CDC&quot;

&gt; # example 9.13 of section 9.2.4 
&gt; # (example 9.13 of section 9.2.4)  : Exploring advanced methods : Using generalized additive models (GAMs) to learn non-monotone relationships : Using GAM on actual data 
&gt; # Title: Checking GAM model performance on hold-out data 
&gt; 
&gt; pred.lin &lt;- predict(linmodel, newdata=test)   # Note: 1 

&gt; pred.glin &lt;- predict(glinmodel, newdata=test)

&gt; cor(pred.lin, test$DBWT)^2            # Note: 2 
[1] 0.0616812

&gt; # [1] 0.0616812
&gt; cor(pred.glin, test$DBWT)^2
[1] 0.08857426

&gt; # [1] 0.08857426
&gt; 
&gt; # Note 1: 
&gt; #   Run both the linear model and the GAM on the test 
&gt; #   data. 
&gt; 
&gt; # Note 2: 
&gt; #   Calculate R-squared for both 
&gt; #   models. 
&gt; 
[1] &quot;############################### end  182 Fri Jun 17 10:43:13 2016&quot;
[1] &quot;############################### start  183 Fri Jun 17 10:43:13 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c09_Exploring_advanced_methods/00183_example_9.14_of_section_9.2.5.R&quot;
[1] &quot;#####   in directory ../CDC&quot;

&gt; # example 9.14 of section 9.2.5 
&gt; # (example 9.14 of section 9.2.5)  : Exploring advanced methods : Using generalized additive models (GAMs) to learn non-monotone relationships : Using GAM for logistic regression 
&gt; # Title: GLM logistic regression 
&gt; 
&gt; form &lt;- as.formula(&quot;DBWT &lt; 2000 ~ PWGT + WTGAIN + MAGER + UPREVIS&quot;)

&gt; logmod &lt;- glm(form, data=train, family=binomial(link=&quot;logit&quot;))
[1] &quot;############################### end  183 Fri Jun 17 10:43:14 2016&quot;
[1] &quot;############################### start  184 Fri Jun 17 10:43:14 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c09_Exploring_advanced_methods/00184_example_9.15_of_section_9.2.5.R&quot;
[1] &quot;#####   in directory ../CDC&quot;

&gt; # example 9.15 of section 9.2.5 
&gt; # (example 9.15 of section 9.2.5)  : Exploring advanced methods : Using generalized additive models (GAMs) to learn non-monotone relationships : Using GAM for logistic regression 
&gt; # Title: GAM logistic regression 
&gt; 
&gt; form2 &lt;- as.formula(&quot;DBWT&lt;2000~s(PWGT)+s(WTGAIN)+
                                               s(MAGER)+s(UPREVIS)&quot;)

&gt; glogmod &lt;- gam(form2, data=train, family=binomial(link=&quot;logit&quot;))

&gt; glogmod$converged
[1] TRUE

&gt; ## [1] TRUE
&gt; 
&gt; summary(glogmod)

Family: binomial 
Link function: logit 

Formula:
DBWT &lt; 2000 ~ s(PWGT) + s(WTGAIN) + s(MAGER) + s(UPREVIS)

Parametric coefficients:
            Estimate Std. Error z value Pr(&gt;|z|)    
(Intercept) -3.94085    0.06794     -58   &lt;2e-16 ***
---
Signif. codes:  0 &#39;***&#39; 0.001 &#39;**&#39; 0.01 &#39;*&#39; 0.05 &#39;.&#39; 0.1 &#39; &#39; 1

Approximate significance of smooth terms:
             edf Ref.df  Chi.sq  p-value    
s(PWGT)    1.905  2.420   2.463  0.39023    
s(WTGAIN)  3.674  4.543  64.211 1.81e-12 ***
s(MAGER)   1.003  1.005   8.347  0.00393 ** 
s(UPREVIS) 6.802  7.216 217.631  &lt; 2e-16 ***
---
Signif. codes:  0 &#39;***&#39; 0.001 &#39;**&#39; 0.01 &#39;*&#39; 0.05 &#39;.&#39; 0.1 &#39; &#39; 1

R-sq.(adj) =  0.0331   Deviance explained = 9.14%
UBRE = -0.76987  Scale est. = 1         n = 14386

&gt; ## Family: binomial 
&gt; ## Link function: logit 
&gt; ##
&gt; ## Formula:
&gt; ## DBWT &lt; 2000 ~ s(PWGT) + s(WTGAIN) + s(MAGER) + s(UPREVIS)
&gt; ##
&gt; ## Parametric coefficients:
&gt; ##             Estimate Std. Error z value Pr(&gt;|z|)    
&gt; ## (Intercept) -3.94085    0.06794     -58   &lt;2e-16 ***
&gt; ## ---
&gt; ## Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
&gt; ##
&gt; ## Approximate significance of smooth terms: 
&gt; ##              edf Ref.df  Chi.sq  p-value    
&gt; ## s(PWGT)    1.905  2.420   2.463  0.36412       # Note: 1 
&gt; ## s(WTGAIN)  3.674  4.543  64.426 1.72e-12 ***
&gt; ## s(MAGER)   1.003  1.005   8.335  0.00394 ** 
&gt; ## s(UPREVIS) 6.802  7.216 217.631  &lt; 2e-16 ***
&gt; ## ---
&gt; ## Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
&gt; ##
&gt; ## R-sq.(adj) =  0.0331   Deviance explained = 9.14%  # Note: 2 
&gt; ## UBRE score = -0.76987  Scale est. = 1         n = 14386
&gt; 
&gt; # Note 1: 
&gt; #   Note that there’s no proof that the mother’s weight (PWGT) has a significant effect on 
&gt; #   outcome. 
&gt; 
&gt; # Note 2: 
&gt; #   “Deviance explained” is the pseudo R-squared: 1 - 
&gt; #   (deviance/null.deviance). 
&gt; 
[1] &quot;############################### end  184 Fri Jun 17 10:43:17 2016&quot;</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">rm</span>(<span class="dt">list=</span><span class="kw">ls</span>())
<span class="kw">source</span>(<span class="st">&#39;runDir.R&#39;</span>)</code></pre></div>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(<span class="st">&#39;ggplot2&#39;</span>)
<span class="kw">load</span>(<span class="st">&#39;../PUMS/psub.RData&#39;</span>)
<span class="kw">runDir</span>(<span class="st">&#39;../CodeExamples/c09_Exploring_advanced_methods&#39;</span>,
      <span class="st">&#39;../PUMS&#39;</span>,<span class="dt">first=</span><span class="dv">185</span>,<span class="dt">last=</span><span class="dv">195</span>)</code></pre></div>
<pre><code>[1] &quot;############################### start  185 Fri Jun 17 10:43:17 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c09_Exploring_advanced_methods/00185_example_9.16_of_section_9.3.1.R&quot;
[1] &quot;#####   in directory ../PUMS&quot;

&gt; # example 9.16 of section 9.3.1 
&gt; # (example 9.16 of section 9.3.1)  : Exploring advanced methods : Using kernel methods to increase data separation : Understanding kernel functions 
&gt; # Title: An artificial kernel example 
&gt; 
&gt; u &lt;- c(1,2)

&gt; v &lt;- c(3,4)

&gt; k &lt;- function(u,v) {  # Note: 1 
      u[1]*v[1] + u[2]*v[2] +
         u[1]*u[1]*v[1]*v[1] + u[2]*u[2]*v[2]*v[2] +
         u[1]*u[2]*v[1]*v[2]
   }

&gt; phi &lt;- function(x) {  # Note: 2 
      x &lt;- as.numeric(x)
      c(x,x*x,combn(x,2,FUN=prod))
   }

&gt; print(k(u,v))     # Note: 3 
[1] 108

&gt; ## [1] 108
&gt; print(phi(u))
[1] 1 2 1 4 2

&gt; ## [1] 1 2 1 4 2
&gt; print(phi(v))
[1]  3  4  9 16 12

&gt; ## [1]  3  4  9 16 12
&gt; print(as.numeric(phi(u) %*% phi(v)))  # Note: 4 
[1] 108

&gt; ## [1] 108
&gt; 
&gt; # Note 1: 
&gt; #   Define a function of two vector variables 
&gt; #   (both two dimensional) as the sum of various products of terms. 
&gt; 
&gt; # Note 2: 
&gt; #   Define a function of a single vector variable 
&gt; #   that returns a vector containing the original entries plus all products of 
&gt; #   entries. 
&gt; 
&gt; # Note 3: 
&gt; #   Example evaluation of k(,). 
&gt; 
&gt; # Note 4: 
&gt; #   Confirm phi() agrees with k(,). phi() is the certificate that shows k(,) is in fact a 
&gt; #   kernel. 
&gt; 
[1] &quot;############################### end  185 Fri Jun 17 10:43:17 2016&quot;
[1] &quot;############################### start  186 Fri Jun 17 10:43:17 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c09_Exploring_advanced_methods/00186_example_9.17_of_section_9.3.2.R&quot;
[1] &quot;#####   in directory ../PUMS&quot;

&gt; # example 9.17 of section 9.3.2 
&gt; # (example 9.17 of section 9.3.2)  : Exploring advanced methods : Using kernel methods to increase data separation : Using an explicit kernel on a problem 
&gt; # Title: Applying stepwise linear regression to PUMS data 
&gt; 
&gt; dtrain &lt;- subset(psub,ORIGRANDGROUP &gt;= 500)

&gt; dtest &lt;- subset(psub,ORIGRANDGROUP &lt; 500)     # Note: 1 

&gt; m1 &lt;- step(   # Note: 2 
    lm(log(PINCP,base=10) ~ AGEP + SEX + COW + SCHL,
       data=dtrain),    # Note: 3 
    direction=&#39;both&#39;)
Start:  AIC=-1545.52
log(PINCP, base = 10) ~ AGEP + SEX + COW + SCHL

       Df Sum of Sq    RSS     AIC
&lt;none&gt;              41.842 -1545.5
- COW   6    1.1304 42.973 -1541.7
- SEX   1    1.1462 42.989 -1531.4
- AGEP  1    5.4363 47.279 -1474.8
- SCHL  8   14.0776 55.920 -1389.0

&gt; rmse &lt;- function(y, f) { sqrt(mean( (y-f)^2 )) }  # Note: 4 

&gt; print(rmse(log(dtest$PINCP,base=10),
    predict(m1,newdata=dtest)))     # Note: 5 
[1] 0.2752171

&gt; # [1] 0.2752171
&gt; 
&gt; # Note 1: 
&gt; #   Split data into test and training. 
&gt; 
&gt; # Note 2: 
&gt; #   Ask that the linear regression model we’re building be 
&gt; #   stepwise improved, which is a powerful automated procedure for 
&gt; #   removing variables that don’t seem to have significant impacts 
&gt; #   (can improve generalization performance). 
&gt; 
&gt; # Note 3: 
&gt; #   Build the basic linear regression model. 
&gt; 
&gt; # Note 4: 
&gt; #   Define the RMSE function. 
&gt; 
&gt; # Note 5: 
&gt; #   Calculate the RMSE between the prediction and the 
&gt; #   actuals. 
&gt; 
[1] &quot;############################### end  186 Fri Jun 17 10:43:17 2016&quot;
[1] &quot;############################### start  187 Fri Jun 17 10:43:17 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c09_Exploring_advanced_methods/00187_example_9.18_of_section_9.3.2.R&quot;
[1] &quot;#####   in directory ../PUMS&quot;

&gt; # example 9.18 of section 9.3.2 
&gt; # (example 9.18 of section 9.3.2)  : Exploring advanced methods : Using kernel methods to increase data separation : Using an explicit kernel on a problem 
&gt; # Title: Applying an example explicit kernel transform 
&gt; 
&gt; phi &lt;- function(x) {  # Note: 1 
      x &lt;- as.numeric(x)
      c(x,x*x,combn(x,2,FUN=prod))
   }

&gt; phiNames &lt;- function(n) {     # Note: 2 
      c(n,paste(n,n,sep=&#39;:&#39;),
         combn(n,2,FUN=function(x) {paste(x,collapse=&#39;:&#39;)}))
   }

&gt; modelMatrix &lt;- model.matrix(~ 0 + AGEP + SEX + COW + SCHL,psub)   # Note: 3 

&gt; colnames(modelMatrix) &lt;- gsub(&#39;[^a-zA-Z0-9]+&#39;,&#39;_&#39;,
    colnames(modelMatrix))  # Note: 4 

&gt; pM &lt;- t(apply(modelMatrix,1,phi))     # Note: 5 

&gt; vars &lt;- phiNames(colnames(modelMatrix))

&gt; vars &lt;- gsub(&#39;[^a-zA-Z0-9]+&#39;,&#39;_&#39;,vars)

&gt; colnames(pM) &lt;- vars  # Note: 6 

&gt; pM &lt;- as.data.frame(pM)

&gt; pM$PINCP &lt;- psub$PINCP

&gt; pM$ORIGRANDGROUP &lt;- psub$ORIGRANDGROUP

&gt; pMtrain &lt;- subset(pM,ORIGRANDGROUP &gt;= 500)

&gt; pMtest &lt;- subset(pM,ORIGRANDGROUP &lt; 500)  # Note: 7

&gt; # Note 1: 
&gt; #   Define our primal kernel function: map a 
&gt; #   vector to a copy of itself plus all square terms and cross-multiplied 
&gt; #   terms. 
&gt; 
&gt; # Note 2: 
&gt; #   Define a function similar to our primal 
&gt; #   kernel, but working on variable names instead of values. 
&gt; 
&gt; # Note 3: 
&gt; #   Convert data to a matrix where all 
&gt; #   categorical variables are encoded as multiple numeric indicators. 
&gt; 
&gt; # Note 4: 
&gt; #   Remove problematic characters from matrix 
&gt; #   column names. 
&gt; 
&gt; # Note 5: 
&gt; #   Apply the primal kernel function to every 
&gt; #   row of the matrix and transpose results so they’re written as rows (not as a 
&gt; #   list as returned by apply()). 
&gt; 
&gt; # Note 6: 
&gt; #   Extend names from original matrix to 
&gt; #   names for compound variables in new matrix. 
&gt; 
&gt; # Note 7: 
&gt; #   Add in outcomes, test/train split 
&gt; #   columns, and prepare new data for modeling. 
&gt; 
[1] &quot;############################### end  187 Fri Jun 17 10:43:17 2016&quot;
[1] &quot;############################### start  188 Fri Jun 17 10:43:17 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c09_Exploring_advanced_methods/00188_example_9.19_of_section_9.3.2.R&quot;
[1] &quot;#####   in directory ../PUMS&quot;

&gt; # example 9.19 of section 9.3.2 
&gt; # (example 9.19 of section 9.3.2)  : Exploring advanced methods : Using kernel methods to increase data separation : Using an explicit kernel on a problem 
&gt; # Title: Modeling using the explicit kernel transform 
&gt; 
&gt; formulaStr2 &lt;- paste(&#39;log(PINCP,base=10)&#39;,
    paste(vars,collapse=&#39; + &#39;),
    sep=&#39; ~ &#39;)

&gt; m2 &lt;- lm(as.formula(formulaStr2),data=pMtrain)

&gt; coef2 &lt;- summary(m2)$coefficients

&gt; interestingVars &lt;- setdiff(rownames(coef2)[coef2[,&#39;Pr(&gt;|t|)&#39;]&lt;0.01],
                               &#39;(Intercept)&#39;)

&gt; interestingVars &lt;- union(colnames(modelMatrix),interestingVars)   # Note: 1 

&gt; formulaStr3 &lt;- paste(&#39;log(PINCP,base=10)&#39;,
                     paste(interestingVars,collapse=&#39; + &#39;),
                     sep=&#39; ~ &#39;)

&gt; m3 &lt;- step(lm(as.formula(formulaStr3),data=pMtrain),direction=&#39;both&#39;)     # Note: 2 
Start:  AIC=-1563.28
log(PINCP, base = 10) ~ AGEP + SEXM + SEXF + COWFederal_government_employee + 
    COWLocal_government_employee + COWPrivate_not_for_profit_employee + 
    COWSelf_employed_incorporated + COWSelf_employed_not_incorporated + 
    COWState_government_employee + SCHLAssociate_s_degree + SCHLBachelor_s_degree + 
    SCHLDoctorate_degree + SCHLGED_or_alternative_credential + 
    SCHLMaster_s_degree + SCHLProfessional_degree + SCHLRegular_high_school_diploma + 
    SCHLsome_college_credit_no_degree + AGEP_AGEP


Step:  AIC=-1563.28
log(PINCP, base = 10) ~ AGEP + SEXM + COWFederal_government_employee + 
    COWLocal_government_employee + COWPrivate_not_for_profit_employee + 
    COWSelf_employed_incorporated + COWSelf_employed_not_incorporated + 
    COWState_government_employee + SCHLAssociate_s_degree + SCHLBachelor_s_degree + 
    SCHLDoctorate_degree + SCHLGED_or_alternative_credential + 
    SCHLMaster_s_degree + SCHLProfessional_degree + SCHLRegular_high_school_diploma + 
    SCHLsome_college_credit_no_degree + AGEP_AGEP

                                     Df Sum of Sq    RSS     AIC
- COWFederal_government_employee      1    0.0529 40.528 -1564.5
- COWLocal_government_employee        1    0.0556 40.531 -1564.5
- COWSelf_employed_not_incorporated   1    0.0580 40.533 -1564.4
- COWSelf_employed_incorporated       1    0.0739 40.549 -1564.2
&lt;none&gt;                                            40.475 -1563.3
- COWState_government_employee        1    0.1454 40.621 -1563.2
- SCHLGED_or_alternative_credential   1    0.2038 40.679 -1562.3
- SCHLDoctorate_degree                1    0.2976 40.773 -1560.9
- SCHLRegular_high_school_diploma     1    0.4088 40.884 -1559.3
- COWPrivate_not_for_profit_employee  1    0.7791 41.255 -1553.9
- SEXM                                1    1.0356 41.511 -1550.2
- SCHLsome_college_credit_no_degree   1    1.0598 41.535 -1549.9
- AGEP_AGEP                           1    1.3669 41.842 -1545.5
- SCHLAssociate_s_degree              1    1.4120 41.887 -1544.9
- AGEP                                1    1.9901 42.466 -1536.7
- SCHLProfessional_degree             1    4.1591 44.635 -1507.1
- SCHLBachelor_s_degree               1    5.8785 46.354 -1484.6
- SCHLMaster_s_degree                 1    6.0262 46.502 -1482.7

Step:  AIC=-1564.5
log(PINCP, base = 10) ~ AGEP + SEXM + COWLocal_government_employee + 
    COWPrivate_not_for_profit_employee + COWSelf_employed_incorporated + 
    COWSelf_employed_not_incorporated + COWState_government_employee + 
    SCHLAssociate_s_degree + SCHLBachelor_s_degree + SCHLDoctorate_degree + 
    SCHLGED_or_alternative_credential + SCHLMaster_s_degree + 
    SCHLProfessional_degree + SCHLRegular_high_school_diploma + 
    SCHLsome_college_credit_no_degree + AGEP_AGEP

                                     Df Sum of Sq    RSS     AIC
- COWSelf_employed_not_incorporated   1    0.0621 40.590 -1565.6
- COWLocal_government_employee        1    0.0628 40.591 -1565.6
- COWSelf_employed_incorporated       1    0.0810 40.609 -1565.3
&lt;none&gt;                                            40.528 -1564.5
- COWState_government_employee        1    0.1566 40.685 -1564.2
- SCHLGED_or_alternative_credential   1    0.2157 40.744 -1563.3
+ COWFederal_government_employee      1    0.0529 40.475 -1563.3
- SCHLDoctorate_degree                1    0.2989 40.827 -1562.1
- SCHLRegular_high_school_diploma     1    0.4193 40.948 -1560.4
- COWPrivate_not_for_profit_employee  1    0.8172 41.345 -1554.6
- SEXM                                1    1.0873 41.616 -1550.8
- SCHLsome_college_credit_no_degree   1    1.0992 41.628 -1550.6
- AGEP_AGEP                           1    1.3837 41.912 -1546.5
- SCHLAssociate_s_degree              1    1.4720 42.000 -1545.3
- AGEP                                1    2.0110 42.539 -1537.7
- SCHLProfessional_degree             1    4.2486 44.777 -1507.2
- SCHLBachelor_s_degree               1    6.0593 46.588 -1483.6
- SCHLMaster_s_degree                 1    6.1231 46.651 -1482.8

Step:  AIC=-1565.59
log(PINCP, base = 10) ~ AGEP + SEXM + COWLocal_government_employee + 
    COWPrivate_not_for_profit_employee + COWSelf_employed_incorporated + 
    COWState_government_employee + SCHLAssociate_s_degree + SCHLBachelor_s_degree + 
    SCHLDoctorate_degree + SCHLGED_or_alternative_credential + 
    SCHLMaster_s_degree + SCHLProfessional_degree + SCHLRegular_high_school_diploma + 
    SCHLsome_college_credit_no_degree + AGEP_AGEP

                                     Df Sum of Sq    RSS     AIC
- COWLocal_government_employee        1    0.0580 40.648 -1566.7
- COWSelf_employed_incorporated       1    0.0767 40.667 -1566.5
&lt;none&gt;                                            40.590 -1565.6
- COWState_government_employee        1    0.1487 40.739 -1565.4
+ COWSelf_employed_not_incorporated   1    0.0621 40.528 -1564.5
+ COWFederal_government_employee      1    0.0570 40.533 -1564.4
- SCHLGED_or_alternative_credential   1    0.2345 40.825 -1564.2
- SCHLDoctorate_degree                1    0.3092 40.900 -1563.1
- SCHLRegular_high_school_diploma     1    0.4556 41.046 -1561.0
- COWPrivate_not_for_profit_employee  1    0.7940 41.384 -1556.1
- SEXM                                1    1.0830 41.673 -1551.9
- SCHLsome_college_credit_no_degree   1    1.1667 41.757 -1550.7
- AGEP_AGEP                           1    1.3816 41.972 -1547.7
- SCHLAssociate_s_degree              1    1.5088 42.099 -1545.9
- AGEP                                1    2.0030 42.593 -1538.9
- SCHLProfessional_degree             1    4.3220 44.912 -1507.4
- SCHLBachelor_s_degree               1    6.1755 46.766 -1483.3
- SCHLMaster_s_degree                 1    6.3180 46.908 -1481.5

Step:  AIC=-1566.74
log(PINCP, base = 10) ~ AGEP + SEXM + COWPrivate_not_for_profit_employee + 
    COWSelf_employed_incorporated + COWState_government_employee + 
    SCHLAssociate_s_degree + SCHLBachelor_s_degree + SCHLDoctorate_degree + 
    SCHLGED_or_alternative_credential + SCHLMaster_s_degree + 
    SCHLProfessional_degree + SCHLRegular_high_school_diploma + 
    SCHLsome_college_credit_no_degree + AGEP_AGEP

                                     Df Sum of Sq    RSS     AIC
- COWSelf_employed_incorporated       1    0.0705 40.719 -1567.7
- COWState_government_employee        1    0.1297 40.778 -1566.8
&lt;none&gt;                                            40.648 -1566.7
+ COWFederal_government_employee      1    0.0640 40.584 -1565.7
+ COWLocal_government_employee        1    0.0580 40.590 -1565.6
+ COWSelf_employed_not_incorporated   1    0.0573 40.591 -1565.6
- SCHLGED_or_alternative_credential   1    0.2366 40.885 -1565.3
- SCHLDoctorate_degree                1    0.2875 40.936 -1564.5
- SCHLRegular_high_school_diploma     1    0.4549 41.103 -1562.1
- COWPrivate_not_for_profit_employee  1    0.7562 41.405 -1557.8
- SCHLsome_college_credit_no_degree   1    1.1621 41.811 -1552.0
- SEXM                                1    1.2186 41.867 -1551.2
- AGEP_AGEP                           1    1.3635 42.012 -1549.1
- SCHLAssociate_s_degree              1    1.5080 42.156 -1547.1
- AGEP                                1    1.9797 42.628 -1540.5
- SCHLProfessional_degree             1    4.2647 44.913 -1509.4
- SCHLBachelor_s_degree               1    6.1175 46.766 -1485.3
- SCHLMaster_s_degree                 1    6.3301 46.979 -1482.6

Step:  AIC=-1567.71
log(PINCP, base = 10) ~ AGEP + SEXM + COWPrivate_not_for_profit_employee + 
    COWState_government_employee + SCHLAssociate_s_degree + SCHLBachelor_s_degree + 
    SCHLDoctorate_degree + SCHLGED_or_alternative_credential + 
    SCHLMaster_s_degree + SCHLProfessional_degree + SCHLRegular_high_school_diploma + 
    SCHLsome_college_credit_no_degree + AGEP_AGEP

                                     Df Sum of Sq    RSS     AIC
- COWState_government_employee        1    0.1237 40.843 -1567.9
&lt;none&gt;                                            40.719 -1567.7
+ COWFederal_government_employee      1    0.0707 40.648 -1566.8
+ COWSelf_employed_incorporated       1    0.0705 40.648 -1566.7
+ COWSelf_employed_not_incorporated   1    0.0536 40.665 -1566.5
+ COWLocal_government_employee        1    0.0518 40.667 -1566.5
- SCHLGED_or_alternative_credential   1    0.2360 40.955 -1566.3
- SCHLDoctorate_degree                1    0.2886 41.008 -1565.5
- SCHLRegular_high_school_diploma     1    0.4276 41.146 -1563.5
- COWPrivate_not_for_profit_employee  1    0.7274 41.446 -1559.2
- SCHLsome_college_credit_no_degree   1    1.1306 41.850 -1553.4
- SEXM                                1    1.1717 41.891 -1552.8
- AGEP_AGEP                           1    1.3736 42.093 -1550.0
- SCHLAssociate_s_degree              1    1.4994 42.218 -1548.2
- AGEP                                1    1.9842 42.703 -1541.4
- SCHLProfessional_degree             1    4.1942 44.913 -1511.4
- SCHLBachelor_s_degree               1    6.0696 46.789 -1487.0
- SCHLMaster_s_degree                 1    6.2770 46.996 -1484.4

Step:  AIC=-1567.91
log(PINCP, base = 10) ~ AGEP + SEXM + COWPrivate_not_for_profit_employee + 
    SCHLAssociate_s_degree + SCHLBachelor_s_degree + SCHLDoctorate_degree + 
    SCHLGED_or_alternative_credential + SCHLMaster_s_degree + 
    SCHLProfessional_degree + SCHLRegular_high_school_diploma + 
    SCHLsome_college_credit_no_degree + AGEP_AGEP

                                     Df Sum of Sq    RSS     AIC
&lt;none&gt;                                            40.843 -1567.9
+ COWState_government_employee        1    0.1237 40.719 -1567.7
+ COWFederal_government_employee      1    0.0801 40.763 -1567.1
+ COWSelf_employed_incorporated       1    0.0644 40.778 -1566.8
+ COWSelf_employed_not_incorporated   1    0.0477 40.795 -1566.6
- SCHLGED_or_alternative_credential   1    0.2357 41.078 -1566.5
+ COWLocal_government_employee        1    0.0347 40.808 -1566.4
- SCHLDoctorate_degree                1    0.2901 41.133 -1565.7
- SCHLRegular_high_school_diploma     1    0.4194 41.262 -1563.8
- COWPrivate_not_for_profit_employee  1    0.6860 41.529 -1560.0
- SCHLsome_college_credit_no_degree   1    1.1101 41.953 -1554.0
- SEXM                                1    1.2198 42.062 -1552.4
- AGEP_AGEP                           1    1.3760 42.219 -1550.2
- SCHLAssociate_s_degree              1    1.4515 42.294 -1549.1
- AGEP                                1    1.9805 42.823 -1541.7
- SCHLProfessional_degree             1    4.1851 45.028 -1511.9
- SCHLBachelor_s_degree               1    5.9520 46.795 -1489.0
- SCHLMaster_s_degree                 1    6.1704 47.013 -1486.2

&gt; print(rmse(log(pMtest$PINCP,base=10),predict(m3,newdata=pMtest)))     # Note: 3 
[1] 0.2735955

&gt; # [1] 0.2735955
&gt; 
&gt; # Note 1: 
&gt; #   Select a set of interesting variables by building an initial model using all of the new 
&gt; #   variables and retaining an interesting subset. This is an ad hoc 
&gt; #   move to speed up the stepwise regression by trying to quickly 
&gt; #   dispose of many useless derived variables. By introducing many new 
&gt; #   variables, the primal kernel method also introduces many new degrees 
&gt; #   of freedom, which can invite overfitting. 
&gt; 
&gt; # Note 2: 
&gt; #   Stepwise regress on subset of variables to 
&gt; #   get new model. 
&gt; 
&gt; # Note 3: 
&gt; #   Calculate the RMSE between the prediction and the actuals. 
&gt; 
[1] &quot;############################### end  188 Fri Jun 17 10:43:17 2016&quot;
[1] &quot;############################### start  189 Fri Jun 17 10:43:17 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c09_Exploring_advanced_methods/00189_example_9.20_of_section_9.3.2.R&quot;
[1] &quot;#####   in directory ../PUMS&quot;

&gt; # example 9.20 of section 9.3.2 
&gt; # (example 9.20 of section 9.3.2)  : Exploring advanced methods : Using kernel methods to increase data separation : Using an explicit kernel on a problem 
&gt; # Title: Inspecting the results of the explicit kernel model 
&gt; 
&gt; print(summary(m3))

Call:
lm(formula = log(PINCP, base = 10) ~ AGEP + SEXM + COWPrivate_not_for_profit_employee + 
    SCHLAssociate_s_degree + SCHLBachelor_s_degree + SCHLDoctorate_degree + 
    SCHLGED_or_alternative_credential + SCHLMaster_s_degree + 
    SCHLProfessional_degree + SCHLRegular_high_school_diploma + 
    SCHLsome_college_credit_no_degree + AGEP_AGEP, data = pMtrain)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.29264 -0.14925  0.01343  0.17021  0.61968 

Coefficients:
                                     Estimate Std. Error t value Pr(&gt;|t|)
(Intercept)                         2.9400460  0.2219310  13.248  &lt; 2e-16
AGEP                                0.0663537  0.0124905   5.312 1.54e-07
SEXM                                0.0934876  0.0224236   4.169 3.52e-05
COWPrivate_not_for_profit_employee -0.1187914  0.0379944  -3.127  0.00186
SCHLAssociate_s_degree              0.2317211  0.0509509   4.548 6.60e-06
SCHLBachelor_s_degree               0.3844459  0.0417445   9.210  &lt; 2e-16
SCHLDoctorate_degree                0.3190572  0.1569356   2.033  0.04250
SCHLGED_or_alternative_credential   0.1405157  0.0766743   1.833  0.06737
SCHLMaster_s_degree                 0.4553550  0.0485609   9.377  &lt; 2e-16
SCHLProfessional_degree             0.6525921  0.0845052   7.723 5.01e-14
SCHLRegular_high_school_diploma     0.1016590  0.0415834   2.445  0.01479
SCHLsome_college_credit_no_degree   0.1655906  0.0416345   3.977 7.85e-05
AGEP_AGEP                          -0.0007547  0.0001704  -4.428 1.14e-05
                                      
(Intercept)                        ***
AGEP                               ***
SEXM                               ***
COWPrivate_not_for_profit_employee ** 
SCHLAssociate_s_degree             ***
SCHLBachelor_s_degree              ***
SCHLDoctorate_degree               *  
SCHLGED_or_alternative_credential  .  
SCHLMaster_s_degree                ***
SCHLProfessional_degree            ***
SCHLRegular_high_school_diploma    *  
SCHLsome_college_credit_no_degree  ***
AGEP_AGEP                          ***
---
Signif. codes:  0 &#39;***&#39; 0.001 &#39;**&#39; 0.01 &#39;*&#39; 0.05 &#39;.&#39; 0.1 &#39; &#39; 1

Residual standard error: 0.2649 on 582 degrees of freedom
Multiple R-squared:  0.3541,    Adjusted R-squared:  0.3408 
F-statistic: 26.59 on 12 and 582 DF,  p-value: &lt; 2.2e-16


&gt; ## Call:
&gt; ## lm(formula = log(PINCP, base = 10) ~ AGEP + SEXM +
&gt; ##     COWPrivate_not_for_profit_employee +
&gt; ##     SCHLAssociate_s_degree + SCHLBachelor_s_degree +
&gt; ##     SCHLDoctorate_degree +
&gt; ##     SCHLGED_or_alternative_credential + SCHLMaster_s_degree +
&gt; ##     SCHLProfessional_degree + SCHLRegular_high_school_diploma +
&gt; ##     SCHLsome_college_credit_no_degree + AGEP_AGEP, data = pMtrain)
&gt; ##
&gt; ## Residuals:
&gt; ##      Min       1Q   Median       3Q      Max
&gt; ## -1.29264 -0.14925  0.01343  0.17021  0.61968
&gt; ##
&gt; ## Coefficients:
&gt; ##                                 Estimate Std. Error t value Pr(&gt;|t|)
&gt; ## (Intercept)                   2.9400460  0.2219310  13.248  &lt; 2e-16 ***
&gt; ## AGEP                          0.0663537  0.0124905   5.312 1.54e-07 ***
&gt; ## SEXM                          0.0934876  0.0224236   4.169 3.52e-05 ***
&gt; ## COWPrivate_not_for_profit_em -0.1187914  0.0379944  -3.127  0.00186 **
&gt; ## SCHLAssociate_s_degree        0.2317211  0.0509509   4.548 6.60e-06 ***
&gt; ## SCHLBachelor_s_degree         0.3844459  0.0417445   9.210  &lt; 2e-16 ***
&gt; ## SCHLDoctorate_degree          0.3190572  0.1569356   2.033  0.04250 *
&gt; ## SCHLGED_or_alternative_creden 0.1405157  0.0766743   1.833  0.06737 .
&gt; ## SCHLMaster_s_degree           0.4553550  0.0485609   9.377  &lt; 2e-16 ***
&gt; ## SCHLProfessional_degree       0.6525921  0.0845052   7.723 5.01e-14 ***
&gt; ## SCHLRegular_high_school_diplo 0.1016590  0.0415834   2.445  0.01479 *
&gt; ## SCHLsome_college_credit_no_de 0.1655906  0.0416345   3.977 7.85e-05 ***
&gt; ## AGEP_AGEP                    -0.0007547  0.0001704  -4.428 1.14e-05 ***
&gt; ## ---
&gt; ## Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
&gt; ##
&gt; ## Residual standard error: 0.2649 on 582 degrees of freedom
&gt; ## Multiple R-squared:  0.3541,    Adjusted R-squared:  0.3408
&gt; ## F-statistic: 26.59 on 12 and 582 DF,  p-value: &lt; 2.2e-16
&gt; 
[1] &quot;############################### end  189 Fri Jun 17 10:43:17 2016&quot;</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">rm</span>(<span class="dt">list=</span><span class="kw">ls</span>())
<span class="kw">source</span>(<span class="st">&#39;runDir.R&#39;</span>)</code></pre></div>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(<span class="st">&#39;ggplot2&#39;</span>)
<span class="kw">runDir</span>(<span class="st">&#39;../CodeExamples/c09_Exploring_advanced_methods&#39;</span>,
      <span class="st">&#39;../Spambase&#39;</span>,<span class="dt">first=</span><span class="dv">196</span>)</code></pre></div>
<pre><code>[1] &quot;############################### start  196 Fri Jun 17 10:43:17 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c09_Exploring_advanced_methods/00196_example_9.21_of_section_9.4.2.R&quot;
[1] &quot;#####   in directory ../Spambase&quot;

&gt; # example 9.21 of section 9.4.2 
&gt; # (example 9.21 of section 9.4.2)  : Exploring advanced methods : Using SVMs to model complicated decision boundaries : Trying an SVM on artificial example data 
&gt; # Title: Setting up the spirals data as an example classification problem 
&gt; 
&gt; library(&#39;kernlab&#39;)


Attaching package: &#39;kernlab&#39;

The following object is masked from &#39;package:ggplot2&#39;:

    alpha


&gt; data(&#39;spirals&#39;)   # Note: 1 

&gt; sc &lt;- specc(spirals, centers = 2)     # Note: 2 

&gt; s &lt;- data.frame(x=spirals[,1],y=spirals[,2],
    class=as.factor(sc))    # Note: 3 

&gt; library(&#39;ggplot2&#39;)

&gt; ggplot(data=s) +
    geom_text(aes(x=x,y=y,
       label=class,color=class)) +
    coord_fixed() + 
    theme_bw() + theme(legend.position=&#39;none&#39;)  # Note: 4</code></pre>
<div class="figure">
<img src="rCh09_files/figure-markdown_github/ch9ex4-1.png" alt="" />

</div>
<pre><code>&gt; # Note 1: 
&gt; #   Load the kernlab kernel and support vector 
&gt; #   machine package and then ask that the included example &quot;spirals&quot; be made 
&gt; #   available. 
&gt; 
&gt; # Note 2: 
&gt; #   Use kernlab’s spectral clustering routine 
&gt; #   to identify the two different spirals in the example dataset. 
&gt; 
&gt; # Note 3: 
&gt; #   Combine the spiral coordinates and the 
&gt; #   spiral label into a data frame. 
&gt; 
&gt; # Note 4: 
&gt; #   Plot the spirals with class labels. 
&gt; 
[1] &quot;############################### end  196 Fri Jun 17 10:43:19 2016&quot;
[1] &quot;############################### start  197 Fri Jun 17 10:43:19 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c09_Exploring_advanced_methods/00197_example_9.22_of_section_9.4.2.R&quot;
[1] &quot;#####   in directory ../Spambase&quot;

&gt; # example 9.22 of section 9.4.2 
&gt; # (example 9.22 of section 9.4.2)  : Exploring advanced methods : Using SVMs to model complicated decision boundaries : Trying an SVM on artificial example data 
&gt; # Title: SVM with a poor choice of kernel 
&gt; 
&gt; set.seed(2335246L)

&gt; s$group &lt;- sample.int(100,size=dim(s)[[1]],replace=T)

&gt; sTrain &lt;- subset(s,group&gt;10)

&gt; sTest &lt;- subset(s,group&lt;=10)  # Note: 1 

&gt; # mSVMV &lt;- ksvm(class~x+y,data=sTrain,kernel=&#39;vanilladot&#39;) 
&gt; # had been using ksvm, but it seems to keep bad state in some cases
&gt; library(&#39;e1071&#39;)

&gt; mSVMV &lt;- svm(class~x+y,data=sTrain,kernel=&#39;linear&#39;,type=&#39;nu-classification&#39;)  # Note: 2 

&gt; sTest$predSVMV &lt;- predict(mSVMV,newdata=sTest,type=&#39;response&#39;)    # Note: 3 

&gt; ggplot() +
    geom_text(data=sTest,aes(x=x,y=y,
       label=predSVMV),size=12) +
    geom_text(data=s,aes(x=x,y=y,
       label=class,color=class),alpha=0.7) +
    coord_fixed() + 
    theme_bw() + theme(legend.position=&#39;none&#39;)  # Note: 4</code></pre>
<div class="figure">
<img src="rCh09_files/figure-markdown_github/ch9ex4-2.png" alt="" />

</div>
<pre><code>&gt; # Note 1: 
&gt; #   Prepare to try to learn spiral class label 
&gt; #   from coordinates using a support vector machine. 
&gt; 
&gt; # Note 2: 
&gt; #   Build the support vector model using a 
&gt; #   vanilladot kernel (not a very good kernel). 
&gt; 
&gt; # Note 3: 
&gt; #   Use the model to predict class on held-out 
&gt; #   data. 
&gt; 
&gt; # Note 4: 
&gt; #   Plot the predictions on top of a grey copy 
&gt; #   of all the data so we can see if predictions agree with the original 
&gt; #   markings. 
&gt; 
[1] &quot;############################### end  197 Fri Jun 17 10:43:20 2016&quot;
[1] &quot;############################### start  198 Fri Jun 17 10:43:20 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c09_Exploring_advanced_methods/00198_example_9.23_of_section_9.4.2.R&quot;
[1] &quot;#####   in directory ../Spambase&quot;

&gt; # example 9.23 of section 9.4.2 
&gt; # (example 9.23 of section 9.4.2)  : Exploring advanced methods : Using SVMs to model complicated decision boundaries : Trying an SVM on artificial example data 
&gt; # Title: SVM with a good choice of kernel 
&gt; 
&gt; # mSVMG &lt;- ksvm(class~x+y,data=sTrain,kernel=&#39;rbfdot&#39;)
&gt; # had been using ksvm, but it seems to be keeping bad state in some cases
&gt; mSVMG &lt;- svm(class~x+y,data=sTrain,kernel=&#39;radial&#39;,type=&#39;nu-classification&#39;)  # Note: 1 

&gt; sTest$predSVMG &lt;- predict(mSVMG,newdata=sTest,type=&#39;response&#39;)

&gt; ggplot() +
    geom_text(data=sTest,aes(x=x,y=y,
       label=predSVMG),size=12) +
    geom_text(data=s,aes(x=x,y=y,
       label=class,color=class),alpha=0.7) +
    coord_fixed() + 
    theme_bw() + theme(legend.position=&#39;none&#39;)


&gt; # Note 1: 
&gt; #   This time use the &quot;radial&quot; or 
&gt; #   Gaussian kernel, which is a nice geometric similarity measure. 
&gt; 
[1] &quot;############################### end  198 Fri Jun 17 10:43:20 2016&quot;
[1] &quot;############################### start  199 Fri Jun 17 10:43:20 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c09_Exploring_advanced_methods/00199_example_9.24_of_section_9.4.3.R&quot;
[1] &quot;#####   in directory ../Spambase&quot;

&gt; # example 9.24 of section 9.4.3 
&gt; # (example 9.24 of section 9.4.3)  : Exploring advanced methods : Using SVMs to model complicated decision boundaries : Using SVMs on real data 
&gt; # Title: Revisiting the Spambase example with GLM 
&gt; 
&gt; spamD &lt;- read.table(&#39;spamD.tsv&#39;,header=T,sep=&#39;\t&#39;)

&gt; spamTrain &lt;- subset(spamD,spamD$rgroup&gt;=10)

&gt; spamTest &lt;- subset(spamD,spamD$rgroup&lt;10)

&gt; spamVars &lt;- setdiff(colnames(spamD),list(&#39;rgroup&#39;,&#39;spam&#39;))

&gt; spamFormula &lt;- as.formula(paste(&#39;spam==&quot;spam&quot;&#39;,
    paste(spamVars,collapse=&#39; + &#39;),sep=&#39; ~ &#39;))

&gt; spamModel &lt;- glm(spamFormula,family=binomial(link=&#39;logit&#39;),
    data=spamTrain)

Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred</code></pre>
<div class="figure">
<img src="rCh09_files/figure-markdown_github/ch9ex4-3.png" alt="" />

</div>
<pre><code>&gt; spamTest$pred &lt;- predict(spamModel,newdata=spamTest,
    type=&#39;response&#39;)

&gt; print(with(spamTest,table(y=spam,glPred=pred&gt;=0.5)))
          glPred
y          FALSE TRUE
  non-spam   264   14
  spam        22  158

&gt; ##           glPred
&gt; ## y          FALSE TRUE
&gt; ##   non-spam   264   14
&gt; ##   spam        22  158
&gt; 
[1] &quot;############################### end  199 Fri Jun 17 10:43:20 2016&quot;
[1] &quot;############################### start  200 Fri Jun 17 10:43:20 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c09_Exploring_advanced_methods/00200_example_9.25_of_section_9.4.3.R&quot;
[1] &quot;#####   in directory ../Spambase&quot;

&gt; # example 9.25 of section 9.4.3 
&gt; # (example 9.25 of section 9.4.3)  : Exploring advanced methods : Using SVMs to model complicated decision boundaries : Using SVMs on real data 
&gt; # Title: Applying an SVM to the Spambase example 
&gt; 
&gt; library(&#39;kernlab&#39;)

&gt; spamFormulaV &lt;- as.formula(paste(&#39;spam&#39;,
    paste(spamVars,collapse=&#39; + &#39;),sep=&#39; ~ &#39;))

&gt; # may want to switch to library(&#39;e1071&#39;) svm() as had some state holding problems in some examles
&gt; svmM &lt;- ksvm(spamFormulaV,data=spamTrain,     # Note: 1 
         kernel=&#39;rbfdot&#39;,   # Note: 2 
         C=10,  # Note: 3 
         prob.model=T,cross=5,  # Note: 4 
         class.weights=c(&#39;spam&#39;=1,&#39;non-spam&#39;=10)    # Note: 5 
         )

&gt; spamTest$svmPred &lt;- predict(svmM,newdata=spamTest,type=&#39;response&#39;)

&gt; print(with(spamTest,table(y=spam,svmPred=svmPred)))
          svmPred
y          non-spam spam
  non-spam      269    9
  spam           29  151

&gt; ##           svmPred
&gt; ## y          non-spam spam
&gt; ##   non-spam      269    9
&gt; ##   spam           27  153
&gt; 
&gt; # Note 1: 
&gt; #   Build a support vector model for the Spambase 
&gt; #   problem. 
&gt; 
&gt; # Note 2: 
&gt; #   Ask for the radial dot or Gaussian kernel (in 
&gt; #   fact the default kernel). 
&gt; 
&gt; # Note 3: 
&gt; #   Set the “soft margin penalty” high; prefer not moving training examples over getting a wider 
&gt; #   margin. Prefer a complex model that applies weakly to all the data 
&gt; #   over a simpler model that applies strongly on a subset of the 
&gt; #   data. 
&gt; 
&gt; # Note 4: 
&gt; #   Ask that, in addition to a predictive model, an estimate of a model estimating class 
&gt; #   probabilities also be built. Not all SVM libraries support this 
&gt; #   operation, and the probabilities are essentially built after the 
&gt; #   model (through a cross-validation procedure) and may not be as high-quality 
&gt; #   as the model itself. 
&gt; 
&gt; # Note 5: 
&gt; #   Explicitly control the trade-off between 
&gt; #   false positive and false negative errors. In this case, we say non-spam 
&gt; #   classified as spam (a false positive) should be considered an expensive 
&gt; #   mistake. 
&gt; 
[1] &quot;############################### end  200 Fri Jun 17 10:43:27 2016&quot;
[1] &quot;############################### start  201 Fri Jun 17 10:43:27 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c09_Exploring_advanced_methods/00201_example_9.26_of_section_9.4.3.R&quot;
[1] &quot;#####   in directory ../Spambase&quot;

&gt; # example 9.26 of section 9.4.3 
&gt; # (example 9.26 of section 9.4.3)  : Exploring advanced methods : Using SVMs to model complicated decision boundaries : Using SVMs on real data 
&gt; # Title: Printing the SVM results summary 
&gt; 
&gt; print(svmM)
Support Vector Machine object of class &quot;ksvm&quot; 

SV type: C-svc  (classification) 
 parameter : cost C = 10 

Gaussian Radial Basis kernel function. 
 Hyperparameter : sigma =  0.0296673304748671 

Number of Support Vectors : 1115 

Objective Function Value : -4674.85 
Training error : 0.028965 
Cross validation error : 0.076026 
Probability model included. 

&gt; ## Support Vector Machine object of class &quot;ksvm&quot; 
&gt; ##
&gt; ## SV type: C-svc  (classification) 
&gt; ##  parameter : cost C = 10 
&gt; ##
&gt; ## Gaussian Radial Basis kernel function. 
&gt; ##  Hyperparameter : sigma =  0.0299836801848002 
&gt; ##
&gt; ## Number of Support Vectors : 1118 
&gt; ##
&gt; ## Objective Function Value : -4642.236 
&gt; ## Training error : 0.028482 
&gt; ## Cross validation error : 0.076998 
&gt; ## Probability model included.
&gt; 
[1] &quot;############################### end  201 Fri Jun 17 10:43:27 2016&quot;
[1] &quot;############################### start  202 Fri Jun 17 10:43:27 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c09_Exploring_advanced_methods/00202_example_9.27_of_section_9.4.3.R&quot;
[1] &quot;#####   in directory ../Spambase&quot;

&gt; # example 9.27 of section 9.4.3 
&gt; # (example 9.27 of section 9.4.3)  : Exploring advanced methods : Using SVMs to model complicated decision boundaries : Using SVMs on real data 
&gt; # Title: Shifting decision point to perform an apples-to-apples comparison 
&gt; 
&gt; sameCut &lt;- sort(spamTest$pred)[length(spamTest$pred)-162]     # Note: 1 

&gt; print(with(spamTest,table(y=spam,glPred=pred&gt;sameCut)))   # Note: 2 
          glPred
y          FALSE TRUE
  non-spam   267   11
  spam        29  151

&gt; ##           glPred
&gt; ## y          FALSE TRUE
&gt; ##   non-spam   267   11
&gt; ##   spam        29  151
&gt; 
&gt; # Note 1: 
&gt; #   Find out what GLM score threshold has 162 
&gt; #   examples above it. 
&gt; 
&gt; # Note 2: 
&gt; #   Ask the GLM model for its predictions that 
&gt; #   are above the threshold. We’re essentially asking the model for its 162 best 
&gt; #   candidate spam prediction results. 
&gt; 
[1] &quot;############################### end  202 Fri Jun 17 10:43:27 2016&quot;</code></pre>