RunExamples/rCh04.html

<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">source</span>(<span class="st">&#39;runDir.R&#39;</span>)</code></pre></div>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">custdata &lt;-<span class="st"> </span><span class="kw">read.table</span>(<span class="st">&#39;../Custdata/custdata.tsv&#39;</span>,
                       <span class="dt">header=</span><span class="ot">TRUE</span>,<span class="dt">sep=</span><span class="st">&#39;</span><span class="ch">\t</span><span class="st">&#39;</span>)
<span class="kw">runDir</span>(<span class="st">&#39;../CodeExamples/c04_Managing_data&#39;</span>,
       <span class="st">&#39;../Custdata&#39;</span>,<span class="dt">last=</span><span class="dv">55</span>)</code></pre></div>
<pre><code>[1] &quot;############################### start  42 Fri Jun 17 10:32:48 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c04_Managing_data/00042_example_4.1_of_section_4.1.1.R&quot;
[1] &quot;#####   in directory ../Custdata&quot;

&gt; # example 4.1 of section 4.1.1 
&gt; # (example 4.1 of section 4.1.1)  : Managing data : Cleaning data : Treating missing values (NAs) 
&gt; # Title: Checking locations of missing data 
&gt; 
&gt; custdata &lt;- read.table(&#39;custdata.tsv&#39;,
    header=TRUE,sep=&#39;\t&#39;)

&gt; summary(custdata[is.na(custdata$housing.type),    # Note: 1 
                    c(&quot;recent.move&quot;,&quot;num.vehicles&quot;)])   # Note: 2 
 recent.move     num.vehicles
 Mode:logical   Min.   : NA  
 NA&#39;s:56        1st Qu.: NA  
                Median : NA  
                Mean   :NaN  
                3rd Qu.: NA  
                Max.   : NA  
                NA&#39;s   :56   

&gt; ##  recent.move     num.vehicles      # Note: 3 
&gt; ##  Mode:logical   Min.   : NA
&gt; ##  NA&#39;s:56        1st Qu.: NA
&gt; ##                 Median : NA
&gt; ##                 Mean   :NaN
&gt; ##                 3rd Qu.: NA
&gt; ##                 Max.   : NA
&gt; ##                 NA&#39;s   :56
&gt; 
&gt; # Note 1: 
&gt; #   Restrict to the rows where housing.type is 
&gt; #   NA. 
&gt; 
&gt; # Note 2: 
&gt; #   Look only at the columns recent.move and 
&gt; #   num.vehicles. 
&gt; 
&gt; # Note 3: 
&gt; #   The output: all NAs. All the missing data 
&gt; #   comes from the same rows. 
&gt; 
[1] &quot;############################### end  42 Fri Jun 17 10:32:48 2016&quot;
[1] &quot;############################### start  43 Fri Jun 17 10:32:48 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c04_Managing_data/00043_example_4.2_of_section_4.1.1.R&quot;
[1] &quot;#####   in directory ../Custdata&quot;

&gt; # example 4.2 of section 4.1.1 
&gt; # (example 4.2 of section 4.1.1)  : Managing data : Cleaning data : Treating missing values (NAs) 
&gt; # Title: Remapping NA to a level 
&gt; 
&gt; custdata$is.employed.fix &lt;- ifelse(is.na(custdata$is.employed),   # Note: 1 
                                    &quot;missing&quot;,                      # Note: 2 
                                    ifelse(custdata$is.employed==T,     # Note: 3 
                                           &quot;employed&quot;,
                                           &quot;not employed&quot;))     # Note: 4 

&gt; summary(as.factor(custdata$is.employed.fix))  # Note: 5 
    employed      missing not employed 
         599          328           73 

&gt; ##     employed      missing not employed
&gt; ##          599          328           73
&gt; 
&gt; # Note 1: 
&gt; #   If is.employed value is missing... 
&gt; 
&gt; # Note 2: 
&gt; #   ...assign the value &quot;missing&quot;. 
&gt; #   Otherwise... 
&gt; 
&gt; # Note 3: 
&gt; #   ...if is.employed==TRUE, assign the value 
&gt; #   &quot;employed&quot;... 
&gt; 
&gt; # Note 4: 
&gt; #   ...or the value &quot;not employed&quot;. 
&gt; 
&gt; # Note 5: 
&gt; #   The transformation has turned the variable 
&gt; #   type from factor to string. You can change it back 
&gt; #   with the as.factor() function. 
&gt; 
[1] &quot;############################### end  43 Fri Jun 17 10:32:49 2016&quot;
[1] &quot;############################### start  44 Fri Jun 17 10:32:49 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c04_Managing_data/00044_informalexample_4.1_of_section_4.1.1.R&quot;
[1] &quot;#####   in directory ../Custdata&quot;

&gt; # informalexample 4.1 of section 4.1.1 
&gt; # (informalexample 4.1 of section 4.1.1)  : Managing data : Cleaning data : Treating missing values (NAs) 
&gt; 
&gt; custdata$is.employed.fix &lt;- ifelse(is.na(custdata$is.employed),
                   &quot;not in active workforce&quot;,
                    ifelse(custdata$is.employed==T,
                                    &quot;employed&quot;,
                                     &quot;not employed&quot;))
[1] &quot;############################### end  44 Fri Jun 17 10:32:49 2016&quot;
[1] &quot;############################### start  45 Fri Jun 17 10:32:49 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c04_Managing_data/00045_informalexample_4.2_of_section_4.1.1.R&quot;
[1] &quot;#####   in directory ../Custdata&quot;

&gt; # informalexample 4.2 of section 4.1.1 
&gt; # (informalexample 4.2 of section 4.1.1)  : Managing data : Cleaning data : Treating missing values (NAs) 
&gt; 
&gt; summary(custdata$Income)
Length  Class   Mode 
     0   NULL   NULL 

&gt; ##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA&#39;s
&gt; ##       0   25000   45000   66200   82000  615000     328
&gt; 
[1] &quot;############################### end  45 Fri Jun 17 10:32:49 2016&quot;
[1] &quot;############################### start  46 Fri Jun 17 10:32:49 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c04_Managing_data/00046_informalexample_4.3_of_section_4.1.1.R&quot;
[1] &quot;#####   in directory ../Custdata&quot;

&gt; # informalexample 4.3 of section 4.1.1 
&gt; # (informalexample 4.3 of section 4.1.1)  : Managing data : Cleaning data : Treating missing values (NAs) 
&gt; 
&gt; meanIncome &lt;- mean(custdata$Income, na.rm=T)  # Note: 1 

Warning in mean.default(custdata$Income, na.rm = T): argument is not
numeric or logical: returning NA


&gt; Income.fix &lt;- ifelse(is.na(custdata$Income),
                        meanIncome,
                        custdata$Income)

Warning in is.na(custdata$Income): is.na() applied to non-(list or vector)
of type &#39;NULL&#39;


&gt; summary(Income.fix)
   Mode    NA&#39;s 
logical       0 

&gt; ##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
&gt; ##       0   35000   66200   66200   66200  615000
&gt; 
&gt; # Note 1: 
&gt; #   Don’t forget the argument &quot;na.rm=T&quot;! 
&gt; #   Otherwise, the mean() function will include the 
&gt; #   NAs by default, and meanIncome will be NA. 
&gt; 
[1] &quot;############################### end  46 Fri Jun 17 10:32:49 2016&quot;
[1] &quot;############################### start  47 Fri Jun 17 10:32:49 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c04_Managing_data/00047_example_4.3_of_section_4.1.1.R&quot;
[1] &quot;#####   in directory ../Custdata&quot;

&gt; # example 4.3 of section 4.1.1 
&gt; # (example 4.3 of section 4.1.1)  : Managing data : Cleaning data : Treating missing values (NAs) 
&gt; # Title: Converting missing numeric data to a level 
&gt; 
&gt; breaks &lt;-c(0, 10000, 50000, 100000, 250000, 1000000)              # Note: 1 

&gt; Income.groups &lt;- cut(custdata$income,
                       breaks=breaks, include.lowest=T)     # Note: 2 

&gt; summary(Income.groups)                                            # Note: 3 
      [0,1e+04]   (1e+04,5e+04]   (5e+04,1e+05] (1e+05,2.5e+05] 
            184             469             215             105 
(2.5e+05,1e+06]            NA&#39;s 
             26               1 

&gt; ##  [0,1e+04] (1e+04,5e+04] (5e+04,1e+05] (1e+05,2.5e+05] (2.5e+05,1e+06]
&gt; ##         63           312           178              98              21
&gt; ##       NA&#39;s
&gt; ##        328
&gt; 
&gt; Income.groups &lt;- as.character(Income.groups)                      # Note: 4 

&gt; Income.groups &lt;- ifelse(is.na(Income.groups),                     # Note: 5 
                       &quot;no income&quot;, Income.groups)

&gt; summary(as.factor(Income.groups))
  (1e+04,5e+04] (1e+05,2.5e+05] (2.5e+05,1e+06]   (5e+04,1e+05] 
            469             105              26             215 
      [0,1e+04]       no income 
            184               1 

&gt; ##  (1e+04,5e+04] (1e+05,2.5e+05] (2.5e+05,1e+06]  (5e+04,1e+05]  [0,1e+04]
&gt; ##            312              98              21            178         63
&gt; ##      no income
&gt; ##            328
&gt; 
&gt; # Note 1: 
&gt; #   Select some income ranges of interest. To 
&gt; #   use the cut() function, the upper and lower bounds 
&gt; #   should encompass the full income range of the 
&gt; #   data. 
&gt; 
&gt; # Note 2: 
&gt; #   Cut the data into income ranges. The 
&gt; #   include.lowest=T argument makes sure that zero 
&gt; #   income data is included in the lowest income range 
&gt; #   category. By default it would be excluded. 
&gt; 
&gt; # Note 3: 
&gt; #   The cut() function produces factor 
&gt; #   variables. Note the NAs are preserved. 
&gt; 
&gt; # Note 4: 
&gt; #   To preserve the category names before adding 
&gt; #   a new category, convert the variables to strings. 
&gt; 
&gt; # Note 5: 
&gt; #   Add the &quot;no income&quot; category to replace the 
&gt; #   NAs. 
&gt; 
[1] &quot;############################### end  47 Fri Jun 17 10:32:49 2016&quot;
[1] &quot;############################### start  48 Fri Jun 17 10:32:49 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c04_Managing_data/00048_example_4.4_of_section_4.1.1.R&quot;
[1] &quot;#####   in directory ../Custdata&quot;

&gt; # example 4.4 of section 4.1.1 
&gt; # (example 4.4 of section 4.1.1)  : Managing data : Cleaning data : Treating missing values (NAs) 
&gt; # Title: Tracking original NAs with an extra categorical variable 
&gt; 
&gt; missingIncome &lt;- is.na(custdata$Income)   # Note: 1 

Warning in is.na(custdata$Income): is.na() applied to non-(list or vector)
of type &#39;NULL&#39;


&gt; Income.fix &lt;- ifelse(is.na(custdata$Income), 0, custdata$Income)  # Note: 2

Warning in is.na(custdata$Income): is.na() applied to non-(list or vector)
of type &#39;NULL&#39;


&gt; # Note 1: 
&gt; #   The missingIncome variable lets you 
&gt; #   differentiate the two kinds of zeros in the data: 
&gt; #   the ones that you are about to add, and the ones 
&gt; #   that were already there. 
&gt; 
&gt; # Note 2: 
&gt; #   Replace the NAs with zeros. 
&gt; 
[1] &quot;############################### end  48 Fri Jun 17 10:32:49 2016&quot;
[1] &quot;############################### start  49 Fri Jun 17 10:32:49 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c04_Managing_data/00049_example_4.5_of_section_4.1.2.R&quot;
[1] &quot;#####   in directory ../Custdata&quot;

&gt; # example 4.5 of section 4.1.2 
&gt; # (example 4.5 of section 4.1.2)  : Managing data : Cleaning data : Data transformations 
&gt; # Title: Normalizing income by state 
&gt; 
&gt; medianincome &lt;- aggregate(income~state.of.res,custdata,FUN=median)

&gt; colnames(medianincome) &lt;- c(&#39;State&#39;,&#39;Median.Income&#39;)

&gt; summary(medianincome)     # Note: 1 
        State    Median.Income  
 Alabama   : 1   Min.   :    0  
 Alaska    : 1   1st Qu.:27575  
 Arizona   : 1   Median :35685  
 Arkansas  : 1   Mean   :35362  
 California: 1   3rd Qu.:40375  
 Colorado  : 1   Max.   :94700  
 (Other)   :44                  

&gt; ##         State    Median.Income
&gt; ##            : 1   Min.   :37427
&gt; ##  Alabama   : 1   1st Qu.:47483
&gt; ##  Alaska    : 1   Median :52274
&gt; ##  Arizona   : 1   Mean   :52655
&gt; ##  Arkansas  : 1   3rd Qu.:57195
&gt; ##  California: 1   Max.   :68187
&gt; ##  (Other)   :46
&gt; 
&gt;         
&gt; custdata &lt;- merge(custdata, medianincome,
                    by.x=&quot;state.of.res&quot;, by.y=&quot;State&quot;)      # Note: 2 

&gt; summary(custdata[,c(&quot;state.of.res&quot;, &quot;income&quot;, &quot;Median.Income&quot;)])  # Note: 3 
       state.of.res     income       Median.Income  
 California  :100   Min.   : -8700   Min.   :    0  
 New York    : 71   1st Qu.: 14600   1st Qu.:31600  
 Pennsylvania: 70   Median : 35000   Median :35780  
 Texas       : 56   Mean   : 53505   Mean   :34853  
 Michigan    : 52   3rd Qu.: 67000   3rd Qu.:38500  
 Ohio        : 51   Max.   :615000   Max.   :94700  
 (Other)     :600                                   

&gt; ##        state.of.res     income       Median.Income
&gt; ##  California  :100   Min.   : -8700   Min.   :37427
&gt; ##  New York    : 71   1st Qu.: 14600   1st Qu.:44819
&gt; ##  Pennsylvania: 70   Median : 35000   Median :50977
&gt; ##  Texas       : 56   Mean   : 53505   Mean   :51161
&gt; ##  Michigan    : 52   3rd Qu.: 67000   3rd Qu.:55559
&gt; ##  Ohio        : 51   Max.   :615000   Max.   :68187
&gt; ##  (Other)     :600
&gt; 
&gt; custdata$income.norm &lt;- with(custdata, income/Median.Income)  # Note: 4 

&gt; summary(custdata$income.norm)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA&#39;s 
-0.1933  0.4479  1.0000  1.5740  1.8990 17.0800       1 

&gt; ##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
&gt; ## -0.1791  0.2729  0.6992  1.0820  1.3120 11.6600
&gt; 
&gt; # Note 1: 
&gt; #   medianincome is a data frame of median 
&gt; #   income by state. 
&gt; 
&gt; # Note 2: 
&gt; #   Merge median income information into the 
&gt; #   custdata data frame by matching the column 
&gt; #   custdata$state.of.res to the column 
&gt; #   medianincome$State. 
&gt; 
&gt; # Note 3: 
&gt; #   Median.Income is now part of custdata. 
&gt; 
&gt; # Note 4: 
&gt; #   Normalize income by Median.Income. 
&gt; 
[1] &quot;############################### end  49 Fri Jun 17 10:32:49 2016&quot;
[1] &quot;############################### start  50 Fri Jun 17 10:32:49 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c04_Managing_data/00050_informalexample_4.4_of_section_4.1.2.R&quot;
[1] &quot;#####   in directory ../Custdata&quot;

&gt; # informalexample 4.4 of section 4.1.2 
&gt; # (informalexample 4.4 of section 4.1.2)  : Managing data : Cleaning data : Data transformations 
&gt; 
&gt; custdata$income.lt.20K &lt;- custdata$income &lt; 20000

&gt; summary(custdata$income.lt.20K)
   Mode   FALSE    TRUE    NA&#39;s 
logical     678     322       0 

&gt; ##    Mode   FALSE    TRUE    NA&#39;s
&gt; ## logical     678     322       0
&gt; 
[1] &quot;############################### end  50 Fri Jun 17 10:32:49 2016&quot;
[1] &quot;############################### start  51 Fri Jun 17 10:32:49 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c04_Managing_data/00051_example_4.6_of_section_4.1.2.R&quot;
[1] &quot;#####   in directory ../Custdata&quot;

&gt; # example 4.6 of section 4.1.2 
&gt; # (example 4.6 of section 4.1.2)  : Managing data : Cleaning data : Data transformations 
&gt; # Title: Converting age into ranges 
&gt; 
&gt; brks &lt;- c(0, 25, 65, Inf)     # Note: 1 

&gt; custdata$age.range &lt;- cut(custdata$age,
     breaks=brks, include.lowest=T)     # Note: 2 

&gt; summary(custdata$age.range)   # Note: 3 
  [0,25]  (25,65] (65,Inf] 
      56      732      212 

&gt; ##   [0,25]  (25,65] (65,Inf]
&gt; ##       56      732      212
&gt; 
&gt; # Note 1: 
&gt; #   Select the age ranges of interest. The upper 
&gt; #   and lower bounds should encompass the full range 
&gt; #   of the data. 
&gt; 
&gt; # Note 2: 
&gt; #   Cut the data into age ranges. The 
&gt; #   include.lowest=T argument makes sure that zero age 
&gt; #   data is included in the lowest age range category. 
&gt; #   By default it would be excluded. 
&gt; 
&gt; # Note 3: 
&gt; #   The output of cut() is a factor variable. 
&gt; 
[1] &quot;############################### end  51 Fri Jun 17 10:32:49 2016&quot;
[1] &quot;############################### start  52 Fri Jun 17 10:32:49 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c04_Managing_data/00052_example_4.7_of_section_4.1.2.R&quot;
[1] &quot;#####   in directory ../Custdata&quot;

&gt; # example 4.7 of section 4.1.2 
&gt; # (example 4.7 of section 4.1.2)  : Managing data : Cleaning data : Data transformations 
&gt; # Title: Centering on mean age 
&gt; 
&gt; summary(custdata$age)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    0.0    38.0    50.0    51.7    64.0   146.7 

&gt; ##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
&gt; ##     0.0    38.0    50.0    51.7    64.0   146.7
&gt; meanage &lt;- mean(custdata$age)

&gt; custdata$age.normalized &lt;- custdata$age/meanage

&gt; summary(custdata$age.normalized)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.0000  0.7350  0.9671  1.0000  1.2380  2.8370 

&gt; ##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
&gt; ##  0.0000  0.7350  0.9671  1.0000  1.2380  2.8370
&gt; 
[1] &quot;############################### end  52 Fri Jun 17 10:32:49 2016&quot;
[1] &quot;############################### start  53 Fri Jun 17 10:32:49 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c04_Managing_data/00053_example_4.8_of_section_4.1.2.R&quot;
[1] &quot;#####   in directory ../Custdata&quot;

&gt; # example 4.8 of section 4.1.2 
&gt; # (example 4.8 of section 4.1.2)  : Managing data : Cleaning data : Data transformations 
&gt; # Title: Summarizing age 
&gt; 
&gt; summary(custdata$age)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    0.0    38.0    50.0    51.7    64.0   146.7 

&gt; ##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
&gt; ##     0.0    38.0    50.0    51.7    64.0   146.7
&gt; meanage &lt;- mean(custdata$age)     # Note: 1 

&gt; stdage &lt;- sd(custdata$age)        # Note: 2 

&gt; meanage
[1] 51.69981

&gt; ## [1] 51.69981
&gt; stdage
[1] 18.86343

&gt; ## [1] 18.86343
&gt; custdata$age.normalized &lt;- (custdata$age-meanage)/stdage  # Note: 3 

&gt; summary(custdata$age.normalized)
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-2.74100 -0.72630 -0.09011  0.00000  0.65210  5.03500 

&gt; ##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.
&gt; ## -2.74100 -0.72630 -0.09011  0.00000  0.65210  5.03500
&gt; 
&gt; # Note 1: 
&gt; #   Take the mean. 
&gt; 
&gt; # Note 2: 
&gt; #   Take the standard deviation. 
&gt; 
&gt; # Note 3: 
&gt; #   Use the mean value as the origin (or 
&gt; #   reference point) and rescale the distance from the 
&gt; #   mean by the standard deviation. 
&gt; 
[1] &quot;############################### end  53 Fri Jun 17 10:32:49 2016&quot;
[1] &quot;############################### start  54 Fri Jun 17 10:32:49 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c04_Managing_data/00054_informalexample_4.5_of_section_4.1.2.R&quot;
[1] &quot;#####   in directory ../Custdata&quot;

&gt; # informalexample 4.5 of section 4.1.2 
&gt; # (informalexample 4.5 of section 4.1.2)  : Managing data : Cleaning data : Data transformations 
&gt; 
&gt; signedlog10 &lt;- function(x) {
   ifelse(abs(x) &lt;= 1, 0, sign(x)*log10(abs(x)))
 }
[1] &quot;############################### end  54 Fri Jun 17 10:32:49 2016&quot;
[1] &quot;############################### start  55 Fri Jun 17 10:32:49 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c04_Managing_data/00055_example_4.9_of_section_4.2.2.R&quot;
[1] &quot;#####   in directory ../Custdata&quot;

&gt; # example 4.9 of section 4.2.2 
&gt; # (example 4.9 of section 4.2.2)  : Managing data : Sampling for modeling and validation : Creating a sample group column 
&gt; # Title: Splitting into test and training using a random group mark 
&gt; 
&gt; custdata$gp &lt;- runif(dim(custdata)[1])    # Note: 1 

&gt; testSet &lt;- subset(custdata, custdata$gp &lt;= 0.1)   # Note: 2 

&gt; trainingSet &lt;- subset(custdata, custdata$gp &gt; 0.1)    # Note: 3 

&gt; dim(testSet)[1]
[1] 105

&gt; ## [1] 93
&gt; dim(trainingSet)[1]
[1] 895

&gt; ## [1] 907
&gt; 
&gt; # Note 1: 
&gt; #   dim(custdata) returns the number of rows and 
&gt; #   columns of the data frame as a vector, so 
&gt; #   dim(custdata)[1] returns the number of rows. 
&gt; 
&gt; # Note 2: 
&gt; #   Here we generate a test set of about 10% of 
&gt; #   the data (93 customers—a little over 9%, actually) 
&gt; #   and train on the remaining 90%. 
&gt; 
&gt; # Note 3: 
&gt; #   Here we generate a training using the 
&gt; #   remaining data. 
&gt; 
[1] &quot;############################### end  55 Fri Jun 17 10:32:49 2016&quot;</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">rm</span>(<span class="dt">list=</span><span class="kw">ls</span>())
<span class="kw">source</span>(<span class="st">&#39;runDir.R&#39;</span>)
<span class="kw">load</span>(<span class="st">&#39;../NotionalData/exampleData.rData&#39;</span>)
<span class="kw">runDir</span>(<span class="st">&#39;../CodeExamples/c04_Managing_data&#39;</span>,
       <span class="st">&#39;../NotionalData&#39;</span>,<span class="dt">first=</span><span class="dv">56</span>)</code></pre></div>
<pre><code>[1] &quot;############################### start  56 Fri Jun 17 10:32:49 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c04_Managing_data/00056_example_4.10_of_section_4.2.3.R&quot;
[1] &quot;#####   in directory ../NotionalData&quot;

&gt; # example 4.10 of section 4.2.3 
&gt; # (example 4.10 of section 4.2.3)  : Managing data : Sampling for modeling and validation : Record grouping 
&gt; # Title: Ensuring test/train split doesn’t split inside a household 
&gt; 
&gt; hh &lt;- unique(hhdata$household_id)     # Note: 1 

&gt; households &lt;- data.frame(household_id = hh, gp = runif(length(hh)))   # Note: 2 

&gt; hhdata &lt;- merge(hhdata, households, by=&quot;household_id&quot;)    # Note: 3

&gt; # Note 1: 
&gt; #   Get all unique household IDs from your data 
&gt; #   frame. 
&gt; 
&gt; # Note 2: 
&gt; #   Create a temporary data frame of household IDs 
&gt; #   and a uniformly random number from 0 to 1. 
&gt; 
&gt; # Note 3: 
&gt; #   Merge new random sample group column back into 
&gt; #   original data frame. 
&gt; 
[1] &quot;############################### end  56 Fri Jun 17 10:32:49 2016&quot;</code></pre>