-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathwideChurnModel.Rmd
137 lines (108 loc) · 4.15 KB
/
wideChurnModel.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
```{r}
library(glmnet)
library(ggplot2)
library(WVPlots)
```
Modeling functions
```{r}
# assuming xframe is entirely numeric
ridge_predict_function = function(model) {
# to get around the 'unfullfilled promise' leak. blech.
force(model)
function(xframe) {
as.numeric(predict(model, newx=as.matrix(xframe), type="response"))
}
}
# assuming the xframe is entirely numeric
# if there are categories, we would have to use
# model_matrix, or something
# assuming family is one of c("binomial", "gaussian")
# should have a check for that
ridge_model = function(xframe, y, family="binomial") {
model = glmnet(as.matrix(xframe), y, alpha=0,
lambda=0.001, family=family)
list(coef = coef(model),
deviance = deviance(model),
predfun = ridge_predict_function(model)
)
}
```
```{r model}
# loads vars (names of vars), yVar (name of y column),
# dTrainS, dTestS
load("wideData.rData")
model = ridge_model(dTrainS[,vars], dTrainS[[yVar]])
names = rownames(model$coef)
coefs = as.vector(model$coef)
names(coefs) = names
ord = order(abs(coefs), decreasing=TRUE)
coefs[ord]
testpred = model$predfun(dTestS[,vars])
dTestS$pred = testpred
DoubleDensityPlot(dTestS, "pred", yVar,
title="Model Score Distribution")
ROCPlot(dTestS, "pred", yVar, title="ROC plot")
GainCurvePlot(dTestS, "pred", yVar, title="Gain plot")
```
Evaluate the positive examples. Do we detect churn in time?
*Note: from the implementation, we know that things in "bad" state
will leave in about 5 days. things not in bad state will leave in about 15 days.*
```{r evaluate}
dTestS$predictedToLeave = dTestS$pred>0.5
# confusion matrix
cmat = table(pred=dTestS$predictedToLeave, actual=dTestS[[yVar]])
cmat
recall = cmat[2,2]/sum(cmat[,2])
recall
precision = cmat[2,2]/sum(cmat[2,])
precision
# make daysToX finite. The idea is that the live-forevers should be rare
isinf = dTestS$daysToX==Inf
maxval = max(dTestS$daysToX[!isinf])
dTestS$daysToX = with(dTestS, ifelse(daysToX==Inf, maxval, daysToX))
# how long on average until flagged customers leave?
posmean = mean(dTestS[dTestS$predictedToLeave, "daysToX"])
posmean
# how long on average until unflagged customers leave?
negmean = mean(dTestS[!dTestS$predictedToLeave, "daysToX"])
negmean
print(negmean-posmean) # theoretically, the difference can't be above 10
```
```{r drilldown}
# xvar is integral
dist_and_mean = function(frm, xvar, title, meanlabel) {
meanval = mean(frm[[xvar]])
mode = max(table(frm[[xvar]]))
print(paste("Mean days until exit:", meanval))
DiscreteDistribution(frm, xvar, title=title) +
geom_vline(xintercept=meanval, color="blue", linetype=2)+
annotate("text", x=meanval, y=mode,
hjust=0, vjust=0,
label=meanlabel,
color="blue")
}
# how long until true positives (customers flagged as leaving who really do leave) leave?
tpfilter = dTestS$predictedToLeave & dTestS[[yVar]]
dist_and_mean(dTestS[tpfilter,], "daysToX",
"Distribution of days til exit, true positives",
"mean days til exit")
# same plot as above for all positives and all negatives
dist_and_mean(dTestS[dTestS$predictedToLeave,], "daysToX",
"Distribution of days til exit, predicted positive",
"mean days til exit") + geom_vline(xintercept=7, color="red")
dist_and_mean(dTestS[!dTestS$predictedToLeave,], "daysToX",
"Distribution of days til exit, predicted negative",
"mean days til exit") + geom_vline(xintercept=7, color="red")
# how soon do the actual positives that we missed leave?
falsenegmean = mean(dTestS[!dTestS$predictedToLeave & dTestS[[yVar]], "daysToX"])
falsenegmean
# alternative to the double density plot above;
# plots days to churn, instead of model score
ggplot(dTestS, aes_string(x="daysToX", color="predictedToLeave")) +
geom_density(adjust=0.5) +
geom_vline(xintercept=7, color="darkgray") +
geom_vline(xintercept=posmean, color="cyan4", linetype=2) +
geom_vline(xintercept=negmean, color="red", linetype=2) +
ggtitle("Days to Exit")
ScatterBoxPlot(dTestS, "predictedToLeave", "daysToX", pt_alpha=0.2,
title="Distribution of days til exit")