::p_load(dplyr,ggplot2,caTools)
pacmanrm(list=ls(all=TRUE))
load("data/tf3.rdata")
= subset(A, spl)
TR = subset(A, !spl) TS
= glm(buy ~ ., TR[,c(2:9, 11)], family=binomial())
glm1 summary(glm1)
##
## Call:
## glm(formula = buy ~ ., family = binomial(), data = TR[, c(2:9,
## 11)])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.7931 -0.8733 -0.6991 1.0384 1.8735
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.259e+00 1.261e-01 -9.985 < 2e-16 ***
## r -1.227e-02 8.951e-04 -13.708 < 2e-16 ***
## s 9.566e-03 9.101e-04 10.511 < 2e-16 ***
## f 2.905e-01 1.593e-02 18.233 < 2e-16 ***
## m -3.028e-05 2.777e-05 -1.090 0.27559
## rev 4.086e-05 1.940e-05 2.106 0.03521 *
## raw -2.306e-04 8.561e-05 -2.693 0.00708 **
## agea29 -4.194e-02 8.666e-02 -0.484 0.62838
## agea34 1.772e-02 7.992e-02 0.222 0.82456
## agea39 7.705e-02 7.921e-02 0.973 0.33074
## agea44 8.699e-02 8.132e-02 1.070 0.28476
## agea49 1.928e-02 8.457e-02 0.228 0.81962
## agea54 1.745e-02 9.323e-02 0.187 0.85155
## agea59 1.752e-01 1.094e-01 1.602 0.10926
## agea64 6.177e-02 1.175e-01 0.526 0.59904
## agea69 2.652e-01 1.047e-01 2.533 0.01131 *
## agea99 -1.419e-01 1.498e-01 -0.947 0.34347
## areaz106 -4.105e-02 1.321e-01 -0.311 0.75603
## areaz110 -2.075e-01 1.045e-01 -1.986 0.04703 *
## areaz114 3.801e-02 1.111e-01 0.342 0.73214
## areaz115 2.599e-01 9.682e-02 2.684 0.00727 **
## areaz221 1.817e-01 9.753e-02 1.863 0.06243 .
## areazOthers -4.677e-02 1.045e-01 -0.448 0.65435
## areazUnknown -1.695e-01 1.232e-01 -1.375 0.16912
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 27629 on 20007 degrees of freedom
## Residual deviance: 23295 on 19984 degrees of freedom
## AIC: 23343
##
## Number of Fisher Scoring iterations: 5
= predict(glm1, TS, type="response")
pred = table(actual = TS$buy, predict = pred > 0.5); cm cm
## predict
## actual FALSE TRUE
## FALSE 3730 873
## TRUE 1700 2273
= cm %>% {sum(diag(.))/sum(.)}
acc.ts c(1-mean(TS$buy) , acc.ts) # 0.69998
## [1] 0.5367304 0.6999767
colAUC(pred, TS$buy) # 0.7556
## [,1]
## FALSE vs. TRUE 0.7556038
= subset(A, A$buy) %>% mutate_at(c("m","rev","amount"), log10)
A2 = subset(A2, spl2)
TR2 = subset(A2, !spl2) TS2
= lm(amount ~ ., TR2[,2:10])
lm1 # lm1 = lm(amount ~ ., TR2[,c(2:6,8:10)])
summary(lm1)
##
## Call:
## lm(formula = amount ~ ., data = TR2[, 2:10])
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.02055 -0.22997 0.04995 0.28613 1.37046
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.337e+00 5.434e-02 24.605 < 2e-16 ***
## r 3.809e-04 3.128e-04 1.218 0.22330
## s 1.404e-04 3.155e-04 0.445 0.65639
## f 2.054e-02 1.957e-03 10.499 < 2e-16 ***
## m 4.578e-01 3.810e-02 12.017 < 2e-16 ***
## rev 1.974e-02 3.627e-02 0.544 0.58618
## raw 6.947e-05 8.574e-06 8.102 6.10e-16 ***
## agea29 4.874e-02 2.519e-02 1.935 0.05297 .
## agea34 9.043e-02 2.323e-02 3.893 9.98e-05 ***
## agea39 1.216e-01 2.284e-02 5.322 1.05e-07 ***
## agea44 1.101e-01 2.346e-02 4.691 2.76e-06 ***
## agea49 6.667e-02 2.435e-02 2.739 0.00618 **
## agea54 8.679e-02 2.653e-02 3.272 0.00107 **
## agea59 4.009e-02 3.097e-02 1.294 0.19559
## agea64 7.085e-03 3.248e-02 0.218 0.82734
## agea69 -3.604e-02 2.883e-02 -1.250 0.21136
## agea99 9.705e-02 4.067e-02 2.386 0.01704 *
## areaz106 9.740e-02 4.257e-02 2.288 0.02216 *
## areaz110 5.381e-02 3.459e-02 1.556 0.11976
## areaz114 1.547e-02 3.635e-02 0.426 0.67030
## areaz115 1.773e-02 3.169e-02 0.560 0.57577
## areaz221 3.851e-02 3.194e-02 1.206 0.22801
## areazOthers 3.598e-02 3.431e-02 1.049 0.29440
## areazUnknown 1.220e-02 3.857e-02 0.316 0.75178
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4234 on 9245 degrees of freedom
## Multiple R-squared: 0.2847, Adjusted R-squared: 0.2829
## F-statistic: 160 on 23 and 9245 DF, p-value: < 2.2e-16
= summary(lm1)$r.sq
r2.tr = sum((TS2$amount - mean(TR2$amount))^ 2)
SST = sum((predict(lm1, TS2) - TS2$amount)^2)
SSE = 1 - (SSE/SST)
r2.ts c(R2train=r2.tr, R2test=r2.ts)
## R2train R2test
## 0.2846719 0.2896076
Aggregate data 2000-12-01 ~ 2001~02-28.
load("data/tf0.rdata")
= max(X0$date) + 1
d0 = X0 %>%
B filter(date >= as.Date("2000-12-01")) %>%
mutate(days = as.integer(difftime(d0, date, units="days"))) %>%
group_by(cust) %>% summarise(
r = min(days), # recency
s = max(days), # seniority
f = n(), # frquency
m = mean(total), # monetary
rev = sum(total), # total revenue contribution
raw = sum(gross), # total gross profit contribution
age = age[1], # age group
area = area[1], # area code
%>% data.frame # 28584
) nrow(B)
## [1] 28531
In B
, there is a record for each customer.
B$Buy
is the probability of buying in March.
$Buy = predict(glm1, B, type="response") B
💡: remember log transformation for the monetary variables
= B %>% mutate_at(c("m","rev"), log10)
B2 $Rev = 10^predict(lm1, B2) B
par(mfrow=c(1,2), cex=0.8)
hist(B$Buy)
hist(log(B$Rev,10))
save(B, file='data/tf4.rdata')
Given one’s
we can estimate his/her CLV via
\[ V_i = \sum_{t=0}^N g \times m_i \frac{r_i^t}{(1+d)^t} = g \times m_i \sum_{t=0}^N (\frac{r_i}{1+d})^t \]
= 0.3 # margin
g = 36 # period
N = 0.01 # interest rate
d $CLV = g * B$Rev * rowSums(sapply(
B0:N, function(i) (B$Buy/(1+d))^i ) )
summary(B$CLV)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 35.5 280.0 450.2 1412.0 786.0 1145741.4
ggplot(B, aes(CLV)) +
geom_histogram(bins=30, fill="green",alpha=0.4) +
scale_x_log10()
save(B, file='data/tf4.rdata')