Model Training & Testing

Fig-1: The First Model


Loading & Preparing Data

pacman::p_load(dplyr,ggplot2,caTools)
rm(list=ls(all=TRUE))
load("data/tf3.rdata")
Spliting for Classification
TR = subset(A, spl)
TS = subset(A, !spl)


Classification Model

glm1 = glm(buy ~ ., TR[,c(2:9, 11)], family=binomial()) 
summary(glm1)
## 
## Call:
## glm(formula = buy ~ ., family = binomial(), data = TR[, c(2:9, 
##     11)])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.7931  -0.8733  -0.6991   1.0384   1.8735  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.259e+00  1.261e-01  -9.985  < 2e-16 ***
## r            -1.227e-02  8.951e-04 -13.708  < 2e-16 ***
## s             9.566e-03  9.101e-04  10.511  < 2e-16 ***
## f             2.905e-01  1.593e-02  18.233  < 2e-16 ***
## m            -3.028e-05  2.777e-05  -1.090  0.27559    
## rev           4.086e-05  1.940e-05   2.106  0.03521 *  
## raw          -2.306e-04  8.561e-05  -2.693  0.00708 ** 
## agea29       -4.194e-02  8.666e-02  -0.484  0.62838    
## agea34        1.772e-02  7.992e-02   0.222  0.82456    
## agea39        7.705e-02  7.921e-02   0.973  0.33074    
## agea44        8.699e-02  8.132e-02   1.070  0.28476    
## agea49        1.928e-02  8.457e-02   0.228  0.81962    
## agea54        1.745e-02  9.323e-02   0.187  0.85155    
## agea59        1.752e-01  1.094e-01   1.602  0.10926    
## agea64        6.177e-02  1.175e-01   0.526  0.59904    
## agea69        2.652e-01  1.047e-01   2.533  0.01131 *  
## agea99       -1.419e-01  1.498e-01  -0.947  0.34347    
## areaz106     -4.105e-02  1.321e-01  -0.311  0.75603    
## areaz110     -2.075e-01  1.045e-01  -1.986  0.04703 *  
## areaz114      3.801e-02  1.111e-01   0.342  0.73214    
## areaz115      2.599e-01  9.682e-02   2.684  0.00727 ** 
## areaz221      1.817e-01  9.753e-02   1.863  0.06243 .  
## areazOthers  -4.677e-02  1.045e-01  -0.448  0.65435    
## areazUnknown -1.695e-01  1.232e-01  -1.375  0.16912    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27629  on 20007  degrees of freedom
## Residual deviance: 23295  on 19984  degrees of freedom
## AIC: 23343
## 
## Number of Fisher Scoring iterations: 5
pred =  predict(glm1, TS, type="response")
cm = table(actual = TS$buy, predict = pred > 0.5); cm
##        predict
## actual  FALSE TRUE
##   FALSE  3730  873
##   TRUE   1700 2273
acc.ts = cm %>% {sum(diag(.))/sum(.)}
c(1-mean(TS$buy) , acc.ts)  # 0.69998
## [1] 0.5367304 0.6999767
colAUC(pred, TS$buy)        # 0.7556
##                     [,1]
## FALSE vs. TRUE 0.7556038


Regression Model

A2 = subset(A, A$buy) %>% mutate_at(c("m","rev","amount"), log10)
TR2 = subset(A2, spl2)
TS2 = subset(A2, !spl2)
lm1 = lm(amount ~ ., TR2[,2:10])
# lm1 = lm(amount ~ ., TR2[,c(2:6,8:10)])
summary(lm1)
## 
## Call:
## lm(formula = amount ~ ., data = TR2[, 2:10])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.02055 -0.22997  0.04995  0.28613  1.37046 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.337e+00  5.434e-02  24.605  < 2e-16 ***
## r             3.809e-04  3.128e-04   1.218  0.22330    
## s             1.404e-04  3.155e-04   0.445  0.65639    
## f             2.054e-02  1.957e-03  10.499  < 2e-16 ***
## m             4.578e-01  3.810e-02  12.017  < 2e-16 ***
## rev           1.974e-02  3.627e-02   0.544  0.58618    
## raw           6.947e-05  8.574e-06   8.102 6.10e-16 ***
## agea29        4.874e-02  2.519e-02   1.935  0.05297 .  
## agea34        9.043e-02  2.323e-02   3.893 9.98e-05 ***
## agea39        1.216e-01  2.284e-02   5.322 1.05e-07 ***
## agea44        1.101e-01  2.346e-02   4.691 2.76e-06 ***
## agea49        6.667e-02  2.435e-02   2.739  0.00618 ** 
## agea54        8.679e-02  2.653e-02   3.272  0.00107 ** 
## agea59        4.009e-02  3.097e-02   1.294  0.19559    
## agea64        7.085e-03  3.248e-02   0.218  0.82734    
## agea69       -3.604e-02  2.883e-02  -1.250  0.21136    
## agea99        9.705e-02  4.067e-02   2.386  0.01704 *  
## areaz106      9.740e-02  4.257e-02   2.288  0.02216 *  
## areaz110      5.381e-02  3.459e-02   1.556  0.11976    
## areaz114      1.547e-02  3.635e-02   0.426  0.67030    
## areaz115      1.773e-02  3.169e-02   0.560  0.57577    
## areaz221      3.851e-02  3.194e-02   1.206  0.22801    
## areazOthers   3.598e-02  3.431e-02   1.049  0.29440    
## areazUnknown  1.220e-02  3.857e-02   0.316  0.75178    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4234 on 9245 degrees of freedom
## Multiple R-squared:  0.2847, Adjusted R-squared:  0.2829 
## F-statistic:   160 on 23 and 9245 DF,  p-value: < 2.2e-16
r2.tr = summary(lm1)$r.sq
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(lm1, TS2) -  TS2$amount)^2)
r2.ts = 1 - (SSE/SST)
c(R2train=r2.tr, R2test=r2.ts)
##   R2train    R2test 
## 0.2846719 0.2896076


More Predictors for Better Prediction

Fig-2: Prediction

Prediction

Fig-3: Prediction


Aggregate data 2000-12-01 ~ 2001~02-28.

load("data/tf0.rdata")
d0 = max(X0$date) + 1
B = X0 %>% 
  filter(date >= as.Date("2000-12-01")) %>% 
  mutate(days = as.integer(difftime(d0, date, units="days"))) %>% 
  group_by(cust) %>% summarise(
    r = min(days),      # recency
    s = max(days),      # seniority
    f = n(),            # frquency
    m = mean(total),    # monetary
    rev = sum(total),   # total revenue contribution
    raw = sum(gross),   # total gross profit contribution
    age = age[1],       # age group
    area = area[1],     # area code
  ) %>% data.frame      # 28584
nrow(B)
## [1] 28531

In B, there is a record for each customer. B$Buy is the probability of buying in March.

B$Buy = predict(glm1, B, type="response")

💡: remember log transformation for the monetary variables

B2 = B %>% mutate_at(c("m","rev"), log10)
B$Rev = 10^predict(lm1, B2)
par(mfrow=c(1,2), cex=0.8)
hist(B$Buy)
hist(log(B$Rev,10))

save(B, file='data/tf4.rdata')



CLV - Customer Live Time Value

Given one’s

we can estimate his/her CLV via discounted cash flow.

customer \(i\)’s CLV

\[ V_i = \sum_{t=0}^N g \times m_i \frac{r_i^t}{(1+d)^t} = g \times m_i \sum_{t=0}^N (\frac{r_i}{1+d})^t \]

\(m_i\):customer \(i\)’s expected buying amount
\(r_i\):customer \(i\)’s expected retention probability
\(g\):company’s average operational margin rate
\(d\):company’s weighted average cost of capital
g = 0.3   # margin
N = 36    # period
d = 0.01  # interest rate
B$CLV = g * B$Rev * rowSums(sapply(
  0:N, function(i) (B$Buy/(1+d))^i ) )

summary(B$CLV)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##      35.5     280.0     450.2    1412.0     786.0 1145741.4
ggplot(B, aes(CLV)) + 
  geom_histogram(bins=30, fill="green",alpha=0.4) + 
  scale_x_log10()

save(B, file='data/tf4.rdata')