善用商業數據分析的工具和技巧,光靠一份最簡單的交易紀錄(只有顧客ID、交易日期和交易金額三個欄位),我們就可以做一系列很深入、很有價值的分析,包括:

圖一、顧客價值管理

圖一、顧客價值管理


從這一些分析我們可以看到公司主要的營收和獲利的重要來源,我們也可以看到這一些產生獲利的群組是不是有成長或者衰退的趨勢;據此我們可以設定行銷的重點,決定行銷的策略,和規劃行銷的工具。除了上述的敘述統計、集群分析、和資料視覺化之外,我們還可以利用這些簡單的交易紀錄:

利用這一些預測我們就可以進行全面客製化的:

圖二、顧客價值管理流程

圖二、顧客價值管理流程



Setup

packages = c(
  "dplyr","ggplot2","caTools","ROCR","d3heatmap",
  "googleVis","devtools","plotly")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) 
  install.packages(pkg)
if(!is.element("chorddiag", existing))
  devtools::install_github("mattflor/chorddiag")

Library

options(digits=4)
library(dplyr)
library(ggplot2)
library(caTools)
library(ROCR)
library(d3heatmap)
library(googleVis)
library(chorddiag)

1. 資料整理

1.1 交易資料 (X)

rm(list=ls(all=T))
X = read.table(
  'purchases.txt', header=FALSE, sep='\t', stringsAsFactors=F)
names(X) = c('cid','amount','date')
X$date = as.Date(X$date)
summary(X)                  # 交易次數 51243
##       cid             amount          date           
##  Min.   :    10   Min.   :   5   Min.   :2005-01-02  
##  1st Qu.: 57720   1st Qu.:  25   1st Qu.:2009-01-17  
##  Median :102440   Median :  30   Median :2011-11-23  
##  Mean   :108935   Mean   :  62   Mean   :2011-07-14  
##  3rd Qu.:160525   3rd Qu.:  60   3rd Qu.:2013-12-29  
##  Max.   :264200   Max.   :4500   Max.   :2015-12-31
n_distinct(X$cid)           # 顧客數 18417
## [1] 18417

1.2 顧客資料 (A)

A = X %>% 
  mutate(days = as.integer(as.Date("2016-01-01") - date)) %>% 
  group_by(cid) %>% summarise(
    recent = min(days),     # 最近購買距今天數
    freq = n(),             # 購買次數
    money = mean(amount),   # 平均購買金額
    senior = max(days),     # 第一次購買距今天數
    since = min(date)       # 第一次購買日期
  ) %>% data.frame

1.3 資料概要

summary(A)
##       cid             recent          freq           money          senior    
##  Min.   :    10   Min.   :   1   Min.   : 1.00   Min.   :   5   Min.   :   1  
##  1st Qu.: 81990   1st Qu.: 244   1st Qu.: 1.00   1st Qu.:  22   1st Qu.: 988  
##  Median :136430   Median :1070   Median : 2.00   Median :  30   Median :2087  
##  Mean   :137574   Mean   :1253   Mean   : 2.78   Mean   :  58   Mean   :1984  
##  3rd Qu.:195100   3rd Qu.:2130   3rd Qu.: 3.00   3rd Qu.:  50   3rd Qu.:2992  
##  Max.   :264200   Max.   :4014   Max.   :45.00   Max.   :4500   Max.   :4016  
##      since           
##  Min.   :2005-01-02  
##  1st Qu.:2007-10-23  
##  Median :2010-04-15  
##  Mean   :2010-07-26  
##  3rd Qu.:2013-04-18  
##  Max.   :2015-12-31
# 回購顧客的平均購買週期
K = as.integer(sum(A$senior[A$freq>1]) / sum(A$freq[A$freq>1]))

1.4 資料檢視

p0 = par(cex=0.8, mfrow=c(2,2))
hist(A$recent)
hist(A$freq)
hist(log(A$money,10))
hist(A$senior)

par(p0)

2. 層級式集群分析

2.1 RFM顧客分群

set.seed(111)
A$grp = kmeans(scale(A[,2:4]),10)$cluster
table(A$grp)  # 族群大小
## 
##    1    2    3    4    5    6    7    8    9   10 
## 1073 2266 1296 2237 3207 1942 1781 2392 2096  127

2.2 顧客群組屬性

group_by(A, grp) %>% summarise(
  recent=mean(recent), 
  freq=mean(freq), 
  money=mean(money), 
  size=n() ) %>% 
  mutate( revenue = size*money/1000 )  %>% 
  filter(size > 1) %>% 
  ggplot(aes(x=freq, y=money)) +
  geom_point(aes(size=revenue, col=recent),alpha=0.5) +
  scale_size(range=c(4,40)) +
  scale_color_gradient(low="green",high="red") +
  scale_x_log10() + scale_y_log10() + 
  geom_text(aes(label = size )) +
  theme_bw() + guides(size=F) +
  labs(title="Customer Segements",
       subtitle="(bubble_size:revenue_contribution; text:group_size)",
       color="Recency") +
  xlab("Frequency (log)") + ylab("Average Transaction Amount (log)")


3. 規則分群

3.1 顧客分群規則

STS = c("N1","N2","R1","R2","S1","S2","S3")
Status = function(rx,fx,mx,sx,K) {factor(
  ifelse(sx < 2*K,
         ifelse(fx*mx > 50, "N2", "N1"),
         ifelse(rx < 2*K,
                ifelse(sx/fx < 0.75*K,"R2","R1"),
                ifelse(rx < 3*K,"S1",
                       ifelse(rx < 4*K,"S2","S3")))), STS)}
圖三、顧客分群規則

圖三、顧客分群規則

3.2 滑動資料窗格

Y = list()              # 建立一個空的LIST
for(y in 2010:2015) {   # 每年年底將顧客資料彙整成一個資料框
  D = as.Date(paste0(c(y, y-1),"-12-31")) # 當期、前期的期末日期 
  Y[[paste0("Y",y)]] = X %>%        # 從交易資料做起
    filter(date <= D[1]) %>%        # 將資料切齊到期末日期
    mutate(days = 1 + as.integer(D[1] - date)) %>%   # 交易距期末天數
    group_by(cid) %>% summarise(    # 依顧客彙總 ...
      recent = min(days),           #   最後一次購買距期末天數   
      freq = n(),                   #   購買次數 (至期末為止)   
      money = mean(amount),         #   平均購買金額 (至期末為止)
      senior = max(days),           #   第一次購買距期末天數
      status = Status(recent,freq,money,senior,K),  # 期末狀態
      since = min(date),                      # 第一次購買日期
      y_freq = sum(date > D[2]),              # 當期購買次數
      y_revenue = sum(amount[date > D[2]])    # 當期購買金額
    ) %>% data.frame }

sapply(Y, nrow)
## Y2010 Y2011 Y2012 Y2013 Y2014 Y2015 
## 10407 11674 13562 15468 16905 18417
sapply(Y, function(x) table(x$status))
##    Y2010 Y2011 Y2012 Y2013 Y2014 Y2015
## N1  3330  2640  2778  3054  3106  2772
## N2  1655  1496  1704  1918  2091  1957
## R1  1298  1731  2177  2156  2102  2140
## R2  1547  1831  1923  1955  2016  2264
## S1  2203  2163  2147  2357  1963  2807
## S2   360  1502  1820  1723  2110  1718
## S3    14   311  1013  2305  3517  4759

3.3 族群大小變化趨勢

cols = c("gold","orange","blue","green",
         "pink","magenta","darkred")
sapply(Y, function(df) table(df$status)) %>% barplot(col=cols)
legend("topleft",rev(STS),fill=rev(cols))

3.4 族群屬性動態分析

CustSegments = do.call(rbind, lapply(Y, function(d) {
  group_by(d, status) %>% summarise(
    average_frequency = mean(freq),
    average_amount = mean(money),
    total_revenue = sum(y_revenue),
    total_no_orders = sum(y_freq),
    average_recency = mean(recent),
    average_seniority = mean(senior),
    group_size = n()
  )})) %>% ungroup %>% 
  mutate(year=rep(2010:2015, each=7))

plot( gvisMotionChart(
  CustSegments, "status", "year",
  options=list(width=900, height=600) ) )
## starting httpd help server ... done
圖四、族群屬性動態分析

圖四、族群屬性動態分析

3.5 群間流量分析

df = merge(Y$Y2014[,c(1,6)], Y$Y2015[,c(1,6)],
           by="cid", all.x=T)
tx = table(df$status.x, df$status.y) %>% 
  as.data.frame.matrix() %>% as.matrix()
tx                                  # 流量矩陣
##      N1   N2   R1   R2  S1   S2   S3
## N1 1705  381  144   45 831    0    0
## N2    0 1131  267  430 263    0    0
## R1    0    0 1240   43 819    0    0
## R2    0    0  199 1742  75    0    0
## S1    0    0  115    3 819 1026    0
## S2    0    0   78    1   0  692 1339
## S3    0    0   97    0   0    0 3420
tx %>% prop.table(1) %>% round(4)   # 流量矩陣(%)
##        N1     N2     R1     R2     S1     S2     S3
## N1 0.5489 0.1227 0.0464 0.0145 0.2675 0.0000 0.0000
## N2 0.0000 0.5409 0.1277 0.2056 0.1258 0.0000 0.0000
## R1 0.0000 0.0000 0.5899 0.0205 0.3896 0.0000 0.0000
## R2 0.0000 0.0000 0.0987 0.8641 0.0372 0.0000 0.0000
## S1 0.0000 0.0000 0.0586 0.0015 0.4172 0.5227 0.0000
## S2 0.0000 0.0000 0.0370 0.0005 0.0000 0.3280 0.6346
## S3 0.0000 0.0000 0.0276 0.0000 0.0000 0.0000 0.9724

3.6 互動式流量分析

chorddiag(tx, groupColors=cols)

4. 建立模型

我們的目的是:以Y2015年底的資料,預測每一位顧客:

圖五、建模、預測、估價、選擇

圖五、建模、預測、估價、選擇

4.1 準備資料

我們用Y2014年底的資料做自變數,Y2015年的資料做應變數

CX = left_join(Y$Y2014, Y$Y2015[,c(1,8,9)], by="cid")
names(CX)[8:11] = c("freq0","revenue0","Retain", "Revenue")
CX$Retain = CX$Retain > 0

table(CX$Retain) %>% prop.table()  # 平均保留機率 = 22.54%
## 
##  FALSE   TRUE 
## 0.7701 0.2299

4.2 建立類別模型、預測機率

mRet = glm(Retain ~ ., CX[,c(2:3,6,8:10)], family=binomial())
summary(mRet)
## 
## Call:
## glm(formula = Retain ~ ., family = binomial(), data = CX[, c(2:3, 
##     6, 8:10)])
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -3.689  -0.473  -0.298  -0.142   3.386  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.074007   0.089431  -12.01  < 2e-16 ***
## recent      -0.002067   0.000131  -15.73  < 2e-16 ***
## freq         0.095217   0.013882    6.86  6.9e-12 ***
## statusN2     0.669429   0.070234    9.53  < 2e-16 ***
## statusR1     0.488321   0.084389    5.79  7.2e-09 ***
## statusR2     1.290002   0.110841   11.64  < 2e-16 ***
## statusS1     0.670604   0.146532    4.58  4.7e-06 ***
## statusS2     1.353554   0.208210    6.50  8.0e-11 ***
## statusS3     2.573689   0.275786    9.33  < 2e-16 ***
## freq0        0.566557   0.065532    8.65  < 2e-16 ***
## revenue0    -0.000132   0.000135   -0.98     0.33    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 18228  on 16904  degrees of freedom
## Residual deviance: 11766  on 16894  degrees of freedom
## AIC: 11788
## 
## Number of Fisher Scoring iterations: 6

4.3 估計類別模型的準確性

pred = predict(mRet,type="response")
table(pred>0.5,CX$Retain)          # 混淆矩陣 (Confusion Matrix)  
##        
##         FALSE  TRUE
##   FALSE 12045  1530
##   TRUE    974  2356
table(pred>0.5,CX$Retain) %>% 
  {sum(diag(.))/sum(.)}            # 正確率(ACC): 85.19%  
## [1] 0.8519
colAUC(pred,CX$Retain)             # 辯識率(AUC): 87.92%
##                  [,1]
## FALSE vs. TRUE 0.8792
prediction(pred, CX$Retain) %>%    # ROC CURVE 
  performance("tpr", "fpr") %>% 
  plot(print.cutoffs.at=seq(0,1,0.1))

4.4 建立數量模型

dx = subset(CX, Revenue > 0)  # 只對有來購買的人做模型
mRev = lm(log(Revenue) ~ recent + freq + log(1+money) + senior +
            status + freq0 + log(1+revenue0), dx)  
summary(mRev)                 # 判定係數:R2 = 0.713
## 
## Call:
## lm(formula = log(Revenue) ~ recent + freq + log(1 + money) + 
##     senior + status + freq0 + log(1 + revenue0), data = dx)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.245 -0.209 -0.067  0.205  3.435 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        5.88e-02   4.58e-02    1.28   0.1997    
## recent             3.54e-04   5.07e-05    6.98  3.4e-12 ***
## freq               5.27e-02   4.65e-03   11.33  < 2e-16 ***
## log(1 + money)     9.32e-01   1.35e-02   68.94  < 2e-16 ***
## senior            -1.37e-04   1.82e-05   -7.52  7.0e-14 ***
## statusN2           1.28e-02   2.63e-02    0.49   0.6268    
## statusR1           1.93e-01   4.08e-02    4.73  2.3e-06 ***
## statusR2           2.98e-02   3.52e-02    0.84   0.3984    
## statusS1           8.24e-03   6.30e-02    0.13   0.8960    
## statusS2          -2.41e-01   8.66e-02   -2.78   0.0055 ** 
## statusS3          -3.67e-01   1.18e-01   -3.11   0.0019 ** 
## freq0              1.03e-02   1.73e-02    0.60   0.5501    
## log(1 + revenue0)  6.33e-02   9.40e-03    6.73  1.9e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.463 on 3873 degrees of freedom
## Multiple R-squared:  0.713,  Adjusted R-squared:  0.712 
## F-statistic:  802 on 12 and 3873 DF,  p-value: <2e-16
plot(log(dx$Revenue), predict(mRev))
abline(0,1,col='red') 


5. 估計顧客終生價值

5.1 進行預測

使用模型對Y2015年底的資料做預測,對資料中的每一位顧客,預測她們在Y2016的保留率和購買金額。

CX = Y$Y2015
names(CX)[8:9] = c("freq0","revenue0")

# 預測Y2016保留率
CX$ProbRetain = predict(mRet,CX,type='response')

# 預測Y2016購買金額
CX$PredRevenue = exp(predict(mRev,CX))
p0 = par(cex=0.8, mfrow=c(1,2))
hist(CX$ProbRetain,main="ProbRetain")
hist(log(CX$PredRevenue,10),main="log(PredRevenue)")

par(p0)

5.2 估計價值

\[ CLV = \sum_{t=0}^N m \frac{r^t}{(1+d)^t} = m \sum_{t=0}^N (\frac{r}{1+d})^t \]

N = 5     # 期數 = 5
d = 0.1   # 利率 = 10%
CX$CLV = CX$PredRevenue * rowSums(sapply(
  0:N, function(i) (CX$ProbRetain/(1+d))^i ) )
summary(CX$CLV)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       7      31      48     103      90   10188
tapply(CX$CLV, CX$status, mean)
##     N1     N2     R1     R2     S1     S2     S3 
##  40.34 221.77 109.19 272.61  59.32  51.16  50.34
hist(log(CX$CLV,10))

5.3 比較各族群的價值

tapply(CX$ProbRetain, CX$status, mean)
##      N1      N2      R1      R2      S1      S2      S3 
## 0.20269 0.44075 0.34150 0.74925 0.05724 0.03475 0.02326
boxplot(log(CLV)~status, CX)

ggplot(CX, aes(x=status, y=log(CLV), fill=status)) + 
  geom_violin(draw_quantiles=c(.25,.5,.75)) + 
  theme_minimal()

group_by(CX, status) %>% summarise(
  AvgExpRevenue = mean(PredRevenue),
  AvgRetainProb = mean(ProbRetain),
  AvgCLV = mean(CLV), 
  GroupSize = n() ) %>% 
  mutate(
    GroupValue = AvgCLV * GroupSize
  ) %>%  data.frame
##   status AvgExpRevenue AvgRetainProb AvgCLV GroupSize GroupValue
## 1     N1         31.98       0.20269  40.34      2772     111821
## 2     N2        131.23       0.44075 221.77      1957     434013
## 3     R1         69.85       0.34150 109.19      2140     233670
## 4     R2         91.27       0.74925 272.61      2264     617200
## 5     S1         56.10       0.05724  59.32      2807     166521
## 6     S2         49.48       0.03475  51.16      1718      87900
## 7     S3         49.36       0.02326  50.34      4759     239554

6. 設定行銷策略、規劃行銷工具


7. 選擇行銷對象

給定某一行銷工具的成本和預期效益,選擇可以施行這項工具的對象。

7.1 對R2族群進行保留

R2族群的預測保留率和購買金額

p0 = par(cex=0.8, mfrow=c(1,2))
hist(CX$ProbRetain[CX$status=="R2"],main="ProbRetain",xlab="")
hist(log(CX$PredRevenue[CX$status=="R2"],10),main="PredRevenue",xlab="")

par(p0)

7.2 估計預期報酬

假設行銷工具的成本和預期效益為

cost = 10        # 成本
effect = 0.75    # 效益:下一期的購買機率

估計這項行銷工具對每一位R2顧客的預期報酬

Target = subset(CX, status=="R2")
Target$ExpReturn = (effect - Target$ProbRetain) * Target$PredRevenue - cost
summary(Target$ExpReturn)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -515.8   -15.4   -11.5   -10.3    -8.1   646.9

7.3 選擇行銷對象

Target = Target %>% arrange(desc(ExpReturn)) %>% 
  select(cid, ExpReturn) %>% head(20)

sum(Target$ExpReturn > 0)                 # 可實施對象:258
## [1] 20
sum(Target$ExpReturn[Target$ExpReturn > 0])   # 預期報酬:6464
## [1] 3768

7.4 如果我們全面對所有的族群實施這項工具 …

Target = CX
Target$ExpReturn = (effect - Target$ProbRetain) * Target$PredRevenue - cost
filter(Target, Target$ExpReturn > 0) %>%
  group_by(status) %>% summarise(
    No.Target = n(),
    AvgROI = mean(ExpReturn),
    TotalROI = sum(ExpReturn) ) %>% data.frame
##   status No.Target AvgROI TotalROI
## 1     N1      2211  9.973    22049
## 2     N2      1459 47.238    68920
## 3     R1      1596 25.459    40633
## 4     R2       258 25.054     6464
## 5     S1      2645 30.592    80915
## 6     S2      1609 27.134    43658
## 7     S3      4495 27.598   124052

這個結果是合理的嗎? 你想要怎麼修正這項分析的程序呢?


8. 結論

如果你只有顧客ID、交易日期、交易金額三個欄位的話,你可以做的分析包括:

一般而言,這一些分析的結果,足夠讓我們制定顧客發展和顧客保留策略;至於顧客吸收策略,我們通常還需要從CRM撈出顧客個人屬性資料才能做到。