善用商業數據分析的工具和技巧,光靠一份最簡單的交易紀錄(只有顧客ID、交易日期和交易金額三個欄位),我們就可以做一系列很深入、很有價值的分析,包括:
從這一些分析我們可以看到公司主要的營收和獲利的重要來源,我們也可以看到這一些產生獲利的群組是不是有成長或者衰退的趨勢;據此我們可以設定行銷的重點,決定行銷的策略,和規劃行銷的工具。除了上述的敘述統計、集群分析、和資料視覺化之外,我們還可以利用這些簡單的交易紀錄:
利用這一些預測我們就可以進行全面客製化的:
packages = c(
"dplyr","ggplot2","caTools","ROCR","d3heatmap",
"googleVis","devtools","plotly")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)])
install.packages(pkg)
if(!is.element("chorddiag", existing))
devtools::install_github("mattflor/chorddiag")
options(digits=4)
library(dplyr)
library(ggplot2)
library(caTools)
library(ROCR)
library(d3heatmap)
library(googleVis)
library(chorddiag)
rm(list=ls(all=T))
X = read.table(
'purchases.txt', header=FALSE, sep='\t', stringsAsFactors=F)
names(X) = c('cid','amount','date')
X$date = as.Date(X$date)
summary(X) # 交易次數 51243
## cid amount date
## Min. : 10 Min. : 5 Min. :2005-01-02
## 1st Qu.: 57720 1st Qu.: 25 1st Qu.:2009-01-17
## Median :102440 Median : 30 Median :2011-11-23
## Mean :108935 Mean : 62 Mean :2011-07-14
## 3rd Qu.:160525 3rd Qu.: 60 3rd Qu.:2013-12-29
## Max. :264200 Max. :4500 Max. :2015-12-31
n_distinct(X$cid) # 顧客數 18417
## [1] 18417
A = X %>%
mutate(days = as.integer(as.Date("2016-01-01") - date)) %>%
group_by(cid) %>% summarise(
recent = min(days), # 最近購買距今天數
freq = n(), # 購買次數
money = mean(amount), # 平均購買金額
senior = max(days), # 第一次購買距今天數
since = min(date) # 第一次購買日期
) %>% data.frame
summary(A)
## cid recent freq money senior
## Min. : 10 Min. : 1 Min. : 1.00 Min. : 5 Min. : 1
## 1st Qu.: 81990 1st Qu.: 244 1st Qu.: 1.00 1st Qu.: 22 1st Qu.: 988
## Median :136430 Median :1070 Median : 2.00 Median : 30 Median :2087
## Mean :137574 Mean :1253 Mean : 2.78 Mean : 58 Mean :1984
## 3rd Qu.:195100 3rd Qu.:2130 3rd Qu.: 3.00 3rd Qu.: 50 3rd Qu.:2992
## Max. :264200 Max. :4014 Max. :45.00 Max. :4500 Max. :4016
## since
## Min. :2005-01-02
## 1st Qu.:2007-10-23
## Median :2010-04-15
## Mean :2010-07-26
## 3rd Qu.:2013-04-18
## Max. :2015-12-31
# 回購顧客的平均購買週期
K = as.integer(sum(A$senior[A$freq>1]) / sum(A$freq[A$freq>1]))
p0 = par(cex=0.8, mfrow=c(2,2))
hist(A$recent)
hist(A$freq)
hist(log(A$money,10))
hist(A$senior)
par(p0)
set.seed(111)
A$grp = kmeans(scale(A[,2:4]),10)$cluster
table(A$grp) # 族群大小
##
## 1 2 3 4 5 6 7 8 9 10
## 1073 2266 1296 2237 3207 1942 1781 2392 2096 127
group_by(A, grp) %>% summarise(
recent=mean(recent),
freq=mean(freq),
money=mean(money),
size=n() ) %>%
mutate( revenue = size*money/1000 ) %>%
filter(size > 1) %>%
ggplot(aes(x=freq, y=money)) +
geom_point(aes(size=revenue, col=recent),alpha=0.5) +
scale_size(range=c(4,40)) +
scale_color_gradient(low="green",high="red") +
scale_x_log10() + scale_y_log10() +
geom_text(aes(label = size )) +
theme_bw() + guides(size=F) +
labs(title="Customer Segements",
subtitle="(bubble_size:revenue_contribution; text:group_size)",
color="Recency") +
xlab("Frequency (log)") + ylab("Average Transaction Amount (log)")
STS = c("N1","N2","R1","R2","S1","S2","S3")
Status = function(rx,fx,mx,sx,K) {factor(
ifelse(sx < 2*K,
ifelse(fx*mx > 50, "N2", "N1"),
ifelse(rx < 2*K,
ifelse(sx/fx < 0.75*K,"R2","R1"),
ifelse(rx < 3*K,"S1",
ifelse(rx < 4*K,"S2","S3")))), STS)}
Y = list() # 建立一個空的LIST
for(y in 2010:2015) { # 每年年底將顧客資料彙整成一個資料框
D = as.Date(paste0(c(y, y-1),"-12-31")) # 當期、前期的期末日期
Y[[paste0("Y",y)]] = X %>% # 從交易資料做起
filter(date <= D[1]) %>% # 將資料切齊到期末日期
mutate(days = 1 + as.integer(D[1] - date)) %>% # 交易距期末天數
group_by(cid) %>% summarise( # 依顧客彙總 ...
recent = min(days), # 最後一次購買距期末天數
freq = n(), # 購買次數 (至期末為止)
money = mean(amount), # 平均購買金額 (至期末為止)
senior = max(days), # 第一次購買距期末天數
status = Status(recent,freq,money,senior,K), # 期末狀態
since = min(date), # 第一次購買日期
y_freq = sum(date > D[2]), # 當期購買次數
y_revenue = sum(amount[date > D[2]]) # 當期購買金額
) %>% data.frame }
sapply(Y, nrow)
## Y2010 Y2011 Y2012 Y2013 Y2014 Y2015
## 10407 11674 13562 15468 16905 18417
sapply(Y, function(x) table(x$status))
## Y2010 Y2011 Y2012 Y2013 Y2014 Y2015
## N1 3330 2640 2778 3054 3106 2772
## N2 1655 1496 1704 1918 2091 1957
## R1 1298 1731 2177 2156 2102 2140
## R2 1547 1831 1923 1955 2016 2264
## S1 2203 2163 2147 2357 1963 2807
## S2 360 1502 1820 1723 2110 1718
## S3 14 311 1013 2305 3517 4759
cols = c("gold","orange","blue","green",
"pink","magenta","darkred")
sapply(Y, function(df) table(df$status)) %>% barplot(col=cols)
legend("topleft",rev(STS),fill=rev(cols))
CustSegments = do.call(rbind, lapply(Y, function(d) {
group_by(d, status) %>% summarise(
average_frequency = mean(freq),
average_amount = mean(money),
total_revenue = sum(y_revenue),
total_no_orders = sum(y_freq),
average_recency = mean(recent),
average_seniority = mean(senior),
group_size = n()
)})) %>% ungroup %>%
mutate(year=rep(2010:2015, each=7))
plot( gvisMotionChart(
CustSegments, "status", "year",
options=list(width=900, height=600) ) )
## starting httpd help server ... done
df = merge(Y$Y2014[,c(1,6)], Y$Y2015[,c(1,6)],
by="cid", all.x=T)
tx = table(df$status.x, df$status.y) %>%
as.data.frame.matrix() %>% as.matrix()
tx # 流量矩陣
## N1 N2 R1 R2 S1 S2 S3
## N1 1705 381 144 45 831 0 0
## N2 0 1131 267 430 263 0 0
## R1 0 0 1240 43 819 0 0
## R2 0 0 199 1742 75 0 0
## S1 0 0 115 3 819 1026 0
## S2 0 0 78 1 0 692 1339
## S3 0 0 97 0 0 0 3420
tx %>% prop.table(1) %>% round(4) # 流量矩陣(%)
## N1 N2 R1 R2 S1 S2 S3
## N1 0.5489 0.1227 0.0464 0.0145 0.2675 0.0000 0.0000
## N2 0.0000 0.5409 0.1277 0.2056 0.1258 0.0000 0.0000
## R1 0.0000 0.0000 0.5899 0.0205 0.3896 0.0000 0.0000
## R2 0.0000 0.0000 0.0987 0.8641 0.0372 0.0000 0.0000
## S1 0.0000 0.0000 0.0586 0.0015 0.4172 0.5227 0.0000
## S2 0.0000 0.0000 0.0370 0.0005 0.0000 0.3280 0.6346
## S3 0.0000 0.0000 0.0276 0.0000 0.0000 0.0000 0.9724
chorddiag(tx, groupColors=cols)
我們的目的是:以Y2015年底的資料,預測每一位顧客:
Retain
)Revenue
)我們用Y2014年底的資料做自變數,Y2015年的資料做應變數
CX = left_join(Y$Y2014, Y$Y2015[,c(1,8,9)], by="cid")
names(CX)[8:11] = c("freq0","revenue0","Retain", "Revenue")
CX$Retain = CX$Retain > 0
table(CX$Retain) %>% prop.table() # 平均保留機率 = 22.54%
##
## FALSE TRUE
## 0.7701 0.2299
mRet = glm(Retain ~ ., CX[,c(2:3,6,8:10)], family=binomial())
summary(mRet)
##
## Call:
## glm(formula = Retain ~ ., family = binomial(), data = CX[, c(2:3,
## 6, 8:10)])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.689 -0.473 -0.298 -0.142 3.386
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.074007 0.089431 -12.01 < 2e-16 ***
## recent -0.002067 0.000131 -15.73 < 2e-16 ***
## freq 0.095217 0.013882 6.86 6.9e-12 ***
## statusN2 0.669429 0.070234 9.53 < 2e-16 ***
## statusR1 0.488321 0.084389 5.79 7.2e-09 ***
## statusR2 1.290002 0.110841 11.64 < 2e-16 ***
## statusS1 0.670604 0.146532 4.58 4.7e-06 ***
## statusS2 1.353554 0.208210 6.50 8.0e-11 ***
## statusS3 2.573689 0.275786 9.33 < 2e-16 ***
## freq0 0.566557 0.065532 8.65 < 2e-16 ***
## revenue0 -0.000132 0.000135 -0.98 0.33
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 18228 on 16904 degrees of freedom
## Residual deviance: 11766 on 16894 degrees of freedom
## AIC: 11788
##
## Number of Fisher Scoring iterations: 6
pred = predict(mRet,type="response")
table(pred>0.5,CX$Retain) # 混淆矩陣 (Confusion Matrix)
##
## FALSE TRUE
## FALSE 12045 1530
## TRUE 974 2356
table(pred>0.5,CX$Retain) %>%
{sum(diag(.))/sum(.)} # 正確率(ACC): 85.19%
## [1] 0.8519
colAUC(pred,CX$Retain) # 辯識率(AUC): 87.92%
## [,1]
## FALSE vs. TRUE 0.8792
prediction(pred, CX$Retain) %>% # ROC CURVE
performance("tpr", "fpr") %>%
plot(print.cutoffs.at=seq(0,1,0.1))
dx = subset(CX, Revenue > 0) # 只對有來購買的人做模型
mRev = lm(log(Revenue) ~ recent + freq + log(1+money) + senior +
status + freq0 + log(1+revenue0), dx)
summary(mRev) # 判定係數:R2 = 0.713
##
## Call:
## lm(formula = log(Revenue) ~ recent + freq + log(1 + money) +
## senior + status + freq0 + log(1 + revenue0), data = dx)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.245 -0.209 -0.067 0.205 3.435
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.88e-02 4.58e-02 1.28 0.1997
## recent 3.54e-04 5.07e-05 6.98 3.4e-12 ***
## freq 5.27e-02 4.65e-03 11.33 < 2e-16 ***
## log(1 + money) 9.32e-01 1.35e-02 68.94 < 2e-16 ***
## senior -1.37e-04 1.82e-05 -7.52 7.0e-14 ***
## statusN2 1.28e-02 2.63e-02 0.49 0.6268
## statusR1 1.93e-01 4.08e-02 4.73 2.3e-06 ***
## statusR2 2.98e-02 3.52e-02 0.84 0.3984
## statusS1 8.24e-03 6.30e-02 0.13 0.8960
## statusS2 -2.41e-01 8.66e-02 -2.78 0.0055 **
## statusS3 -3.67e-01 1.18e-01 -3.11 0.0019 **
## freq0 1.03e-02 1.73e-02 0.60 0.5501
## log(1 + revenue0) 6.33e-02 9.40e-03 6.73 1.9e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.463 on 3873 degrees of freedom
## Multiple R-squared: 0.713, Adjusted R-squared: 0.712
## F-statistic: 802 on 12 and 3873 DF, p-value: <2e-16
plot(log(dx$Revenue), predict(mRev))
abline(0,1,col='red')
使用模型對Y2015年底的資料做預測,對資料中的每一位顧客,預測她們在Y2016的保留率和購買金額。
CX = Y$Y2015
names(CX)[8:9] = c("freq0","revenue0")
# 預測Y2016保留率
CX$ProbRetain = predict(mRet,CX,type='response')
# 預測Y2016購買金額
CX$PredRevenue = exp(predict(mRev,CX))
p0 = par(cex=0.8, mfrow=c(1,2))
hist(CX$ProbRetain,main="ProbRetain")
hist(log(CX$PredRevenue,10),main="log(PredRevenue)")
par(p0)
\[ CLV = \sum_{t=0}^N m \frac{r^t}{(1+d)^t} = m \sum_{t=0}^N (\frac{r}{1+d})^t \]
N = 5 # 期數 = 5
d = 0.1 # 利率 = 10%
CX$CLV = CX$PredRevenue * rowSums(sapply(
0:N, function(i) (CX$ProbRetain/(1+d))^i ) )
summary(CX$CLV)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7 31 48 103 90 10188
tapply(CX$CLV, CX$status, mean)
## N1 N2 R1 R2 S1 S2 S3
## 40.34 221.77 109.19 272.61 59.32 51.16 50.34
hist(log(CX$CLV,10))
tapply(CX$ProbRetain, CX$status, mean)
## N1 N2 R1 R2 S1 S2 S3
## 0.20269 0.44075 0.34150 0.74925 0.05724 0.03475 0.02326
boxplot(log(CLV)~status, CX)
ggplot(CX, aes(x=status, y=log(CLV), fill=status)) +
geom_violin(draw_quantiles=c(.25,.5,.75)) +
theme_minimal()
group_by(CX, status) %>% summarise(
AvgExpRevenue = mean(PredRevenue),
AvgRetainProb = mean(ProbRetain),
AvgCLV = mean(CLV),
GroupSize = n() ) %>%
mutate(
GroupValue = AvgCLV * GroupSize
) %>% data.frame
## status AvgExpRevenue AvgRetainProb AvgCLV GroupSize GroupValue
## 1 N1 31.98 0.20269 40.34 2772 111821
## 2 N2 131.23 0.44075 221.77 1957 434013
## 3 R1 69.85 0.34150 109.19 2140 233670
## 4 R2 91.27 0.74925 272.61 2264 617200
## 5 S1 56.10 0.05724 59.32 2807 166521
## 6 S2 49.48 0.03475 51.16 1718 87900
## 7 S3 49.36 0.02326 50.34 4759 239554
給定某一行銷工具的成本和預期效益,選擇可以施行這項工具的對象。
R2
族群進行保留R2
族群的預測保留率和購買金額
p0 = par(cex=0.8, mfrow=c(1,2))
hist(CX$ProbRetain[CX$status=="R2"],main="ProbRetain",xlab="")
hist(log(CX$PredRevenue[CX$status=="R2"],10),main="PredRevenue",xlab="")
par(p0)
假設行銷工具的成本和預期效益為
cost = 10 # 成本
effect = 0.75 # 效益:下一期的購買機率
估計這項行銷工具對每一位R2
顧客的預期報酬
Target = subset(CX, status=="R2")
Target$ExpReturn = (effect - Target$ProbRetain) * Target$PredRevenue - cost
summary(Target$ExpReturn)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -515.8 -15.4 -11.5 -10.3 -8.1 646.9
Target = Target %>% arrange(desc(ExpReturn)) %>%
select(cid, ExpReturn) %>% head(20)
sum(Target$ExpReturn > 0) # 可實施對象:258
## [1] 20
sum(Target$ExpReturn[Target$ExpReturn > 0]) # 預期報酬:6464
## [1] 3768
Target = CX
Target$ExpReturn = (effect - Target$ProbRetain) * Target$PredRevenue - cost
filter(Target, Target$ExpReturn > 0) %>%
group_by(status) %>% summarise(
No.Target = n(),
AvgROI = mean(ExpReturn),
TotalROI = sum(ExpReturn) ) %>% data.frame
## status No.Target AvgROI TotalROI
## 1 N1 2211 9.973 22049
## 2 N2 1459 47.238 68920
## 3 R1 1596 25.459 40633
## 4 R2 258 25.054 6464
## 5 S1 2645 30.592 80915
## 6 S2 1609 27.134 43658
## 7 S3 4495 27.598 124052
這個結果是合理的嗎? 你想要怎麼修正這項分析的程序呢?
如果你只有顧客ID、交易日期、交易金額三個欄位的話,你可以做的分析包括:
一般而言,這一些分析的結果,足夠讓我們制定顧客發展和顧客保留策略;至於顧客吸收策略,我們通常還需要從CRM撈出顧客個人屬性資料才能做到。