TA FENG GROCERY期末專案

pacman::p_load(latex2exp, tidyr, readr, caTools, ggplot2, dplyr, magrittr, vcd, d3heatmap, Matrix)

1. 資料前處理：`Z`

1.1 讀進資料

Z = read_csv("data/ta_feng_all_months_merged.csv") %>% 
  data.frame %>% setNames(c(
    "date","cust","age","area","cat","prod","qty","cost","price"))

Parsed with column specification:
cols(
  TRANSACTION_DT = col_character(),
  CUSTOMER_ID = col_character(),
  AGE_GROUP = col_character(),
  PIN_CODE = col_character(),
  PRODUCT_SUBCLASS = col_double(),
  PRODUCT_ID = col_character(),
  AMOUNT = col_double(),
  ASSET = col_double(),
  SALES_PRICE = col_double()
)

1.2 處理離群值

# Quantile of Variables
sapply(Z[,7:9], quantile, prob=c(.99, .999, .9995))

       qty   cost  price
99%      6  858.0 1014.0
99.9%   14 2722.0 3135.8
99.95%  24 3799.3 3999.0

# Remove Outliers
Z = subset(Z, qty<=24 & cost<=3800 & price<=4000)

1.3 每周交易次數

Z$date = as.Date(Z$date, format="%m/%d/%Y") #日期格式轉換
par(cex=0.8)
hist(Z$date,'weeks',freq=T,las=2, main="No. Transaction per Week")

1.4 年齡層級、郵遞區號

age.group = c("<25","25-29","30-34","35-39","40-44",
              "45-49","50-54","55-59","60-64",">65")
Z$age = c(paste0("a",seq(24,69,5)),"a99")[match(Z$age,age.group,11)]
Z$area = paste0("z",Z$area)

par(mfrow=c(1,2),cex=0.7)
table(Z$age, useNA='ifany') %>% barplot(main="Age Groups", las=2)
table(Z$area, useNA='ifany') %>% barplot(main="Areas", las=2)

1.5 價格分布

par(mfrow=c(1,2),cex=0.7)
hist(Z$price)
hist(log10(Z$price))

1.6 彙總訂單 Assign Transaction ID

把每一天、每一位顧客的交易項目彙總為一張訂單

Z$tid = group_indices(Z, date, cust) 
# same customer same day
#將Z資料框中，根據天數(date)跟顧客(cust)兩個欄位產生Index，叫做tid(每個組合給他一個編號)

1.7 資料總覽

# No. cust, cat, prod, tid
sapply(Z[c("cust","cat","prod","tid")], n_distinct)

  cust    cat   prod    tid 
 32256   2007  23789 119422

#分別為"顧客數", "品類", "品項", "交易數"

2. 交易紀錄：`X`

2.1 交易資料彙整

X = Z %>% group_by(tid) %>% 
  summarise(
    date = min(date),           # 交易日期  
    cust = min(cust),           # 顧客 ID
    age = min(age),             # 顧客 年齡級別
    area = min(area),           # 顧客 居住區別
    items = n(),                # 交易項目(總)數
    pieces = sum(qty),          # 產品(總)件數
    money = sum(price),         # 交易(總)金額
    costs = sum(cost),          # 交易(總)成本
    gross = sum(price - cost),  # 毛利
    gross_margin = gross/money
  ) %>% 
  #將年齡70以上、地區未知的去掉
  filter(!age %in% c("a99")) %>%  
  filter(!area %in% c("zUnknown")) %>% data.frame
nrow(X)  #111066

[1] 111066

2.2 處理離群值

# Check Quantile & Remove Outliers
sapply(X[,6:11], quantile, prob=c(.0005, .9999, .99995))

         items pieces money costs   gross gross_margin
0.05%    1.000   1.00    10     9 -284.94     -0.92308
99.99%  82.000 130.68 16267 13817 3263.31      0.53125
99.995% 91.681 147.91 20198 16332 3660.70      0.58494

# Remove Outliers
X = subset(X, items<=82 & pieces<130.7 & money<16267 & gross_margin < 0.58 & gross_margin>-1.07) # 111026

2.3 周末與周間

X$wday = format(X$date, "%u")
par(cex=0.7, mar=c(2,3,2,1))
table(X$wday) %>% barplot(main="No. Transactions in Week Days")

3. 顧客資料：`A`

3.1 顧客資料彙整

d0 = max(X$date) + 1
A = X %>% 
  mutate(
    days = as.integer(difftime(d0, date, units="days"))) %>% 
  group_by(cust) %>% 
  summarise(
    age = min(age),          # 年齡
    area = min(area),        # 地區
    recency = min(days),     # 最近一次購買，距今日的天數
    seniority = max(days),   # 第一次購買，距今日的天數
    frquency = n(),          # 購買次數
    pieces = mean(pieces),   # 平均交易購買產品件數
    monetary = mean(money),  # 平均交易金額
    revenue = sum(money),    # 交易總額
    gross_margin = mean(gross_margin) #平均毛利率
  ) %>% data.frame      
nrow(A)

[1] 30510

X %>% mutate(wdate=format(date, "%u")) %>% 
  group_by(cust) %>% summarise(
    weekday = mean(wdate <= 5)
  ) %>% right_join(A) -> A

Joining, by = "cust"

A = A %>% mutate(
  Daytype = ifelse(weekday == 0, "WEND", 
            ifelse(weekday == 1, "WDAY", "MIXED")) %>% factor)

table(A$Daytype , cut(A$frquency, c(0, 4, 8, 12, 20, 30, Inf)) ) %>% 
  prop.table(2) %>% round(3)

       
        (0,4] (4,8] (8,12] (12,20] (20,30] (30,Inf]
  MIXED 0.276 0.848  0.944   0.977   0.993    1.000
  WDAY  0.435 0.127  0.051   0.022   0.007    0.000
  WEND  0.289 0.025  0.005   0.001   0.000    0.000

3.3 顧客年齡、地區、平假日分布

par(mfrow=c(1, 3),cex=0.7)
table(A$age, useNA='ifany') %>% barplot(main="Age Groups",las=2)
table(A$area, useNA='ifany') %>% barplot(main="Areas",las=2)
table(A$Daytype, useNA='ifany') %>% barplot(main="Daytype",las=2)

3.4 來店相關變數(近期、多久沒來、平假日比率、次數)

par(mfrow=c(2,2), mar=c(3,3,4,2))
hist(A$recency,freq=T,xlab="",ylab="", cex.main=1)
hist(A$seniority,freq=T,xlab="",ylab="", cex.main=1)
hist(A$weekday,freq=T,xlab="",ylab="", cex.main=1)
hist(pmin(A$frquency,10),0:10,freq=T,xlab="",ylab="", cex.main=1)

3.5 消費相關變數(平均件數、平均消費、總消費、毛利率)

par(mfrow=c(2,2), mar=c(3,3,4,2))
hist(A$pieces,freq=T,xlab="",ylab="", cex.main=1)
hist(log(A$monetary,10),freq=T,xlab="",ylab="", cex.main=1)
hist(log(A$revenue,10),freq=T,xlab="",ylab="", cex.main=1)
hist(A$gross_margin,freq=T,xlab="",ylab="", cex.main=1)

1. 簡單泡泡圖

1.1 年齡區隔特徵

A %>% group_by(age) %>% summarise(
  Group.Size = n(),                     # 族群人數
  avg.Freq = mean(frquency),            # 平均購買次數
  avg.Revenue = sum(frquency * monetary)/sum(frquency)  # 平均客單價
  ) %>% 
  ggplot(aes(y=avg.Freq, x=avg.Revenue)) +
  geom_point(aes(col=age, size=Group.Size), alpha=0.5) +
  geom_text(aes(label=age)) +
  scale_size(range=c(5,25)) +
  theme_bw() + theme(legend.position="none") +
  ggtitle("年齡區隔特徵 (泡泡大小:族群人數)") + 
  ylab("平均購買次數") + xlab("平均客單價")

1.2 地理區隔特徵

A %>% group_by(area) %>% summarise(
  Group.Size = n(),                     # 族群人數
  avg.Freq = mean(frquency),            # 平均購買次數
  avg.Revenue = sum(frquency * monetary)/sum(frquency)  # 平均客單價
  ) %>% 
  ggplot(aes(y=avg.Freq, x=avg.Revenue)) +
  geom_point(aes(col=area, size=Group.Size), alpha=0.5) +
  geom_text(aes(label=area)) +
  scale_size(range=c(5,25)) +
  theme_bw() + theme(legend.position="none") +
  ggtitle("地理區隔特徵 (泡泡大小:族群人數)") + 
  ylab("平均購買次數") + xlab("平均客單價")

💡 主要發現：
※ 「平均購買次數」和「平均客單價」之間有明顯的負相關
§ 30~40歲(年長60-70歲)的人比較少(常)來買、但每一次買的比較多(少)
§ 若以Z115、Z221為中心，住的遠(近)的人比較少(常)來買、但每一次買的比較多(少)

2. 洞察變數間關聯性

使用馬賽克圖檢視列連表的關聯性(Association between Categorial Variables) + 方塊大小代表該類別組合的數量 + 紅(藍)色代表該類別組合的數量顯著小(大)於期望值 + 期望值就是邊際機率(如上方的直條圖所示)的乘積 + 卡方檢定(類別變數的關聯性檢定)的p值顯示在圖示最下方 + p-value < 2.22e-16 : age 與 area 之間有顯著的關聯性

2.1 年齡與地理區隔的關聯性

MOSA = function(formula, data) mosaic(formula, data, shade=T, 
  margins=c(0,1,0,0), labeling_args = list(rot_labels=c(90,0,0,0)),
  gp_labels=gpar(fontsize=9), legend_args=list(fontsize=9),
  gp_text=gpar(fontsize=7),labeling=labeling_residuals)

MOSA(~area+age, A)

💡 主要發現：
※ 「年齡」與「地區」之間有很高的關聯性
§ 南港(z115)：30~40歲的顧客比率比較低、50歲以上的顧客比率比較高
§ 汐止(z221)、內湖(z114)和其他(zOthers)：30_{40歲的顧客比率比較高、45}55歲的顧客比率比較低

2.2 年齡與購物日的關聯性

MOSA(~wday+age, X)

💡 主要發現：
※ 「年齡」與「購物日」之間有關聯性
§ 平日時，年齡為65-70歲的顧客較常出沒、年齡為30-40歲的顧客幾乎不會來 § 相反地，禮拜日時，年齡為30-40歲的顧客較常出沒、年齡為50歲以上的顧客幾乎不會來

2.3 地區與購物日的關聯性

MOSA(~wday+area, X)

💡 主要發現：
※ 「地區」與「購物日」之間有很高的關聯性
§ 平日時，南港(z115)生意興隆，剩下的地區較沒有生意 § 假日時，南港(z115)生意慘淡，顧客都來反倒來信義(z110)消費

3. 層級式集群分析

3.1 RFM顧客分群

#採用集群式分析k-means，將資料依照不同屬性將顧客做分群
#k-means分完群會把分群的向量放在“cluster”這個欄位裡面
set.seed(11)
gp= A$group= kmeans(scale(A[,c(2, 5:9, 11)]),4)$cluster
table(gp)  # 族群大小

gp
    1     2     3     4 
 7559 11284  3604  8063

3.2 顧客群組屬性

#做各群的泡泡圖，數值代表族群大小
#泡泡大小：貢獻毛利率；顏色：平均每次消費購買數
#X軸：是購買頻率；Y軸：最近一次購買距今的時間
group_by(A, group) %>% summarise(
  weekday = mean(weekday),
  recency = mean(recency), 
  seniority = mean(seniority),
  frquency = mean(frquency), 
  pieces = mean(pieces),
  monetary = mean(monetary), 
  revenue = mean(revenue),
  gross_margin = mean(gross_margin),
  size=n() ) %>% 
  filter(size > 1) %>% 
  
  ggplot(aes(x = frquency, y = recency)) +
  geom_point(aes(size= gross_margin, col= pieces),alpha=0.5) +
  scale_size(range=c(2,30)) +
  scale_color_gradient(low="blue",high="red") +
  scale_x_log10() + scale_y_continuous() + 
  geom_text(aes(label = size ),size=3) +
  theme_bw() + guides(size=F) +
  
  
  labs(title="Customer Segments",
       subtitle="(bubble_size:Gross_margin; text:Group_size)",
       color="pieces") +
  xlab("frquency") + ylab("recency")

3.3 顧客參數常規化

#將用得到的幾個參數獨立挑出來，放在一個table中
cc = A %>% select("seniority", "recency", "frquency","weekday", "pieces", "monetary", "gross_margin")

#將CC做常規化，把平均值調為0，標準差調為1
AN = scale(cc) %>% data.frame
sapply(AN, mean)

               seniority                  recency                 frquency 
 0.000000000000000027527 -0.000000000000000019501 -0.000000000000000023395 
                 weekday                   pieces                 monetary 
-0.000000000000000102981 -0.000000000000000054087 -0.000000000000000025898 
            gross_margin 
-0.000000000000000036200

sapply(AN, sd)

   seniority      recency     frquency      weekday       pieces     monetary 
           1            1            1            1            1            1 
gross_margin 
           1

#查看原始尺度中各數值對應各族群的大小
names(cc) = names(AN) =c("seniority", "recency", "frquency","weekday", "pieces", "monetary", "gross_margin")
# 原始尺度
sapply(split(cc,gp), colMeans) %>% round(3)

                   1       2        3       4
seniority     94.914 104.765   77.263  35.331
recency       83.418  14.280   43.362  23.538
frquency       1.582   6.932    2.163   1.619
weekday        0.558   0.647    0.443   0.629
pieces         7.952   8.372   28.325   7.957
monetary     712.222 751.530 2940.579 761.342
gross_margin   0.106   0.105    0.173   0.126

#查看常態化尺度後各數值對應各族群的大小
sapply(split(AN,gp), colMeans) %>% round(3) #常態化尺度

                  1      2      3      4
seniority     0.417  0.706 -0.102 -1.334
recency       1.375 -0.686  0.181 -0.410
frquency     -0.458  0.732 -0.328 -0.449
weekday      -0.098  0.131 -0.394  0.085
pieces       -0.282 -0.236  1.957 -0.281
monetary     -0.298 -0.257  1.983 -0.247
gross_margin -0.086 -0.093  0.367  0.047

par(cex=0.8)
split(AN,gp) %>% sapply(colMeans) %>% barplot(beside=T,col=rainbow(7))
legend('topright',legend=colnames(cc),fill=rainbow(7))

The Demarcation Date

Remove data after the demarcation date

#先將資料分割，資料集Z是二月之前的資料
feb01 = as.Date("2001-02-01")
Z0 = subset(Z, date < feb01)    # 618212

Aggregate for the Transaction Records

#將Z交易資料(11月到1月)做處理
X0 = Z0 %>% group_by(tid) %>% 
  summarise(
    date = min(date),           # 交易日期  
    cust = min(cust),           # 顧客 ID
    age = min(age),             # 顧客 年齡級別
    area = min(area),           # 顧客 居住區別
    items = n(),                # 交易項目(總)數
    pieces = sum(qty),          # 產品(總)件數
    money = sum(price),         # 交易(總)金額
    costs = sum(cost),          # 交易(總)成本
    gross = sum(price - cost),  # 毛利
    gross_margin = gross/money
  ) %>% data.frame  # 88387

Check Quantile and Remove Outlier

#檢查交易資料的分布
sapply(X0[,6:11], quantile, prob=c(.999, .9995, .9999))

        items pieces   money   costs  gross gross_margin
99.9%  56.000  84.00  9378.7  7797.4 1883.2      0.40035
99.95% 64.000  98.00 11261.8  9151.6 2317.1      0.43164
99.99% 85.646 137.65 17699.3 14108.5 3389.6      0.53125

#移除離群值，把在大於99.95%分布以外的資料移除
X0 = subset(X0, items<=64 & pieces<=98 & money<=11261 & gross_margin <= 0.43164) # 88387 -> 88250

Aggregate for Customer Records

#把X資料集顧客資料做彙總
d0 = max(X0$date) + 1
A0 = X0 %>% 
  mutate(
    days = as.integer(difftime(d0, date, units="days"))) %>% 
  group_by(cust) %>% 
  summarise(
    age = min(age),          # 年齡
    area = min(area),        # 地區
    recency = min(days),     # 最近一次購買，距今日的天數
    seniority = max(days),   # 第一次購買，距今日的天數
    frquency = n(),          # 購買次數
    pieces = mean(pieces),   # 平均交易購買產品件數
    monetary = mean(money),  # 平均交易金額
    revenue = sum(money),    # 交易總額
    gross_margin = mean(gross_margin) #平均毛利率
  ) %>% data.frame      
nrow(A0) #28581

[1] 28581

Preparing the Target Variables (Y)

Aggregate Feb’s Transaction by Customer

#將二月份的每位顧客的交易資料進行彙整
feb = filter(X, date>= feb01) %>% group_by(cust) %>% 
  summarise(amount = sum(money))  # 16049

The Target for Regression - `A$amount`

Simply a Left Joint

#將二月份資料與之前資料合併
A0 = merge(A0, feb, by="cust", all.x=T)

The Target for Classification - `A$buy`

#找出哪些是在2月之前就有消費紀錄以及在二月開始才有消費紀錄的顧客
A0$buy = !is.na(A0$amount)
table(A0$buy, !is.na(A0$amount))

       
        FALSE  TRUE
  FALSE 15999     0
  TRUE      0 12582

Train & Test Dataset

#針對四個月的資料，將資料分成TR(0.7)和TS(0.3)，來測試模型的準確度
X = subset(X, cust %in% A0$cust & date < as.Date("2001-02-01"))
Z = subset(Z, cust %in% A0$cust & date < as.Date("2001-02-01"))
set.seed(2018); spl = sample.split(A0$buy, SplitRatio=0.7)
c(nrow(A), sum(spl), sum(!spl))

[1] 30510 20006  8575

#畫出密度曲線圖
cbind(A0, spl) %>% filter(buy) %>% 
  ggplot(aes(x=log(amount))) + geom_density(aes(fill=spl), alpha=0.5)

#透過有來買的人來預測之後會來買的價錢,用A做機率模型
A2 = subset(A0, buy) %>% mutate_at(c("monetary","revenue","amount"), log10)
n = nrow(A2)
set.seed(2018); spl2 = 1:n %in% sample(1:n, round(0.7*n))
c(nrow(A2), sum(spl2), sum(!spl2))

[1] 12582  8807  3775

#畫出密度曲線圖
cbind(A2, spl2) %>% 
  ggplot(aes(x=amount)) + geom_density(aes(fill=spl2), alpha=0.5)

Spliting for Classification

#分割訓練資料和測試資料
TR = subset(A0, spl)
TS = subset(A0, !spl)

Classification Model

#建立分類模型，放入所有可用資料
glm1 = glm(buy ~ ., TR[,c(2:10, 12)], family=binomial())

Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(glm1)


Call:
glm(formula = buy ~ ., family = binomial(), data = TR[, c(2:10, 
    12)])

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-3.613  -0.862  -0.650   0.991   1.885  

Coefficients:
                Estimate  Std. Error z value             Pr(>|z|)    
(Intercept)   -1.1972523   0.1295743   -9.24 < 0.0000000000000002 ***
agea29        -0.0156346   0.0888963   -0.18              0.86039    
agea34         0.0693388   0.0819900    0.85              0.39772    
agea39         0.1105934   0.0812170    1.36              0.17329    
agea44         0.0512668   0.0837627    0.61              0.54051    
agea49         0.0309601   0.0871720    0.36              0.72247    
agea54        -0.0743826   0.0957263   -0.78              0.43714    
agea59         0.1346176   0.1153689    1.17              0.24327    
agea64         0.1202517   0.1212237    0.99              0.32121    
agea69         0.2046125   0.1075123    1.90              0.05702 .  
agea99       -16.6760779 106.7462650   -0.16              0.87586    
areaz106      -0.0291434   0.1351816   -0.22              0.82931    
areaz110      -0.2223265   0.1054748   -2.11              0.03504 *  
areaz114      -0.0353269   0.1130661   -0.31              0.75470    
areaz115       0.2089803   0.0983745    2.12              0.03364 *  
areaz221       0.0900806   0.0991870    0.91              0.36378    
areazOthers   -0.0678827   0.1057391   -0.64              0.52088    
areazUnknown -27.5223923 116.5778841   -0.24              0.81337    
recency       -0.0135307   0.0009220  -14.68 < 0.0000000000000002 ***
seniority      0.0105512   0.0009403   11.22 < 0.0000000000000002 ***
frquency       0.2970780   0.0163838   18.13 < 0.0000000000000002 ***
pieces         0.0096962   0.0027934    3.47              0.00052 ***
monetary      -0.0000563   0.0000345   -1.63              0.10265    
revenue       -0.0000122   0.0000125   -0.97              0.32975    
gross_margin  -0.6880819   0.1132861   -6.07         0.0000000012 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 27448  on 20005  degrees of freedom
Residual deviance: 22049  on 19981  degrees of freedom
AIC: 22099

Number of Fisher Scoring iterations: 16

#使用測試集來檢驗當在0.5的機率下，會實際會購買的人數與不會購買的數量分布
pred =  predict(glm1, TS, type="response")
cm = table(actual = TS$buy, predict = pred > 0.5); cm

       predict
actual  FALSE TRUE
  FALSE  3923  877
  TRUE   1574 2201

#平均值代表顧客族群裡面購買比率，代表平均會回購的機率。用平均值跟預測模型的準確度比，下值發現模型的準確度比單就平均值猜的機率更大
acc.ts = cm %>% {sum(diag(.))/sum(.)}
c(1-mean(TS$buy) , acc.ts)  # 0.71417

[1] 0.55977 0.71417

colAUC(pred, TS$buy)        # 0.77885

                  [,1]
FALSE vs. TRUE 0.77885

Regression Model

#將資料集A切割成訓練資料和測試資料並做LOG處理
A3 = subset(A0, A0$buy) %>% mutate_at(c("monetary","revenue","amount"), log10)
TR2 = subset(A2, spl2)
TS2 = subset(A2, !spl2)

#建立模型
lm1 = lm(amount ~ ., TR2[,c(2:11)])
summary(lm1)


Call:
lm(formula = amount ~ ., data = TR2[, c(2:11)])

Residuals:
    Min      1Q  Median      3Q     Max 
-1.9841 -0.2311  0.0465  0.2809  1.5430 

Coefficients:
              Estimate Std. Error t value             Pr(>|t|)    
(Intercept)   1.405737   0.056689   24.80 < 0.0000000000000002 ***
agea29        0.044212   0.025291    1.75               0.0805 .  
agea34        0.102705   0.023243    4.42    0.000010046777673 ***
agea39        0.126071   0.022889    5.51    0.000000037318791 ***
agea44        0.103045   0.023437    4.40    0.000011122781602 ***
agea49        0.082340   0.024272    3.39               0.0007 ***
agea54        0.073336   0.026761    2.74               0.0061 ** 
agea59        0.057401   0.031235    1.84               0.0661 .  
agea64        0.045274   0.032362    1.40               0.1619    
agea69       -0.033785   0.028674   -1.18               0.2387    
areaz106      0.026646   0.041911    0.64               0.5249    
areaz110     -0.008423   0.034046   -0.25               0.8046    
areaz114     -0.031301   0.035996   -0.87               0.3846    
areaz115     -0.033412   0.031347   -1.07               0.2865    
areaz221     -0.009636   0.031560   -0.31               0.7601    
areazOthers  -0.020780   0.033897   -0.61               0.5399    
recency       0.000330   0.000318    1.04               0.2994    
seniority     0.000114   0.000322    0.35               0.7229    
frquency      0.026753   0.002104   12.71 < 0.0000000000000002 ***
pieces        0.005315   0.000775    6.86    0.000000000007288 ***
monetary      0.391573   0.043044    9.10 < 0.0000000000000002 ***
revenue       0.047772   0.038784    1.23               0.2181    
gross_margin  0.290456   0.037517    7.74    0.000000000000011 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.42 on 8784 degrees of freedom
Multiple R-squared:  0.296, Adjusted R-squared:  0.294 
F-statistic:  168 on 22 and 8784 DF,  p-value: <0.0000000000000002

#對12月到2月的交易資料進行彙總
load("data/tf0.rdata")
d0 = max(X0$date) + 1
B = X0 %>% 
  filter(date >= as.Date("2000-12-01")) %>% 
  mutate(days = as.integer(difftime(d0, date, units="days"))) %>% 
  group_by(cust) %>% 
  summarise(
    age = age[1],          # 年齡
    area = area[1],        # 地區
    recency = min(days),     # 最近一次購買，距今日的天數
    seniority = max(days),   # 第一次購買，距今日的天數
    frquency = n(),          # 購買次數
    pieces = mean(pieces),   # 平均交易購買產品件數
    monetary = mean(money),  # 平均交易金額
    revenue = sum(money),    # 交易總額
    gross_margin = mean(gross_margin) #平均毛利率
  ) %>% data.frame      
nrow(B)

[1] 27048

In B, there is a record for each customer. B$Buy is the probability of buying in March.

#預測3月會不會購買的機率
B$Buy = predict(glm1, B, type="response")

💡：預測購買金額時要記得做指數、對數轉換！

#預測三月可能購買的金額，且在轉換時要取對數
B2 = B %>% mutate_at(c("monetary","revenue"), log10)
B$Rev = 10^predict(lm1, B2)

購買機率與預期營收的分布

par(mfrow=c(1,2), cex=0.8)
hist(B$Buy)
hist(log(B$Rev,10))

B = left_join(B, A[, c(1,13)])

Joining, by = "cust"

BG3 = B %>% filter(group == "3")
BG4 = B %>% filter(group == "4")

par(mfrow=c(1,2), cex=0.8)
hist(BG3$Buy)
hist(log(BG3$Rev,10))

hist(BG4$Buy)
hist(log(BG4$Rev,10))

帶有「參數」的成本效益函數

#delta P指的是購買機率增加的%數，M是預期客單價，x是成本。
DP = function(x,m0,b0,a0) {m0*plogis((10/a0)*(x-b0))}

par(mar=c(4,4,2,1),mfrow=c(1,2),cex=0.7)
curve(DP(x,m=0.2,b=15,a=20), 0, 30, lwd=2, ylim=c(0, 0.25),
      main="F( x | m=0.2, b=15, a=20 )", ylab="delta P")
abline(h=seq(0,0.2,0.05),v=seq(0,30,5),col='lightgrey',lty=2)

m=0.1; b=150; a=200; x=200;
dp = DP(x,m,b,a)
dp = ifelse(BG4$Buy+dp>1, 1-BG4$Buy, dp)
eR = dp*BG4$Rev - x
hist(eR)

m=0.1; b=50; a=100; X = seq(10,120,1)
#行銷策略對eR>0的人滿額Y送折價券，發給N個人，而eReturn2為可預測之獲利
#dp = 機率增額，eR = expected return
sapply(X, function(x) {
  dp = DP(x,m,b,a)
  dp = ifelse(B$Buy+dp>1, 1-B$Buy, dp)
  eR = dp*B$Rev - x*(B$Buy+dp)#x乘上(購買率+購買率增額)
  c(x=x, eReturn=sum(eR), N=sum(eR > 0), eReturn2=sum(eR[eR > 0]))
  }) %>% t %>% data.frame %>% 
  gather('key','value',-x) %>% 
  ggplot(aes(x=x, y=value, col=key)) + 
  geom_hline(yintercept=0,linetype='dashed') +
  geom_line(size=1.5,alpha=0.5) + 
  facet_wrap(~key,ncol=1,scales='free_y') + theme_bw()

#針對第三群的行銷工具
#最佳滿額贈品
gm = mean(BG3$gross_margin)
dPm = 0.3 #顧客回購率上升的S曲線
dPb = 50
dPa = 30
dMm = 0.3 #顧客回購後購買金額上升的S曲線
dMb = 50
dMa = 30
 
X = seq(10, 100, 5) 
df2 = sapply(X, function(x) {
    dp = DP(x,dPm,dPb,dPa)
    dp = ifelse(BG3$Buy+dp>1, 1-BG3$Buy, dp)
    dm = DP(x,dMm,dMb,dMa)
    eR = gm * ((BG3$Buy+dp)*BG3$Rev*(1+dm) - BG3$Buy*BG3$Rev) - x
    c(x=x, eR.ALL=sum(eR), N=sum(eR>0), eR.SEL=sum(eR[eR > 0]) )
    }) %>% t %>% data.frame 


df2 %>% gather('key','value', -x) %>% 
  mutate(Instrument = paste0('Instrument')) %>% 
  ggplot(aes(x=x, y=value, col=Instrument)) + 
  geom_hline(yintercept=0, linetype='dashed', col='blue') +
  geom_line(size=1.5,alpha=0.5) + 
  xlab('工具選項(成本)') + ylab('預期報償') + 
  ggtitle('行銷工具優化','假設行銷工具的效果是其成本的函數') +
    facet_wrap(~key,ncol=1,scales='free_y') + theme_bw()

第三組最佳策略

df2 %>% top_n(1,eR.ALL)

   x eR.ALL    N eR.SEL
1 65 340291 3166 340810

#針對第四群的行銷工具
#明星商品
#我們發現
gm = mean(BG4$gross_margin)
dPm = 0.5#顧客回購率上升的S曲線
dPb = 25
dPa = 10
dMm = 0.5#顧客回購後購買金額上升的S曲線
dMb = 25
dMa = 10

X = seq(10, 100, 1) 
df = sapply(X, function(x) {
    dp = DP(x,dPm,dPb,dPa)
    dp = ifelse(BG4$Buy+dp>1, 1-BG4$Buy, dp)
    dm = DP(x,dMm,dMb,dMa)
    eR = gm * ((BG4$Buy+dp)*BG4$Rev*(1+dm) - BG4$Buy*BG4$Rev) - x
    c(x=x, eR.ALL=sum(eR), N=sum(eR>0), eR.SEL=sum(eR[eR > 0]) )
    }) %>% t %>% data.frame

df %>% gather('key','value',-x) %>% 
  mutate(Instrument = paste0('Instrument')) %>% 
  ggplot(aes(x=x, y=value, col=Instrument)) + 
  geom_hline(yintercept=0, linetype='dashed', col='blue') +
  geom_line(size=1.5,alpha=0.5) + 
  xlab('工具選項(成本)') + ylab('預期報償') + 
  ggtitle('行銷工具優化','假設行銷工具的效果是其成本的函數') +
    facet_wrap(~key,ncol=1,scales='free_y') + theme_bw()

第四組最佳策略

df %>% top_n(1,eR.ALL)

   x eR.ALL    N eR.SEL
1 30 425431 7577 429626