ZZ = read_csv("data/ta_feng_all_months_merged.csv") %>%
data.frame %>% setNames(c(
"date","cust","age","area","cat","prod","qty","cost","price"))Parsed with column specification:
cols(
TRANSACTION_DT = col_character(),
CUSTOMER_ID = col_character(),
AGE_GROUP = col_character(),
PIN_CODE = col_character(),
PRODUCT_SUBCLASS = col_double(),
PRODUCT_ID = col_character(),
AMOUNT = col_double(),
ASSET = col_double(),
SALES_PRICE = col_double()
)
qty cost price
99% 6 858.0 1014.0
99.9% 14 2722.0 3135.8
99.95% 24 3799.3 3999.0
Z$date = as.Date(Z$date, format="%m/%d/%Y") #日期格式轉換
par(cex=0.8)
hist(Z$date,'weeks',freq=T,las=2, main="No. Transaction per Week")age.group = c("<25","25-29","30-34","35-39","40-44",
"45-49","50-54","55-59","60-64",">65")
Z$age = c(paste0("a",seq(24,69,5)),"a99")[match(Z$age,age.group,11)]
Z$area = paste0("z",Z$area)par(mfrow=c(1,2),cex=0.7)
table(Z$age, useNA='ifany') %>% barplot(main="Age Groups", las=2)
table(Z$area, useNA='ifany') %>% barplot(main="Areas", las=2)把每一天、每一位顧客的交易項目彙總為一張訂單
cust cat prod tid
32256 2007 23789 119422
XX = Z %>% group_by(tid) %>%
summarise(
date = min(date), # 交易日期
cust = min(cust), # 顧客 ID
age = min(age), # 顧客 年齡級別
area = min(area), # 顧客 居住區別
items = n(), # 交易項目(總)數
pieces = sum(qty), # 產品(總)件數
money = sum(price), # 交易(總)金額
costs = sum(cost), # 交易(總)成本
gross = sum(price - cost), # 毛利
gross_margin = gross/money
) %>%
#將年齡70以上、地區未知的去掉
filter(!age %in% c("a99")) %>%
filter(!area %in% c("zUnknown")) %>% data.frame
nrow(X) #111066[1] 111066
items pieces money costs gross gross_margin
0.05% 1.000 1.00 10 9 -284.94 -0.92308
99.99% 82.000 130.68 16267 13817 3263.31 0.53125
99.995% 91.681 147.91 20198 16332 3660.70 0.58494
Ad0 = max(X$date) + 1
A = X %>%
mutate(
days = as.integer(difftime(d0, date, units="days"))) %>%
group_by(cust) %>%
summarise(
age = min(age), # 年齡
area = min(area), # 地區
recency = min(days), # 最近一次購買,距今日的天數
seniority = max(days), # 第一次購買,距今日的天數
frquency = n(), # 購買次數
pieces = mean(pieces), # 平均交易購買產品件數
monetary = mean(money), # 平均交易金額
revenue = sum(money), # 交易總額
gross_margin = mean(gross_margin) #平均毛利率
) %>% data.frame
nrow(A) [1] 30510
X %>% mutate(wdate=format(date, "%u")) %>%
group_by(cust) %>% summarise(
weekday = mean(wdate <= 5)
) %>% right_join(A) -> AJoining, by = "cust"
A = A %>% mutate(
Daytype = ifelse(weekday == 0, "WEND",
ifelse(weekday == 1, "WDAY", "MIXED")) %>% factor)
table(A$Daytype , cut(A$frquency, c(0, 4, 8, 12, 20, 30, Inf)) ) %>%
prop.table(2) %>% round(3)
(0,4] (4,8] (8,12] (12,20] (20,30] (30,Inf]
MIXED 0.276 0.848 0.944 0.977 0.993 1.000
WDAY 0.435 0.127 0.051 0.022 0.007 0.000
WEND 0.289 0.025 0.005 0.001 0.000 0.000
par(mfrow=c(1, 3),cex=0.7)
table(A$age, useNA='ifany') %>% barplot(main="Age Groups",las=2)
table(A$area, useNA='ifany') %>% barplot(main="Areas",las=2)
table(A$Daytype, useNA='ifany') %>% barplot(main="Daytype",las=2)A %>% group_by(age) %>% summarise(
Group.Size = n(), # 族群人數
avg.Freq = mean(frquency), # 平均購買次數
avg.Revenue = sum(frquency * monetary)/sum(frquency) # 平均客單價
) %>%
ggplot(aes(y=avg.Freq, x=avg.Revenue)) +
geom_point(aes(col=age, size=Group.Size), alpha=0.5) +
geom_text(aes(label=age)) +
scale_size(range=c(5,25)) +
theme_bw() + theme(legend.position="none") +
ggtitle("年齡區隔特徵 (泡泡大小:族群人數)") +
ylab("平均購買次數") + xlab("平均客單價")A %>% group_by(area) %>% summarise(
Group.Size = n(), # 族群人數
avg.Freq = mean(frquency), # 平均購買次數
avg.Revenue = sum(frquency * monetary)/sum(frquency) # 平均客單價
) %>%
ggplot(aes(y=avg.Freq, x=avg.Revenue)) +
geom_point(aes(col=area, size=Group.Size), alpha=0.5) +
geom_text(aes(label=area)) +
scale_size(range=c(5,25)) +
theme_bw() + theme(legend.position="none") +
ggtitle("地理區隔特徵 (泡泡大小:族群人數)") +
ylab("平均購買次數") + xlab("平均客單價")
💡 主要發現:
※ 「平均購買次數」和「平均客單價」之間有明顯的負相關
§ 30~40歲(年長60-70歲)的人比較少(常)來買、但每一次買的比較多(少)
§ 若以Z115、Z221為中心,住的遠(近)的人比較少(常)來買、但每一次買的比較多(少)
使用馬賽克圖檢視列連表的關聯性(Association between Categorial Variables) + 方塊大小代表該類別組合的數量 + 紅(藍)色代表該類別組合的數量顯著小(大)於期望值 + 期望值就是邊際機率(如上方的直條圖所示)的乘積 + 卡方檢定(類別變數的關聯性檢定)的p值顯示在圖示最下方 + p-value < 2.22e-16 : age 與 area 之間有顯著的關聯性
MOSA = function(formula, data) mosaic(formula, data, shade=T,
margins=c(0,1,0,0), labeling_args = list(rot_labels=c(90,0,0,0)),
gp_labels=gpar(fontsize=9), legend_args=list(fontsize=9),
gp_text=gpar(fontsize=7),labeling=labeling_residuals)
MOSA(~area+age, A)
💡 主要發現:
※ 「年齡」與「地區」之間有很高的關聯性
§ 南港(z115):30~40歲的顧客比率比較低、50歲以上的顧客比率比較高
§ 汐止(z221)、內湖(z114)和其他(zOthers):3040歲的顧客比率比較高、4555歲的顧客比率比較低
💡 主要發現:
※ 「年齡」與「購物日」之間有關聯性
§ 平日時,年齡為65-70歲的顧客較常出沒、年齡為30-40歲的顧客幾乎不會來 § 相反地,禮拜日時,年齡為30-40歲的顧客較常出沒、年齡為50歲以上的顧客幾乎不會來
💡 主要發現:
※ 「地區」與「購物日」之間有很高的關聯性
§ 平日時,南港(z115)生意興隆,剩下的地區較沒有生意 § 假日時,南港(z115)生意慘淡,顧客都來反倒來信義(z110)消費
#採用集群式分析k-means,將資料依照不同屬性將顧客做分群
#k-means分完群會把分群的向量放在“cluster”這個欄位裡面
set.seed(11)
gp= A$group= kmeans(scale(A[,c(2, 5:9, 11)]),4)$cluster
table(gp) # 族群大小gp
1 2 3 4
7559 11284 3604 8063
#做各群的泡泡圖,數值代表族群大小
#泡泡大小:貢獻毛利率;顏色:平均每次消費購買數
#X軸:是購買頻率;Y軸:最近一次購買距今的時間
group_by(A, group) %>% summarise(
weekday = mean(weekday),
recency = mean(recency),
seniority = mean(seniority),
frquency = mean(frquency),
pieces = mean(pieces),
monetary = mean(monetary),
revenue = mean(revenue),
gross_margin = mean(gross_margin),
size=n() ) %>%
filter(size > 1) %>%
ggplot(aes(x = frquency, y = recency)) +
geom_point(aes(size= gross_margin, col= pieces),alpha=0.5) +
scale_size(range=c(2,30)) +
scale_color_gradient(low="blue",high="red") +
scale_x_log10() + scale_y_continuous() +
geom_text(aes(label = size ),size=3) +
theme_bw() + guides(size=F) +
labs(title="Customer Segments",
subtitle="(bubble_size:Gross_margin; text:Group_size)",
color="pieces") +
xlab("frquency") + ylab("recency")#將用得到的幾個參數獨立挑出來,放在一個table中
cc = A %>% select("seniority", "recency", "frquency","weekday", "pieces", "monetary", "gross_margin")
#將CC做常規化,把平均值調為0,標準差調為1
AN = scale(cc) %>% data.frame
sapply(AN, mean) seniority recency frquency
0.000000000000000027527 -0.000000000000000019501 -0.000000000000000023395
weekday pieces monetary
-0.000000000000000102981 -0.000000000000000054087 -0.000000000000000025898
gross_margin
-0.000000000000000036200
seniority recency frquency weekday pieces monetary
1 1 1 1 1 1
gross_margin
1
#查看原始尺度中各數值對應各族群的大小
names(cc) = names(AN) =c("seniority", "recency", "frquency","weekday", "pieces", "monetary", "gross_margin")
# 原始尺度
sapply(split(cc,gp), colMeans) %>% round(3) 1 2 3 4
seniority 94.914 104.765 77.263 35.331
recency 83.418 14.280 43.362 23.538
frquency 1.582 6.932 2.163 1.619
weekday 0.558 0.647 0.443 0.629
pieces 7.952 8.372 28.325 7.957
monetary 712.222 751.530 2940.579 761.342
gross_margin 0.106 0.105 0.173 0.126
1 2 3 4
seniority 0.417 0.706 -0.102 -1.334
recency 1.375 -0.686 0.181 -0.410
frquency -0.458 0.732 -0.328 -0.449
weekday -0.098 0.131 -0.394 0.085
pieces -0.282 -0.236 1.957 -0.281
monetary -0.298 -0.257 1.983 -0.247
gross_margin -0.086 -0.093 0.367 0.047
par(cex=0.8)
split(AN,gp) %>% sapply(colMeans) %>% barplot(beside=T,col=rainbow(7))
legend('topright',legend=colnames(cc),fill=rainbow(7))Remove data after the demarcation date
#將Z交易資料(11月到1月)做處理
X0 = Z0 %>% group_by(tid) %>%
summarise(
date = min(date), # 交易日期
cust = min(cust), # 顧客 ID
age = min(age), # 顧客 年齡級別
area = min(area), # 顧客 居住區別
items = n(), # 交易項目(總)數
pieces = sum(qty), # 產品(總)件數
money = sum(price), # 交易(總)金額
costs = sum(cost), # 交易(總)成本
gross = sum(price - cost), # 毛利
gross_margin = gross/money
) %>% data.frame # 88387 items pieces money costs gross gross_margin
99.9% 56.000 84.00 9378.7 7797.4 1883.2 0.40035
99.95% 64.000 98.00 11261.8 9151.6 2317.1 0.43164
99.99% 85.646 137.65 17699.3 14108.5 3389.6 0.53125
#把X資料集顧客資料做彙總
d0 = max(X0$date) + 1
A0 = X0 %>%
mutate(
days = as.integer(difftime(d0, date, units="days"))) %>%
group_by(cust) %>%
summarise(
age = min(age), # 年齡
area = min(area), # 地區
recency = min(days), # 最近一次購買,距今日的天數
seniority = max(days), # 第一次購買,距今日的天數
frquency = n(), # 購買次數
pieces = mean(pieces), # 平均交易購買產品件數
monetary = mean(money), # 平均交易金額
revenue = sum(money), # 交易總額
gross_margin = mean(gross_margin) #平均毛利率
) %>% data.frame
nrow(A0) #28581 [1] 28581
A$amountSimply a Left Joint
A$buy
FALSE TRUE
FALSE 15999 0
TRUE 0 12582
#針對四個月的資料,將資料分成TR(0.7)和TS(0.3),來測試模型的準確度
X = subset(X, cust %in% A0$cust & date < as.Date("2001-02-01"))
Z = subset(Z, cust %in% A0$cust & date < as.Date("2001-02-01"))
set.seed(2018); spl = sample.split(A0$buy, SplitRatio=0.7)
c(nrow(A), sum(spl), sum(!spl))[1] 30510 20006 8575
#畫出密度曲線圖
cbind(A0, spl) %>% filter(buy) %>%
ggplot(aes(x=log(amount))) + geom_density(aes(fill=spl), alpha=0.5)#透過有來買的人來預測之後會來買的價錢,用A做機率模型
A2 = subset(A0, buy) %>% mutate_at(c("monetary","revenue","amount"), log10)
n = nrow(A2)
set.seed(2018); spl2 = 1:n %in% sample(1:n, round(0.7*n))
c(nrow(A2), sum(spl2), sum(!spl2))[1] 12582 8807 3775
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Call:
glm(formula = buy ~ ., family = binomial(), data = TR[, c(2:10,
12)])
Deviance Residuals:
Min 1Q Median 3Q Max
-3.613 -0.862 -0.650 0.991 1.885
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.1972523 0.1295743 -9.24 < 0.0000000000000002 ***
agea29 -0.0156346 0.0888963 -0.18 0.86039
agea34 0.0693388 0.0819900 0.85 0.39772
agea39 0.1105934 0.0812170 1.36 0.17329
agea44 0.0512668 0.0837627 0.61 0.54051
agea49 0.0309601 0.0871720 0.36 0.72247
agea54 -0.0743826 0.0957263 -0.78 0.43714
agea59 0.1346176 0.1153689 1.17 0.24327
agea64 0.1202517 0.1212237 0.99 0.32121
agea69 0.2046125 0.1075123 1.90 0.05702 .
agea99 -16.6760779 106.7462650 -0.16 0.87586
areaz106 -0.0291434 0.1351816 -0.22 0.82931
areaz110 -0.2223265 0.1054748 -2.11 0.03504 *
areaz114 -0.0353269 0.1130661 -0.31 0.75470
areaz115 0.2089803 0.0983745 2.12 0.03364 *
areaz221 0.0900806 0.0991870 0.91 0.36378
areazOthers -0.0678827 0.1057391 -0.64 0.52088
areazUnknown -27.5223923 116.5778841 -0.24 0.81337
recency -0.0135307 0.0009220 -14.68 < 0.0000000000000002 ***
seniority 0.0105512 0.0009403 11.22 < 0.0000000000000002 ***
frquency 0.2970780 0.0163838 18.13 < 0.0000000000000002 ***
pieces 0.0096962 0.0027934 3.47 0.00052 ***
monetary -0.0000563 0.0000345 -1.63 0.10265
revenue -0.0000122 0.0000125 -0.97 0.32975
gross_margin -0.6880819 0.1132861 -6.07 0.0000000012 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 27448 on 20005 degrees of freedom
Residual deviance: 22049 on 19981 degrees of freedom
AIC: 22099
Number of Fisher Scoring iterations: 16
#使用測試集來檢驗當在0.5的機率下,會實際會購買的人數與不會購買的數量分布
pred = predict(glm1, TS, type="response")
cm = table(actual = TS$buy, predict = pred > 0.5); cm predict
actual FALSE TRUE
FALSE 3923 877
TRUE 1574 2201
#平均值代表顧客族群裡面購買比率,代表平均會回購的機率。用平均值跟預測模型的準確度比,下值發現模型的準確度比單就平均值猜的機率更大
acc.ts = cm %>% {sum(diag(.))/sum(.)}
c(1-mean(TS$buy) , acc.ts) # 0.71417[1] 0.55977 0.71417
[,1]
FALSE vs. TRUE 0.77885
#將資料集A切割成訓練資料和測試資料並做LOG處理
A3 = subset(A0, A0$buy) %>% mutate_at(c("monetary","revenue","amount"), log10)
TR2 = subset(A2, spl2)
TS2 = subset(A2, !spl2)
Call:
lm(formula = amount ~ ., data = TR2[, c(2:11)])
Residuals:
Min 1Q Median 3Q Max
-1.9841 -0.2311 0.0465 0.2809 1.5430
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.405737 0.056689 24.80 < 0.0000000000000002 ***
agea29 0.044212 0.025291 1.75 0.0805 .
agea34 0.102705 0.023243 4.42 0.000010046777673 ***
agea39 0.126071 0.022889 5.51 0.000000037318791 ***
agea44 0.103045 0.023437 4.40 0.000011122781602 ***
agea49 0.082340 0.024272 3.39 0.0007 ***
agea54 0.073336 0.026761 2.74 0.0061 **
agea59 0.057401 0.031235 1.84 0.0661 .
agea64 0.045274 0.032362 1.40 0.1619
agea69 -0.033785 0.028674 -1.18 0.2387
areaz106 0.026646 0.041911 0.64 0.5249
areaz110 -0.008423 0.034046 -0.25 0.8046
areaz114 -0.031301 0.035996 -0.87 0.3846
areaz115 -0.033412 0.031347 -1.07 0.2865
areaz221 -0.009636 0.031560 -0.31 0.7601
areazOthers -0.020780 0.033897 -0.61 0.5399
recency 0.000330 0.000318 1.04 0.2994
seniority 0.000114 0.000322 0.35 0.7229
frquency 0.026753 0.002104 12.71 < 0.0000000000000002 ***
pieces 0.005315 0.000775 6.86 0.000000000007288 ***
monetary 0.391573 0.043044 9.10 < 0.0000000000000002 ***
revenue 0.047772 0.038784 1.23 0.2181
gross_margin 0.290456 0.037517 7.74 0.000000000000011 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.42 on 8784 degrees of freedom
Multiple R-squared: 0.296, Adjusted R-squared: 0.294
F-statistic: 168 on 22 and 8784 DF, p-value: <0.0000000000000002
#對12月到2月的交易資料進行彙總
load("data/tf0.rdata")
d0 = max(X0$date) + 1
B = X0 %>%
filter(date >= as.Date("2000-12-01")) %>%
mutate(days = as.integer(difftime(d0, date, units="days"))) %>%
group_by(cust) %>%
summarise(
age = age[1], # 年齡
area = area[1], # 地區
recency = min(days), # 最近一次購買,距今日的天數
seniority = max(days), # 第一次購買,距今日的天數
frquency = n(), # 購買次數
pieces = mean(pieces), # 平均交易購買產品件數
monetary = mean(money), # 平均交易金額
revenue = sum(money), # 交易總額
gross_margin = mean(gross_margin) #平均毛利率
) %>% data.frame
nrow(B)[1] 27048
In B, there is a record for each customer. B$Buy is the probability of buying in March.
💡: 預測購買金額時要記得做指數、對數轉換!
#預測三月可能購買的金額,且在轉換時要取對數
B2 = B %>% mutate_at(c("monetary","revenue"), log10)
B$Rev = 10^predict(lm1, B2)Joining, by = "cust"
#delta P指的是購買機率增加的%數,M是預期客單價,x是成本。
DP = function(x,m0,b0,a0) {m0*plogis((10/a0)*(x-b0))}
par(mar=c(4,4,2,1),mfrow=c(1,2),cex=0.7)
curve(DP(x,m=0.2,b=15,a=20), 0, 30, lwd=2, ylim=c(0, 0.25),
main="F( x | m=0.2, b=15, a=20 )", ylab="delta P")
abline(h=seq(0,0.2,0.05),v=seq(0,30,5),col='lightgrey',lty=2)
m=0.1; b=150; a=200; x=200;
dp = DP(x,m,b,a)
dp = ifelse(BG4$Buy+dp>1, 1-BG4$Buy, dp)
eR = dp*BG4$Rev - x
hist(eR)m=0.1; b=50; a=100; X = seq(10,120,1)
#行銷策略對eR>0的人滿額Y送折價券,發給N個人,而eReturn2為可預測之獲利
#dp = 機率增額,eR = expected return
sapply(X, function(x) {
dp = DP(x,m,b,a)
dp = ifelse(B$Buy+dp>1, 1-B$Buy, dp)
eR = dp*B$Rev - x*(B$Buy+dp)#x乘上(購買率+購買率增額)
c(x=x, eReturn=sum(eR), N=sum(eR > 0), eReturn2=sum(eR[eR > 0]))
}) %>% t %>% data.frame %>%
gather('key','value',-x) %>%
ggplot(aes(x=x, y=value, col=key)) +
geom_hline(yintercept=0,linetype='dashed') +
geom_line(size=1.5,alpha=0.5) +
facet_wrap(~key,ncol=1,scales='free_y') + theme_bw()#針對第三群的行銷工具
#最佳滿額贈品
gm = mean(BG3$gross_margin)
dPm = 0.3 #顧客回購率上升的S曲線
dPb = 50
dPa = 30
dMm = 0.3 #顧客回購後購買金額上升的S曲線
dMb = 50
dMa = 30
X = seq(10, 100, 5)
df2 = sapply(X, function(x) {
dp = DP(x,dPm,dPb,dPa)
dp = ifelse(BG3$Buy+dp>1, 1-BG3$Buy, dp)
dm = DP(x,dMm,dMb,dMa)
eR = gm * ((BG3$Buy+dp)*BG3$Rev*(1+dm) - BG3$Buy*BG3$Rev) - x
c(x=x, eR.ALL=sum(eR), N=sum(eR>0), eR.SEL=sum(eR[eR > 0]) )
}) %>% t %>% data.frame
df2 %>% gather('key','value', -x) %>%
mutate(Instrument = paste0('Instrument')) %>%
ggplot(aes(x=x, y=value, col=Instrument)) +
geom_hline(yintercept=0, linetype='dashed', col='blue') +
geom_line(size=1.5,alpha=0.5) +
xlab('工具選項(成本)') + ylab('預期報償') +
ggtitle('行銷工具優化','假設行銷工具的效果是其成本的函數') +
facet_wrap(~key,ncol=1,scales='free_y') + theme_bw()第三組最佳策略
x eR.ALL N eR.SEL
1 65 340291 3166 340810
#針對第四群的行銷工具
#明星商品
#我們發現
gm = mean(BG4$gross_margin)
dPm = 0.5#顧客回購率上升的S曲線
dPb = 25
dPa = 10
dMm = 0.5#顧客回購後購買金額上升的S曲線
dMb = 25
dMa = 10
X = seq(10, 100, 1)
df = sapply(X, function(x) {
dp = DP(x,dPm,dPb,dPa)
dp = ifelse(BG4$Buy+dp>1, 1-BG4$Buy, dp)
dm = DP(x,dMm,dMb,dMa)
eR = gm * ((BG4$Buy+dp)*BG4$Rev*(1+dm) - BG4$Buy*BG4$Rev) - x
c(x=x, eR.ALL=sum(eR), N=sum(eR>0), eR.SEL=sum(eR[eR > 0]) )
}) %>% t %>% data.frame
df %>% gather('key','value',-x) %>%
mutate(Instrument = paste0('Instrument')) %>%
ggplot(aes(x=x, y=value, col=Instrument)) +
geom_hline(yintercept=0, linetype='dashed', col='blue') +
geom_line(size=1.5,alpha=0.5) +
xlab('工具選項(成本)') + ylab('預期報償') +
ggtitle('行銷工具優化','假設行銷工具的效果是其成本的函數') +
facet_wrap(~key,ncol=1,scales='free_y') + theme_bw()第四組最佳策略
x eR.ALL N eR.SEL
1 30 425431 7577 429626