pacman::p_load(caTools, ggplot2, dplyr, d3heatmap,vcd,latex2exp,Martix,tidyr,arules,arulesViz)
load("data/tf0.rdata")
load("data/tf4.rdata")
load("data/tf3.rdata")
par(mfrow=c(1,2),cex=0.7)
table(A0$age) %>% barplot(las=2,main="Age Groups")
table(A0$area) %>% barplot(las=2,main="Areas")
Fig-2: Zip Codes
## [1] 21
## [1] 2
## [1] 691.4
Status = function(r,f,m) {
ifelse(r>= 21,
ifelse( f >= 2, ifelse( m > 691.4, "D", "C"), ifelse( m >= 691.4, "B", "A")),
ifelse( f >= 2, ifelse( m > 691.4, "H", "G"), ifelse( m >= 691.4, "F", "E"))
)
}
# f剛好等於50%值為2,但f為2的涵蓋範圍過大(可能40~60%都是2)
# 這裡是將>1者都列為>50%,f=1(只來過一次)列為<50% )
new_B = B %>% group_by(cust) %>%
mutate( status = Status(r,f,m) )
table(new_B$status) # A至H為依據白板上圖片分群,由左至右排序
##
## A B C D E F G H
## 4226 4581 2621 2917 1567 1497 5852 5270
MOSA = function(formula, data) mosaic(formula, data, shade=T,
margins=c(0,1,0,0), labeling_args = list(rot_labels=c(90,0,0,0)),
gp_labels=gpar(fontsize=9), legend_args=list(fontsize=9),
gp_text=gpar(fontsize=7),labeling=labeling_residuals)
MOSA(~status+age, new_B) #馬賽克圖
new_B %>% filter(status %in% c("B", "D", "F") ) %>% group_by(age, status) %>%
summarise(total_raw = sum(raw, na.rm=T ) ) # 得出3大族群中的不同年紀範圍,購買產生毛利之總額
## # A tibble: 33 x 3
## # Groups: age [11]
## age status total_raw
## <chr> <chr> <int>
## 1 a24 B 53837
## 2 a24 D 60151
## 3 a24 F 12311
## 4 a29 B 129656
## 5 a29 D 159485
## 6 a29 F 35364
## 7 a34 B 280311
## 8 a34 D 405058
## 9 a34 F 89243
## 10 a39 B 323603
## # … with 23 more rows
B1 = new_B %>% filter(status %in% c("B") ) %>% merge(Z0, by = "cust" ) %>% group_by(prod) %>% summarise(total_raw = sum(raw, na.rm=T) ) # 分群B(卜學亮族群)中,不同商品的毛利總額
D1 = new_B %>% filter(status %in% c("D") ) %>% merge(Z0, by = "cust" ) %>% group_by(prod) %>% summarise(total_raw = sum(raw, na.rm=T) ) # 分群D(周杰倫族群)中,不同商品的毛利總額
F1 = new_B %>% filter(status %in% c("F") ) %>% merge(Z0, by = "cust" ) %>% group_by(prod) %>% summarise(total_raw = sum(raw, na.rm=T) ) # 分群B(高爾宣族群)中,不同商品的毛利總額
#商品關聯性分析
tapply((Z0$price-Z0$cost), Z0$prod, sum) %>% sort(dec=T) %>% names -> TOP
tr = as(split(Z0[,"prod"], Z0[,"tid"]),"transactions")
R <- apriori(tr, parameter=list(supp=0.0001, conf=0.25))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.25 0.1 1 none FALSE TRUE 5 1e-04 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 11
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[23789 item(s), 119422 transaction(s)] done [0.50s].
## sorting and recoding items ... [10166 item(s)] done [0.02s].
## creating transaction tree ... done [0.09s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 done [0.90s].
## writing ... [9795 rule(s)] done [0.27s].
## creating S4 object ... done [0.07s].
## lhs rhs support confidence lift count
## [1] {4716114000312} => {4716114000329} 0.0012727973 0.5527273 231.60630 152
## [2] {4716114000329} => {4716114000312} 0.0012727973 0.5333333 231.60630 152
## [3] {4710154015138} => {4710154015206} 0.0009964663 0.3742138 52.08551 119
## [4] {4713754987614} => {4713754987607} 0.0011388186 0.3035714 78.46993 136
## [5] {4713754987607} => {4713754987614} 0.0011388186 0.2943723 78.46993 136
## [6] {4710011402026} => {4710011402019} 0.0028219256 0.6740000 90.23591 337
## [7] {4710088414328} => {4710088414311} 0.0017919646 0.4662309 86.18921 214
## [8] {4710011401142} => {4710011406123} 0.0015323810 0.4130926 50.33912 183
## [9] {4710085172702} => {4710085172696} 0.0024283633 0.5400372 62.01185 290
## [10] {4710254049323} => {4710254049521} 0.0020096800 0.4308797 55.56859 240
## [11] {4710011409056} => {4710011406123} 0.0026293313 0.4137022 50.41342 314
## [12] {4710011409056} => {4710011401128} 0.0044464169 0.6996047 51.03738 531
## [13] {4710085120093} => {4710085172696} 0.0037430289 0.4977728 57.15868 447
## [14] {4710011401135} => {4710011401128} 0.0058615665 0.7526882 54.90991 700
## [15] {4710011401142,
## 4710011409056} => {4710011401128} 0.0011974343 0.7447917 54.33385 143
## [16] {4710011401135,
## 4710011401142} => {4710011406123} 0.0008876086 0.4840183 58.98207 106
## [17] {4710011401135,
## 4710011401142} => {4710011401128} 0.0013900286 0.7579909 55.29675 166
## [18] {4710011401142,
## 4710011405133} => {4710011401128} 0.0012476763 0.6866359 50.09129 149
## [19] {4710011401128,
## 4710011401142} => {4710011406123} 0.0010132136 0.4566038 55.64136 121
## [20] {4710085120093,
## 4710085172702} => {4710085172696} 0.0013481603 0.6544715 75.15221 161
## [21] {4710085120093,
## 4710085172702} => {4710085120628} 0.0012811710 0.6219512 54.73446 153
## [22] {4710085172696,
## 4710085172702} => {4710085120628} 0.0014905126 0.6137931 54.01651 178
## [23] {4710085120628,
## 4710085172702} => {4710085172696} 0.0014905126 0.6054422 69.52223 178
## [24] {4710011401135,
## 4710011409056} => {4710011406123} 0.0015993703 0.4716049 57.46939 191
## [25] {4710011401135,
## 4710011409056} => {4710011401128} 0.0027214416 0.8024691 58.54152 325
## [26] {4710011405133,
## 4710011409056} => {4710011406123} 0.0014737653 0.4929972 60.07624 176
## [27] {4710011405133,
## 4710011409056} => {4710011401128} 0.0022776373 0.7619048 55.58228 272
## [28] {4710011406123,
## 4710011409056} => {4710011401128} 0.0019929326 0.7579618 55.29463 238
## [29] {4710011401128,
## 4710011409056} => {4710011406123} 0.0019929326 0.4482109 54.61862 238
## [30] {4710085120093,
## 4710085172696} => {4710085120628} 0.0021352850 0.5704698 50.20386 255
## [31] {4710085120093,
## 4710085120628} => {4710085172696} 0.0021352850 0.5391121 61.90561 255
## [32] {4710011401135,
## 4710011405133} => {4710011406123} 0.0016328650 0.4441913 54.12879 195
## [33] {4710011401135,
## 4710011405133} => {4710011401128} 0.0028386729 0.7722096 56.33403 339
## [34] {4710011401135,
## 4710011406123} => {4710011401128} 0.0024534843 0.8027397 58.56126 293
## [35] {4710011401128,
## 4710011401135} => {4710011406123} 0.0024534843 0.4185714 51.00677 293
## [36] {4710011405133,
## 4710011406123} => {4710011401128} 0.0022273953 0.7018470 51.20096 266
## [37] {4710011401128,
## 4710011405133} => {4710011406123} 0.0022273953 0.4290323 52.28152 266
## [38] {4710011401135,
## 4710011401142,
## 4710011409056} => {4710011401128} 0.0009462243 0.8560606 62.45111 113
## [39] {4710085120093,
## 4710085172696,
## 4710085172702} => {4710085120628} 0.0008792350 0.6521739 57.39419 105
## [40] {4710085120093,
## 4710085120628,
## 4710085172702} => {4710085172696} 0.0008792350 0.6862745 78.80411 105
## [41] {4710011401135,
## 4710011405133,
## 4710011409056} => {4710011406123} 0.0009964663 0.5265487 64.16479 119
## [42] {4710011401135,
## 4710011405133,
## 4710011409056} => {4710011401128} 0.0015742493 0.8318584 60.68552 188
## [43] {4710011401135,
## 4710011406123,
## 4710011409056} => {4710011401128} 0.0013397866 0.8376963 61.11141 160
## [44] {4710011401128,
## 4710011401135,
## 4710011409056} => {4710011406123} 0.0013397866 0.4923077 59.99221 160
## [45] {4710011405133,
## 4710011406123,
## 4710011409056} => {4710011401128} 0.0011639396 0.7897727 57.61530 139
## [46] {4710011401128,
## 4710011405133,
## 4710011409056} => {4710011406123} 0.0011639396 0.5110294 62.27363 139
## [47] {4710011401135,
## 4710011405133,
## 4710011406123} => {4710011401128} 0.0013565340 0.8307692 60.60606 162
## [48] {4710011401128,
## 4710011401135,
## 4710011405133} => {4710011406123} 0.0013565340 0.4778761 58.23359 162
## [49] {4710011401135,
## 4710011405133,
## 4710011406123,
## 4710011409056} => {4710011401128} 0.0008624876 0.8655462 63.14310 103
## [50] {4710011401128,
## 4710011401135,
## 4710011405133,
## 4710011409056} => {4710011406123} 0.0008624876 0.5478723 66.76328 103
new_Z0 = Z0 %>% merge(new_B, by = "cust") %>% filter( status %in% c('B','D','F') ) %>%
mutate(new_area = case_when( area.x == "z221" ~ "z221",
area.x == "z115" ~ "z115",
TRUE ~ " others") )
# 將Z0資料與消費者資料(new_B)合併,只取B(卜學亮族群)、D(周杰倫族群)、F(高爾宣族群)的data,再針對地區做分群,取z221及z115區,其餘的設為others
table(new_Z0$prod) %>% sort %>% tail(9) %>% names -> prod9 # 取出B,D,F中最受歡迎的9項產品
new_Z0 %>% filter(prod %in% prod9) %>%
group_by(prod, age.x, date) %>% summarise(
t.qty = sum(qty),
u.price = sum(price)/t.qty ,
status = first(status),
new_area = first(new_area)
) %>%
ggplot(aes(x=t.qty,y=u.price,col= new_area)) +
geom_smooth(method='lm',se=F) +
facet_wrap(~prod,scales="free") + theme_bw() # 畫出需求曲線(P-Q),針對9項最受歡迎的商品,在不同的地區(3種顏色),看出其需求彈性
## `geom_smooth()` using formula 'y ~ x'
BDF_prod = new_Z0 %>% filter(status %in% c("B", "D", "F") ) %>% group_by( prod) %>% summarise(count = n() )
# 得出B(卜學亮族群)、D(周杰倫族群)、F(高爾宣族群)中,每件產品的銷售量
BDF_prod = BDF_prod[order(BDF_prod$count, decreasing = T),] # 將結果由大到小排序
Z = read.csv("data/ta_feng_all_months_merged.csv") %>%
data.frame %>% setNames(c(
"date","cust","age","area","cat","prod","qty","cost","price"))
nrow(Z) #訂單項目
## [1] 817741
age.group = c("<25","25-29","30-34","35-39","40-44",
"45-49","50-54","55-59","60-64",">65")
Z$age = c(paste0("a",seq(24,69,5)),"a99")[match(Z$age,age.group,11)]
Z$area = paste0("z",Z$area)
par(mfrow=c(1,2),cex=0.7)
table(Z$age, useNA='ifany') %>% barplot(main="Age Groups", las=2)
table(Z$area,useNA='ifany') %>% barplot(main="Areas", las=2)
## qty cost price
## 99% 6 858.0 1014.00
## 99.9% 14 2722.0 3135.82
## 99.95% 24 3799.3 3999.00
## [1] 817182
## cust cat prod tid
## 32256 2007 23789 119422
X = Z %>% group_by(tid) %>% summarise(
date = min(date), # 交易日期 #一張訂單只會有一個日期
cust = min(cust), # 顧客 ID #一張訂單只會有一個顧客id
age = min(age), # 顧客 年齡級別 #min:因顧客年齡不隨時間改變
area = min(area), # 顧客 居住區別 #min:因顧客地區不隨時間改變
items = n(), # 交易項目(總)數
pieces = sum(qty), # 產品(總)件數
total = sum(price), # 交易(總)金額
gross = sum(price - cost) # 毛利
) %>% data.frame
nrow(X)
## [1] 119422
## items pieces total gross
## 99.9% 54 81.0000 9009.579 1824.737
## 99.95% 62 94.2895 10611.579 2179.817
## 99.99% 82 133.0000 16044.401 3226.548
## tid date cust age
## Min. : 1 Min. :2000-11-01 Min. : 1069 Length:119328
## 1st Qu.: 29855 1st Qu.:2000-11-29 1st Qu.: 926997 Class :character
## Median : 59704 Median :2001-01-01 Median : 1615661 Mode :character
## Mean : 59712 Mean :2000-12-31 Mean : 1401991
## 3rd Qu.: 89581 3rd Qu.:2001-02-02 3rd Qu.: 1894479
## Max. :119422 Max. :2001-02-28 Max. :20002000
## area items pieces total
## Length:119328 Min. : 1.000 Min. : 1.000 Min. : 5.0
## Class :character 1st Qu.: 2.000 1st Qu.: 3.000 1st Qu.: 227.0
## Mode :character Median : 5.000 Median : 6.000 Median : 510.0
## Mean : 6.802 Mean : 9.222 Mean : 851.6
## 3rd Qu.: 9.000 3rd Qu.:12.000 3rd Qu.: 1080.0
## Max. :62.000 Max. :94.000 Max. :15345.0
## gross
## Min. :-1645.0
## 1st Qu.: 21.0
## Median : 68.0
## Mean : 130.9
## 3rd Qu.: 168.0
## Max. : 3389.0
d0 = max(X$date) + 1
A = X %>% mutate(
days = as.integer(difftime(d0, date, units="days"))
) %>% group_by(cust) %>% summarise(
r = min(days), # recency
s = max(days), # seniority
f = n(), # frquency
m = mean(total), # monetary
rev = sum(total), # total revenue contribution
raw = sum(gross), # total gross profit contribution
age = min(age), # age group
area = min(area), # area code
) %>% data.frame
nrow(A) # 32241 (個顧客)
## [1] 32241
par(mfrow=c(1,2),cex=0.7)
table(A$age, useNA='ifany') %>% barplot(main="Age Groups",las=2)
table(A$area, useNA='ifany') %>% barplot(main="Areas",las=2)
par(mfrow=c(3,2), mar=c(3,3,4,2))
for(x in c('r','s','f','m'))
hist(A[,x],freq=T,main=x,xlab="",ylab="",cex.main=2)
hist(pmin(A$f,10),0:10,freq=T,xlab="",ylab="",cex.main=2)
hist(log(A$m,10),freq=T,xlab="",ylab="",cex.main=2)
## date cust age area cat prod qty cost price tid
## 0 0 0 0 0 0 0 0 0 0
## tid date cust age area items pieces total gross
## 0 0 0 0 0 0 0 0 0
## cust r s f m rev raw age area
## 0 0 0 0 0 0 0 0 0
X = group_by(Z, tid) %>% summarise(
date = first(date), # 交易日期
cust = first(cust), # 顧客 ID
age = first(age), # 顧客 年齡級別
area = first(area), # 顧客 居住區別
items = n(), # 交易項目(總)數
pieces = sum(qty), # 產品(總)件數
total = sum(price), # 交易(總)金額
gross = sum(price - cost) # 毛利
) %>% data.frame # 88387
sapply(X[,6:9], quantile, prob=c(.999, .9995, .9999))
## items pieces total gross
## 99.9% 56.0000 84.0000 9378.684 1883.228
## 99.95% 64.0000 98.0000 11261.751 2317.087
## 99.99% 85.6456 137.6456 17699.325 3389.646
d0 = max(X$date) + 1
A = X %>% mutate(
days = as.integer(difftime(d0, date, units="days"))
) %>%
group_by(cust) %>% summarise(
r = min(days), # recency
s = max(days), # seniority
f = n(), # frquency
m = mean(total), # monetary
rev = sum(total), # total revenue contribution
raw = sum(gross), # total gross profit contribution
age = age[1], # age group
area = area[1], # area code
) %>% data.frame # 28584
nrow(A)
## [1] 28584
#將feb裡的交易依照顧客分群
feb = filter(X0, date>= feb01) %>% group_by(cust) %>%
summarise(amount = sum(total)) # 16900
#回歸目標
A = merge(A, feb, by="cust", all.x=T)
#將feb與A merge,NA值代表該顧客在2月份並沒有消費
#分類目標
#新增一個buy欄位,並用邏輯判斷是否有消費(TRUE是有;FALSE是沒有)
A$buy = !is.na(A$amount)
table(A$buy, !is.na(A$amount))
##
## FALSE TRUE
## FALSE 15342 0
## TRUE 0 13242
## cust r s f
## Min. : 1069 Min. : 1.00 Min. : 1.00 Min. : 1.000
## 1st Qu.: 1060898 1st Qu.:11.00 1st Qu.:47.00 1st Qu.: 1.000
## Median : 1654100 Median :21.00 Median :68.00 Median : 2.000
## Mean : 1461070 Mean :32.12 Mean :61.27 Mean : 3.089
## 3rd Qu.: 1945003 3rd Qu.:53.00 3rd Qu.:83.00 3rd Qu.: 4.000
## Max. :20002000 Max. :92.00 Max. :92.00 Max. :60.000
##
## m rev raw age
## Min. : 8.0 Min. : 8 Min. : -742.0 Length:28584
## 1st Qu.: 359.4 1st Qu.: 638 1st Qu.: 70.0 Class :character
## Median : 709.5 Median : 1566 Median : 218.0 Mode :character
## Mean : 1012.4 Mean : 2711 Mean : 420.8
## 3rd Qu.: 1315.0 3rd Qu.: 3426 3rd Qu.: 535.0
## Max. :10634.0 Max. :99597 Max. :15565.0
##
## area amount buy
## Length:28584 Min. : 8 Mode :logical
## Class :character 1st Qu.: 454 FALSE:15342
## Mode :character Median : 993 TRUE :13242
## Mean : 1499
## 3rd Qu.: 1955
## Max. :28089
## NA's :15342
#資料整理成一致情況
X = subset(X, cust %in% A$cust & date < as.Date("2001-02-01"))
Z = subset(Z, cust %in% A$cust & date < as.Date("2001-02-01"))
set.seed(2018); spl = sample.split(A$buy, SplitRatio=0.7)
#sample.split : 切train/test(0.7/0.3) #以y欄位"buy"為基準去切
#他們的buy欄位(Y的欄位)比率相同
#切出來的spl是一個跟A一樣長的邏輯向量
c(nrow(A), sum(spl), sum(!spl))
## [1] 28584 20008 8576
cbind(A, spl) %>% filter(buy) %>%
ggplot(aes(x=log(amount))) + geom_density(aes(fill=spl), alpha=0.5)
#把有交易紀錄的保留在A2(要拿來做數量模型)
A2 = subset(A, buy) %>% mutate_at(c("m","rev","amount"), log10)
#subset(A, buy) :有買的人
n = nrow(A2)
set.seed(2018); spl2 = 1:n %in% sample(1:n, round(0.7*n))
c(nrow(A2), sum(spl2), sum(!spl2))
## [1] 13242 9269 3973
#make sure test/train dataset分布一樣
cbind(A2, spl2) %>%
ggplot(aes(x=amount)) + geom_density(aes(fill=spl2), alpha=0.5)
#模型製作
TR = subset(A, spl) #根據spl邏輯變數,將TRUE者切為Training Data
TS = subset(A, !spl) #根據spl邏輯變數,將TRUE者切為Testing Data
glm1 = glm(buy ~ ., TR[,c(2:9, 11)], family=binomial())
#做邏輯式回歸,以training data中的buy為應變數,其餘為自變數
#2:9因為1.cust不需要
summary(glm1)
##
## Call:
## glm(formula = buy ~ ., family = binomial(), data = TR[, c(2:9,
## 11)])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.7931 -0.8733 -0.6991 1.0384 1.8735
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.259e+00 1.261e-01 -9.985 < 2e-16 ***
## r -1.227e-02 8.951e-04 -13.708 < 2e-16 ***
## s 9.566e-03 9.101e-04 10.511 < 2e-16 ***
## f 2.905e-01 1.593e-02 18.233 < 2e-16 ***
## m -3.028e-05 2.777e-05 -1.090 0.27559
## rev 4.086e-05 1.940e-05 2.106 0.03521 *
## raw -2.306e-04 8.561e-05 -2.693 0.00708 **
## agea29 -4.194e-02 8.666e-02 -0.484 0.62838
## agea34 1.772e-02 7.992e-02 0.222 0.82456
## agea39 7.705e-02 7.921e-02 0.973 0.33074
## agea44 8.699e-02 8.132e-02 1.070 0.28476
## agea49 1.928e-02 8.457e-02 0.228 0.81962
## agea54 1.745e-02 9.323e-02 0.187 0.85155
## agea59 1.752e-01 1.094e-01 1.602 0.10926
## agea64 6.177e-02 1.175e-01 0.526 0.59904
## agea69 2.652e-01 1.047e-01 2.533 0.01131 *
## agea99 -1.419e-01 1.498e-01 -0.947 0.34347
## areaz106 -4.105e-02 1.321e-01 -0.311 0.75603
## areaz110 -2.075e-01 1.045e-01 -1.986 0.04703 *
## areaz114 3.801e-02 1.111e-01 0.342 0.73214
## areaz115 2.599e-01 9.682e-02 2.684 0.00727 **
## areaz221 1.817e-01 9.753e-02 1.863 0.06243 .
## areazOthers -4.677e-02 1.045e-01 -0.448 0.65435
## areazUnknown -1.695e-01 1.232e-01 -1.375 0.16912
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 27629 on 20007 degrees of freedom
## Residual deviance: 23295 on 19984 degrees of freedom
## AIC: 23343
##
## Number of Fisher Scoring iterations: 5
pred = predict(glm1, TS, type="response")
# 以前述所獲得的邏輯式模型(glm1),來預測一開始切出的Training Data,丟入predict Function當中進行
cm = table(actual = TS$buy, predict = pred > 0.5); cm
## predict
## actual FALSE TRUE
## FALSE 3730 873
## TRUE 1700 2273
acc.ts = cm %>% {sum(diag(.))/sum(.)}
# diag為對角線相加,而預測成功的情形為:
# 1. 預測為F,且實際為F
# 2. 預測為T,且實際為T
# 所以將對角線(預測成功)相加後除以總數sum,得到模型預測正確率。
#acc.ts : testing accuracy
c(1-mean(TS$buy) , acc.ts) # 0.69998
## [1] 0.5367304 0.6999767
#1-mean(TS$buy) : 1-顧客族群裡有買的人的比率(平均回購率、保留率) -> null model : 不做模型去猜全部人都不買猜中的機率(0.5367304)
#跟不做模型(平均值)比,準確率從0.5367304提高到0.6999767
## [,1]
## FALSE vs. TRUE 0.7556038
#Regression Model (估數量)
A2 = subset(A, A$buy) %>% mutate_at(c("m","rev","amount"), log10)
#將A集合中buy為True者找出來建立新的集合,並將"m","rev","amount"3個欄位取對數
TR2 = subset(A2, spl2) #spl2為True者切為training data
TS2 = subset(A2, !spl2) #spl2為False者切為testing data
lm1 = lm(amount ~ ., TR2[,c(2:6,8:10)])
#對Training Data做線性回歸,以amount為應變數,"r", "rev", "age", "amount"為自變數,後將回歸模型設為lm1
summary(lm1)
##
## Call:
## lm(formula = amount ~ ., data = TR2[, c(2:6, 8:10)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.02874 -0.23292 0.05011 0.28423 1.45108
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.1651034 0.0501997 23.209 < 2e-16 ***
## r 0.0003390 0.0003138 1.080 0.27999
## s 0.0002380 0.0003164 0.752 0.45200
## f 0.0266666 0.0018112 14.723 < 2e-16 ***
## m 0.5165187 0.0375332 13.762 < 2e-16 ***
## rev 0.0240517 0.0363911 0.661 0.50868
## agea29 0.0471837 0.0252728 1.867 0.06194 .
## agea34 0.0896806 0.0233116 3.847 0.00012 ***
## agea39 0.1203331 0.0229212 5.250 1.56e-07 ***
## agea44 0.1107960 0.0235428 4.706 2.56e-06 ***
## agea49 0.0649780 0.0244296 2.660 0.00783 **
## agea54 0.0838574 0.0266168 3.151 0.00163 **
## agea59 0.0395519 0.0310826 1.272 0.20323
## agea64 0.0059463 0.0325943 0.182 0.85525
## agea69 -0.0399961 0.0289299 -1.383 0.16685
## agea99 0.0892782 0.0408041 2.188 0.02870 *
## areaz106 0.0955455 0.0427171 2.237 0.02533 *
## areaz110 0.0526326 0.0347075 1.516 0.12944
## areaz114 0.0154046 0.0364721 0.422 0.67277
## areaz115 0.0193686 0.0317954 0.609 0.54243
## areaz221 0.0350306 0.0320485 1.093 0.27440
## areazOthers 0.0385476 0.0344270 1.120 0.26288
## areazUnknown 0.0130805 0.0387052 0.338 0.73541
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4249 on 9246 degrees of freedom
## Multiple R-squared: 0.2796, Adjusted R-squared: 0.2779
## F-statistic: 163.1 on 22 and 9246 DF, p-value: < 2.2e-16
r2.tr = summary(lm1)$r.sq #線性模型的r square
SST = sum((TS2$amount - mean(TR2$amount))^ 2) #基礎模型的誤差平方總和
SSE = sum((predict(lm1, TS2) - TS2$amount)^2) #回歸模型的誤差平方總和
r2.ts = 1 - (SSE/SST) #r square公式
c(R2train=r2.tr, R2test=r2.ts) #比較training / testing的r square
## R2train R2test
## 0.2795931 0.2845795
#Aggregate data 2000-12-01 ~ 2001~02-28
d0 = max(X0$date) + 1 #最大日期的下一天(用來計算客戶來店天數間隔之差異)
B = X0 %>%
filter(date >= as.Date("2000-12-01")) %>%
mutate(days = as.integer(difftime(d0, date, units="days"))) %>%
group_by(cust) %>% summarise(
r = min(days), # recency 最近一次來消費距離d0的天數
s = max(days), # seniority 第一次來消費距離d0的天數
f = n(), # frquency 來店次數
m = mean(total), # monetary 平均消費金額
rev = sum(total), # total revenue contribution 總消費金額
raw = sum(gross), # total gross profit contribution 總毛利
age = age[1], # age group 客戶年齡間距
area = area[1], # area code 客戶地區
) %>% data.frame # 28584(筆資料)
nrow(B)
## [1] 28531
B$Buy = predict(glm1, B, type="response")
#用先前建立好的模型glm1來預測資料集B
B2 = B %>% mutate_at(c("m","rev"), log10) #將B中的m/rev做對數處理
B$Rev = 10^predict(lm1, B2) #用先前建立好的模型lm1對B中的rev做預測
#10的次方式因有做過對數處理
par(mfrow=c(1,2), cex=0.8)
hist(B$Buy) #購買機率分布直條圖
hist(log(B$Rev,10)) #收入對數分布直條圖
#分群顧客的未來預測資料
BB = new_B %>% filter(status == "B")
DD = new_B %>% filter(status == "D")
FF = new_B %>% filter(status == "F")
par(mfrow=c(1,2), cex=0.8)
hist(BB$Buy)
hist(log(BB$Rev,10))
#地區與分群顧客馬賽克圖
MOSA = function(formula, data) mosaic(formula, data, shade=T,
margins=c(0,1,0,0), labeling_args = list(rot_labels=c(90,0,0,0)),
gp_labels=gpar(fontsize=9), legend_args=list(fontsize=9),
gp_text=gpar(fontsize=7),labeling=labeling_residuals)
MOSA(~status+area, new_B)
DP = function(x,m0,b0,a0) {m0*plogis((10/a0)*(x-b0))}
par(mar=c(4,4,2,1),mfrow=c(1,2),cex=0.7)
curve(DP(x,m=0.20,b=15,a=20), 0, 30, lwd=2, ylim=c(0, 0.25),
main="F( x | m=0.2, b=15, a=20 )", ylab="delta P")
abline(h=seq(0,0.2,0.05),v=seq(0,30,5),col='lightgrey',lty=2)
m=0.1; b=150; a=200; x=200
dp = DP(x,m,b,a)
dp = ifelse(B$Buy+dp>1, 1-B$Buy, dp)
eR = dp*B$Rev - x
hist(eR)
#成本效益參數m, b, a分別為
mm=c(0.12,0.14,0.2)
bb=c(40,60,100)
aa=c(80,60,200)
X = seq(10, 250, 1)
df = do.call(rbind, lapply(1:3, function(i) {
sapply(X, function(x) {
# eR = (min(B$Buy+DP(x,mm[i],bb[i],aa[i]),1) - B$Buy) * B$Rev - x
dp = DP(x,mm[i],bb[i],aa[i])
dp = ifelse(B$Buy+dp>1, 1-B$Buy, dp)
eR = dp*B$Rev - x
c(i=i, x=x, eR.ALL=sum(eR), N=sum(eR>0), eR.SEL=sum(eR[eR > 0]) )
}) %>% t %>% data.frame
}))
df %>% gather('key','value',-i,-x) %>%
mutate(Instrument = paste0('I',i)) %>%
ggplot(aes(x=x, y=value, col=Instrument)) +
geom_hline(yintercept=0, linetype='dashed', col='blue') +
geom_line(size=1.5,alpha=0.5) +
xlab('工具選項(成本)') + ylab('預期報償') +
ggtitle('行銷工具優化','假設行銷工具的效果是其成本的函數') +
facet_wrap(~key,ncol=1,scales='free_y') + theme_bw()
最佳策略
## # A tibble: 3 x 5
## # Groups: i [3]
## i x eR.ALL N eR.SEL
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 61 1061196. 20828 1224614.
## 2 2 79 1084642. 19519 1336093.
## 3 3 146 282097. 13751 1118785.
因此我們針對不同族群祭出不同價值的禮品或折價券,如B族群(只來過一次的大戶)則針對特定節日贈送禮包;D族群(很久沒來買的老顧客)則贈送防疫包;F族群(潛力新顧客)則利用三倍券贈送$140回購券
```