G6_final 期末筆記

pacman::p_load(caTools, ggplot2, dplyr, d3heatmap,vcd,latex2exp,Martix,tidyr,arules,arulesViz)
load("data/tf0.rdata")
load("data/tf4.rdata")
load("data/tf3.rdata")

par(mfrow=c(1,2),cex=0.7)
table(A0$age) %>% barplot(las=2,main="Age Groups")
table(A0$area) %>% barplot(las=2,main="Areas")

#以年齡區隔：a39為最多數、a64最少
#以地理區隔：z115南港區人數最多、z106大安區人數最少

Fig-2: Zip Codes

median(B$r) #21

## [1] 21

median(B$f) #2

## [1] 2

median(B$m) # 691.4

## [1] 691.4

Status = function(r,f,m) {
  ifelse(r>= 21,
         ifelse( f >= 2, ifelse( m > 691.4, "D", "C"), ifelse( m >= 691.4, "B", "A")),
         ifelse( f >= 2, ifelse( m > 691.4, "H", "G"), ifelse( m >= 691.4, "F", "E")) 
  ) 
} 
# f剛好等於50%值為2，但f為2的涵蓋範圍過大(可能40~60%都是2)
# 這裡是將>1者都列為>50%，f=1(只來過一次)列為<50% )

new_B = B %>% group_by(cust) %>%
  mutate( status = Status(r,f,m) )

table(new_B$status) # A至H為依據白板上圖片分群，由左至右排序

## 
##    A    B    C    D    E    F    G    H 
## 4226 4581 2621 2917 1567 1497 5852 5270

MOSA = function(formula, data) mosaic(formula, data, shade=T, 
  margins=c(0,1,0,0), labeling_args = list(rot_labels=c(90,0,0,0)),
  gp_labels=gpar(fontsize=9), legend_args=list(fontsize=9),
  gp_text=gpar(fontsize=7),labeling=labeling_residuals)

MOSA(~status+age, new_B) #馬賽克圖

new_B %>% filter(status %in% c("B", "D", "F") ) %>% group_by(age, status) %>%
  summarise(total_raw = sum(raw, na.rm=T ) ) # 得出３大族群中的不同年紀範圍，購買產生毛利之總額

## # A tibble: 33 x 3
## # Groups:   age [11]
##    age   status total_raw
##    <chr> <chr>      <int>
##  1 a24   B          53837
##  2 a24   D          60151
##  3 a24   F          12311
##  4 a29   B         129656
##  5 a29   D         159485
##  6 a29   F          35364
##  7 a34   B         280311
##  8 a34   D         405058
##  9 a34   F          89243
## 10 a39   B         323603
## # … with 23 more rows

B1 = new_B %>% filter(status %in% c("B") ) %>% merge(Z0, by = "cust" ) %>% group_by(prod) %>% summarise(total_raw = sum(raw, na.rm=T) ) # 分群B(卜學亮族群)中，不同商品的毛利總額

D1 = new_B %>% filter(status %in% c("D") ) %>% merge(Z0, by = "cust" ) %>% group_by(prod) %>% summarise(total_raw = sum(raw, na.rm=T) ) # 分群D(周杰倫族群)中，不同商品的毛利總額

F1 = new_B %>% filter(status %in% c("F") ) %>% merge(Z0, by = "cust" ) %>% group_by(prod) %>% summarise(total_raw = sum(raw, na.rm=T) ) # 分群B(高爾宣族群)中，不同商品的毛利總額

#商品關聯性分析

tapply((Z0$price-Z0$cost), Z0$prod, sum) %>% sort(dec=T) %>% names -> TOP

tr = as(split(Z0[,"prod"], Z0[,"tid"]),"transactions")
R <- apriori(tr, parameter=list(supp=0.0001, conf=0.25))

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##        0.25    0.1    1 none FALSE            TRUE       5   1e-04      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 11 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[23789 item(s), 119422 transaction(s)] done [0.50s].
## sorting and recoding items ... [10166 item(s)] done [0.02s].
## creating transaction tree ... done [0.09s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 done [0.90s].
## writing ... [9795 rule(s)] done [0.27s].
## creating S4 object  ... done [0.07s].

# 利用Apriori演算法，算出信賴水準confidence、support

rx = subset(R, rhs %in% TOP[1:200] & lift >= 50 & count > 100 )
df = inspect(rx)

##      lhs                rhs                  support confidence      lift count
## [1]  {4716114000312} => {4716114000329} 0.0012727973  0.5527273 231.60630   152
## [2]  {4716114000329} => {4716114000312} 0.0012727973  0.5333333 231.60630   152
## [3]  {4710154015138} => {4710154015206} 0.0009964663  0.3742138  52.08551   119
## [4]  {4713754987614} => {4713754987607} 0.0011388186  0.3035714  78.46993   136
## [5]  {4713754987607} => {4713754987614} 0.0011388186  0.2943723  78.46993   136
## [6]  {4710011402026} => {4710011402019} 0.0028219256  0.6740000  90.23591   337
## [7]  {4710088414328} => {4710088414311} 0.0017919646  0.4662309  86.18921   214
## [8]  {4710011401142} => {4710011406123} 0.0015323810  0.4130926  50.33912   183
## [9]  {4710085172702} => {4710085172696} 0.0024283633  0.5400372  62.01185   290
## [10] {4710254049323} => {4710254049521} 0.0020096800  0.4308797  55.56859   240
## [11] {4710011409056} => {4710011406123} 0.0026293313  0.4137022  50.41342   314
## [12] {4710011409056} => {4710011401128} 0.0044464169  0.6996047  51.03738   531
## [13] {4710085120093} => {4710085172696} 0.0037430289  0.4977728  57.15868   447
## [14] {4710011401135} => {4710011401128} 0.0058615665  0.7526882  54.90991   700
## [15] {4710011401142,                                                           
##       4710011409056} => {4710011401128} 0.0011974343  0.7447917  54.33385   143
## [16] {4710011401135,                                                           
##       4710011401142} => {4710011406123} 0.0008876086  0.4840183  58.98207   106
## [17] {4710011401135,                                                           
##       4710011401142} => {4710011401128} 0.0013900286  0.7579909  55.29675   166
## [18] {4710011401142,                                                           
##       4710011405133} => {4710011401128} 0.0012476763  0.6866359  50.09129   149
## [19] {4710011401128,                                                           
##       4710011401142} => {4710011406123} 0.0010132136  0.4566038  55.64136   121
## [20] {4710085120093,                                                           
##       4710085172702} => {4710085172696} 0.0013481603  0.6544715  75.15221   161
## [21] {4710085120093,                                                           
##       4710085172702} => {4710085120628} 0.0012811710  0.6219512  54.73446   153
## [22] {4710085172696,                                                           
##       4710085172702} => {4710085120628} 0.0014905126  0.6137931  54.01651   178
## [23] {4710085120628,                                                           
##       4710085172702} => {4710085172696} 0.0014905126  0.6054422  69.52223   178
## [24] {4710011401135,                                                           
##       4710011409056} => {4710011406123} 0.0015993703  0.4716049  57.46939   191
## [25] {4710011401135,                                                           
##       4710011409056} => {4710011401128} 0.0027214416  0.8024691  58.54152   325
## [26] {4710011405133,                                                           
##       4710011409056} => {4710011406123} 0.0014737653  0.4929972  60.07624   176
## [27] {4710011405133,                                                           
##       4710011409056} => {4710011401128} 0.0022776373  0.7619048  55.58228   272
## [28] {4710011406123,                                                           
##       4710011409056} => {4710011401128} 0.0019929326  0.7579618  55.29463   238
## [29] {4710011401128,                                                           
##       4710011409056} => {4710011406123} 0.0019929326  0.4482109  54.61862   238
## [30] {4710085120093,                                                           
##       4710085172696} => {4710085120628} 0.0021352850  0.5704698  50.20386   255
## [31] {4710085120093,                                                           
##       4710085120628} => {4710085172696} 0.0021352850  0.5391121  61.90561   255
## [32] {4710011401135,                                                           
##       4710011405133} => {4710011406123} 0.0016328650  0.4441913  54.12879   195
## [33] {4710011401135,                                                           
##       4710011405133} => {4710011401128} 0.0028386729  0.7722096  56.33403   339
## [34] {4710011401135,                                                           
##       4710011406123} => {4710011401128} 0.0024534843  0.8027397  58.56126   293
## [35] {4710011401128,                                                           
##       4710011401135} => {4710011406123} 0.0024534843  0.4185714  51.00677   293
## [36] {4710011405133,                                                           
##       4710011406123} => {4710011401128} 0.0022273953  0.7018470  51.20096   266
## [37] {4710011401128,                                                           
##       4710011405133} => {4710011406123} 0.0022273953  0.4290323  52.28152   266
## [38] {4710011401135,                                                           
##       4710011401142,                                                           
##       4710011409056} => {4710011401128} 0.0009462243  0.8560606  62.45111   113
## [39] {4710085120093,                                                           
##       4710085172696,                                                           
##       4710085172702} => {4710085120628} 0.0008792350  0.6521739  57.39419   105
## [40] {4710085120093,                                                           
##       4710085120628,                                                           
##       4710085172702} => {4710085172696} 0.0008792350  0.6862745  78.80411   105
## [41] {4710011401135,                                                           
##       4710011405133,                                                           
##       4710011409056} => {4710011406123} 0.0009964663  0.5265487  64.16479   119
## [42] {4710011401135,                                                           
##       4710011405133,                                                           
##       4710011409056} => {4710011401128} 0.0015742493  0.8318584  60.68552   188
## [43] {4710011401135,                                                           
##       4710011406123,                                                           
##       4710011409056} => {4710011401128} 0.0013397866  0.8376963  61.11141   160
## [44] {4710011401128,                                                           
##       4710011401135,                                                           
##       4710011409056} => {4710011406123} 0.0013397866  0.4923077  59.99221   160
## [45] {4710011405133,                                                           
##       4710011406123,                                                           
##       4710011409056} => {4710011401128} 0.0011639396  0.7897727  57.61530   139
## [46] {4710011401128,                                                           
##       4710011405133,                                                           
##       4710011409056} => {4710011406123} 0.0011639396  0.5110294  62.27363   139
## [47] {4710011401135,                                                           
##       4710011405133,                                                           
##       4710011406123} => {4710011401128} 0.0013565340  0.8307692  60.60606   162
## [48] {4710011401128,                                                           
##       4710011401135,                                                           
##       4710011405133} => {4710011406123} 0.0013565340  0.4778761  58.23359   162
## [49] {4710011401135,                                                           
##       4710011405133,                                                           
##       4710011406123,                                                           
##       4710011409056} => {4710011401128} 0.0008624876  0.8655462  63.14310   103
## [50] {4710011401128,                                                           
##       4710011401135,                                                           
##       4710011405133,                                                           
##       4710011409056} => {4710011406123} 0.0008624876  0.5478723  66.76328   103

# 檢視不同商品對於另一項商品的關聯性
# 可由此關聯性表，對照上述B族群中的特定受歡迎的商品，找到相關性高的品項，做出加價購或是組合包的行銷方案。

new_Z0 = Z0 %>% merge(new_B, by = "cust")  %>% filter( status %in% c('B','D','F') ) %>% 
  mutate(new_area = case_when( area.x == "z221" ~ "z221",
                               area.x == "z115" ~ "z115",
                               TRUE ~ " others") ) 
# 將Z0資料與消費者資料(new_B)合併，只取B(卜學亮族群)、D(周杰倫族群)、F(高爾宣族群)的data，再針對地區做分群，取z221及z115區，其餘的設為others

table(new_Z0$prod) %>% sort %>% tail(9) %>% names -> prod9 # 取出B,D,F中最受歡迎的9項產品
new_Z0 %>% filter(prod %in% prod9) %>% 
  group_by(prod, age.x, date) %>% summarise(
    t.qty = sum(qty),
    u.price = sum(price)/t.qty ,
    status = first(status),
    new_area = first(new_area)
  ) %>% 
  ggplot(aes(x=t.qty,y=u.price,col= new_area)) +
  geom_smooth(method='lm',se=F) +
  facet_wrap(~prod,scales="free") + theme_bw() # 畫出需求曲線(P-Q)，針對９項最受歡迎的商品，在不同的地區(3種顏色)，看出其需求彈性

## `geom_smooth()` using formula 'y ~ x'

                                               # 一項商品可看出在不同地區分店的彈性不同，所以可將該商品在彈性高的地區多做促銷活動

BDF_prod = new_Z0 %>% filter(status %in% c("B", "D", "F") ) %>% group_by( prod) %>% summarise(count = n() ) 
# 得出B(卜學亮族群)、D(周杰倫族群)、F(高爾宣族群)中，每件產品的銷售量
BDF_prod = BDF_prod[order(BDF_prod$count, decreasing = T),] # 將結果由大到小排序

Z = read.csv("data/ta_feng_all_months_merged.csv") %>% 
  data.frame %>% setNames(c(
    "date","cust","age","area","cat","prod","qty","cost","price"))
nrow(Z) #訂單項目

## [1] 817741

Z$date = as.Date(Z$date, format="%m/%d/%Y")
par(cex=0.8)
hist(Z$date,'weeks',freq=T,las=2)

age.group = c("<25","25-29","30-34","35-39","40-44",
              "45-49","50-54","55-59","60-64",">65")
Z$age = c(paste0("a",seq(24,69,5)),"a99")[match(Z$age,age.group,11)]
Z$area = paste0("z",Z$area)

par(mfrow=c(1,2),cex=0.7)
table(Z$age, useNA='ifany') %>% barplot(main="Age Groups", las=2)
table(Z$area,useNA='ifany') %>% barplot(main="Areas", las=2)

sapply(Z[,7:9], quantile, prob=c(.99, .999, .9995)) #7:9 -> qty/cost/price

##        qty   cost   price
## 99%      6  858.0 1014.00
## 99.9%   14 2722.0 3135.82
## 99.95%  24 3799.3 3999.00

Z = subset(Z, qty<=24 & cost<=3800 & price<=4000) 
nrow(Z)  #移除outliers

## [1] 817182

Z$tid = group_indices(Z, date, cust) 
#沒有交易id的話，同一個顧客在同一天的交易項目當作同一筆訂單資料
#才能進而了解客單價

sapply(Z[c("cust","cat","prod","tid")], n_distinct)

##   cust    cat   prod    tid 
##  32256   2007  23789 119422

X = Z %>% group_by(tid) %>% summarise(
  date = min(date),          # 交易日期  #一張訂單只會有一個日期
  cust = min(cust),          # 顧客 ID #一張訂單只會有一個顧客id
  age = min(age),            # 顧客 年齡級別 #min:因顧客年齡不隨時間改變
  area = min(area),          # 顧客 居住區別 #min:因顧客地區不隨時間改變
  items = n(),               # 交易項目(總)數
  pieces = sum(qty),         # 產品(總)件數
  total = sum(price),        # 交易(總)金額
  gross = sum(price - cost)  # 毛利
) %>% data.frame
nrow(X)

## [1] 119422

sapply(X[,6:9], quantile, prob=c(.999, .9995, .9999))

##        items   pieces     total    gross
## 99.9%     54  81.0000  9009.579 1824.737
## 99.95%    62  94.2895 10611.579 2179.817
## 99.99%    82 133.0000 16044.401 3226.548

X = subset(X, items<=62 & pieces<95 & total<16000) #移除outliers

summary(X)

##       tid              date                 cust              age           
##  Min.   :     1   Min.   :2000-11-01   Min.   :    1069   Length:119328     
##  1st Qu.: 29855   1st Qu.:2000-11-29   1st Qu.:  926997   Class :character  
##  Median : 59704   Median :2001-01-01   Median : 1615661   Mode  :character  
##  Mean   : 59712   Mean   :2000-12-31   Mean   : 1401991                     
##  3rd Qu.: 89581   3rd Qu.:2001-02-02   3rd Qu.: 1894479                     
##  Max.   :119422   Max.   :2001-02-28   Max.   :20002000                     
##      area               items            pieces           total        
##  Length:119328      Min.   : 1.000   Min.   : 1.000   Min.   :    5.0  
##  Class :character   1st Qu.: 2.000   1st Qu.: 3.000   1st Qu.:  227.0  
##  Mode  :character   Median : 5.000   Median : 6.000   Median :  510.0  
##                     Mean   : 6.802   Mean   : 9.222   Mean   :  851.6  
##                     3rd Qu.: 9.000   3rd Qu.:12.000   3rd Qu.: 1080.0  
##                     Max.   :62.000   Max.   :94.000   Max.   :15345.0  
##      gross        
##  Min.   :-1645.0  
##  1st Qu.:   21.0  
##  Median :   68.0  
##  Mean   :  130.9  
##  3rd Qu.:  168.0  
##  Max.   : 3389.0

par(cex=0.8)
hist(X$date, "weeks", freq=T, las=2, main="No. Transaction per Week")

d0 = max(X$date) + 1
A = X %>% mutate(
  days = as.integer(difftime(d0, date, units="days"))
  ) %>% group_by(cust) %>% summarise(
    r = min(days),      # recency
    s = max(days),      # seniority
    f = n(),            # frquency
    m = mean(total),    # monetary
    rev = sum(total),   # total revenue contribution
    raw = sum(gross),   # total gross profit contribution
    age = min(age),     # age group
    area = min(area),   # area code
  ) %>% data.frame      
nrow(A) # 32241 (個顧客)

## [1] 32241

par(mfrow=c(1,2),cex=0.7)
table(A$age, useNA='ifany') %>% barplot(main="Age Groups",las=2)
table(A$area, useNA='ifany') %>% barplot(main="Areas",las=2)

par(mfrow=c(3,2), mar=c(3,3,4,2))
for(x in c('r','s','f','m')) 
  hist(A[,x],freq=T,main=x,xlab="",ylab="",cex.main=2)
hist(pmin(A$f,10),0:10,freq=T,xlab="",ylab="",cex.main=2)
hist(log(A$m,10),freq=T,xlab="",ylab="",cex.main=2)

is.na(Z) %>% colSums

##  date  cust   age  area   cat  prod   qty  cost price   tid 
##     0     0     0     0     0     0     0     0     0     0

is.na(X) %>% colSums

##    tid   date   cust    age   area  items pieces  total  gross 
##      0      0      0      0      0      0      0      0      0

is.na(A) %>% colSums

## cust    r    s    f    m  rev  raw  age area 
##    0    0    0    0    0    0    0    0    0

A0 = A; X0 = X; Z0 = Z

feb01 = as.Date("2001-02-01")
Z = subset(Z0, date < feb01)    # 618212
#Z當中只有2/1之前的交易資料

X = group_by(Z, tid) %>% summarise(
  date = first(date),  # 交易日期
  cust = first(cust),  # 顧客 ID
  age = first(age),    # 顧客 年齡級別
  area = first(area),  # 顧客 居住區別
  items = n(),                # 交易項目(總)數
  pieces = sum(qty),          # 產品(總)件數
  total = sum(price),         # 交易(總)金額
  gross = sum(price - cost)   # 毛利
  ) %>% data.frame  # 88387

sapply(X[,6:9], quantile, prob=c(.999, .9995, .9999))

##          items   pieces     total    gross
## 99.9%  56.0000  84.0000  9378.684 1883.228
## 99.95% 64.0000  98.0000 11261.751 2317.087
## 99.99% 85.6456 137.6456 17699.325 3389.646

#將資料以99.9%、99.95%、99.99%切成三群，以找到離群值

X = subset(X, items<=64 & pieces<=98 & total<=11260) # 88387 -> 88295
#移除離群值

d0 = max(X$date) + 1
A = X %>% mutate(
  days = as.integer(difftime(d0, date, units="days"))
  ) %>% 
  group_by(cust) %>% summarise(
    r = min(days),      # recency
    s = max(days),      # seniority
    f = n(),            # frquency
    m = mean(total),    # monetary
    rev = sum(total),   # total revenue contribution
    raw = sum(gross),   # total gross profit contribution
    age = age[1],       # age group
    area = area[1],     # area code
  ) %>% data.frame      # 28584
nrow(A)

## [1] 28584

#將feb裡的交易依照顧客分群
feb = filter(X0, date>= feb01) %>% group_by(cust) %>% 
  summarise(amount = sum(total))  # 16900

#回歸目標
A = merge(A, feb, by="cust", all.x=T)
#將feb與A merge,NA值代表該顧客在2月份並沒有消費

#分類目標
#新增一個buy欄位，並用邏輯判斷是否有消費(TRUE是有；FALSE是沒有)
A$buy = !is.na(A$amount)
table(A$buy, !is.na(A$amount))

##        
##         FALSE  TRUE
##   FALSE 15342     0
##   TRUE      0 13242

summary(A)

##       cust                r               s               f         
##  Min.   :    1069   Min.   : 1.00   Min.   : 1.00   Min.   : 1.000  
##  1st Qu.: 1060898   1st Qu.:11.00   1st Qu.:47.00   1st Qu.: 1.000  
##  Median : 1654100   Median :21.00   Median :68.00   Median : 2.000  
##  Mean   : 1461070   Mean   :32.12   Mean   :61.27   Mean   : 3.089  
##  3rd Qu.: 1945003   3rd Qu.:53.00   3rd Qu.:83.00   3rd Qu.: 4.000  
##  Max.   :20002000   Max.   :92.00   Max.   :92.00   Max.   :60.000  
##                                                                     
##        m                rev             raw              age           
##  Min.   :    8.0   Min.   :    8   Min.   : -742.0   Length:28584      
##  1st Qu.:  359.4   1st Qu.:  638   1st Qu.:   70.0   Class :character  
##  Median :  709.5   Median : 1566   Median :  218.0   Mode  :character  
##  Mean   : 1012.4   Mean   : 2711   Mean   :  420.8                     
##  3rd Qu.: 1315.0   3rd Qu.: 3426   3rd Qu.:  535.0                     
##  Max.   :10634.0   Max.   :99597   Max.   :15565.0                     
##                                                                        
##      area               amount         buy         
##  Length:28584       Min.   :    8   Mode :logical  
##  Class :character   1st Qu.:  454   FALSE:15342    
##  Mode  :character   Median :  993   TRUE :13242    
##                     Mean   : 1499                  
##                     3rd Qu.: 1955                  
##                     Max.   :28089                  
##                     NA's   :15342

#資料整理成一致情況
X = subset(X, cust %in% A$cust & date < as.Date("2001-02-01"))
Z = subset(Z, cust %in% A$cust & date < as.Date("2001-02-01"))
set.seed(2018); spl = sample.split(A$buy, SplitRatio=0.7) 
#sample.split : 切train/test(0.7/0.3) #以y欄位"buy"為基準去切 
#他們的buy欄位(Y的欄位)比率相同
#切出來的spl是一個跟A一樣長的邏輯向量
c(nrow(A), sum(spl), sum(!spl))

## [1] 28584 20008  8576

cbind(A, spl) %>% filter(buy) %>% 
  ggplot(aes(x=log(amount))) + geom_density(aes(fill=spl), alpha=0.5)

#確認用

#把有交易紀錄的保留在A2（要拿來做數量模型）
A2 = subset(A, buy) %>% mutate_at(c("m","rev","amount"), log10)
#subset(A, buy) :有買的人
n = nrow(A2) 
set.seed(2018); spl2 = 1:n %in% sample(1:n, round(0.7*n))
c(nrow(A2), sum(spl2), sum(!spl2))

## [1] 13242  9269  3973

#make sure test/train dataset分布一樣

cbind(A2, spl2) %>% 
  ggplot(aes(x=amount)) + geom_density(aes(fill=spl2), alpha=0.5)

#模型製作
TR = subset(A, spl)  #根據spl邏輯變數，將TRUE者切為Training Data
TS = subset(A, !spl)  #根據spl邏輯變數，將TRUE者切為Testing Data

glm1 = glm(buy ~ ., TR[,c(2:9, 11)], family=binomial())
#做邏輯式回歸，以training data中的buy為應變數，其餘為自變數
#2:9因為1.cust不需要
summary(glm1)

## 
## Call:
## glm(formula = buy ~ ., family = binomial(), data = TR[, c(2:9, 
##     11)])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.7931  -0.8733  -0.6991   1.0384   1.8735  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.259e+00  1.261e-01  -9.985  < 2e-16 ***
## r            -1.227e-02  8.951e-04 -13.708  < 2e-16 ***
## s             9.566e-03  9.101e-04  10.511  < 2e-16 ***
## f             2.905e-01  1.593e-02  18.233  < 2e-16 ***
## m            -3.028e-05  2.777e-05  -1.090  0.27559    
## rev           4.086e-05  1.940e-05   2.106  0.03521 *  
## raw          -2.306e-04  8.561e-05  -2.693  0.00708 ** 
## agea29       -4.194e-02  8.666e-02  -0.484  0.62838    
## agea34        1.772e-02  7.992e-02   0.222  0.82456    
## agea39        7.705e-02  7.921e-02   0.973  0.33074    
## agea44        8.699e-02  8.132e-02   1.070  0.28476    
## agea49        1.928e-02  8.457e-02   0.228  0.81962    
## agea54        1.745e-02  9.323e-02   0.187  0.85155    
## agea59        1.752e-01  1.094e-01   1.602  0.10926    
## agea64        6.177e-02  1.175e-01   0.526  0.59904    
## agea69        2.652e-01  1.047e-01   2.533  0.01131 *  
## agea99       -1.419e-01  1.498e-01  -0.947  0.34347    
## areaz106     -4.105e-02  1.321e-01  -0.311  0.75603    
## areaz110     -2.075e-01  1.045e-01  -1.986  0.04703 *  
## areaz114      3.801e-02  1.111e-01   0.342  0.73214    
## areaz115      2.599e-01  9.682e-02   2.684  0.00727 ** 
## areaz221      1.817e-01  9.753e-02   1.863  0.06243 .  
## areazOthers  -4.677e-02  1.045e-01  -0.448  0.65435    
## areazUnknown -1.695e-01  1.232e-01  -1.375  0.16912    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27629  on 20007  degrees of freedom
## Residual deviance: 23295  on 19984  degrees of freedom
## AIC: 23343
## 
## Number of Fisher Scoring iterations: 5

pred =  predict(glm1, TS, type="response")
# 以前述所獲得的邏輯式模型(glm1)，來預測一開始切出的Training Data，丟入predict Function當中進行
cm = table(actual = TS$buy, predict = pred > 0.5); cm

##        predict
## actual  FALSE TRUE
##   FALSE  3730  873
##   TRUE   1700 2273

#混淆矩陣

acc.ts = cm %>% {sum(diag(.))/sum(.)}
# diag為對角線相加，而預測成功的情形為: 
# 1. 預測為F，且實際為F
# 2. 預測為T，且實際為T
# 所以將對角線(預測成功)相加後除以總數sum，得到模型預測正確率。
#acc.ts : testing accuracy
c(1-mean(TS$buy) , acc.ts)  # 0.69998

## [1] 0.5367304 0.6999767

#1-mean(TS$buy) : 1-顧客族群裡有買的人的比率(平均回購率、保留率) -> null model : 不做模型去猜全部人都不買猜中的機率(0.5367304)
#跟不做模型(平均值)比，準確率從0.5367304提高到0.6999767

colAUC(pred, TS$buy)

##                     [,1]
## FALSE vs. TRUE 0.7556038

#AUC:辨識率、模型的解釋能力。

#Regression Model (估數量)
A2 = subset(A, A$buy) %>% mutate_at(c("m","rev","amount"), log10)
#將A集合中buy為True者找出來建立新的集合，並將"m","rev","amount"３個欄位取對數
TR2 = subset(A2, spl2) #spl2為True者切為training data
TS2 = subset(A2, !spl2) #spl2為False者切為testing data

lm1 = lm(amount ~ ., TR2[,c(2:6,8:10)]) 
#對Training Data做線性回歸，以amount為應變數，"r", "rev", "age", "amount"為自變數，後將回歸模型設為lm1

summary(lm1)

## 
## Call:
## lm(formula = amount ~ ., data = TR2[, c(2:6, 8:10)])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.02874 -0.23292  0.05011  0.28423  1.45108 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.1651034  0.0501997  23.209  < 2e-16 ***
## r             0.0003390  0.0003138   1.080  0.27999    
## s             0.0002380  0.0003164   0.752  0.45200    
## f             0.0266666  0.0018112  14.723  < 2e-16 ***
## m             0.5165187  0.0375332  13.762  < 2e-16 ***
## rev           0.0240517  0.0363911   0.661  0.50868    
## agea29        0.0471837  0.0252728   1.867  0.06194 .  
## agea34        0.0896806  0.0233116   3.847  0.00012 ***
## agea39        0.1203331  0.0229212   5.250 1.56e-07 ***
## agea44        0.1107960  0.0235428   4.706 2.56e-06 ***
## agea49        0.0649780  0.0244296   2.660  0.00783 ** 
## agea54        0.0838574  0.0266168   3.151  0.00163 ** 
## agea59        0.0395519  0.0310826   1.272  0.20323    
## agea64        0.0059463  0.0325943   0.182  0.85525    
## agea69       -0.0399961  0.0289299  -1.383  0.16685    
## agea99        0.0892782  0.0408041   2.188  0.02870 *  
## areaz106      0.0955455  0.0427171   2.237  0.02533 *  
## areaz110      0.0526326  0.0347075   1.516  0.12944    
## areaz114      0.0154046  0.0364721   0.422  0.67277    
## areaz115      0.0193686  0.0317954   0.609  0.54243    
## areaz221      0.0350306  0.0320485   1.093  0.27440    
## areazOthers   0.0385476  0.0344270   1.120  0.26288    
## areazUnknown  0.0130805  0.0387052   0.338  0.73541    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4249 on 9246 degrees of freedom
## Multiple R-squared:  0.2796, Adjusted R-squared:  0.2779 
## F-statistic: 163.1 on 22 and 9246 DF,  p-value: < 2.2e-16

r2.tr = summary(lm1)$r.sq #線性模型的r square
SST = sum((TS2$amount - mean(TR2$amount))^ 2) #基礎模型的誤差平方總和
SSE = sum((predict(lm1, TS2) -  TS2$amount)^2) #回歸模型的誤差平方總和
r2.ts = 1 - (SSE/SST) #r square公式
c(R2train=r2.tr, R2test=r2.ts) #比較training / testing的r square

##   R2train    R2test 
## 0.2795931 0.2845795

#Aggregate data 2000-12-01 ~ 2001~02-28
d0 = max(X0$date) + 1 #最大日期的下一天(用來計算客戶來店天數間隔之差異)
B = X0 %>% 
  filter(date >= as.Date("2000-12-01")) %>% 
  mutate(days = as.integer(difftime(d0, date, units="days"))) %>% 
  group_by(cust) %>% summarise(
    r = min(days),      # recency 最近一次來消費距離d0的天數
    s = max(days),      # seniority 第一次來消費距離d0的天數
    f = n(),            # frquency 來店次數
    m = mean(total),    # monetary 平均消費金額
    rev = sum(total),   # total revenue contribution 總消費金額
    raw = sum(gross),   # total gross profit contribution 總毛利
    age = age[1],       # age group 客戶年齡間距
    area = area[1],     # area code 客戶地區
  ) %>% data.frame      # 28584(筆資料)
nrow(B)

## [1] 28531

B$Buy = predict(glm1, B, type="response")
#用先前建立好的模型glm1來預測資料集B

B2 = B %>% mutate_at(c("m","rev"), log10) #將B中的m/rev做對數處理
B$Rev = 10^predict(lm1, B2) #用先前建立好的模型lm1對B中的rev做預測
#10的次方式因有做過對數處理

par(mfrow=c(1,2), cex=0.8)
hist(B$Buy) #購買機率分布直條圖
hist(log(B$Rev,10)) #收入對數分布直條圖

#分群顧客的未來預測資料
BB = new_B %>% filter(status == "B")
DD = new_B %>% filter(status == "D")
FF = new_B %>%  filter(status == "F")


par(mfrow=c(1,2), cex=0.8)
hist(BB$Buy)
hist(log(BB$Rev,10))

hist(DD$Buy)
hist(log(DD$Rev,10))

hist(FF$Buy)
hist(log(FF$Rev,10))

#地區與分群顧客馬賽克圖
MOSA = function(formula, data) mosaic(formula, data, shade=T, 
  margins=c(0,1,0,0), labeling_args = list(rot_labels=c(90,0,0,0)),
  gp_labels=gpar(fontsize=9), legend_args=list(fontsize=9),
  gp_text=gpar(fontsize=7),labeling=labeling_residuals)

MOSA(~status+area, new_B)

DP = function(x,m0,b0,a0) {m0*plogis((10/a0)*(x-b0))}

par(mar=c(4,4,2,1),mfrow=c(1,2),cex=0.7)
curve(DP(x,m=0.20,b=15,a=20), 0, 30, lwd=2, ylim=c(0, 0.25),
      main="F( x | m=0.2, b=15, a=20 )", ylab="delta P")
abline(h=seq(0,0.2,0.05),v=seq(0,30,5),col='lightgrey',lty=2)

m=0.1; b=150; a=200; x=200
dp = DP(x,m,b,a)
dp = ifelse(B$Buy+dp>1, 1-B$Buy, dp)
eR = dp*B$Rev - x
hist(eR)

#成本效益參數m, b, a分別為
mm=c(0.12,0.14,0.2)
bb=c(40,60,100)
aa=c(80,60,200)

X = seq(10, 250, 1) 
df = do.call(rbind, lapply(1:3, function(i) {
  sapply(X, function(x) {
    # eR = (min(B$Buy+DP(x,mm[i],bb[i],aa[i]),1) - B$Buy) * B$Rev - x
    dp = DP(x,mm[i],bb[i],aa[i])
    dp = ifelse(B$Buy+dp>1, 1-B$Buy, dp)
    eR = dp*B$Rev - x
    c(i=i, x=x, eR.ALL=sum(eR), N=sum(eR>0), eR.SEL=sum(eR[eR > 0]) )
    }) %>% t %>% data.frame
  })) 

df %>% gather('key','value',-i,-x) %>% 
  mutate(Instrument = paste0('I',i)) %>% 
  ggplot(aes(x=x, y=value, col=Instrument)) + 
  geom_hline(yintercept=0, linetype='dashed', col='blue') +
  geom_line(size=1.5,alpha=0.5) + 
  xlab('工具選項(成本)') + ylab('預期報償') + 
  ggtitle('行銷工具優化','假設行銷工具的效果是其成本的函數') +
    facet_wrap(~key,ncol=1,scales='free_y') + theme_bw()

最佳策略

group_by(df, i) %>% top_n(1,eR.SEL)

## # A tibble: 3 x 5
## # Groups:   i [3]
##       i     x   eR.ALL     N   eR.SEL
##   <dbl> <dbl>    <dbl> <dbl>    <dbl>
## 1     1    61 1061196. 20828 1224614.
## 2     2    79 1084642. 19519 1336093.
## 3     3   146  282097. 13751 1118785.

#最佳策略中
#B族群若投放61元的成本，則將有1224614的預期報償
#D族群中若投放79元成本，則有1336093元的預期報償
#F族群中若投放146元成本，則有1118785元預期報償

因此我們針對不同族群祭出不同價值的禮品或折價券，如B族群(只來過一次的大戶)則針對特定節日贈送禮包；D族群(很久沒來買的老顧客)則贈送防疫包；F族群(潛力新顧客)則利用三倍券贈送$140回購券

```

G6_final 期末筆記

G6

2020-06-22 19:43:07