rm(list=ls(all=TRUE))
::p_load(magrittr, caTools, ggplot2, dplyr)
pacmanload("data/tf0.rdata")
移除最後一期(分割日期之後)的資料
= as.Date("2001-02-01") # 資料分割日期
feb01 = subset(Z0, date < feb01) # 618212 項目 Z
重新匯整交易紀錄
= group_by(Z, tid) %>% summarise(
X date = first(date), # date of transaction
cust = first(cust), # customer id
age = first(age), # age group
area = first(area), # area group
items = n(), # number of items
pieces = sum(qty), # number of pieces
total = sum(price), # total amount
gross = sum(price - cost) # raw profit
%>% data.frame # 88387 交易筆數 )
summary(X)
## tid date cust age
## Min. : 1 Min. :2000-11-01 Length:88387 Length:88387
## 1st Qu.:22098 1st Qu.:2000-11-23 Class :character Class :character
## Median :44194 Median :2000-12-12 Mode :character Mode :character
## Mean :44194 Mean :2000-12-15
## 3rd Qu.:66291 3rd Qu.:2001-01-12
## Max. :88387 Max. :2001-01-31
## area items pieces total
## Length:88387 Min. : 1.000 Min. : 1.000 Min. : 5.0
## Class :character 1st Qu.: 2.000 1st Qu.: 3.000 1st Qu.: 230.0
## Mode :character Median : 5.000 Median : 6.000 Median : 522.0
## Mean : 6.994 Mean : 9.453 Mean : 888.7
## 3rd Qu.: 9.000 3rd Qu.: 12.000 3rd Qu.: 1120.0
## Max. :112.000 Max. :339.000 Max. :30171.0
## gross
## Min. :-1645.0
## 1st Qu.: 23.0
## Median : 72.0
## Mean : 138.3
## 3rd Qu.: 174.0
## Max. : 8069.0
移除不合理的離群資料
sapply(X[,6:9], quantile, prob=c(.999, .9995, .9999))
## items pieces total gross
## 99.9% 56.0000 84.0000 9378.684 1883.228
## 99.95% 64.0000 98.0000 11261.751 2317.087
## 99.99% 85.6456 137.6456 17699.325 3389.646
= subset(X, items<=64 & pieces<=98 & total<=11260) # 88387 -> 88295 X
重新匯整顧客資料
= max(X$date) + 1
d0 = X %>% mutate(
A days = as.integer(difftime(d0, date, units="days"))
%>%
) group_by(cust) %>% summarise(
r = min(days), # recency
s = max(days), # seniority
f = n(), # frequency
m = mean(total), # monetary
rev = sum(total), # total revenue contribution
raw = sum(gross), # total gross profit contribution
age = age[1], # age group
area = area[1], # area code
%>% data.frame # 28584 顧客
) nrow(A)
## [1] 28584
匯整最後一期的資料
= filter(X0, date>= feb01) %>% group_by(cust) %>%
feb summarise(amount = sum(total)) # 16900
feb$amount
之中有最後一期來買過的16,900
位顧客的營收貢獻
A$amount
將feb$amount
匯入A
= merge(A, feb, by="cust", all.x=T) A
A$buy
A$amount
是NA
代表這位顧客最後一期沒來買
A$amount
不是NA
代表這位顧客最後一期有來買過
$buy = !is.na(A$amount)
Atable(A$buy, !is.na(A$amount))
##
## FALSE TRUE
## FALSE 15342 0
## TRUE 0 13242
summary(A)
## cust r s f
## Length:28584 Min. : 1.00 Min. : 1.00 Min. : 1.000
## Class :character 1st Qu.:11.00 1st Qu.:47.00 1st Qu.: 1.000
## Mode :character Median :21.00 Median :68.00 Median : 2.000
## Mean :32.12 Mean :61.27 Mean : 3.089
## 3rd Qu.:53.00 3rd Qu.:83.00 3rd Qu.: 4.000
## Max. :92.00 Max. :92.00 Max. :60.000
##
## m rev raw age
## Min. : 8.0 Min. : 8 Min. : -742.0 Length:28584
## 1st Qu.: 359.4 1st Qu.: 638 1st Qu.: 70.0 Class :character
## Median : 709.5 Median : 1566 Median : 218.0 Mode :character
## Mean : 1012.4 Mean : 2711 Mean : 420.8
## 3rd Qu.: 1315.0 3rd Qu.: 3426 3rd Qu.: 535.0
## Max. :10634.0 Max. :99597 Max. :15565.0
##
## area amount buy
## Length:28584 Min. : 8 Mode :logical
## Class :character 1st Qu.: 454 FALSE:15342
## Mode :character Median : 993 TRUE :13242
## Mean : 1499
## 3rd Qu.: 1955
## Max. :28089
## NA's :15342
讓X
與Z
之中的資料範圍與A
一樣
= subset(X, cust %in% A$cust & date < as.Date("2001-02-01"))
X = subset(Z, cust %in% A$cust & date < as.Date("2001-02-01")) Z
A
之中每一筆資料都可以拿來做購買機率模型A$amount
有值的資料可以拿來做購買金額模型 :
A2
A
和A2
都需要在相同的目標變數分佈切割成訓練測試(TR)與測試資料(TS)caTools::sample.split()
和隨機抽樣(sample
)來製作分割向量:spl
,spl2
依指定的目標變數(A$buy
)的分佈切割成訓練測試(TR)與測試資料(TS)製作分割向量:spl
set.seed(2018)
= sample.split(A$buy, SplitRatio=0.7)
spl = A[spl,]; TS = A[!spl,] TR
mean(spl)
## [1] 0.699972
tapply(A$buy,spl,mean)
## FALSE TRUE
## 0.4632696 0.4632647
TR
和TS
之中buy==TRUE
的比例是一致的
cbind(A, spl) %>% filter(buy) %>%
ggplot(aes(x=log(amount))) + geom_density(aes(fill=spl), alpha=0.5)
TR
和TS
之中amount
的分佈也是是一致的
A$amount
有值的資料可以拿來做購買金額模型 :
A2
= subset(A, buy) %>% mutate_at(c("m","rev","amount"), log10)
A2 = nrow(A2) n
以隨機抽樣(sample
)來製作分割向量:spl2
set.seed(2018); spl2 = 1:n %in% sample(1:n, round(0.7*n))
mean(spl)
## [1] 0.699972
cbind(A2, spl2) %>%
ggplot(aes(x=amount)) + geom_density(aes(fill=spl2), alpha=0.5)
save(Z, X, A, spl, spl2, file="data/tf3.rdata")
Z
、X
:最後一期之前的交易項目紀錄、交易紀錄A
:最後一期之前購買過的顧客
TR=A[spl,]
:購買機率模型的訓練測試TS=A[!spl]
:購買機率模型的測試資料A2
:A
之中在最後一期有購買過的顧客
TR2=A2[spl,]
:購買金額模型的訓練測試TS2=A2[!spl]
:購買金額模型的測試資料