Data Preparation Process 資料整理

Fig-3: Data Preparation


Preparing The Predictors (X) 製作預測變數

rm(list=ls(all=TRUE))
pacman::p_load(magrittr, caTools, ggplot2, dplyr)
load("data/tf0.rdata")
移除最後一期資料

移除最後一期(分割日期之後)的資料

feb01 = as.Date("2001-02-01")   # 資料分割日期 
Z = subset(Z0, date < feb01)    # 618212 項目
Aggregate for the Transaction Records

重新匯整交易紀錄

X = group_by(Z, tid) %>% summarise(
  date = first(date),  # date of transaction
  cust = first(cust),  # customer id
  age = first(age),    # age group
  area = first(area),  # area group
  items = n(),                # number of items
  pieces = sum(qty),          # number of pieces
  total = sum(price),         # total amount
  gross = sum(price - cost)   # raw profit
  ) %>% data.frame  # 88387 交易筆數
summary(X)
##       tid             date                cust               age           
##  Min.   :    1   Min.   :2000-11-01   Length:88387       Length:88387      
##  1st Qu.:22098   1st Qu.:2000-11-23   Class :character   Class :character  
##  Median :44194   Median :2000-12-12   Mode  :character   Mode  :character  
##  Mean   :44194   Mean   :2000-12-15                                        
##  3rd Qu.:66291   3rd Qu.:2001-01-12                                        
##  Max.   :88387   Max.   :2001-01-31                                        
##      area               items             pieces            total        
##  Length:88387       Min.   :  1.000   Min.   :  1.000   Min.   :    5.0  
##  Class :character   1st Qu.:  2.000   1st Qu.:  3.000   1st Qu.:  230.0  
##  Mode  :character   Median :  5.000   Median :  6.000   Median :  522.0  
##                     Mean   :  6.994   Mean   :  9.453   Mean   :  888.7  
##                     3rd Qu.:  9.000   3rd Qu.: 12.000   3rd Qu.: 1120.0  
##                     Max.   :112.000   Max.   :339.000   Max.   :30171.0  
##      gross        
##  Min.   :-1645.0  
##  1st Qu.:   23.0  
##  Median :   72.0  
##  Mean   :  138.3  
##  3rd Qu.:  174.0  
##  Max.   : 8069.0
Check Quantile and Remove Outlier

移除不合理的離群資料

sapply(X[,6:9], quantile, prob=c(.999, .9995, .9999))
##          items   pieces     total    gross
## 99.9%  56.0000  84.0000  9378.684 1883.228
## 99.95% 64.0000  98.0000 11261.751 2317.087
## 99.99% 85.6456 137.6456 17699.325 3389.646
X = subset(X, items<=64 & pieces<=98 & total<=11260) # 88387 -> 88295
Aggregate for Customer Records

重新匯整顧客資料

d0 = max(X$date) + 1
A = X %>% mutate(
  days = as.integer(difftime(d0, date, units="days"))
  ) %>% 
  group_by(cust) %>% summarise(
    r = min(days),      # recency
    s = max(days),      # seniority
    f = n(),            # frequency
    m = mean(total),    # monetary
    rev = sum(total),   # total revenue contribution
    raw = sum(gross),   # total gross profit contribution
    age = age[1],       # age group
    area = area[1],     # area code
  ) %>% data.frame      # 28584 顧客
nrow(A)
## [1] 28584



Preparing the Target Variables (Y) 製作預測變數

Aggregate Feb’s Transaction by Customer

匯整最後一期的資料

feb = filter(X0, date>= feb01) %>% group_by(cust) %>% 
  summarise(amount = sum(total))  # 16900

feb$amount之中有最後一期來買過的16,900位顧客的營收貢獻

The Target for Regression - A$amount

feb$amount匯入A

A = merge(A, feb, by="cust", all.x=T)
The Target for Classification - A$buy

A$amountNA代表這位顧客最後一期沒來買
A$amount不是NA代表這位顧客最後一期有來買過

A$buy = !is.na(A$amount)  
table(A$buy, !is.na(A$amount))
##        
##         FALSE  TRUE
##   FALSE 15342     0
##   TRUE      0 13242
Summary of the Dataset
summary(A)
##      cust                 r               s               f         
##  Length:28584       Min.   : 1.00   Min.   : 1.00   Min.   : 1.000  
##  Class :character   1st Qu.:11.00   1st Qu.:47.00   1st Qu.: 1.000  
##  Mode  :character   Median :21.00   Median :68.00   Median : 2.000  
##                     Mean   :32.12   Mean   :61.27   Mean   : 3.089  
##                     3rd Qu.:53.00   3rd Qu.:83.00   3rd Qu.: 4.000  
##                     Max.   :92.00   Max.   :92.00   Max.   :60.000  
##                                                                     
##        m                rev             raw              age           
##  Min.   :    8.0   Min.   :    8   Min.   : -742.0   Length:28584      
##  1st Qu.:  359.4   1st Qu.:  638   1st Qu.:   70.0   Class :character  
##  Median :  709.5   Median : 1566   Median :  218.0   Mode  :character  
##  Mean   : 1012.4   Mean   : 2711   Mean   :  420.8                     
##  3rd Qu.: 1315.0   3rd Qu.: 3426   3rd Qu.:  535.0                     
##  Max.   :10634.0   Max.   :99597   Max.   :15565.0                     
##                                                                        
##      area               amount         buy         
##  Length:28584       Min.   :    8   Mode :logical  
##  Class :character   1st Qu.:  454   FALSE:15342    
##  Mode  :character   Median :  993   TRUE :13242    
##                     Mean   : 1499                  
##                     3rd Qu.: 1955                  
##                     Max.   :28089                  
##                     NA's   :15342

XZ之中的資料範圍與A一樣

X = subset(X, cust %in% A$cust & date < as.Date("2001-02-01"))
Z = subset(Z, cust %in% A$cust & date < as.Date("2001-02-01"))



分割訓練測試(TR)與測試資料(TS)

Data for 購買機率模型

依指定的目標變數(A$buy)的分佈切割成訓練測試(TR)與測試資料(TS)製作分割向量:spl

set.seed(2018)
spl = sample.split(A$buy, SplitRatio=0.7)
TR = A[spl,]; TS = A[!spl,]
mean(spl)
## [1] 0.699972
tapply(A$buy,spl,mean)
##     FALSE      TRUE 
## 0.4632696 0.4632647

TRTS之中buy==TRUE的比例是一致的

cbind(A, spl) %>% filter(buy) %>%
  ggplot(aes(x=log(amount))) + geom_density(aes(fill=spl), alpha=0.5)

TRTS之中amount的分佈也是是一致的

Data for 購買金額模型
  • 只有A$amount有值的資料可以拿來做購買金額模型 : A2
  • 金額欄位需要做對數轉換才會符合常態分佈(線性回歸的假設)
A2 = subset(A, buy) %>% mutate_at(c("m","rev","amount"), log10)
n = nrow(A2)

以隨機抽樣(sample)來製作分割向量:spl2

set.seed(2018); spl2 = 1:n %in% sample(1:n, round(0.7*n))
mean(spl)
## [1] 0.699972
cbind(A2, spl2) %>% 
  ggplot(aes(x=amount)) + geom_density(aes(fill=spl2), alpha=0.5)

儲存做好的資料
save(Z, X, A, spl, spl2, file="data/tf3.rdata")


Fig-4: Data Spliting

  • ZX:最後一期之前的交易項目紀錄、交易紀錄
  • A:最後一期之前購買過的顧客
    • TR=A[spl,]:購買機率模型的訓練測試
    • TS=A[!spl]:購買機率模型的測試資料
  • A2:A之中在最後一期有購買過的顧客
    • TR2=A2[spl,]:購買金額模型的訓練測試
    • TS2=A2[!spl]:購買金額模型的測試資料