Data Preparation Process

Fig-1: Data Preparation
Fig-1: Data Preparation

Preparing The Predictors (X)

rm(list=ls(all=TRUE))
pacman::p_load(magrittr, caTools, ggplot2, dplyr)
load("data/tf0.rdata")
The Demarcation Date

Remove data after the demarcation date

feb01 = as.Date("2001-02-01")
Z = subset(Z0, date < feb01)    # 618212
Aggregate for the Transaction Records
X = group_by(Z, tid) %>% summarise(
  date = first(date),  # date of transaction
  cust = first(cust),  # customer id
  age = first(age),    # age group
  area = first(area),  # area group
  items = n(),                # number of items
  pieces = sum(qty),          # number of pieces
  total = sum(price),         # total amount
  gross = sum(price - cost)   # raw profit
  ) %>% data.frame  # 88387
summary(X)
##       tid             date                cust               age           
##  Min.   :    1   Min.   :2000-11-01   Length:88387       Length:88387      
##  1st Qu.:22098   1st Qu.:2000-11-23   Class :character   Class :character  
##  Median :44194   Median :2000-12-12   Mode  :character   Mode  :character  
##  Mean   :44194   Mean   :2000-12-15                                        
##  3rd Qu.:66291   3rd Qu.:2001-01-12                                        
##  Max.   :88387   Max.   :2001-01-31                                        
##      area               items             pieces            total        
##  Length:88387       Min.   :  1.000   Min.   :  1.000   Min.   :    5.0  
##  Class :character   1st Qu.:  2.000   1st Qu.:  3.000   1st Qu.:  230.0  
##  Mode  :character   Median :  5.000   Median :  6.000   Median :  522.0  
##                     Mean   :  6.994   Mean   :  9.453   Mean   :  888.7  
##                     3rd Qu.:  9.000   3rd Qu.: 12.000   3rd Qu.: 1120.0  
##                     Max.   :112.000   Max.   :339.000   Max.   :30171.0  
##      gross        
##  Min.   :-1645.0  
##  1st Qu.:   23.0  
##  Median :   72.0  
##  Mean   :  138.3  
##  3rd Qu.:  174.0  
##  Max.   : 8069.0
Check Quantile and Remove Outlier
sapply(X[,6:9], quantile, prob=c(.999, .9995, .9999))
##          items   pieces     total    gross
## 99.9%  56.0000  84.0000  9378.684 1883.228
## 99.95% 64.0000  98.0000 11261.751 2317.087
## 99.99% 85.6456 137.6456 17699.325 3389.646
# remove outliers
X = subset(X, items<=64 & pieces<=98 & total<=11260) # 88387 -> 88295
Aggregate for Customer Records
d0 = max(X$date) + 1
A = X %>% mutate(
  days = as.integer(difftime(d0, date, units="days"))
  ) %>% 
  group_by(cust) %>% summarise(
    r = min(days),      # recency
    s = max(days),      # seniority
    f = n(),            # frquency
    m = mean(total),    # monetary
    rev = sum(total),   # total revenue contribution
    raw = sum(gross),   # total gross profit contribution
    age = age[1],       # age group
    area = area[1],     # area code
  ) %>% data.frame      # 28584
nrow(A)
## [1] 28584



Preparing the Target Variables (Y)

Aggregate Feb’s Transaction by Customer
feb = filter(X0, date>= feb01) %>% 
  group_by(cust) %>% 
  summarise(amount = sum(total))  # 16900
The Target for Regression - A$amount

Merge the buying amount into A$amount

A = merge(A, feb, by="cust", all.x=T)
Make Target Variable for Buying Probability Model - A$buy
A$buy = !is.na(A$amount)
table(A$buy)  # the number of buying customers  
## 
## FALSE  TRUE 
## 15342 13242
mean(A$buy)   # the base probability of buying
## [1] 0.4632662
Summary of the Dataset
summary(A)
##      cust                 r               s               f         
##  Length:28584       Min.   : 1.00   Min.   : 1.00   Min.   : 1.000  
##  Class :character   1st Qu.:11.00   1st Qu.:47.00   1st Qu.: 1.000  
##  Mode  :character   Median :21.00   Median :68.00   Median : 2.000  
##                     Mean   :32.12   Mean   :61.27   Mean   : 3.089  
##                     3rd Qu.:53.00   3rd Qu.:83.00   3rd Qu.: 4.000  
##                     Max.   :92.00   Max.   :92.00   Max.   :60.000  
##                                                                     
##        m                rev             raw              age           
##  Min.   :    8.0   Min.   :    8   Min.   : -742.0   Length:28584      
##  1st Qu.:  359.4   1st Qu.:  638   1st Qu.:   70.0   Class :character  
##  Median :  709.5   Median : 1566   Median :  218.0   Mode  :character  
##  Mean   : 1012.4   Mean   : 2711   Mean   :  420.8                     
##  3rd Qu.: 1315.0   3rd Qu.: 3426   3rd Qu.:  535.0                     
##  Max.   :10634.0   Max.   :99597   Max.   :15565.0                     
##                                                                        
##      area               amount         buy         
##  Length:28584       Min.   :    8   Mode :logical  
##  Class :character   1st Qu.:  454   FALSE:15342    
##  Mode  :character   Median :  993   TRUE :13242    
##                     Mean   : 1499                  
##                     3rd Qu.: 1955                  
##                     Max.   :28089                  
##                     NA's   :15342



Preparing Train & Test Datasets

Train & Test Dataset
# synchronize the data amount A, X and Z
X = subset(X, cust %in% A$cust & date < as.Date("2001-02-01"))
Z = subset(Z, cust %in% A$cust & date < as.Date("2001-02-01"))
set.seed(2018)  # set seed before splitting
# so that the result of random splitting remains the same
spl = sample.split(A$buy, SplitRatio=0.7) 
c(nrow(A), sum(spl), sum(!spl))
## [1] 28584 20008  8576
# compare the distribution of buying amount between train and test data
cbind(A, spl) %>% filter(buy) %>% 
  ggplot(aes(x=log(amount))) + 
  geom_density(aes(fill=spl), alpha=0.5)

A2 = subset(A, buy) %>% mutate_at(c("m","rev","amount"), log10)
n = nrow(A2)
set.seed(2018)  # remember to set seed 
spl2 = 1:n %in% sample(1:n, round(0.7*n)) # before random split
c(nrow(A2), sum(spl2), sum(!spl2))
## [1] 13242  9269  3973
cbind(A2, spl2) %>% 
  ggplot(aes(x=amount)) + 
  geom_density(aes(fill=spl2), alpha=0.5)

# save the results before closing the session
save(Z, X, A, spl, spl2, file="data/tf3.rdata")



The Splitting Process and the Results

Upto this point of time we already have the train and test data for
the buying probability and expected revenue models as shown below.

Fig-2: Data Spliting
Fig-2: Data Spliting