TF5_Prep_and

Data Preparation Process

Fig-1: Data Preparation

Preparing The Predictors (X)

rm(list=ls(all=TRUE))
pacman::p_load(magrittr, caTools, ggplot2, dplyr)
load("data/tf0.rdata")

The Demarcation Date

Remove data after the demarcation date

feb01 = as.Date("2001-02-01")
Z = subset(Z0, date < feb01)    # 618212

Aggregate for the Transaction Records

X = group_by(Z, tid) %>% summarise(
  date = first(date),  # date of transaction
  cust = first(cust),  # customer id
  age = first(age),    # age group
  area = first(area),  # area group
  items = n(),                # number of items
  pieces = sum(qty),          # number of pieces
  total = sum(price),         # total amount
  gross = sum(price - cost)   # raw profit
  ) %>% data.frame  # 88387

summary(X)

##       tid             date                cust               age           
##  Min.   :    1   Min.   :2000-11-01   Length:88387       Length:88387      
##  1st Qu.:22098   1st Qu.:2000-11-23   Class :character   Class :character  
##  Median :44194   Median :2000-12-12   Mode  :character   Mode  :character  
##  Mean   :44194   Mean   :2000-12-15                                        
##  3rd Qu.:66291   3rd Qu.:2001-01-12                                        
##  Max.   :88387   Max.   :2001-01-31                                        
##      area               items             pieces            total        
##  Length:88387       Min.   :  1.000   Min.   :  1.000   Min.   :    5.0  
##  Class :character   1st Qu.:  2.000   1st Qu.:  3.000   1st Qu.:  230.0  
##  Mode  :character   Median :  5.000   Median :  6.000   Median :  522.0  
##                     Mean   :  6.994   Mean   :  9.453   Mean   :  888.7  
##                     3rd Qu.:  9.000   3rd Qu.: 12.000   3rd Qu.: 1120.0  
##                     Max.   :112.000   Max.   :339.000   Max.   :30171.0  
##      gross        
##  Min.   :-1645.0  
##  1st Qu.:   23.0  
##  Median :   72.0  
##  Mean   :  138.3  
##  3rd Qu.:  174.0  
##  Max.   : 8069.0

Check Quantile and Remove Outlier

sapply(X[,6:9], quantile, prob=c(.999, .9995, .9999))

##          items   pieces     total    gross
## 99.9%  56.0000  84.0000  9378.684 1883.228
## 99.95% 64.0000  98.0000 11261.751 2317.087
## 99.99% 85.6456 137.6456 17699.325 3389.646

# remove outliers
X = subset(X, items<=64 & pieces<=98 & total<=11260) # 88387 -> 88295

Aggregate for Customer Records

d0 = max(X$date) + 1
A = X %>% mutate(
  days = as.integer(difftime(d0, date, units="days"))
  ) %>% 
  group_by(cust) %>% summarise(
    r = min(days),      # recency
    s = max(days),      # seniority
    f = n(),            # frquency
    m = mean(total),    # monetary
    rev = sum(total),   # total revenue contribution
    raw = sum(gross),   # total gross profit contribution
    age = age[1],       # age group
    area = area[1],     # area code
  ) %>% data.frame      # 28584
nrow(A)

## [1] 28584

Preparing the Target Variables (Y)

Aggregate Feb’s Transaction by Customer

feb = filter(X0, date>= feb01) %>% 
  group_by(cust) %>% 
  summarise(amount = sum(total))  # 16900

The Target for Regression - `A$amount`

Merge the buying amount into A$amount

A = merge(A, feb, by="cust", all.x=T)

Make Target Variable for Buying Probability Model - `A$buy`

A$buy = !is.na(A$amount)

table(A$buy)  # the number of buying customers

## 
## FALSE  TRUE 
## 15342 13242

mean(A$buy)   # the base probability of buying

## [1] 0.4632662

Summary of the Dataset

summary(A)

##      cust                 r               s               f         
##  Length:28584       Min.   : 1.00   Min.   : 1.00   Min.   : 1.000  
##  Class :character   1st Qu.:11.00   1st Qu.:47.00   1st Qu.: 1.000  
##  Mode  :character   Median :21.00   Median :68.00   Median : 2.000  
##                     Mean   :32.12   Mean   :61.27   Mean   : 3.089  
##                     3rd Qu.:53.00   3rd Qu.:83.00   3rd Qu.: 4.000  
##                     Max.   :92.00   Max.   :92.00   Max.   :60.000  
##                                                                     
##        m                rev             raw              age           
##  Min.   :    8.0   Min.   :    8   Min.   : -742.0   Length:28584      
##  1st Qu.:  359.4   1st Qu.:  638   1st Qu.:   70.0   Class :character  
##  Median :  709.5   Median : 1566   Median :  218.0   Mode  :character  
##  Mean   : 1012.4   Mean   : 2711   Mean   :  420.8                     
##  3rd Qu.: 1315.0   3rd Qu.: 3426   3rd Qu.:  535.0                     
##  Max.   :10634.0   Max.   :99597   Max.   :15565.0                     
##                                                                        
##      area               amount         buy         
##  Length:28584       Min.   :    8   Mode :logical  
##  Class :character   1st Qu.:  454   FALSE:15342    
##  Mode  :character   Median :  993   TRUE :13242    
##                     Mean   : 1499                  
##                     3rd Qu.: 1955                  
##                     Max.   :28089                  
##                     NA's   :15342

Preparing Train & Test Datasets

Train & Test Dataset

# synchronize the data amount A, X and Z
X = subset(X, cust %in% A$cust & date < as.Date("2001-02-01"))
Z = subset(Z, cust %in% A$cust & date < as.Date("2001-02-01"))

set.seed(2018)  # set seed before splitting
# so that the result of random splitting remains the same
spl = sample.split(A$buy, SplitRatio=0.7) 
c(nrow(A), sum(spl), sum(!spl))

## [1] 28584 20008  8576

# compare the distribution of buying amount between train and test data
cbind(A, spl) %>% filter(buy) %>% 
  ggplot(aes(x=log(amount))) + 
  geom_density(aes(fill=spl), alpha=0.5)

A2 = subset(A, buy) %>% mutate_at(c("m","rev","amount"), log10)
n = nrow(A2)
set.seed(2018)  # remember to set seed 
spl2 = 1:n %in% sample(1:n, round(0.7*n)) # before random split
c(nrow(A2), sum(spl2), sum(!spl2))

## [1] 13242  9269  3973

cbind(A2, spl2) %>% 
  ggplot(aes(x=amount)) + 
  geom_density(aes(fill=spl2), alpha=0.5)

# save the results before closing the session
save(Z, X, A, spl, spl2, file="data/tf3.rdata")

The Splitting Process and the Results

Upto this point of time we already have the train and test data for
the buying probability and expected revenue models as shown below.

Fig-2: Data Spliting

TF5_Prep_and_Split

Tony Chuo, NSYSU Taiwan

2025-12-08

Data Preparation Process

Preparing The Predictors (X)

The Demarcation Date

Aggregate for the Transaction Records

Check Quantile and Remove Outlier

Aggregate for Customer Records

Preparing the Target Variables (Y)

Aggregate Feb’s Transaction by Customer

The Target for Regression - `A$amount`

Make Target Variable for Buying Probability Model - `A$buy`

Summary of the Dataset

Preparing Train & Test Datasets

Train & Test Dataset

The Splitting Process and the Results

TF5_Prep_and_Split

Tony Chuo, NSYSU Taiwan

2025-12-08

Data Preparation Process

Preparing The Predictors (X)

The Demarcation Date

Aggregate for the Transaction Records

Check Quantile and Remove Outlier

Aggregate for Customer Records

Preparing the Target Variables (Y)

Aggregate Feb’s Transaction by Customer

The Target for Regression - A$amount

Make Target Variable for Buying Probability Model - A$buy

Summary of the Dataset

Preparing Train & Test Datasets

Train & Test Dataset

The Splitting Process and the Results

The Target for Regression - `A$amount`

Make Target Variable for Buying Probability Model - `A$buy`