rm(list=ls(all=TRUE))
::p_load(magrittr, caTools, ggplot2, dplyr)
pacmanload("data/tf0.rdata")
Remove data after the demarcation date
= as.Date("2001-02-01")
feb01 = subset(Z0, date < feb01) # 618212 Z
= group_by(Z, tid) %>% summarise(
X date = first(date), # date of transaction
cust = first(cust), # customer id
age = first(age), # age group
area = first(area), # area group
items = n(), # number of items
pieces = sum(qty), # number of pieces
total = sum(price), # total amount
gross = sum(price - cost) # raw profit
%>% data.frame # 88387 )
summary(X)
## tid date cust age
## Min. : 1 Min. :2000-11-01 Length:88387 Length:88387
## 1st Qu.:22098 1st Qu.:2000-11-23 Class :character Class :character
## Median :44194 Median :2000-12-12 Mode :character Mode :character
## Mean :44194 Mean :2000-12-15
## 3rd Qu.:66291 3rd Qu.:2001-01-12
## Max. :88387 Max. :2001-01-31
## area items pieces total
## Length:88387 Min. : 1.000 Min. : 1.000 Min. : 5.0
## Class :character 1st Qu.: 2.000 1st Qu.: 3.000 1st Qu.: 230.0
## Mode :character Median : 5.000 Median : 6.000 Median : 522.0
## Mean : 6.994 Mean : 9.453 Mean : 888.7
## 3rd Qu.: 9.000 3rd Qu.: 12.000 3rd Qu.: 1120.0
## Max. :112.000 Max. :339.000 Max. :30171.0
## gross
## Min. :-1645.0
## 1st Qu.: 23.0
## Median : 72.0
## Mean : 138.3
## 3rd Qu.: 174.0
## Max. : 8069.0
sapply(X[,6:9], quantile, prob=c(.999, .9995, .9999))
## items pieces total gross
## 99.9% 56.0000 84.0000 9378.684 1883.228
## 99.95% 64.0000 98.0000 11261.751 2317.087
## 99.99% 85.6456 137.6456 17699.325 3389.646
= subset(X, items<=64 & pieces<=98 & total<=11260) # 88387 -> 88295 X
= max(X$date) + 1
d0 = X %>% mutate(
A days = as.integer(difftime(d0, date, units="days"))
%>%
) group_by(cust) %>% summarise(
r = min(days), # recency
s = max(days), # seniority
f = n(), # frquency
m = mean(total), # monetary
rev = sum(total), # total revenue contribution
raw = sum(gross), # total gross profit contribution
age = age[1], # age group
area = area[1], # area code
%>% data.frame # 28584
) nrow(A)
## [1] 28584
= filter(X0, date>= feb01) %>% group_by(cust) %>%
feb summarise(amount = sum(total)) # 16900
A$amount
Simply a Left Joint
= merge(A, feb, by="cust", all.x=T) A
A$buy
$buy = !is.na(A$amount)
Atable(A$buy, !is.na(A$amount))
##
## FALSE TRUE
## FALSE 15342 0
## TRUE 0 13242
summary(A)
## cust r s f
## Length:28584 Min. : 1.00 Min. : 1.00 Min. : 1.000
## Class :character 1st Qu.:11.00 1st Qu.:47.00 1st Qu.: 1.000
## Mode :character Median :21.00 Median :68.00 Median : 2.000
## Mean :32.12 Mean :61.27 Mean : 3.089
## 3rd Qu.:53.00 3rd Qu.:83.00 3rd Qu.: 4.000
## Max. :92.00 Max. :92.00 Max. :60.000
##
## m rev raw age
## Min. : 8.0 Min. : 8 Min. : -742.0 Length:28584
## 1st Qu.: 359.4 1st Qu.: 638 1st Qu.: 70.0 Class :character
## Median : 709.5 Median : 1566 Median : 218.0 Mode :character
## Mean : 1012.4 Mean : 2711 Mean : 420.8
## 3rd Qu.: 1315.0 3rd Qu.: 3426 3rd Qu.: 535.0
## Max. :10634.0 Max. :99597 Max. :15565.0
##
## area amount buy
## Length:28584 Min. : 8 Mode :logical
## Class :character 1st Qu.: 454 FALSE:15342
## Mode :character Median : 993 TRUE :13242
## Mean : 1499
## 3rd Qu.: 1955
## Max. :28089
## NA's :15342
= subset(X, cust %in% A$cust & date < as.Date("2001-02-01"))
X = subset(Z, cust %in% A$cust & date < as.Date("2001-02-01"))
Z set.seed(2018); spl = sample.split(A$buy, SplitRatio=0.7)
c(nrow(A), sum(spl), sum(!spl))
## [1] 28584 20008 8576
cbind(A, spl) %>% filter(buy) %>%
ggplot(aes(x=log(amount))) + geom_density(aes(fill=spl), alpha=0.5)
= subset(A, buy) %>% mutate_at(c("m","rev","amount"), log10)
A2 = nrow(A2)
n set.seed(2018); spl2 = 1:n %in% sample(1:n, round(0.7*n))
c(nrow(A2), sum(spl2), sum(!spl2))
## [1] 13242 9269 3973
cbind(A2, spl2) %>%
ggplot(aes(x=amount)) + geom_density(aes(fill=spl2), alpha=0.5)
save(Z, X, A, spl, spl2, file="data/tf3.rdata")