rm(list=ls(all=TRUE))
pacman::p_load(ggplot2,dplyr,heatmaply)
load("data/tf0.rdata")
sapply(list(cust=A0,tid=X0,items=Z0), nrow)
cust tid items
32241 119328 817182
🌷 Modern Marketing is all about
col6 = c('seagreen','gold','orange',rep('red',3))
a500 = A0 %>% top_n(500, raw)
g = ggplot(a500, aes(x=m, y=f, col=raw)) +
geom_point(size=1.25, alpha=0.7) +
scale_x_log10() + scale_color_gradientn(colors=col6) +
theme_bw()
ggplotly(g)
☝️ Iterative Charts help to see the
Group the customer by the quantiles of M
and F
bm = c(0, quantile(A0$m,c(.25,0.5,.75)), max(A0$m)+100)
bf = c(0, quantile(A0$f,c(.25,0.5,.75)), max(A0$f)+100)
A = A0 %>% mutate(
mx = cut(A0$m, bm, labels=paste0('M',1:4)),
fx = cut(A0$f, bf, labels=paste0('F',1:4)),
MF = paste0(mx, fx)
)
table(A$mx, A$fx)
F1 F2 F3 F4
M1 3465 1477 1379 1748
M2 2470 1475 1571 2536
M3 2569 1557 1807 2134
M4 3388 1790 1695 1180
Find 100 categories that generate the largest profit
cat100 = count(Z0, cat, wt=price, sort=T) %>% mutate(
pc=n/sum(n), cum.pc=cumsum(pc)) %>% head(100)
cat100[c(1:5,96:100), ]
cat n pc cum.pc
1 560201 4329366 0.0422026 0.042203
2 560402 3634174 0.0354259 0.077629
3 500201 2204325 0.0214877 0.099116
4 110217 2201258 0.0214578 0.120574
5 320402 1481172 0.0144385 0.135013
96 100504 229815 0.0022402 0.547202
97 110106 227899 0.0022216 0.549424
98 100418 226905 0.0022119 0.551636
99 100407 224486 0.0021883 0.553824
100 110402 221145 0.0021557 0.555980
Make a CustGrp-Category matrix in revenue - mx0
Joining, by = "cust"
[1] 16 30
Plot the matrix in a heatmap
# define a helper function to create interactive heatmap
# color9 = c("darkblue","green","gold","orange",rep("red",5))
hmap1 = function(x, ...) { heatmaply(
as.data.frame.matrix(x), cexRow=0.7, cexCol=0.7,
grid_color='gray70', ...)
}
# create the heatmap
hmap1(mx0, col=cool_warm)
☝️ Note that:
🌷
☝ Now we can see distinctive buying patterns across the customer groups
The heatmaply()
function is sophisticated, you can …
mx2 = xtabs(price~MF+cat, filter(Z, cat %in% cat100$cat[1:20]))
mx3 = 100*mx2/rowSums(mx2)
hmap1(mx3, col=cool_warm, show_dendrogram=c(T,F),k_row=5)
🌷
💡 Matrix and Heatmap
■ Matrix is a key data structure for group comparison
■ Applying statistics by two categories creates a matrix
■ Heatmap is the best visualization tool for matrices
■ Heatmaps not only display values in colors …
■ but also clustering the columns and rows according to their similarity
■ Extreme values or unbalance groups degrade the effect of visualization …
■ Log transformation or normalization might help to improve the visual quality
💡 Normalization vs Standardization
■ Normalization emphasize ratios. It’s uni-polar, usually 0 t0 1.
■ Standardization emphasize variation. It is bi-polar and zero based.
■ Whilst both help to improve visual quality, the latter also helps to balance the weights of variables before applying statistical method such as clustering analysis or dimension reduction.