rm(list=ls(all=TRUE))
pacman::p_load(ggplot2,dplyr,heatmaply)
load("data/tf0.rdata")
sapply(list(cust=A0,tid=X0,items=Z0), nrow)
  cust    tid  items 
 32241 119328 817182 


The Value of Customers

🌷 Modern Marketing is all about Customer Centrality & Value Orientation

Top500 Customers
col6 = c('seagreen','gold','orange',rep('red',3))
a500 = A0 %>% top_n(500, raw)
g = ggplot(a500, aes(x=m, y=f, col=raw)) + 
  geom_point(size=1.25, alpha=0.7) +
  scale_x_log10() + scale_color_gradientn(colors=col6) + 
  theme_bw()
ggplotly(g)

☝️ Iterative Charts help to see the global patterns as well as individual details in large scales.



Customer Groups (Segments)

Rule Based segmentation

Group the customer by the quantiles of M and F

bm = c(0, quantile(A0$m,c(.25,0.5,.75)), max(A0$m)+100)
bf = c(0, quantile(A0$f,c(.25,0.5,.75)), max(A0$f)+100)
A = A0 %>% mutate(
  mx = cut(A0$m, bm, labels=paste0('M',1:4)),
  fx = cut(A0$f, bf, labels=paste0('F',1:4)),
  MF = paste0(mx, fx)
  )
table(A$mx, A$fx)
    
       F1   F2   F3   F4
  M1 3465 1477 1379 1748
  M2 2470 1475 1571 2536
  M3 2569 1557 1807 2134
  M4 3388 1790 1695 1180

Find 100 categories that generate the largest profit

cat100 = count(Z0, cat, wt=price, sort=T) %>% mutate(
  pc=n/sum(n), cum.pc=cumsum(pc)) %>% head(100)
cat100[c(1:5,96:100), ]
       cat       n        pc   cum.pc
1   560201 4329366 0.0422026 0.042203
2   560402 3634174 0.0354259 0.077629
3   500201 2204325 0.0214877 0.099116
4   110217 2201258 0.0214578 0.120574
5   320402 1481172 0.0144385 0.135013
96  100504  229815 0.0022402 0.547202
97  110106  227899 0.0022216 0.549424
98  100418  226905 0.0022119 0.551636
99  100407  224486 0.0021883 0.553824
100 110402  221145 0.0021557 0.555980

Make a CustGrp-Category matrix in revenue - mx0

Z = inner_join(Z0, A[,c('cust','MF')])
Joining, by = "cust"
mx0 = xtabs(price~MF+cat, filter(Z, cat %in% cat100$cat[1:30]))
dim(mx0)
[1] 16 30

Plot the matrix in a heatmap

# define a helper function to create interactive heatmap
# color9 = c("darkblue","green","gold","orange",rep("red",5))
hmap1 = function(x, ...) { heatmaply(
  as.data.frame.matrix(x), cexRow=0.7, cexCol=0.7, 
  grid_color='gray70', ...)
  }  

# create the heatmap
hmap1(mx0, col=cool_warm)

☝️ Note that:

  • The heatmap tool sort the columns and rows by hierarchical clustering
  • the color spectrum is heavily skewed
  • It only highlight the largest categories and customer groups

🌷 Normalization :

  • make the customer groups comparable by converting amounts to ratio
mx1 = mx0/rowSums(mx0)    # normalization can be done in a line of code
hmap1(mx1, col=cool_warm)

☝ Now we can see distinctive buying patterns across the customer groups

The heatmaply() function is sophisticated, you can …

  • specify your own colors
  • enable/disable the row and column clustering/sorting
  • decide whether to display the dendrogram(s)
  • cut the dendrogram into groups
  • see online help (press F1) for more options …
mx2 = xtabs(price~MF+cat, filter(Z, cat %in% cat100$cat[1:20]))
mx3 = 100*mx2/rowSums(mx2)
hmap1(mx3, col=cool_warm, show_dendrogram=c(T,F),k_row=5)



Comparability

🌷 EDA is all about Comparison whereof Comparability is the key.

💡 Matrix and Heatmap
■ Matrix is a key data structure for group comparison
■ Applying statistics by two categories creates a matrix
■ Heatmap is the best visualization tool for matrices
■ Heatmaps not only display values in colors …
■ but also clustering the columns and rows according to their similarity
■ Extreme values or unbalance groups degrade the effect of visualization …
■ Log transformation or normalization might help to improve the visual quality


💡 Normalization vs Standardization
■ Normalization emphasize ratios. It’s uni-polar, usually 0 t0 1.
■ Standardization emphasize variation. It is bi-polar and zero based.
■ Whilst both help to improve visual quality, the latter also helps to balance the weights of variables before applying statistical method such as clustering analysis or dimension reduction.