rm(list=ls(all=TRUE))
pacman::p_load(ggplot2,dplyr,heatmaply)
load("data/tf0.rdata")
sapply(list(cust=A0,tid=X0,items=Z0), nrow)
  cust    tid  items 
 32241 119328 817182 


The Value of Customers

🌷 Modern Marketing is all about Customer Centrality & Value Orientation

Top500 Customers
col6 = c('seagreen','gold','orange',rep('red',3))
# a500 = A0 %>% top_n(500, raw)
g = A0 %>% top_n(2000, raw) %>% filter(r > 45) %>% 
  ggplot(aes(x=m, y=f, col=r, size=raw, label=cust)) + 
  geom_point(alpha=0.7) +
  scale_x_log10() + scale_color_gradientn(colors=col6[1:4]) + 
  theme_bw()
ggplotly(g)

☝️ Iterative Charts help to see the global patterns as well as individual details in large scales.



Customer Groups (Segments)

Rule Based segmentation

Group the customer by the quantiles of M and F

bm = c(0, quantile(A0$m,c(.25,0.5,.75)), max(A0$m)+100)
bf = c(0, quantile(A0$f,c(.25,0.5,.75)), max(A0$f)+100)
A = A0 %>% mutate(
  mx = cut(A0$m, bm, labels=paste0('M',1:4)),
  fx = cut(A0$f, bf, labels=paste0('F',1:4)),
  MF = paste0(mx, fx)
  )
table(A$mx, A$fx)
    
       F1   F2   F3   F4
  M1 3465 1477 1379 1748
  M2 2470 1475 1571 2536
  M3 2569 1557 1807 2134
  M4 3388 1790 1695 1180

Find 100 categories that generate the largest profit

cat100 = count(Z0, cat, wt=price, sort=T) %>% mutate(
  pc=n/sum(n), cum.pc=cumsum(pc)) %>% head(100)
cat100[c(1:5,96:100), ]
       cat       n        pc   cum.pc
1   560201 4329366 0.0422026 0.042203
2   560402 3634174 0.0354259 0.077629
3   500201 2204325 0.0214877 0.099116
4   110217 2201258 0.0214578 0.120574
5   320402 1481172 0.0144385 0.135013
96  100504  229815 0.0022402 0.547202
97  110106  227899 0.0022216 0.549424
98  100418  226905 0.0022119 0.551636
99  100407  224486 0.0021883 0.553824
100 110402  221145 0.0021557 0.555980

Make a CustGrp-Category matrix in revenue - mx0

Z = inner_join(Z0, A[,c('cust','MF')])
Joining, by = "cust"
# filter(Z, cat %in% cat100$cat[1:30]) %>% group_by(MF, cat) %>% 
#   summarise(price = sum(price))
mx0 = xtabs(price~MF+cat, filter(Z, cat %in% cat100$cat[1:30]))
mx0 
      cat
MF     100102 100205 100311 100401 100505 100516 110117 110217 110333 110401
  M1F1   8838  13761   2510   2365   5843   2106   4934  24421      0   8243
  M1F2   8249  10736   3474   5751   4747   1517   6750  30304    856   7187
  M1F3   9925  18373   4151  11131   9248   7422  12421  58318    696  11567
  M1F4  42832  69774  21403  64689  41155  32028  45471 258828  15920  45618
  M2F1  14625  23014  12414  17914   8948   6994   9875  32088   2638  12715
  M2F2  20112  24682   9449  22476  12039   6944  11641  43623   1767  18202
  M2F3  27733  45010  15326  43022  17966  14566  20862  79021   7730  26670
  M2F4 131826 198403  85901 227739  95395 108992  92609 419308  69539 138536
  M3F1  26768  36804  15916  35089  16559  18251  14499  40820  10819  21632
  M3F2  27501  37129  17417  41426  17402  17314  16969  54455   6568  27037
  M3F3  46334  76497  28164  77411  32897  37980  32280 122668  33798  52493
  M3F4 135702 215942 107877 213700  90389 148272  97712 394413 108446 138584
  M4F1  54862  83818  51619  76446  37804  50495  43142 100050  59826  52023
  M4F2  47215  79230  45432  80947  36393  41276  39646  93859  54262  56189
  M4F3  81093 115364  52319 115243  63516  77863  60426 186007  74982  77688
  M4F4 135534 171807  76516 161189  82224 115087  82721 260531 118940 104585
      cat
MF     110507 120103 130204 130206 320402 470103 470105 500201 500202 500210
  M1F1   3510   5987  10629  10976      0    135      0  14252   7307   7766
  M1F2   3356   6993  10390  10445    342      0      0  12493   6619   9440
  M1F3   9455  13934  18251  17797    385    384      0  29572  10959  14528
  M1F4  31349  56434 101631  96456  18754    270   4995 105291  33596  55780
  M2F1   9668  10550  13313  13494      0    945      0  38117   9734  16481
  M2F2  11088  14510  18604  17115   1719   1800  10557  44647  12899  18531
  M2F3  15602  27084  33576  37827   6214   1485      0  85787  21671  37399
  M2F4  85313 145780 243144 203981 119167  43369  55060 315253  83262 152243
  M3F1  19320  15377  19696  17573   2722   2315   2988  61329  22789  28947
  M3F2  14303  14662  22690  23624  21806   1836   4270  89243  18810  37276
  M3F3  36331  35254  52205  52976  80263  13231  18648 136550  40276  63095
  M3F4  90862 118451 183030 178049 338318 127726 125335 373478 103444 163157
  M4F1  46069  30773  30485  32561 140688  55994  61718 171202  56916  66087
  M4F2  47468  29987  38965  37840 136722  56926  65393 156320  53731  66021
  M4F3  54241  52047  57347  57750 231429 109330 105486 256533  75745 109051
  M4F4  73631  86769  96617 102516 380339 243370 325412 313300  91937 132314
      cat
MF     500804 520457 530101 530105 530110 530114 530403 530404 560201 560402
  M1F1   3499   3794   5831   3601   4089   1880   8128   3396   2733    866
  M1F2   3428   3493   6820   4390   5989   3459   9585   2549   8614   2278
  M1F3   8676   6360   7850   8292   9980   3329  10944   5566  16322   5384
  M1F4  32106  20495  45863  36374  45407  20332  38069  22586  68722  34346
  M2F1   9197  14412  11154  13959  16600   9680   9881   8234  38313  10496
  M2F2  10044   8925  17748  13069  17750  12318  12004  11789  46891  17623
  M2F3  22636  27883  33708  22706  27688  23224  27977  20486  80048  38561
  M2F4  85683 102004 154876 118697 160314  90811 104867  87388 440339 391757
  M3F1  21848  20142  27232  25235  28952  23357  16290  20254  93190  51984
  M3F2  19120  28767  36856  22117  43238  26984  20559  22418 168309  69592
  M3F3  39345  61519  69117  48670  73606  47143  41382  37926 251225 160887
  M3F4 118252 144922 209412 144736 205220 135618 125949 111369 705864 642854
  M4F1  36991  77717 103668  86930 121605  96961  61648  79013 411030 336043
  M4F2  32136  65624 100320  74907 103055  75536  55394  61803 430239 398902
  M4F3  90790  96023 144090 104225 143773 113944  75480 117300 683142 632498
  M4F4 116443 125865 186287 133446 181903 135660 107134 135702 884385 840103

Plot the matrix in a heatmap

# define a helper function to create interactive heatmap
# color9 = c("darkblue","green","gold","orange",rep("red",5))
hmap1 = function(x, ...) { heatmaply(
  as.data.frame.matrix(x), cexRow=0.7, cexCol=0.7, 
  grid_color='gray70', ...)
  }  

# create the heatmap
hmap1(mx0, col=cool_warm)

☝️ Note that:

  • The heatmap tool sort the columns and rows by hierarchical clustering
  • the color spectrum is heavily skewed
  • It only highlight the largest categories and customer groups

🌷 Normalization :

  • make the customer groups comparable by converting amounts to ratio
mx1 = mx0/rowSums(mx0)    # normalization can be done in a line of code
hmap1(mx1, col=cool_warm)

☝ Now we can see distinctive buying patterns across the customer groups

The heatmaply() function is sophisticated, you can …

  • specify your own colors
  • enable/disable the row and column clustering/sorting
  • decide whether to display the dendrogram(s)
  • cut the dendrogram into groups
  • see online help (press F1) for more options …
mx2 = xtabs(price~MF+cat, filter(Z, cat %in% cat100$cat[1:20]))
mx3 = 100*mx2/rowSums(mx2)
hmap1(mx3, col=cool_warm, show_dendrogram=c(T,F),k_row=5)



Comparability

🌷 EDA is all about Comparison whereof Comparability is the key.

💡 Matrix and Heatmap
■ Matrix is a key data structure for group comparison
■ Applying statistics by two categories creates a matrix
■ Heatmap is the best visualization tool for matrices
■ Heatmaps not only display values in colors …
■ but also clustering the columns and rows according to their similarity
■ Extreme values or unbalance groups degrade the effect of visualization …
■ Log transformation or normalization might help to improve the visual quality


💡 Normalization vs Standardization
■ Normalization emphasize ratios. It’s uni-polar, usually 0 t0 1.
■ Standardization emphasize variation. It is bi-polar and zero based.
■ Whilst both help to improve visual quality, the latter also helps to balance the weights of variables before applying statistical method such as clustering analysis or dimension reduction.