rm(list=ls(all=TRUE))
pacman::p_load(ggplot2,dplyr,heatmaply)
load("data/tf0.rdata")
sapply(list(cust=A0,tid=X0,items=Z0), nrow)
cust tid items
32241 119328 817182
🌷 Modern Marketing is all about
col6 = c('seagreen','gold','orange',rep('red',3))
# a500 = A0 %>% top_n(500, raw)
g = A0 %>% top_n(2000, raw) %>% filter(r > 45) %>%
ggplot(aes(x=m, y=f, col=r, size=raw, label=cust)) +
geom_point(alpha=0.7) +
scale_x_log10() + scale_color_gradientn(colors=col6[1:4]) +
theme_bw()
ggplotly(g)
☝️ Iterative Charts help to see the
Group the customer by the quantiles of M
and F
bm = c(0, quantile(A0$m,c(.25,0.5,.75)), max(A0$m)+100)
bf = c(0, quantile(A0$f,c(.25,0.5,.75)), max(A0$f)+100)
A = A0 %>% mutate(
mx = cut(A0$m, bm, labels=paste0('M',1:4)),
fx = cut(A0$f, bf, labels=paste0('F',1:4)),
MF = paste0(mx, fx)
)
table(A$mx, A$fx)
F1 F2 F3 F4
M1 3465 1477 1379 1748
M2 2470 1475 1571 2536
M3 2569 1557 1807 2134
M4 3388 1790 1695 1180
Find 100 categories that generate the largest profit
cat100 = count(Z0, cat, wt=price, sort=T) %>% mutate(
pc=n/sum(n), cum.pc=cumsum(pc)) %>% head(100)
cat100[c(1:5,96:100), ]
cat n pc cum.pc
1 560201 4329366 0.0422026 0.042203
2 560402 3634174 0.0354259 0.077629
3 500201 2204325 0.0214877 0.099116
4 110217 2201258 0.0214578 0.120574
5 320402 1481172 0.0144385 0.135013
96 100504 229815 0.0022402 0.547202
97 110106 227899 0.0022216 0.549424
98 100418 226905 0.0022119 0.551636
99 100407 224486 0.0021883 0.553824
100 110402 221145 0.0021557 0.555980
Make a CustGrp-Category matrix in revenue - mx0
Joining, by = "cust"
# filter(Z, cat %in% cat100$cat[1:30]) %>% group_by(MF, cat) %>%
# summarise(price = sum(price))
mx0 = xtabs(price~MF+cat, filter(Z, cat %in% cat100$cat[1:30]))
mx0
cat
MF 100102 100205 100311 100401 100505 100516 110117 110217 110333 110401
M1F1 8838 13761 2510 2365 5843 2106 4934 24421 0 8243
M1F2 8249 10736 3474 5751 4747 1517 6750 30304 856 7187
M1F3 9925 18373 4151 11131 9248 7422 12421 58318 696 11567
M1F4 42832 69774 21403 64689 41155 32028 45471 258828 15920 45618
M2F1 14625 23014 12414 17914 8948 6994 9875 32088 2638 12715
M2F2 20112 24682 9449 22476 12039 6944 11641 43623 1767 18202
M2F3 27733 45010 15326 43022 17966 14566 20862 79021 7730 26670
M2F4 131826 198403 85901 227739 95395 108992 92609 419308 69539 138536
M3F1 26768 36804 15916 35089 16559 18251 14499 40820 10819 21632
M3F2 27501 37129 17417 41426 17402 17314 16969 54455 6568 27037
M3F3 46334 76497 28164 77411 32897 37980 32280 122668 33798 52493
M3F4 135702 215942 107877 213700 90389 148272 97712 394413 108446 138584
M4F1 54862 83818 51619 76446 37804 50495 43142 100050 59826 52023
M4F2 47215 79230 45432 80947 36393 41276 39646 93859 54262 56189
M4F3 81093 115364 52319 115243 63516 77863 60426 186007 74982 77688
M4F4 135534 171807 76516 161189 82224 115087 82721 260531 118940 104585
cat
MF 110507 120103 130204 130206 320402 470103 470105 500201 500202 500210
M1F1 3510 5987 10629 10976 0 135 0 14252 7307 7766
M1F2 3356 6993 10390 10445 342 0 0 12493 6619 9440
M1F3 9455 13934 18251 17797 385 384 0 29572 10959 14528
M1F4 31349 56434 101631 96456 18754 270 4995 105291 33596 55780
M2F1 9668 10550 13313 13494 0 945 0 38117 9734 16481
M2F2 11088 14510 18604 17115 1719 1800 10557 44647 12899 18531
M2F3 15602 27084 33576 37827 6214 1485 0 85787 21671 37399
M2F4 85313 145780 243144 203981 119167 43369 55060 315253 83262 152243
M3F1 19320 15377 19696 17573 2722 2315 2988 61329 22789 28947
M3F2 14303 14662 22690 23624 21806 1836 4270 89243 18810 37276
M3F3 36331 35254 52205 52976 80263 13231 18648 136550 40276 63095
M3F4 90862 118451 183030 178049 338318 127726 125335 373478 103444 163157
M4F1 46069 30773 30485 32561 140688 55994 61718 171202 56916 66087
M4F2 47468 29987 38965 37840 136722 56926 65393 156320 53731 66021
M4F3 54241 52047 57347 57750 231429 109330 105486 256533 75745 109051
M4F4 73631 86769 96617 102516 380339 243370 325412 313300 91937 132314
cat
MF 500804 520457 530101 530105 530110 530114 530403 530404 560201 560402
M1F1 3499 3794 5831 3601 4089 1880 8128 3396 2733 866
M1F2 3428 3493 6820 4390 5989 3459 9585 2549 8614 2278
M1F3 8676 6360 7850 8292 9980 3329 10944 5566 16322 5384
M1F4 32106 20495 45863 36374 45407 20332 38069 22586 68722 34346
M2F1 9197 14412 11154 13959 16600 9680 9881 8234 38313 10496
M2F2 10044 8925 17748 13069 17750 12318 12004 11789 46891 17623
M2F3 22636 27883 33708 22706 27688 23224 27977 20486 80048 38561
M2F4 85683 102004 154876 118697 160314 90811 104867 87388 440339 391757
M3F1 21848 20142 27232 25235 28952 23357 16290 20254 93190 51984
M3F2 19120 28767 36856 22117 43238 26984 20559 22418 168309 69592
M3F3 39345 61519 69117 48670 73606 47143 41382 37926 251225 160887
M3F4 118252 144922 209412 144736 205220 135618 125949 111369 705864 642854
M4F1 36991 77717 103668 86930 121605 96961 61648 79013 411030 336043
M4F2 32136 65624 100320 74907 103055 75536 55394 61803 430239 398902
M4F3 90790 96023 144090 104225 143773 113944 75480 117300 683142 632498
M4F4 116443 125865 186287 133446 181903 135660 107134 135702 884385 840103
Plot the matrix in a heatmap
# define a helper function to create interactive heatmap
# color9 = c("darkblue","green","gold","orange",rep("red",5))
hmap1 = function(x, ...) { heatmaply(
as.data.frame.matrix(x), cexRow=0.7, cexCol=0.7,
grid_color='gray70', ...)
}
# create the heatmap
hmap1(mx0, col=cool_warm)
☝️ Note that:
🌷
☝ Now we can see distinctive buying patterns across the customer groups
The heatmaply()
function is sophisticated, you can …
mx2 = xtabs(price~MF+cat, filter(Z, cat %in% cat100$cat[1:20]))
mx3 = 100*mx2/rowSums(mx2)
hmap1(mx3, col=cool_warm, show_dendrogram=c(T,F),k_row=5)
🌷
💡 Matrix and Heatmap
■ Matrix is a key data structure for group comparison
■ Applying statistics by two categories creates a matrix
■ Heatmap is the best visualization tool for matrices
■ Heatmaps not only display values in colors …
■ but also clustering the columns and rows according to their similarity
■ Extreme values or unbalance groups degrade the effect of visualization …
■ Log transformation or normalization might help to improve the visual quality
💡 Normalization vs Standardization
■ Normalization emphasize ratios. It’s uni-polar, usually 0 t0 1.
■ Standardization emphasize variation. It is bi-polar and zero based.
■ Whilst both help to improve visual quality, the latter also helps to balance the weights of variables before applying statistical method such as clustering analysis or dimension reduction.