TF4_Product_Information

🌷 Most of the information in POS dataset is carried in the product column.

rm(list=ls(all=TRUE))
pacman::p_load(ggplot2,dplyr,heatmaply,arules)
load("data/tf0.rdata")

Contribution of Products and Categories

Z0 %>% summarise_at(vars(cust,prod,cat, tid),n_distinct)

   cust  prod  cat    tid
1 32256 23789 2007 119422

Product Categories Analysis

Identify top-100 categories that contributes to the largest profit

col4 = c('seagreen','gold','orange','red')
gg= group_by(Z0, cat) %>% summarise(
  solds = n(), qty = sum(qty), rev = sum(price), cost = sum(cost), 
  profit = rev - cost, margin = 100*profit/rev
  ) %>% 
  top_n(100, profit) %>% 
  ggplot(aes(x=profit, y=rev, col=margin, label=cat)) + 
  geom_point(size=1.5,alpha=0.8) + 
  scale_y_log10() + scale_x_log10() +
  scale_color_gradientn(colors=col4) +
  theme_bw()
ggplotly(gg)

🚴 Practice
Modifying the above code chuck to visualize

the top-100 categories that contributes to the largest revenue
filter out the categories with negative margin
The larger revenue, the larger profit; Is that always true?

Product Items Anlysis

🚴 More Practices
Try to do the contribution analysis on the top 300 product items …

Customer Segments by Buying Pattern

Top50 (Penetrating) Product Categories - cm

M=50
cm = Z0 %>% group_by(cat) %>% 
  summarise(r = n_distinct(cust)/nrow(A0)) %>%   # penetration rate
  arrange(desc(r)) %>% pull(cat) %>% head(M)

Customer-Category Matrix - x

x = xtabs(~cust+cat, filter(Z0, cat%in%cm)) %>% as.data.frame.matrix
x = x[,order(-colSums(x))]
dim(x)

[1] 29474    50

K-means Clustering - K=160

K=160
set.seed(1111)
kg = kmeans(x, K, iter.max=30)$cluster 
table(kg) %>% sort

kg
  25   35   38    6   49   75   62  129   26   33   43    1   32   89  124   73 
   1    1    1    3    4    4    5    5    7    7    7    8   12   13   14   15 
  88  102   27   50   93  125    7  105  138   96  146   92   94   36  128  149 
  15   15   17   17   17   17   20   21   22   24   24   26   26   27   27   27 
  24    9  117   23   70  107  130  103  110    2  150  152  122   11   57   76 
  29   31   31   33   33   33   33   34   34   35   35   35   36   37   37   37 
  58  160   61   80   97   40   68   46   64   79   84   98  148  147  123  154 
  38   38   41   41   41   42   42   47   49   49   50   52   53   54   56   57 
  81  112  156   85  109   71   20   90   48   29   31   44  155   42  120  126 
  61   61   64   66   69   72   74   76   78   83   84   86   86   87   89   92 
  82   39   52   37  131   56  100  132   34  108   10    8  143  101   15  157 
  94   95   99  108  117  118  118  118  119  119  120  122  122  125  127  127 
  51  137   17   65    3   77   16  142   19  104   66  141   95   21  111  134 
 128  133  134  134  140  140  142  143  146  149  152  155  156  158  164  168 
  22  121   41   28  127  139  115  153  151  136   99  159   13   55   53   45 
 173  174  181  190  202  210  213  214  218  220  244  268  285  289  294  307 
  86   78  140  145  106   30   67  116    4  133  144   63   54   69  158   12 
 312  328  335  366  376  377  406  409  412  422  425  430  444  467  476  483 
  60   72   83    5   59   47  135  113  114   14  119  118   74   91   87   18 
 490  503  521  522  535  553  565  616  644  690  735  746  752  794  814 2684

# sort by the sizes of the groups

Rather tan plotting all of the groups, we only plot the groups of appropriate size.

ckg = tibble(cust=rownames(x),kg=kg,by="cust")
gdf = inner_join(A0, ckg) %>% 
  group_by(kg) %>% summarise(
  gsize = n(), ttRev = sum(rev), ttProfit = sum(raw),
  avRev = mean(rev), avProfit = mean(raw),
  avRecent = mean(r), avFreq = mean(f), avMoney = mean(m)
  )  
filter(gdf, gsize >= 200, gsize <= 1000) %>% 
  ggplot(aes(avMoney,avFreq,col=ttProfit,size=gsize,label=kg)) + 
  geom_point(alpha=0.6) +
  scale_color_gradientn(colors=c("seagreen","gold","tomato")) +
  theme_bw() -> g
ggplotly(g)

Besides the RFM, we also know the product preference of each groups specifically

# choose the groups of appropriate size
g = filter(gdf, gsize >= 200, gsize <= 800) %>% pull(kg)

# calculate the group means
a = sapply(split(x[kg %in% g,1:30], kg[kg %in% g]), colMeans)  

# define color and helper function
color9 = c("darkblue","green","gold","orange",rep("red",5))
hmap1 = function(x, ...) { heatmaply(
  as.data.frame.matrix(x), cexRow=0.7, cexCol=0.7, 
  grid_color='gray70', ...)
} 

# generate the heatmap
hmap1(a, col=color9, show_dendrogram=c(F,F))

Shopping Cart Analysis

It is very easy if you know how to use the arules package …

p2k = count(Z0, prod, sort=T) %>% pull(prod) %>% head(2000)
Z = filter(Z0, prod %in% p2k)
tr = as(split(Z[,"prod"], Z[,"tid"]), "transactions"); tr

transactions in sparse format with
 107797 transactions (rows) and
 2000 items (columns)

rules <- apriori(tr, parameter=list(supp=0.0002, conf=0.5))

Apriori

Parameter specification:
 confidence minval smax arem  aval originalSupport maxtime support minlen
        0.5    0.1    1 none FALSE            TRUE       5  0.0002      1
 maxlen target  ext
     10  rules TRUE

Algorithmic control:
 filter tree heap memopt load sort verbose
    0.1 TRUE TRUE  FALSE TRUE    2    TRUE

Absolute minimum support count: 21 

set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[2000 item(s), 107797 transaction(s)] done [0.15s].
sorting and recoding items ... [2000 item(s)] done [0.01s].
creating transaction tree ... done [0.04s].
checking subsets of size 1 2 3 4 5 6 done [0.05s].
writing ... [1001 rule(s)] done [0.01s].
creating S4 object  ... done [0.01s].

summary(rules)

set of 1001 rules

rule length distribution (lhs + rhs):sizes
  2   3   4   5   6 
 85 558 289  64   5 

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   2.00    3.00    3.00    3.35    4.00    6.00 

summary of quality measures:
    support           confidence       coverage             lift        
 Min.   :0.000204   Min.   :0.500   Min.   :0.000232   Min.   :   6.64  
 1st Qu.:0.000232   1st Qu.:0.557   1st Qu.:0.000371   1st Qu.:  60.48  
 Median :0.000297   Median :0.636   Median :0.000482   Median :  91.25  
 Mean   :0.000519   Mean   :0.652   Mean   :0.000829   Mean   : 211.00  
 3rd Qu.:0.000501   3rd Qu.:0.736   3rd Qu.:0.000789   3rd Qu.: 281.70  
 Max.   :0.006494   Max.   :0.966   Max.   :0.009648   Max.   :1131.30  
     count    
 Min.   : 22  
 1st Qu.: 25  
 Median : 32  
 Mean   : 56  
 3rd Qu.: 54  
 Max.   :700  

mining info:
 data ntransactions support confidence
   tr        107797  0.0002        0.5

rx = subset(rules, subset = lift > 100 & count > 100) 
inspect(rx) # %>% View

     lhs                              rhs             support    confidence
[1]  {4710030346110}               => {4710030346097} 0.00094622 0.55135   
[2]  {4710030346110}               => {4710030346103} 0.00111320 0.64865   
[3]  {4710030346110}               => {4710030346059} 0.00105754 0.61622   
[4]  {4716114000312}               => {4716114000329} 0.00141006 0.55273   
[5]  {4716114000329}               => {4716114000312} 0.00141006 0.53333   
[6]  {4710030346097}               => {4710030346103} 0.00109465 0.52444   
[7]  {4710030346097}               => {4710030346059} 0.00121525 0.58222   
[8]  {4719859015061}               => {4713080626225} 0.00105754 0.50667   
[9]  {4713080626225}               => {4719859015061} 0.00105754 0.50442   
[10] {4711524000617}               => {4711524000419} 0.00094622 0.54545   
[11] {4710030346103}               => {4710030346059} 0.00155848 0.65370   
[12] {4710030346059}               => {4710030346103} 0.00155848 0.51220   
[13] {4710008241119}               => {4710008241218} 0.00094622 0.52041   
[14] {4711524000457}               => {4711524000419} 0.00095550 0.50490   
[15] {4711524000457}               => {4711524000396} 0.00101116 0.53431   
[16] {0076150430530}               => {0076150215281} 0.00108537 0.51770   
[17] {4719090701051}               => {4719090790000} 0.00163270 0.57705   
[18] {4711524000471}               => {4711524000396} 0.00108537 0.51092   
[19] {4711524000495}               => {4711524000396} 0.00115031 0.50000   
[20] {4711524000419}               => {4711524000396} 0.00153065 0.67901   
[21] {4710321861209}               => {4710321861186} 0.00207798 0.60870   
[22] {4710321871260}               => {4710321861186} 0.00185534 0.54054   
[23] {4711524000907}               => {4711524000891} 0.00143789 0.56364   
[24] {4711524000907}               => {4711524001041} 0.00166053 0.65091   
[25] {0719859796124}               => {0719859796117} 0.00232845 0.69529   
[26] {4719090790017}               => {4719090790000} 0.00325612 0.80876   
[27] {4719090790000}               => {4719090790017} 0.00325612 0.62124   
[28] {4711524000891}               => {4711524001041} 0.00195738 0.59943   
[29] {4711524001041}               => {4711524000891} 0.00195738 0.52750   
[30] {4719090701051,4719090790017} => {4719090790000} 0.00102971 0.84733   
[31] {4719090701051,4719090790000} => {4719090790017} 0.00102971 0.63068   
[32] {4710321861209,4710321871260} => {4710321861186} 0.00102044 0.63584   
[33] {4710321861186,4710321871260} => {4710321861209} 0.00102044 0.55000   
[34] {4711524000891,4711524000907} => {4711524001041} 0.00110393 0.76774   
[35] {4711524000907,4711524001041} => {4711524000891} 0.00110393 0.66480   
[36] {4711524000891,4711524001041} => {4711524000907} 0.00110393 0.56398   
     coverage  lift   count
[1]  0.0017162 264.15 102  
[2]  0.0017162 272.07 120  
[3]  0.0017162 202.52 114  
[4]  0.0025511 209.06 152  
[5]  0.0026439 209.06 152  
[6]  0.0020873 219.97 118  
[7]  0.0020873 191.35 131  
[8]  0.0020873 241.67 114  
[9]  0.0020965 241.67 114  
[10] 0.0017347 241.97 102  
[11] 0.0023841 214.84 168  
[12] 0.0030428 214.84 168  
[13] 0.0018182 177.53 102  
[14] 0.0018924 223.98 103  
[15] 0.0018924 155.67 109  
[16] 0.0020965 169.11 117  
[17] 0.0028294 110.10 176  
[18] 0.0021244 148.85 117  
[19] 0.0023006 145.67 124  
[20] 0.0022542 197.83 165  
[21] 0.0034138 124.51 224  
[22] 0.0034324 110.57 200  
[23] 0.0025511 172.61 155  
[24] 0.0025511 175.42 179  
[25] 0.0033489 132.89 251  
[26] 0.0040261 154.30 351  
[27] 0.0052413 154.30 351  
[28] 0.0032654 161.54 211  
[29] 0.0037107 161.54 211  
[30] 0.0012152 161.66 111  
[31] 0.0016327 156.65 111  
[32] 0.0016049 130.06 110  
[33] 0.0018553 161.11 110  
[34] 0.0014379 206.90 119  
[35] 0.0016605 203.59 119  
[36] 0.0019574 221.07 119

☝️ The Association Rules …

if the lhs item(s) is in a cart
the rhs item(s) tend to be also in that cart
confidence := the probability that the rhs are also in cart
support := the ratio that this rule actually occurred
count := the number of occurrences of this rule