🌷 Most of the information in POS dataset is carried in the product column.

rm(list=ls(all=TRUE))
pacman::p_load(ggplot2,dplyr,heatmaply,arules)
load("data/tf0.rdata")

Contribution of Products and Categories

Z0 %>% summarise_at(vars(cust,prod,cat, tid),n_distinct)
   cust  prod  cat    tid
1 32256 23789 2007 119422
Product Categories Analysis

Identify top-100 categories that contributes to the largest profit

col4 = c('seagreen','gold','orange','red')
gg= group_by(Z0, cat) %>% summarise(
  solds = n(), qty = sum(qty), rev = sum(price), cost = sum(cost), 
  profit = rev - cost, margin = 100*profit/rev
  ) %>% 
  top_n(100, profit) %>% 
  ggplot(aes(x=profit, y=rev, col=margin, label=cat)) + 
  geom_point(size=1.5,alpha=0.8) + 
  scale_y_log10() + scale_x_log10() +
  scale_color_gradientn(colors=col4) +
  theme_bw()
ggplotly(gg)

🚴 Practice
Modifying the above code chuck to visualize

  • the top-100 categories that contributes to the largest revenue
  • filter out the categories with negative margin
  • The larger revenue, the larger profit; Is that always true?



Product Items Anlysis

🚴 More Practices
Try to do the contribution analysis on the top 300 product items …

Warning in geom_text(aes(x = 10000, y = 1000000, label = paste("top-300", : All aesthetics have length 1, but the data has 300 rows.
ℹ Please consider using `annotate()` or provide this layer with data containing
  a single row.
Warning in geom_text(aes(x = 10000, y = 1000000, label = paste("top-300", : All aesthetics have length 1, but the data has 276 rows.
ℹ Please consider using `annotate()` or provide this layer with data containing
  a single row.



Customer Segments by Buying Pattern

Top50 (Penetrating) Product Categories - cm

M=50
cm = Z0 %>% group_by(cat) %>% 
  summarise(r = n_distinct(cust)/nrow(A0)) %>%   # penetration rate
  arrange(desc(r)) %>% pull(cat) %>% head(M)

Customer-Category Matrix - x

x = xtabs(~cust+cat, filter(Z0, cat%in%cm)) %>% as.data.frame.matrix
x = x[,order(-colSums(x))]
dim(x)
[1] 29474    50

K-means Clustering - K=160

K=160
set.seed(1111)
kg = kmeans(x, K, iter.max=30)$cluster 
table(kg) %>% sort  
kg
  25   35   38    6   49   75   62  129   26   33   43    1   32   89  124   73 
   1    1    1    3    4    4    5    5    7    7    7    8   12   13   14   15 
  88  102   27   50   93  125    7  105  138   96  146   92   94   36  128  149 
  15   15   17   17   17   17   20   21   22   24   24   26   26   27   27   27 
  24    9  117   23   70  107  130  103  110    2  150  152  122   11   57   76 
  29   31   31   33   33   33   33   34   34   35   35   35   36   37   37   37 
  58  160   61   80   97   40   68   46   64   79   84   98  148  147  123  154 
  38   38   41   41   41   42   42   47   49   49   50   52   53   54   56   57 
  81  112  156   85  109   71   20   90   48   29   31   44  155   42  120  126 
  61   61   64   66   69   72   74   76   78   83   84   86   86   87   89   92 
  82   39   52   37  131   56  100  132   34  108   10    8  143  101   15  157 
  94   95   99  108  117  118  118  118  119  119  120  122  122  125  127  127 
  51  137   17   65    3   77   16  142   19  104   66  141   95   21  111  134 
 128  133  134  134  140  140  142  143  146  149  152  155  156  158  164  168 
  22  121   41   28  127  139  115  153  151  136   99  159   13   55   53   45 
 173  174  181  190  202  210  213  214  218  220  244  268  285  289  294  307 
  86   78  140  145  106   30   67  116    4  133  144   63   54   69  158   12 
 312  328  335  366  376  377  406  409  412  422  425  430  444  467  476  483 
  60   72   83    5   59   47  135  113  114   14  119  118   74   91   87   18 
 490  503  521  522  535  553  565  616  644  690  735  746  752  794  814 2684 
# sort by the sizes of the groups 

Rather tan plotting all of the groups, we only plot the groups of appropriate size.

ckg = tibble(cust=rownames(x),kg=kg,by="cust")
gdf = inner_join(A0, ckg) %>% 
  group_by(kg) %>% summarise(
  gsize = n(), ttRev = sum(rev), ttProfit = sum(raw),
  avRev = mean(rev), avProfit = mean(raw),
  avRecent = mean(r), avFreq = mean(f), avMoney = mean(m)
  )  
filter(gdf, gsize >= 200, gsize <= 1000) %>% 
  ggplot(aes(avMoney,avFreq,col=ttProfit,size=gsize,label=kg)) + 
  geom_point(alpha=0.6) +
  scale_color_gradientn(colors=c("seagreen","gold","tomato")) +
  theme_bw() -> g
ggplotly(g)

Besides the RFM, we also know the product preference of each groups specifically

# choose the groups of appropriate size
g = filter(gdf, gsize >= 200, gsize <= 800) %>% pull(kg)

# calculate the group means
a = sapply(split(x[kg %in% g,1:30], kg[kg %in% g]), colMeans)  

# define color and helper function
color9 = c("darkblue","green","gold","orange",rep("red",5))
hmap1 = function(x, ...) { heatmaply(
  as.data.frame.matrix(x), cexRow=0.7, cexCol=0.7, 
  grid_color='gray70', ...)
} 

# generate the heatmap
hmap1(a, col=color9, show_dendrogram=c(F,F))



Shopping Cart Analysis

It is very easy if you know how to use the arules package …

# p2k = count(Z0, prod, sort=T) %>% pull(prod) %>% head(2000)
# Z = filter(Z0, prod %in% p2k)
# tr = as(split(Z[,"prod"], Z[,"tid"]), "transactions"); tr
# rules <- apriori(tr, parameter=list(supp=0.0002, conf=0.5))
# summary(rules)
# rx = subset(rules, subset = lift > 100 & count > 100) 
# inspect(rx) # %>% View

☝️ The Association Rules …