TF4_Product_Information

🌷 Most of the information in POS dataset is carried in the product column.

rm(list=ls(all=TRUE))
pacman::p_load(ggplot2,dplyr,heatmaply,arules)
load("data/tf0.rdata")

Contribution of Products and Categories

# count the number of distinct customers, products, categories, transactions
Z0 %>% summarise_at(vars(cust,prod,cat,tid),n_distinct)

   cust  prod  cat    tid
1 32256 23789 2007 119422

Product Categories Analysis

Identify top-100 categories that contributes to the largest profit

col4 = c('seagreen','gold','orange','red')
gg= group_by(Z0, cat) %>% summarise(
  solds = n(), qty = sum(qty), rev = sum(price), cost = sum(cost), 
  profit = rev - cost, margin = 100*profit/rev
  ) %>% 
  top_n(100, profit) %>% 
  ggplot(aes(x=profit, y=rev, col=margin, label=cat)) + 
  geom_point(size=1.5,alpha=0.8) + 
  scale_y_log10() + scale_x_log10() +
  scale_color_gradientn(colors=col4) +
  theme_bw()
ggplotly(gg)

🚴 Practice
Modifying the above code chuck to visualize

the top-100 categories that contributes to the largest revenue
filter out the categories with negative margin
Is it true that “The larger the revenue, the larger the profit.”?

Product Items Anlysis

🚴 More Practices
Try to do the contribution analysis on the top 300 product items …

Customer Segments by Buying Pattern

Top50 (Penetrating) Product Categories - cm

M=50
cm = Z0 %>% group_by(cat) %>% 
  summarise(r = n_distinct(cust)/nrow(A0)) %>%   # penetration rate
  arrange(desc(r)) %>% pull(cat) %>% head(M)

Customer-Category Matrix: x, the number of times a customer by a category

x = xtabs(~cust+cat, filter(Z0, cat%in%cm)) %>%  
  as.data.frame.matrix       # build the cust-cat matrix
x = x[,order(-colSums(x))]   # order the categories by the no. buys
dim(x)

[1] 29474    50

K-means Clustering (K=160 groups)
Clustering: group subject (customer) by similarity (buying pattern)
We’ll discuss more about clustering methods next week.

K=160
set.seed(1111)
kg = kmeans(x, K, iter.max=30)$cluster 
table(kg) %>% sort

kg
  25   35   38    6   49   75   62  129   26   33   43    1   32   89  124   73 
   1    1    1    3    4    4    5    5    7    7    7    8   12   13   14   15 
  88  102   27   50   93  125    7  105  138   96  146   92   94   36  128  149 
  15   15   17   17   17   17   20   21   22   24   24   26   26   27   27   27 
  24    9  117   23   70  107  130  103  110    2  150  152  122   11   57   76 
  29   31   31   33   33   33   33   34   34   35   35   35   36   37   37   37 
  58  160   61   80   97   40   68   46   64   79   84   98  148  147  123  154 
  38   38   41   41   41   42   42   47   49   49   50   52   53   54   56   57 
  81  112  156   85  109   71   20   90   48   29   31   44  155   42  120  126 
  61   61   64   66   69   72   74   76   78   83   84   86   86   87   89   92 
  82   39   52   37  131   56  100  132   34  108   10    8  143  101   15  157 
  94   95   99  108  117  118  118  118  119  119  120  122  122  125  127  127 
  51  137   17   65    3   77   16  142   19  104   66  141   95   21  111  134 
 128  133  134  134  140  140  142  143  146  149  152  155  156  158  164  168 
  22  121   41   28  127  139  115  153  151  136   99  159   13   55   53   45 
 173  174  181  190  202  210  213  214  218  220  244  268  285  289  294  307 
  86   78  140  145  106   30   67  116    4  133  144   63   54   69  158   12 
 312  328  335  366  376  377  406  409  412  422  425  430  444  467  476  483 
  60   72   83    5   59   47  135  113  114   14  119  118   74   91   87   18 
 490  503  521  522  535  553  565  616  644  690  735  746  752  794  814 2684

# calculate sizes of the groups and sort by the sizes

Rather tan plotting all of the groups, we only plot the groups of appropriate size.

ckg = tibble(cust=rownames(x),kg=kg,by="cust")
gdf = inner_join(A0, ckg) %>% 
  group_by(kg) %>% summarise(
  gsize = n(), ttRev = sum(rev), ttProfit = sum(raw),
  avRev = mean(rev), avProfit = mean(raw),
  avRecent = mean(r), avFreq = mean(f), avMoney = mean(m)
  )  

# Usually we're interested in the groups ofmedium size
filter(gdf, gsize >= 200, gsize <= 1000) %>% 
  ggplot(aes(avMoney,avFreq,col=ttProfit,size=gsize,label=kg)) + 
  geom_point(alpha=0.6) +
  scale_color_gradientn(colors=c("seagreen","gold","tomato")) +
  theme_bw() -> g
ggplotly(g)

Besides the RFM, we also know the product preference of each groups specifically

# choose the groups of appropriate size
g = filter(gdf, gsize >= 200, gsize <= 800) %>% pull(kg)

# calculate the group means
a = sapply(split(x[kg %in% g,1:30], kg[kg %in% g]), colMeans)  

# define color and helper function
color9 = c("darkblue","green","gold","orange",rep("red",5))
hmap1 = function(x, ...) { heatmaply(
  as.data.frame.matrix(x), cexRow=0.7, cexCol=0.7, 
  grid_color='gray70', ...)
} 

# generate the heatmap
hmap1(a, col=color9, show_dendrogram=c(F,F))

In average …

each members of group-135(139) buy category-110401 2.35(4.63) times
but they buy category-100205 only 0.36(0.78) times

The clustering method group the customers of similar buying behaviors
together, so that we can design specific marketing schemes to meet
customers’ distinctive preferences.

Shopping Cart Analysis

According to R version compatibility, this section in temporary suspended.

It is very easy if you know how to use the arules package …

# p2k = count(Z0, prod, sort=T) %>% pull(prod) %>% head(2000)
# Z = filter(Z0, prod %in% p2k)
# tr = as(split(Z[,"prod"], Z[,"tid"]), "transactions"); tr
# rules <- apriori(tr, parameter=list(supp=0.0002, conf=0.5))
# summary(rules)

# rx = subset(rules, subset = lift > 100 & count > 100) 
# inspect(rx) # %>% View

☝️ The Association Rules …

if the lhs item(s) is in a cart
the rhs item(s) tend to be also in that cart
confidence := the probability that the rhs are also in cart
support := the ratio that this rule actually occurred
count := the number of occurrences of this rule