rm(list=ls(all=TRUE))
pacman::p_load(Matrix, vcd, magrittr, readr, caTools, ggplot2, dplyr)
load("data/tf0.rdata")
sapply(list(cust=A0,tid=X0,items=Z0), nrow)
##   cust    tid  items 
##  32241 119328 817182


Age & Zip Codes

par(mfrow=c(1,2),cex=0.7)
table(A0$age) %>% barplot(las=2,main="Age Groups")
table(A0$area) %>% barplot(las=2,main="Areas")

Fig-2: Zip Codes


Subgroup Associations

Mosaic plot is a tool to visualize Association between Categorical Variables.

  • Size of the mosaics represent the counts
  • Red(Blue) color represents the count is signifacantly smaller (larger) than the expected value
  • Expected value is the product of marginal probabilities
  • p-value of the Chi-square analysis is shown in the lower right
table(A0$area, A0$age)
##           
##             a24  a29  a34  a39  a44  a49  a54  a59  a64  a69  a99
##   z105       27   62  196  188  168  109   89   45   22   54   12
##   z106       21   64  172  212  138  114   75   39   25   58   15
##   z110      134  350  643  778  648  453  284  116   98  149   30
##   z114       90  188  405  540  392  279  125   50   40   36   21
##   z115      689 1104 1697 1853 1664 1389  844  428  401  614  117
##   z221      444  934 1751 1902 1311  861  526  265  205  288   52
##   zOthers   174  400  860  885  567  378  156   88   52   94   41
##   zUnknown   50   91  191  222  192  136   84   43   36   70  338
# define a helper function for default format
MOSA = function(formula, data) mosaic(formula, data, shade=T, 
  margins=c(0,1,0,0), labeling_args = list(rot_labels=c(90,0,0,0)),
  gp_labels=gpar(fontsize=9), legend_args=list(fontsize=9),
  gp_text=gpar(fontsize=7),labeling=labeling_residuals)

# an example
MOSA(~area+age, A0)

💡 Major Finding:
※ High association between age and area
    § z115 has fewer a34 & a39 than expected
    §z221 and zOthers has more a34 & a39



Bubble Chart

Characteristics of Age Groups
A0 %>% group_by(age) %>% summarise(
  Group.Size = n(),              # group size
  avg.Freq = mean(f),            # average frequency
  avg.Revenue = sum(f*m)/sum(f)  # average revenue per transaction
  ) %>% 
  ggplot(aes(y=avg.Freq, x=avg.Revenue)) +
  geom_point(aes(col=age, size=Group.Size), alpha=0.5) +
  geom_text(aes(label=age)) +
  scale_size(range=c(5,25)) +
  theme_bw() + theme(legend.position="none") +
  ggtitle("Characteristics of Age Groups (group size)") + 
  ylab("average frequency") + xlab("average revenue per transaction")

mean(A0$age == "a99")
## [1] 0.01941627

Filtering out the small and unique group (a99) helps to compare the major

A0 %>% filter(age!="a99") %>%    # filter out 'a99'
  group_by(age) %>% summarise(
  Group.Size = n(),              # 
  avg.Freq = mean(f),            # 
  avg.Revenue = sum(f*m)/sum(f)  # 
  ) %>% 
  ggplot(aes(y=avg.Freq, x=avg.Revenue)) +
  geom_point(aes(col=age, size=Group.Size), alpha=0.5) +
  geom_text(aes(label=age)) +
  scale_size(range=c(5,25)) +
  theme_bw() + theme(legend.position="none") +
  ggtitle("Characteristics of Age Groups (group size)") + 
  ylab("average frequency") + xlab("average revenue per transaction")

Characteristics of Area Groups
A0 %>% filter(age!="a99") %>%    # 
  group_by(area) %>% summarise(
  Group.Size = n(),              # 
  avg.Freq = mean(f),            # 
  avg.Revenue = sum(f*m)/sum(f)  # 
  ) %>% 
  ggplot(aes(y=avg.Freq, x=avg.Revenue)) +
  geom_point(aes(col=area, size=Group.Size), alpha=0.5) +
  geom_text(aes(label=area)) +
  scale_size(range=c(5,25)) +
  theme_bw() + theme(legend.position="none") +
  ggtitle("Characteristics of Area Groups (group size)") + 
  ylab("average frequency") + xlab("average revenue per transaction")

💡 Major Finding:
※ Frequency is negatively corellated with Average Revenue
    § Near customers buy frequently but less
    § a34 and a39 buy less frequently but buy more each time.



Product Information

cats = Z0 %>% group_by(cat) %>% summarise(  # group by categories
  noProd = n_distinct(prod),
  totalQty = sum(qty),
  totalRev = sum(price),
  totalGross = sum(price) - sum(cost),
  grossMargin = totalGross/totalRev,
  avgPrice = totalRev/totalQty
  )
Revenue and Profit Contributions per Category
par(mfrow=c(2,1), cex=0.7)
cats$totalRev %>% sort(dec=T) %>% {cumsum(.)[1:40]/sum(.)} %>% 
  barplot(names=1:40,las=2,main="acc. percentage of reveune")
abline(h=seq(0,1,0.1),col='green')

cats$totalRev %>% sort(dec=T) %>% {cumsum(.)[1:40]/sum(.)} %>% 
  barplot(names=1:40,las=2,main="acc. percentage of gross profit")
abline(h=seq(0,1,0.1),col='green')

+ The best selling 10 categories contribute ~20% of revenue + The most profitable 10 categories contribute ~20% of profit + Are the best selling categories the same as the most profitable’s?


Association among Age, Area and Categories
top20 = tapply(Z0$qty,Z0$cat,sum) %>% sort %>% tail(20) %>% names
MOSA(~cat+age, Z0[Z0$cat %in% top20,])

MOSA(~cat+area, Z0[Z0$cat %in% top20,])

Different Age or Area Groups buy different Categories !

Weekend vs. Weekdays

X0$wday = format(X0$date, "%u")
par(cex=0.7, mar=c(2,3,2,1))
table(X0$wday) %>% barplot(main="No. Transactions in Week Days")

##### Age Groups vs. Weekdays

MOSA(~wday+age, X0)

##### Categories vs. Weekdays

df = Z0 %>% filter(cat %in% top20) %>% mutate(wday = format(date, '%u'))
MOSA(~cat+wday, df)