rm(list=ls(all=TRUE))
::p_load(Matrix, vcd, magrittr, readr, caTools, ggplot2, dplyr)
pacmanload("data/tf0.rdata")
sapply(list(cust=A0,tid=X0,items=Z0), nrow)
## cust tid items
## 32241 119328 817182
par(mfrow=c(1,2),cex=0.7)
table(A0$age) %>% barplot(las=2,main="Age Groups")
table(A0$area) %>% barplot(las=2,main="Areas")
Mosaic plot is a tool to visualize Association between Categorical Variables.
table(A0$area, A0$age)
##
## a24 a29 a34 a39 a44 a49 a54 a59 a64 a69 a99
## z105 27 62 196 188 168 109 89 45 22 54 12
## z106 21 64 172 212 138 114 75 39 25 58 15
## z110 134 350 643 778 648 453 284 116 98 149 30
## z114 90 188 405 540 392 279 125 50 40 36 21
## z115 689 1104 1697 1853 1664 1389 844 428 401 614 117
## z221 444 934 1751 1902 1311 861 526 265 205 288 52
## zOthers 174 400 860 885 567 378 156 88 52 94 41
## zUnknown 50 91 191 222 192 136 84 43 36 70 338
# define a helper function for default format
= function(formula, data) mosaic(formula, data, shade=T,
MOSA margins=c(0,1,0,0), labeling_args = list(rot_labels=c(90,0,0,0)),
gp_labels=gpar(fontsize=9), legend_args=list(fontsize=9),
gp_text=gpar(fontsize=7),labeling=labeling_residuals)
# an example
MOSA(~area+age, A0)
💡 Major Finding:
※ High
association between age
and area
    §
z115
has fewer a34
& a39
than
expected
    §z221
and zOthers
has more
a34
& a39
%>% group_by(age) %>% summarise(
A0 Group.Size = n(), # group size
avg.Freq = mean(f), # average frequency
avg.Revenue = sum(f*m)/sum(f) # average revenue per transaction
%>%
) ggplot(aes(y=avg.Freq, x=avg.Revenue)) +
geom_point(aes(col=age, size=Group.Size), alpha=0.5) +
geom_text(aes(label=age)) +
scale_size(range=c(5,25)) +
theme_bw() + theme(legend.position="none") +
ggtitle("Characteristics of Age Groups (group size)") +
ylab("average frequency") + xlab("average revenue per transaction")
mean(A0$age == "a99")
## [1] 0.01941627
Filtering out the small and unique group (a99
) helps to
compare the major
%>% filter(age!="a99") %>% # filter out 'a99'
A0 group_by(age) %>% summarise(
Group.Size = n(), #
avg.Freq = mean(f), #
avg.Revenue = sum(f*m)/sum(f) #
%>%
) ggplot(aes(y=avg.Freq, x=avg.Revenue)) +
geom_point(aes(col=age, size=Group.Size), alpha=0.5) +
geom_text(aes(label=age)) +
scale_size(range=c(5,25)) +
theme_bw() + theme(legend.position="none") +
ggtitle("Characteristics of Age Groups (group size)") +
ylab("average frequency") + xlab("average revenue per transaction")
%>% filter(age!="a99") %>% #
A0 group_by(area) %>% summarise(
Group.Size = n(), #
avg.Freq = mean(f), #
avg.Revenue = sum(f*m)/sum(f) #
%>%
) ggplot(aes(y=avg.Freq, x=avg.Revenue)) +
geom_point(aes(col=area, size=Group.Size), alpha=0.5) +
geom_text(aes(label=area)) +
scale_size(range=c(5,25)) +
theme_bw() + theme(legend.position="none") +
ggtitle("Characteristics of Area Groups (group size)") +
ylab("average frequency") + xlab("average revenue per transaction")
💡 Major Finding:
※ Frequency
is negatively corellated with Average Revenue
    § Near customers
buy frequently but less
    § a34
and a39
buy less frequently but buy more each time.
= Z0 %>% group_by(cat) %>% summarise( # group by categories
cats noProd = n_distinct(prod),
totalQty = sum(qty),
totalRev = sum(price),
totalGross = sum(price) - sum(cost),
grossMargin = totalGross/totalRev,
avgPrice = totalRev/totalQty
)
par(mfrow=c(2,1), cex=0.7)
$totalRev %>% sort(dec=T) %>% {cumsum(.)[1:40]/sum(.)} %>%
catsbarplot(names=1:40,las=2,main="acc. percentage of reveune")
abline(h=seq(0,1,0.1),col='green')
$totalRev %>% sort(dec=T) %>% {cumsum(.)[1:40]/sum(.)} %>%
catsbarplot(names=1:40,las=2,main="acc. percentage of gross profit")
abline(h=seq(0,1,0.1),col='green')
+ The best selling 10 categories contribute ~20% of revenue + The most profitable 10 categories contribute ~20% of profit + Are the best selling categories the same as the most profitable’s?
= tapply(Z0$qty,Z0$cat,sum) %>% sort %>% tail(20) %>% names top20
MOSA(~cat+age, Z0[Z0$cat %in% top20,])
MOSA(~cat+area, Z0[Z0$cat %in% top20,])
$wday = format(X0$date, "%u")
X0par(cex=0.7, mar=c(2,3,2,1))
table(X0$wday) %>% barplot(main="No. Transactions in Week Days")
##### Age Groups vs. Weekdays
MOSA(~wday+age, X0)
##### Categories vs. Weekdays
= Z0 %>% filter(cat %in% top20) %>% mutate(wday = format(date, '%u'))
df MOSA(~cat+wday, df)