rm(list=ls(all=TRUE))
::p_load(Matrix, vcd, magrittr, readr, caTools, ggplot2, dplyr)
pacmanload("data/tf0.rdata")
sapply(list(cust=A0,tid=X0,items=Z0), nrow)
## cust tid items
## 32241 119328 817182
par(mfrow=c(1,2),cex=0.7)
table(A0$age) %>% barplot(las=2,main="Age Groups")
table(A0$area) %>% barplot(las=2,main="Areas")
Mosaic plot is a tool to visualize Association between Categorical Variables.
# define a helper function for default format
= function(formula, data) mosaic(formula, data, shade=T,
MOSA margins=c(0,1,0,0), labeling_args = list(rot_labels=c(90,0,0,0)),
gp_labels=gpar(fontsize=9), legend_args=list(fontsize=9),
gp_text=gpar(fontsize=7),labeling=labeling_residuals)
# an example
MOSA(~area+age, A0)
💡 Major Finding:
※ High association between age
and area
    § z115
has fewer a34
& a39
than expected
    §z221
and zOthers
has more a34
& a39
%>% group_by(age) %>% summarise(
A0 Group.Size = n(), # group size
avg.Freq = mean(f), # average frequency
avg.Revenue = sum(f*m)/sum(f) # average revenue per transaction
%>%
) ggplot(aes(y=avg.Freq, x=avg.Revenue)) +
geom_point(aes(col=age, size=Group.Size), alpha=0.5) +
geom_text(aes(label=age)) +
scale_size(range=c(5,25)) +
theme_bw() + theme(legend.position="none") +
ggtitle("Characteristics of Age Groups (group size)") +
ylab("average frequency") + xlab("average revenue per transaction")
mean(A0$age == "a99")
## [1] 0.01941627
Filtering out the small and unique group (a99
) helps to compare the major
%>% filter(age!="a99") %>% # filter out 'a99'
A0 group_by(age) %>% summarise(
Group.Size = n(), #
avg.Freq = mean(f), #
avg.Revenue = sum(f*m)/sum(f) #
%>%
) ggplot(aes(y=avg.Freq, x=avg.Revenue)) +
geom_point(aes(col=age, size=Group.Size), alpha=0.5) +
geom_text(aes(label=age)) +
scale_size(range=c(5,25)) +
theme_bw() + theme(legend.position="none") +
ggtitle("Characteristics of Age Groups (group size)") +
ylab("average frequency") + xlab("average revenue per transaction")
%>% filter(age!="a99") %>% #
A0 group_by(area) %>% summarise(
Group.Size = n(), #
avg.Freq = mean(f), #
avg.Revenue = sum(f*m)/sum(f) #
%>%
) ggplot(aes(y=avg.Freq, x=avg.Revenue)) +
geom_point(aes(col=area, size=Group.Size), alpha=0.5) +
geom_text(aes(label=area)) +
scale_size(range=c(5,25)) +
theme_bw() + theme(legend.position="none") +
ggtitle("Characteristics of Area Groups (group size)") +
ylab("average frequency") + xlab("average revenue per transaction")
💡 Major Finding:
※ Frequency is negatively corellated with Average Revenue
    § Near customers buy frequently but less
    § a34
and a39
buy less frequently but buy more each time.
= Z0 %>% group_by(cat) %>% summarise( # group by categories
cats noProd = n_distinct(prod),
totalQty = sum(qty),
totalRev = sum(price),
totalGross = sum(price) - sum(cost),
grossMargin = totalGross/totalRev,
avgPrice = totalRev/totalQty
)
par(mfrow=c(2,1), cex=0.7)
$totalRev %>% sort(dec=T) %>% {cumsum(.)[1:40]/sum(.)} %>%
catsbarplot(names=1:40,las=2,main="acc. percentage of reveune")
abline(h=seq(0,1,0.1),col='green')
$totalRev %>% sort(dec=T) %>% {cumsum(.)[1:40]/sum(.)} %>%
catsbarplot(names=1:40,las=2,main="acc. percentage of gross profit")
abline(h=seq(0,1,0.1),col='green')
+ The best selling 10 categories contribute ~20% of revenue + The most profitable 10 categories contribute ~20% of profit + Are the best selling categories the same as the most profitable’s?
= tapply(Z0$qty,Z0$cat,sum) %>% sort %>% tail(20) %>% names top20
MOSA(~cat+age, Z0[Z0$cat %in% top20,])
MOSA(~cat+area, Z0[Z0$cat %in% top20,])
$wday = format(X0$date, "%u")
X0par(cex=0.7, mar=c(2,3,2,1))
table(X0$wday) %>% barplot(main="No. Transactions in Week Days")
##### Age Groups vs. Weekdays
MOSA(~wday+age, X0)
##### Categories vs. Weekdays
= Z0 %>% filter(cat %in% top20) %>% mutate(wday = format(date, '%u'))
df MOSA(~cat+wday, df)