🌷
cust prod cat tid
1 32256 23789 2007 119422
Identify top-100 categories that contributes to the largest profit
col4 = c('seagreen','gold','orange','red')
gg= group_by(Z0, cat) %>% summarise(
solds = n(), qty = sum(qty), rev = sum(price), cost = sum(cost),
profit = rev - cost, margin = 100*profit/rev
) %>%
top_n(100, profit) %>%
ggplot(aes(x=profit, y=rev, col=margin, label=cat)) +
geom_point(size=1.5,alpha=0.8) +
scale_y_log10() + scale_x_log10() +
scale_color_gradientn(colors=col4) +
theme_bw()
ggplotly(gg)
🚴 Practice
Modifying the above code chuck to visualize
🚴 More Practices
Try to do the contribution analysis on the top 300 product items …
Warning in geom_text(aes(x = 10000, y = 1000000, label = paste("top-300", : All aesthetics have length 1, but the data has 300 rows.
ℹ Please consider using `annotate()` or provide this layer with data containing
a single row.
Warning in geom_text(aes(x = 10000, y = 1000000, label = paste("top-300", : All aesthetics have length 1, but the data has 276 rows.
ℹ Please consider using `annotate()` or provide this layer with data containing
a single row.
Top50 (Penetrating) Product Categories - cm
M=50
cm = Z0 %>% group_by(cat) %>%
summarise(r = n_distinct(cust)/nrow(A0)) %>% # penetration rate
arrange(desc(r)) %>% pull(cat) %>% head(M)
Customer-Category Matrix - x
x = xtabs(~cust+cat, filter(Z0, cat%in%cm)) %>% as.data.frame.matrix
x = x[,order(-colSums(x))]
dim(x)
[1] 29474 50
K-means Clustering - K=160
kg
25 35 38 6 49 75 62 129 26 33 43 1 32 89 124 73
1 1 1 3 4 4 5 5 7 7 7 8 12 13 14 15
88 102 27 50 93 125 7 105 138 96 146 92 94 36 128 149
15 15 17 17 17 17 20 21 22 24 24 26 26 27 27 27
24 9 117 23 70 107 130 103 110 2 150 152 122 11 57 76
29 31 31 33 33 33 33 34 34 35 35 35 36 37 37 37
58 160 61 80 97 40 68 46 64 79 84 98 148 147 123 154
38 38 41 41 41 42 42 47 49 49 50 52 53 54 56 57
81 112 156 85 109 71 20 90 48 29 31 44 155 42 120 126
61 61 64 66 69 72 74 76 78 83 84 86 86 87 89 92
82 39 52 37 131 56 100 132 34 108 10 8 143 101 15 157
94 95 99 108 117 118 118 118 119 119 120 122 122 125 127 127
51 137 17 65 3 77 16 142 19 104 66 141 95 21 111 134
128 133 134 134 140 140 142 143 146 149 152 155 156 158 164 168
22 121 41 28 127 139 115 153 151 136 99 159 13 55 53 45
173 174 181 190 202 210 213 214 218 220 244 268 285 289 294 307
86 78 140 145 106 30 67 116 4 133 144 63 54 69 158 12
312 328 335 366 376 377 406 409 412 422 425 430 444 467 476 483
60 72 83 5 59 47 135 113 114 14 119 118 74 91 87 18
490 503 521 522 535 553 565 616 644 690 735 746 752 794 814 2684
Rather tan plotting all of the groups, we only plot the groups of appropriate size.
ckg = tibble(cust=rownames(x),kg=kg,by="cust")
gdf = inner_join(A0, ckg) %>%
group_by(kg) %>% summarise(
gsize = n(), ttRev = sum(rev), ttProfit = sum(raw),
avRev = mean(rev), avProfit = mean(raw),
avRecent = mean(r), avFreq = mean(f), avMoney = mean(m)
)
filter(gdf, gsize >= 200, gsize <= 1000) %>%
ggplot(aes(avMoney,avFreq,col=ttProfit,size=gsize,label=kg)) +
geom_point(alpha=0.6) +
scale_color_gradientn(colors=c("seagreen","gold","tomato")) +
theme_bw() -> g
ggplotly(g)
Besides the RFM, we also know the product preference of each groups specifically
# choose the groups of appropriate size
g = filter(gdf, gsize >= 200, gsize <= 800) %>% pull(kg)
# calculate the group means
a = sapply(split(x[kg %in% g,1:30], kg[kg %in% g]), colMeans)
# define color and helper function
color9 = c("darkblue","green","gold","orange",rep("red",5))
hmap1 = function(x, ...) { heatmaply(
as.data.frame.matrix(x), cexRow=0.7, cexCol=0.7,
grid_color='gray70', ...)
}
# generate the heatmap
hmap1(a, col=color9, show_dendrogram=c(F,F))
It is very easy if you know how to use the arules
package …
# p2k = count(Z0, prod, sort=T) %>% pull(prod) %>% head(2000)
# Z = filter(Z0, prod %in% p2k)
# tr = as(split(Z[,"prod"], Z[,"tid"]), "transactions"); tr
# rules <- apriori(tr, parameter=list(supp=0.0002, conf=0.5))
# summary(rules)
☝️ The Association Rules …
lhs
item(s) is in a cartrhs
item(s) tend to be also in that cartconfidence
:= the probability that the rhs
are also in cartsupport
:= the ratio that this rule actually
occurredcount
:= the number of occurrences of this rule