Association Rules for Market Basket Analysis (R) cited & modified from: http://www.informit.com/promotions/code-files-modeling-techniques-in-predictive-analytics-141177
if(!require(arules)) install.packages("arules")
## Warning: package 'arules' was built under R version 4.0.5
if(!require(arulesViz)) install.packages("arulesViz")
## Warning: package 'arulesViz' was built under R version 4.0.5
if(!require(RColorBrewer)) install.packages("RColorBrewer")
# library(arules) # association rules
# library(arulesViz) # data visualization of association rules
# library(RColorBrewer) # color palettes for plots
data(Groceries) # grocery transactions object from arules package
dim(Groceries) # 9835 baskets x 169 items
## [1] 9835 169
inspect(Groceries[1:5])
## items
## [1] {citrus fruit,
## semi-finished bread,
## margarine,
## ready soups}
## [2] {tropical fruit,
## yogurt,
## coffee}
## [3] {whole milk}
## [4] {pip fruit,
## yogurt,
## cream cheese ,
## meat spreads}
## [5] {other vegetables,
## whole milk,
## condensed milk,
## long life bakery product}
# as(Groceries, "data.frame")
par(cex=0.8)
itemFrequencyPlot(Groceries, support = 0.025, xlim = c(0,0.3),
type = "relative", horiz = TRUE, col = "dark red", las = 1,
xlab = paste0(
"Proportion of Market Baskets Containing Item\n",
"(Item Relative Frequency or Support)"))
= itemInfo(Groceries)
df str(df) # levels 10, 55
## 'data.frame': 169 obs. of 3 variables:
## $ labels: chr "frankfurter" "sausage" "liver loaf" "ham" ...
## $ level2: Factor w/ 55 levels "baby food","bags",..: 44 44 44 44 44 44 44 42 42 41 ...
## $ level1: Factor w/ 10 levels "canned food",..: 6 6 6 6 6 6 6 6 6 6 ...
<- aggregate(Groceries, itemInfo(Groceries)[["level2"]])
groceries # dim(groceries) # 9835 baskets x 55 items
itemFrequencyPlot(groceries, support = 0.025, xlim = c(0,0.5),
type = "relative", horiz = TRUE, col = "blue", las = 1,
xlab = paste0("Proportion of Market Baskets Containing Item\n",
"(Item Relative Frequency or Support)"))
This is done by setting very low criteria for support and confidence:
<- apriori(groceries,
first.rules parameter = list(support = 0.001, confidence = 0.05))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.05 0.1 1 none FALSE TRUE 5 0.001 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 9
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[55 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [54 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 7 8 done [0.01s].
## writing ... [69921 rule(s)] done [0.01s].
## creating S4 object ... done [0.02s].
summary(first.rules) # yields 69,921 rules... too many
## set of 69921 rules
##
## rule length distribution (lhs + rhs):sizes
## 1 2 3 4 5 6 7 8
## 21 1205 10467 23895 22560 9888 1813 72
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 4.000 4.000 4.502 5.000 8.000
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.001017 Min. :0.0500 Min. :0.001017 Min. : 0.4475
## 1st Qu.:0.001118 1st Qu.:0.2110 1st Qu.:0.002440 1st Qu.: 1.8315
## Median :0.001525 Median :0.4231 Median :0.004372 Median : 2.2573
## Mean :0.002488 Mean :0.4364 Mean :0.009687 Mean : 2.5382
## 3rd Qu.:0.002339 3rd Qu.:0.6269 3rd Qu.:0.009354 3rd Qu.: 2.9662
## Max. :0.443010 Max. :1.0000 Max. :1.000000 Max. :16.1760
## count
## Min. : 10.00
## 1st Qu.: 11.00
## Median : 15.00
## Mean : 24.47
## 3rd Qu.: 23.00
## Max. :4357.00
##
## mining info:
## data ntransactions support confidence
## groceries 9835 0.001 0.05
<- apriori(groceries,
second.rules parameter = list(support = 0.025, confidence = 0.05))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.05 0.1 1 none FALSE TRUE 5 0.025 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 245
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[55 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [32 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [344 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
summary(second.rules) # yields 344 rules
## set of 344 rules
##
## rule length distribution (lhs + rhs):sizes
## 1 2 3 4
## 21 162 129 32
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 2.0 2.0 2.5 3.0 4.0
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.02542 Min. :0.05043 Min. :0.03427 Min. :0.6669
## 1st Qu.:0.03030 1st Qu.:0.18202 1st Qu.:0.07626 1st Qu.:1.2498
## Median :0.03854 Median :0.39522 Median :0.11657 Median :1.4770
## Mean :0.05276 Mean :0.37658 Mean :0.21184 Mean :1.4831
## 3rd Qu.:0.05236 3rd Qu.:0.51271 3rd Qu.:0.27300 3rd Qu.:1.7094
## Max. :0.44301 Max. :0.79841 Max. :1.00000 Max. :2.4073
## count
## Min. : 250.0
## 1st Qu.: 298.0
## Median : 379.0
## Mean : 518.9
## 3rd Qu.: 515.0
## Max. :4357.0
##
## mining info:
## data ntransactions support confidence
## groceries 9835 0.025 0.05
plot(second.rules, cex=0.8,
control=list(jitter=2, col = rev(brewer.pal(9, "RdYlGn"))),
shading = "lift")
## Warning: Unknown control parameters: cex
## Available control parameters (with default values):
## main = Scatter plot for 344 rules
## colors = c("#EE0000FF", "#EEEEEEFF")
## jitter = NA
## engine = ggplot2
## verbose = FALSE
plot(second.rules, method="grouped",
control=list(col = rev(brewer.pal(9, "RdYlGn"))))
<- subset(second.rules, subset = rhs %pin% "vegetables")
vegie.rules inspect(vegie.rules) # 41 rules
## lhs rhs support confidence coverage lift count
## [1] {} => {vegetables} 0.27300458 0.2730046 1.00000000 1.0000000 2685
## [2] {poultry} => {vegetables} 0.02897814 0.5745968 0.05043213 2.1047148 285
## [3] {pork} => {vegetables} 0.03009659 0.5220459 0.05765125 1.9122238 296
## [4] {staple foods} => {vegetables} 0.02613116 0.5160643 0.05063549 1.8903136 257
## [5] {eggs} => {vegetables} 0.03141840 0.4951923 0.06344687 1.8138608 309
## [6] {games/books/hobby} => {vegetables} 0.02785968 0.3145809 0.08856126 1.1522918 274
## [7] {long-life bakery products} => {vegetables} 0.02907982 0.3492063 0.08327402 1.2791227 286
## [8] {perfumery} => {vegetables} 0.03213015 0.4056483 0.07920691 1.4858662 316
## [9] {beef} => {vegetables} 0.04585663 0.5595533 0.08195221 2.0496116 451
## [10] {bags} => {vegetables} 0.03141840 0.3175745 0.09893238 1.1632571 309
## [11] {vinegar/oils} => {vegetables} 0.04199288 0.4666667 0.08998475 1.7093731 413
## [12] {chocolate} => {vegetables} 0.03192679 0.2934579 0.10879512 1.0749195 314
## [13] {beer} => {vegetables} 0.03406202 0.2189542 0.15556685 0.8020168 335
## [14] {frozen foods} => {vegetables} 0.04738180 0.4052174 0.11692933 1.4842879 466
## [15] {cheese} => {vegetables} 0.05531266 0.4365971 0.12669039 1.5992300 544
## [16] {sausage} => {vegetables} 0.07625826 0.4032258 0.18912049 1.4769929 750
## [17] {fruit} => {vegetables} 0.10706660 0.4297959 0.24911032 1.5743176 1053
## [18] {non-alc. drinks} => {vegetables} 0.09456024 0.2974097 0.31794611 1.0893944 930
## [19] {bread and backed goods} => {vegetables} 0.11621759 0.3363743 0.34550076 1.2321198 1143
## [20] {dairy produce} => {vegetables} 0.17041179 0.3846683 0.44300966 1.4090180 1676
## [21] {beef,
## dairy produce} => {vegetables} 0.02989324 0.6074380 0.04921200 2.2250104 294
## [22] {dairy produce,
## vinegar/oils} => {vegetables} 0.03141840 0.5355286 0.05866802 1.9616103 309
## [23] {dairy produce,
## frozen foods} => {vegetables} 0.03436706 0.5121212 0.06710727 1.8758704 338
## [24] {cheese,
## fruit} => {vegetables} 0.02674123 0.5197628 0.05144891 1.9038613 263
## [25] {bread and backed goods,
## cheese} => {vegetables} 0.02887646 0.4536741 0.06365023 1.6617821 284
## [26] {cheese,
## dairy produce} => {vegetables} 0.04219624 0.4987981 0.08459583 1.8270686 415
## [27] {fruit,
## sausage} => {vegetables} 0.03426538 0.5290424 0.06476868 1.9378517 337
## [28] {non-alc. drinks,
## sausage} => {vegetables} 0.03029995 0.4156206 0.07290290 1.5223944 298
## [29] {bread and backed goods,
## sausage} => {vegetables} 0.04382308 0.4229637 0.10360956 1.5492916 431
## [30] {dairy produce,
## sausage} => {vegetables} 0.05266904 0.4905303 0.10737163 1.7967842 518
## [31] {fruit,
## non-alc. drinks} => {vegetables} 0.04361973 0.4657980 0.09364514 1.7061914 429
## [32] {bread and backed goods,
## fruit} => {vegetables} 0.05124555 0.4763705 0.10757499 1.7449177 504
## [33] {dairy produce,
## fruit} => {vegetables} 0.07869853 0.5032510 0.15638027 1.8433793 774
## [34] {bread and backed goods,
## non-alc. drinks} => {vegetables} 0.04636502 0.3731588 0.12425013 1.3668590 456
## [35] {dairy produce,
## non-alc. drinks} => {vegetables} 0.06446365 0.4243641 0.15190646 1.5544213 634
## [36] {bread and backed goods,
## dairy produce} => {vegetables} 0.08195221 0.4366197 0.18769700 1.5993128 806
## [37] {dairy produce,
## fruit,
## sausage} => {vegetables} 0.02714794 0.5741935 0.04728012 2.1032378 267
## [38] {bread and backed goods,
## dairy produce,
## sausage} => {vegetables} 0.03284189 0.5135135 0.06395526 1.8809704 323
## [39] {dairy produce,
## fruit,
## non-alc. drinks} => {vegetables} 0.03304525 0.5183413 0.06375191 1.8986543 325
## [40] {bread and backed goods,
## dairy produce,
## fruit} => {vegetables} 0.04077275 0.5276316 0.07727504 1.9326840 401
## [41] {bread and backed goods,
## dairy produce,
## non-alc. drinks} => {vegetables} 0.03345196 0.4627286 0.07229283 1.6949480 329
<- head(sort(vegie.rules, decreasing = TRUE, by = "lift"), 10)
top.vegie.rules inspect(top.vegie.rules)
## lhs rhs support confidence coverage lift count
## [1] {beef,
## dairy produce} => {vegetables} 0.02989324 0.6074380 0.04921200 2.225010 294
## [2] {poultry} => {vegetables} 0.02897814 0.5745968 0.05043213 2.104715 285
## [3] {dairy produce,
## fruit,
## sausage} => {vegetables} 0.02714794 0.5741935 0.04728012 2.103238 267
## [4] {beef} => {vegetables} 0.04585663 0.5595533 0.08195221 2.049612 451
## [5] {dairy produce,
## vinegar/oils} => {vegetables} 0.03141840 0.5355286 0.05866802 1.961610 309
## [6] {fruit,
## sausage} => {vegetables} 0.03426538 0.5290424 0.06476868 1.937852 337
## [7] {bread and backed goods,
## dairy produce,
## fruit} => {vegetables} 0.04077275 0.5276316 0.07727504 1.932684 401
## [8] {pork} => {vegetables} 0.03009659 0.5220459 0.05765125 1.912224 296
## [9] {cheese,
## fruit} => {vegetables} 0.02674123 0.5197628 0.05144891 1.903861 263
## [10] {dairy produce,
## fruit,
## non-alc. drinks} => {vegetables} 0.03304525 0.5183413 0.06375191 1.898654 325
plot(top.vegie.rules, method="graph",
control=list(type="items", alpha=1, labelCol="blue"),
shading = "lift")
## Warning: Unknown control parameters: type, alpha, labelCol
## Available control parameters (with default values):
## layout = list(fun = function (graph, dim = 2, ...) { if ("layout" %in% graph_attr_names(graph)) { lay <- graph_attr(graph, "layout") if (is.function(lay)) { lay(graph, ...) } else { lay } } else if (all(c("x", "y") %in% vertex_attr_names(graph))) { if ("z" %in% vertex_attr_names(graph)) { cbind(V(graph)$x, V(graph)$y, V(graph)$z) } else { cbind(V(graph)$x, V(graph)$y) } } else if (vcount(graph) < 1000) { layout_with_fr(graph, dim = dim, ...) } else { layout_with_drl(graph, dim = dim, ...) } }, call_str = c("layout_nicely(<graph>, input = \"C:/Users/Sophia/Documents/R_BusinessAnalytics/BasketAnal.rmd\", ", " encoding = \"UTF-8\")"), args = list())
## edges = <environment>
## nodes = <environment>
## nodetext = <environment>
## colors = c("#EE0000FF", "#EEEEEEFF")
## engine = ggplot2
## max = 100
## verbose = FALSE
Suppose your client is someone other than the local farmer,