Association Rules for Market Basket Analysis (R) cited & modified from: http://www.informit.com/promotions/code-files-modeling-techniques-in-predictive-analytics-141177

if(!require(arules)) install.packages("arules")
## Warning: package 'arules' was built under R version 4.0.5
if(!require(arulesViz)) install.packages("arulesViz")
## Warning: package 'arulesViz' was built under R version 4.0.5
if(!require(RColorBrewer)) install.packages("RColorBrewer")

# library(arules)  # association rules
# library(arulesViz)  # data visualization of association rules
# library(RColorBrewer)  # color palettes for plots

data(Groceries)  # grocery transactions object from arules package
dim(Groceries)   # 9835 baskets x 169 items  
## [1] 9835  169
inspect(Groceries[1:5])
##     items                     
## [1] {citrus fruit,            
##      semi-finished bread,     
##      margarine,               
##      ready soups}             
## [2] {tropical fruit,          
##      yogurt,                  
##      coffee}                  
## [3] {whole milk}              
## [4] {pip fruit,               
##      yogurt,                  
##      cream cheese ,           
##      meat spreads}            
## [5] {other vegetables,        
##      whole milk,              
##      condensed milk,          
##      long life bakery product}
# as(Groceries, "data.frame") 


Examine frequency for each item with support greater than 0.025

par(cex=0.8)
itemFrequencyPlot(Groceries, support = 0.025, xlim = c(0,0.3),
  type = "relative", horiz = TRUE, col = "dark red", las = 1,
  xlab = paste0(
    "Proportion of Market Baskets Containing Item\n",
    "(Item Relative Frequency or Support)"))

Explore possibilities for combining similar items

df = itemInfo(Groceries)
str(df) # levels 10, 55
## 'data.frame':    169 obs. of  3 variables:
##  $ labels: chr  "frankfurter" "sausage" "liver loaf" "ham" ...
##  $ level2: Factor w/ 55 levels "baby food","bags",..: 44 44 44 44 44 44 44 42 42 41 ...
##  $ level1: Factor w/ 10 levels "canned food",..: 6 6 6 6 6 6 6 6 6 6 ...

Aggregate items using the 55 level2 levels for food categories

groceries <- aggregate(Groceries, itemInfo(Groceries)[["level2"]])  
# dim(groceries) # 9835 baskets x 55 items  
itemFrequencyPlot(groceries, support = 0.025, xlim = c(0,0.5),
  type = "relative", horiz = TRUE, col = "blue", las = 1,
  xlab = paste0("Proportion of Market Baskets Containing Item\n",
    "(Item Relative Frequency or Support)"))

Obtain large set of association rules for items by category and all shoppers

This is done by setting very low criteria for support and confidence:

first.rules <- apriori(groceries, 
  parameter = list(support = 0.001, confidence = 0.05))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##        0.05    0.1    1 none FALSE            TRUE       5   0.001      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 9 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[55 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [54 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 7 8 done [0.01s].
## writing ... [69921 rule(s)] done [0.01s].
## creating S4 object  ... done [0.02s].
summary(first.rules)  # yields 69,921 rules... too many
## set of 69921 rules
## 
## rule length distribution (lhs + rhs):sizes
##     1     2     3     4     5     6     7     8 
##    21  1205 10467 23895 22560  9888  1813    72 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   4.000   4.000   4.502   5.000   8.000 
## 
## summary of quality measures:
##     support           confidence        coverage             lift        
##  Min.   :0.001017   Min.   :0.0500   Min.   :0.001017   Min.   : 0.4475  
##  1st Qu.:0.001118   1st Qu.:0.2110   1st Qu.:0.002440   1st Qu.: 1.8315  
##  Median :0.001525   Median :0.4231   Median :0.004372   Median : 2.2573  
##  Mean   :0.002488   Mean   :0.4364   Mean   :0.009687   Mean   : 2.5382  
##  3rd Qu.:0.002339   3rd Qu.:0.6269   3rd Qu.:0.009354   3rd Qu.: 2.9662  
##  Max.   :0.443010   Max.   :1.0000   Max.   :1.000000   Max.   :16.1760  
##      count        
##  Min.   :  10.00  
##  1st Qu.:  11.00  
##  Median :  15.00  
##  Mean   :  24.47  
##  3rd Qu.:  23.00  
##  Max.   :4357.00  
## 
## mining info:
##       data ntransactions support confidence
##  groceries          9835   0.001       0.05

Select association rules using thresholds for support and confidence

second.rules <- apriori(groceries, 
  parameter = list(support = 0.025, confidence = 0.05))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##        0.05    0.1    1 none FALSE            TRUE       5   0.025      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 245 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[55 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [32 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [344 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
summary(second.rules)  # yields 344 rules
## set of 344 rules
## 
## rule length distribution (lhs + rhs):sizes
##   1   2   3   4 
##  21 162 129  32 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0     2.0     2.0     2.5     3.0     4.0 
## 
## summary of quality measures:
##     support          confidence         coverage            lift       
##  Min.   :0.02542   Min.   :0.05043   Min.   :0.03427   Min.   :0.6669  
##  1st Qu.:0.03030   1st Qu.:0.18202   1st Qu.:0.07626   1st Qu.:1.2498  
##  Median :0.03854   Median :0.39522   Median :0.11657   Median :1.4770  
##  Mean   :0.05276   Mean   :0.37658   Mean   :0.21184   Mean   :1.4831  
##  3rd Qu.:0.05236   3rd Qu.:0.51271   3rd Qu.:0.27300   3rd Qu.:1.7094  
##  Max.   :0.44301   Max.   :0.79841   Max.   :1.00000   Max.   :2.4073  
##      count       
##  Min.   : 250.0  
##  1st Qu.: 298.0  
##  Median : 379.0  
##  Mean   : 518.9  
##  3rd Qu.: 515.0  
##  Max.   :4357.0  
## 
## mining info:
##       data ntransactions support confidence
##  groceries          9835   0.025       0.05

Data visualization of association rules in scatter plot

plot(second.rules, cex=0.8,
  control=list(jitter=2, col = rev(brewer.pal(9, "RdYlGn"))),
  shading = "lift")   
## Warning: Unknown control parameters: cex
## Available control parameters (with default values):
## main  =  Scatter plot for 344 rules
## colors    =  c("#EE0000FF", "#EEEEEEFF")
## jitter    =  NA
## engine    =  ggplot2
## verbose   =  FALSE

Grouped matrix of rules

plot(second.rules, method="grouped",    
  control=list(col = rev(brewer.pal(9, "RdYlGn"))))

Select rules with vegetables in consequent (right-hand-side) item subsets

vegie.rules <- subset(second.rules, subset = rhs %pin% "vegetables")
inspect(vegie.rules)  # 41 rules
##      lhs                            rhs             support confidence   coverage      lift count
## [1]  {}                          => {vegetables} 0.27300458  0.2730046 1.00000000 1.0000000  2685
## [2]  {poultry}                   => {vegetables} 0.02897814  0.5745968 0.05043213 2.1047148   285
## [3]  {pork}                      => {vegetables} 0.03009659  0.5220459 0.05765125 1.9122238   296
## [4]  {staple foods}              => {vegetables} 0.02613116  0.5160643 0.05063549 1.8903136   257
## [5]  {eggs}                      => {vegetables} 0.03141840  0.4951923 0.06344687 1.8138608   309
## [6]  {games/books/hobby}         => {vegetables} 0.02785968  0.3145809 0.08856126 1.1522918   274
## [7]  {long-life bakery products} => {vegetables} 0.02907982  0.3492063 0.08327402 1.2791227   286
## [8]  {perfumery}                 => {vegetables} 0.03213015  0.4056483 0.07920691 1.4858662   316
## [9]  {beef}                      => {vegetables} 0.04585663  0.5595533 0.08195221 2.0496116   451
## [10] {bags}                      => {vegetables} 0.03141840  0.3175745 0.09893238 1.1632571   309
## [11] {vinegar/oils}              => {vegetables} 0.04199288  0.4666667 0.08998475 1.7093731   413
## [12] {chocolate}                 => {vegetables} 0.03192679  0.2934579 0.10879512 1.0749195   314
## [13] {beer}                      => {vegetables} 0.03406202  0.2189542 0.15556685 0.8020168   335
## [14] {frozen foods}              => {vegetables} 0.04738180  0.4052174 0.11692933 1.4842879   466
## [15] {cheese}                    => {vegetables} 0.05531266  0.4365971 0.12669039 1.5992300   544
## [16] {sausage}                   => {vegetables} 0.07625826  0.4032258 0.18912049 1.4769929   750
## [17] {fruit}                     => {vegetables} 0.10706660  0.4297959 0.24911032 1.5743176  1053
## [18] {non-alc. drinks}           => {vegetables} 0.09456024  0.2974097 0.31794611 1.0893944   930
## [19] {bread and backed goods}    => {vegetables} 0.11621759  0.3363743 0.34550076 1.2321198  1143
## [20] {dairy produce}             => {vegetables} 0.17041179  0.3846683 0.44300966 1.4090180  1676
## [21] {beef,                                                                                      
##       dairy produce}             => {vegetables} 0.02989324  0.6074380 0.04921200 2.2250104   294
## [22] {dairy produce,                                                                             
##       vinegar/oils}              => {vegetables} 0.03141840  0.5355286 0.05866802 1.9616103   309
## [23] {dairy produce,                                                                             
##       frozen foods}              => {vegetables} 0.03436706  0.5121212 0.06710727 1.8758704   338
## [24] {cheese,                                                                                    
##       fruit}                     => {vegetables} 0.02674123  0.5197628 0.05144891 1.9038613   263
## [25] {bread and backed goods,                                                                    
##       cheese}                    => {vegetables} 0.02887646  0.4536741 0.06365023 1.6617821   284
## [26] {cheese,                                                                                    
##       dairy produce}             => {vegetables} 0.04219624  0.4987981 0.08459583 1.8270686   415
## [27] {fruit,                                                                                     
##       sausage}                   => {vegetables} 0.03426538  0.5290424 0.06476868 1.9378517   337
## [28] {non-alc. drinks,                                                                           
##       sausage}                   => {vegetables} 0.03029995  0.4156206 0.07290290 1.5223944   298
## [29] {bread and backed goods,                                                                    
##       sausage}                   => {vegetables} 0.04382308  0.4229637 0.10360956 1.5492916   431
## [30] {dairy produce,                                                                             
##       sausage}                   => {vegetables} 0.05266904  0.4905303 0.10737163 1.7967842   518
## [31] {fruit,                                                                                     
##       non-alc. drinks}           => {vegetables} 0.04361973  0.4657980 0.09364514 1.7061914   429
## [32] {bread and backed goods,                                                                    
##       fruit}                     => {vegetables} 0.05124555  0.4763705 0.10757499 1.7449177   504
## [33] {dairy produce,                                                                             
##       fruit}                     => {vegetables} 0.07869853  0.5032510 0.15638027 1.8433793   774
## [34] {bread and backed goods,                                                                    
##       non-alc. drinks}           => {vegetables} 0.04636502  0.3731588 0.12425013 1.3668590   456
## [35] {dairy produce,                                                                             
##       non-alc. drinks}           => {vegetables} 0.06446365  0.4243641 0.15190646 1.5544213   634
## [36] {bread and backed goods,                                                                    
##       dairy produce}             => {vegetables} 0.08195221  0.4366197 0.18769700 1.5993128   806
## [37] {dairy produce,                                                                             
##       fruit,                                                                                     
##       sausage}                   => {vegetables} 0.02714794  0.5741935 0.04728012 2.1032378   267
## [38] {bread and backed goods,                                                                    
##       dairy produce,                                                                             
##       sausage}                   => {vegetables} 0.03284189  0.5135135 0.06395526 1.8809704   323
## [39] {dairy produce,                                                                             
##       fruit,                                                                                     
##       non-alc. drinks}           => {vegetables} 0.03304525  0.5183413 0.06375191 1.8986543   325
## [40] {bread and backed goods,                                                                    
##       dairy produce,                                                                             
##       fruit}                     => {vegetables} 0.04077275  0.5276316 0.07727504 1.9326840   401
## [41] {bread and backed goods,                                                                    
##       dairy produce,                                                                             
##       non-alc. drinks}           => {vegetables} 0.03345196  0.4627286 0.07229283 1.6949480   329

Sort by lift and identify the top 10 rules

top.vegie.rules <- head(sort(vegie.rules, decreasing = TRUE, by = "lift"), 10)
inspect(top.vegie.rules) 
##      lhs                         rhs             support confidence   coverage     lift count
## [1]  {beef,                                                                                  
##       dairy produce}          => {vegetables} 0.02989324  0.6074380 0.04921200 2.225010   294
## [2]  {poultry}                => {vegetables} 0.02897814  0.5745968 0.05043213 2.104715   285
## [3]  {dairy produce,                                                                         
##       fruit,                                                                                 
##       sausage}                => {vegetables} 0.02714794  0.5741935 0.04728012 2.103238   267
## [4]  {beef}                   => {vegetables} 0.04585663  0.5595533 0.08195221 2.049612   451
## [5]  {dairy produce,                                                                         
##       vinegar/oils}           => {vegetables} 0.03141840  0.5355286 0.05866802 1.961610   309
## [6]  {fruit,                                                                                 
##       sausage}                => {vegetables} 0.03426538  0.5290424 0.06476868 1.937852   337
## [7]  {bread and backed goods,                                                                
##       dairy produce,                                                                         
##       fruit}                  => {vegetables} 0.04077275  0.5276316 0.07727504 1.932684   401
## [8]  {pork}                   => {vegetables} 0.03009659  0.5220459 0.05765125 1.912224   296
## [9]  {cheese,                                                                                
##       fruit}                  => {vegetables} 0.02674123  0.5197628 0.05144891 1.903861   263
## [10] {dairy produce,                                                                         
##       fruit,                                                                                 
##       non-alc. drinks}        => {vegetables} 0.03304525  0.5183413 0.06375191 1.898654   325
plot(top.vegie.rules, method="graph", 
  control=list(type="items", alpha=1, labelCol="blue"), 
  shading = "lift")
## Warning: Unknown control parameters: type, alpha, labelCol
## Available control parameters (with default values):
## layout    =  list(fun = function (graph, dim = 2, ...)  {     if ("layout" %in% graph_attr_names(graph)) {         lay <- graph_attr(graph, "layout")         if (is.function(lay)) {             lay(graph, ...)         }         else {             lay         }     }     else if (all(c("x", "y") %in% vertex_attr_names(graph))) {         if ("z" %in% vertex_attr_names(graph)) {             cbind(V(graph)$x, V(graph)$y, V(graph)$z)         }         else {             cbind(V(graph)$x, V(graph)$y)         }     }     else if (vcount(graph) < 1000) {         layout_with_fr(graph, dim = dim, ...)     }     else {         layout_with_drl(graph, dim = dim, ...)     } }, call_str = c("layout_nicely(<graph>, input = \"C:/Users/Sophia/Documents/R_BusinessAnalytics/BasketAnal.rmd\", ", "    encoding = \"UTF-8\")"), args = list())
## edges     =  <environment>
## nodes     =  <environment>
## nodetext  =  <environment>
## colors    =  c("#EE0000FF", "#EEEEEEFF")
## engine    =  ggplot2
## max   =  100
## verbose   =  FALSE



Practice:

Suppose your client is someone other than the local farmer,