packages = c(
"dplyr","ggplot2","googleVis","devtools","magrittr","slam","irlba","plotly",
"arules","arulesViz","Matrix","recommenderlab")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
rm(list=ls(all=TRUE))
LOAD = FALSE
library(dplyr)
library(ggplot2)
library(googleVis)
library(Matrix)
library(slam)
library(irlba)
library(plotly)
library(arules)
library(arulesViz)
library(recommenderlab)
A. 檢視資料-敘述性統計
data(Groceries) # grocery transactions object from arules package
str(Groceries)
## Formal class 'transactions' [package "arules"] with 3 slots
## ..@ data :Formal class 'ngCMatrix' [package "Matrix"] with 5 slots
## .. .. ..@ i : int [1:43367] 13 60 69 78 14 29 98 24 15 29 ...
## .. .. ..@ p : int [1:9836] 0 4 7 8 12 16 21 22 27 28 ...
## .. .. ..@ Dim : int [1:2] 169 9835
## .. .. ..@ Dimnames:List of 2
## .. .. .. ..$ : NULL
## .. .. .. ..$ : NULL
## .. .. ..@ factors : list()
## ..@ itemInfo :'data.frame': 169 obs. of 3 variables:
## .. ..$ labels: chr [1:169] "frankfurter" "sausage" "liver loaf" "ham" ...
## .. ..$ level2: Factor w/ 55 levels "baby food","bags",..: 44 44 44 44 44 44 44 42 42 41 ...
## .. ..$ level1: Factor w/ 10 levels "canned food",..: 6 6 6 6 6 6 6 6 6 6 ...
## ..@ itemsetInfo:'data.frame': 0 obs. of 0 variables
summary(Groceries)
## transactions as itemMatrix in sparse format with
## 9835 rows (elements/itemsets/transactions) and
## 169 columns (items) and a density of 0.02609146
##
## most frequent items:
## whole milk other vegetables rolls/buns soda
## 2513 1903 1809 1715
## yogurt (Other)
## 1372 34055
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 2159 1643 1299 1005 855 645 545 438 350 246 182 117 78 77 55
## 16 17 18 19 20 21 22 23 24 26 27 28 29 32
## 46 29 14 14 9 11 4 6 1 1 1 1 3 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 4.409 6.000 32.000
##
## includes extended item information - examples:
## labels level2 level1
## 1 frankfurter sausage meat and sausage
## 2 sausage sausage meat and sausage
## 3 liver loaf sausage meat and sausage
# show the dimensions of the transactions object
print(dim(Groceries))
## [1] 9835 169
print(dim(Groceries)[1]) # 9835 筆交易資料
## [1] 9835
print(dim(Groceries)[2]) # 169 項產品
## [1] 169
inspect(Groceries[1:5]) # 檢視前面五項交易
## items
## [1] {citrus fruit,
## semi-finished bread,
## margarine,
## ready soups}
## [2] {tropical fruit,
## yogurt,
## coffee}
## [3] {whole milk}
## [4] {pip fruit,
## yogurt,
## cream cheese ,
## meat spreads}
## [5] {other vegetables,
## whole milk,
## condensed milk,
## long life bakery product}
# examine frequency for each item with support greater than 0.025
pdf(file="fig_market_basket_initial_item_support.pdf",
width = 8.5, height = 11)
itemFrequencyPlot(Groceries, topN=20, type="absolute",col = "dark red", cex=0.8)
image(Groceries[1:10])

warnings()
## NULL
image(sample(Groceries, 100))

# explore possibilities for combining similar items
df = itemInfo(Groceries)
str(df) # levels 10, 55
## 'data.frame': 169 obs. of 3 variables:
## $ labels: chr "frankfurter" "sausage" "liver loaf" "ham" ...
## $ level2: Factor w/ 55 levels "baby food","bags",..: 44 44 44 44 44 44 44 42 42 41 ...
## $ level1: Factor w/ 10 levels "canned food",..: 6 6 6 6 6 6 6 6 6 6 ...
- 找出相似產品
- 有兩個levels,選擇適合的Level(10太少)
# aggregate items using the 55 level2 levels for food categories
# to create a more meaningful set of items
groceries <- aggregate(Groceries, itemInfo(Groceries)[["level2"]])
print(dim(groceries)[1]) # 9835 market baskets for shopping trips
## [1] 9835
print(dim(groceries)[2]) # 55 final store items (categories)
## [1] 55
itemFrequencyPlot(groceries, support = 0.025, cex.names=1.0, xlim = c(0,0.5), type = "relative", horiz = TRUE, col = "blue", las = 1, xlab = paste("Proportion of Market Baskets Containing Item", "\n(Item Relative Frequency or Support)"))

B. 關聯規則
- support : A品項被購買的基礎機率
- confidence : A品項被購買時B被購買的機率
- lift : A品項被購買時,B被購買所增加機率的倍數
- count : 交易筆數(交易筆數如果太少,分析就沒有實質意義)
#我們必須設定 support 和 confidence 的標準
#我們先用比較低的標準
first.rules <- apriori(groceries,
parameter = list(support = 0.001, confidence = 0.05))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.05 0.1 1 none FALSE TRUE 5 0.001 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 9
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[55 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [54 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 7 8 done [0.02s].
## writing ... [69921 rule(s)] done [0.02s].
## creating S4 object ... done [0.03s].
print(summary(first.rules))
## set of 69921 rules
##
## rule length distribution (lhs + rhs):sizes
## 1 2 3 4 5 6 7 8
## 21 1205 10467 23895 22560 9888 1813 72
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 4.000 4.000 4.502 5.000 8.000
##
## summary of quality measures:
## support confidence lift count
## Min. :0.001017 Min. :0.0500 Min. : 0.4475 Min. : 10.00
## 1st Qu.:0.001118 1st Qu.:0.2110 1st Qu.: 1.8315 1st Qu.: 11.00
## Median :0.001525 Median :0.4231 Median : 2.2573 Median : 15.00
## Mean :0.002488 Mean :0.4364 Mean : 2.5382 Mean : 24.47
## 3rd Qu.:0.002339 3rd Qu.:0.6269 3rd Qu.: 2.9662 3rd Qu.: 23.00
## Max. :0.443010 Max. :1.0000 Max. :16.1760 Max. :4357.00
##
## mining info:
## data ntransactions support confidence
## groceries 9835 0.001 0.05
second.rules <- apriori(groceries,
parameter = list(support = 0.025, confidence = 0.05))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.05 0.1 1 none FALSE TRUE 5 0.025 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 245
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[55 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [32 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [344 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
print(summary(second.rules))
## set of 344 rules
##
## rule length distribution (lhs + rhs):sizes
## 1 2 3 4
## 21 162 129 32
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 2.0 2.0 2.5 3.0 4.0
##
## summary of quality measures:
## support confidence lift count
## Min. :0.02542 Min. :0.05043 Min. :0.6669 Min. : 250.0
## 1st Qu.:0.03030 1st Qu.:0.18202 1st Qu.:1.2498 1st Qu.: 298.0
## Median :0.03854 Median :0.39522 Median :1.4770 Median : 379.0
## Mean :0.05276 Mean :0.37658 Mean :1.4831 Mean : 518.9
## 3rd Qu.:0.05236 3rd Qu.:0.51271 3rd Qu.:1.7094 3rd Qu.: 515.0
## Max. :0.44301 Max. :0.79841 Max. :2.4073 Max. :4357.0
##
## mining info:
## data ntransactions support confidence
## groceries 9835 0.025 0.05
# 互動式圖形
plot(second.rules,colors=c("red","green"),engine="htmlwidget",
marker=list(opacity=.6,size=8))
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
## Warning: package 'bindrcpp' was built under R version 3.4.4
# 檢視前面10條關聯規則
inspect(second.rules[1:10])
## lhs rhs support confidence lift count
## [1] {} => {poultry} 0.05043213 0.05043213 1 496
## [2] {} => {pork} 0.05765125 0.05765125 1 567
## [3] {} => {staple foods} 0.05063549 0.05063549 1 498
## [4] {} => {coffee} 0.06487036 0.06487036 1 638
## [5] {} => {eggs} 0.06344687 0.06344687 1 624
## [6] {} => {games/books/hobby} 0.08856126 0.08856126 1 871
## [7] {} => {long-life bakery products} 0.08327402 0.08327402 1 819
## [8] {} => {perfumery} 0.07920691 0.07920691 1 779
## [9] {} => {beef} 0.08195221 0.08195221 1 806
## [10] {} => {bags} 0.09893238 0.09893238 1 973
- 產生 344 條規則,差不多….比較容易檢視及選擇
- 利用圖形,我們很快地可以練習看圖說故事的能力
- 互動式圖形,可以幫助我們改善修正我們要的標準,ex:zoom in 某一區域
plot(second.rules,method="matrix",shading="lift",engine="htmlwidget",
colors=c("red", "green"))
+ 同一個水平軸上和同一個垂直軸上皆是同一項產品 + 這樣的互動式圖形可以幫助我們在選擇產品上,找出不同的關聯規則
inspect(sort(second.rules, by = "lift")[1:5])
## lhs rhs support confidence
## [1] {bread and backed goods,cheese} => {sausage} 0.02897814 0.4552716
## [2] {dairy produce,sausage} => {cheese} 0.03111337 0.2897727
## [3] {beef,dairy produce} => {vegetables} 0.02989324 0.6074380
## [4] {bread and backed goods,sausage} => {cheese} 0.02897814 0.2796860
## [5] {bread and backed goods,fruit} => {cheese} 0.02958821 0.2750473
## lift count
## [1] 2.407310 285
## [2] 2.287251 306
## [3] 2.225010 294
## [4] 2.207634 285
## [5] 2.171019 291
- 我們也可以查看前5個最高的Lift的規則有哪些
- lhs(left-hand side)
- rhs(right-hand side)
C. 篩選產品- 以蔬菜產品為例
# select rules with vegetables in consequent (right-hand-side) item subsets
vegie.rules <- subset(second.rules, subset = rhs %pin% "vegetables")
inspect(vegie.rules)
## lhs rhs support confidence lift count
## [1] {} => {vegetables} 0.27300458 0.2730046 1.0000000 2685
## [2] {poultry} => {vegetables} 0.02897814 0.5745968 2.1047148 285
## [3] {pork} => {vegetables} 0.03009659 0.5220459 1.9122238 296
## [4] {staple foods} => {vegetables} 0.02613116 0.5160643 1.8903136 257
## [5] {eggs} => {vegetables} 0.03141840 0.4951923 1.8138608 309
## [6] {games/books/hobby} => {vegetables} 0.02785968 0.3145809 1.1522918 274
## [7] {long-life bakery products} => {vegetables} 0.02907982 0.3492063 1.2791227 286
## [8] {perfumery} => {vegetables} 0.03213015 0.4056483 1.4858662 316
## [9] {beef} => {vegetables} 0.04585663 0.5595533 2.0496116 451
## [10] {bags} => {vegetables} 0.03141840 0.3175745 1.1632571 309
## [11] {vinegar/oils} => {vegetables} 0.04199288 0.4666667 1.7093731 413
## [12] {chocolate} => {vegetables} 0.03192679 0.2934579 1.0749195 314
## [13] {beer} => {vegetables} 0.03406202 0.2189542 0.8020168 335
## [14] {frozen foods} => {vegetables} 0.04738180 0.4052174 1.4842879 466
## [15] {cheese} => {vegetables} 0.05531266 0.4365971 1.5992300 544
## [16] {sausage} => {vegetables} 0.07625826 0.4032258 1.4769929 750
## [17] {fruit} => {vegetables} 0.10706660 0.4297959 1.5743176 1053
## [18] {non-alc. drinks} => {vegetables} 0.09456024 0.2974097 1.0893944 930
## [19] {bread and backed goods} => {vegetables} 0.11621759 0.3363743 1.2321198 1143
## [20] {dairy produce} => {vegetables} 0.17041179 0.3846683 1.4090180 1676
## [21] {beef,
## dairy produce} => {vegetables} 0.02989324 0.6074380 2.2250104 294
## [22] {dairy produce,
## vinegar/oils} => {vegetables} 0.03141840 0.5355286 1.9616103 309
## [23] {dairy produce,
## frozen foods} => {vegetables} 0.03436706 0.5121212 1.8758704 338
## [24] {cheese,
## fruit} => {vegetables} 0.02674123 0.5197628 1.9038613 263
## [25] {bread and backed goods,
## cheese} => {vegetables} 0.02887646 0.4536741 1.6617821 284
## [26] {cheese,
## dairy produce} => {vegetables} 0.04219624 0.4987981 1.8270686 415
## [27] {fruit,
## sausage} => {vegetables} 0.03426538 0.5290424 1.9378517 337
## [28] {non-alc. drinks,
## sausage} => {vegetables} 0.03029995 0.4156206 1.5223944 298
## [29] {bread and backed goods,
## sausage} => {vegetables} 0.04382308 0.4229637 1.5492916 431
## [30] {dairy produce,
## sausage} => {vegetables} 0.05266904 0.4905303 1.7967842 518
## [31] {fruit,
## non-alc. drinks} => {vegetables} 0.04361973 0.4657980 1.7061914 429
## [32] {bread and backed goods,
## fruit} => {vegetables} 0.05124555 0.4763705 1.7449177 504
## [33] {dairy produce,
## fruit} => {vegetables} 0.07869853 0.5032510 1.8433793 774
## [34] {bread and backed goods,
## non-alc. drinks} => {vegetables} 0.04636502 0.3731588 1.3668590 456
## [35] {dairy produce,
## non-alc. drinks} => {vegetables} 0.06446365 0.4243641 1.5544213 634
## [36] {bread and backed goods,
## dairy produce} => {vegetables} 0.08195221 0.4366197 1.5993128 806
## [37] {dairy produce,
## fruit,
## sausage} => {vegetables} 0.02714794 0.5741935 2.1032378 267
## [38] {bread and backed goods,
## dairy produce,
## sausage} => {vegetables} 0.03284189 0.5135135 1.8809704 323
## [39] {dairy produce,
## fruit,
## non-alc. drinks} => {vegetables} 0.03304525 0.5183413 1.8986543 325
## [40] {bread and backed goods,
## dairy produce,
## fruit} => {vegetables} 0.04077275 0.5276316 1.9326840 401
## [41] {bread and backed goods,
## dairy produce,
## non-alc. drinks} => {vegetables} 0.03345196 0.4627286 1.6949480 329
# sort by lift and identify the top 10 rules
top.vegie.rules <- head(sort(vegie.rules, decreasing = TRUE, by = "lift"), 10)
inspect(top.vegie.rules)
## lhs rhs support confidence lift count
## [1] {beef,
## dairy produce} => {vegetables} 0.02989324 0.6074380 2.225010 294
## [2] {poultry} => {vegetables} 0.02897814 0.5745968 2.104715 285
## [3] {dairy produce,
## fruit,
## sausage} => {vegetables} 0.02714794 0.5741935 2.103238 267
## [4] {beef} => {vegetables} 0.04585663 0.5595533 2.049612 451
## [5] {dairy produce,
## vinegar/oils} => {vegetables} 0.03141840 0.5355286 1.961610 309
## [6] {fruit,
## sausage} => {vegetables} 0.03426538 0.5290424 1.937852 337
## [7] {bread and backed goods,
## dairy produce,
## fruit} => {vegetables} 0.04077275 0.5276316 1.932684 401
## [8] {pork} => {vegetables} 0.03009659 0.5220459 1.912224 296
## [9] {cheese,
## fruit} => {vegetables} 0.02674123 0.5197628 1.903861 263
## [10] {dairy produce,
## fruit,
## non-alc. drinks} => {vegetables} 0.03304525 0.5183413 1.898654 325
- 41個rules
- 並且依照lift大小排序前10個關聯規則
- 藉由這樣的方法我們可以清楚知道哪些產品被購買之後,能夠增加蔬菜被購買所增加的機率的倍數是最高的
- 然而太高的lift其實是顯而易見,我們要找出的是“秘密”
plot(vegie.rules,method="graph",engine="htmlwidget",itemCol="cyan")
- 泡泡大小:support: A被購買的機率 (A的基礎機率)
- 泡泡顏色:lift: A被購買時,B被購買的機率增加的倍數 (與B的基礎機率相比)