::p_load(dplyr, ggplot2, readr, FactoMineR, factoextra, dendextend) pacman
= read.csv('data/wholesales.csv')
W $Channel = factor( paste0("Ch",W$Channel) )
W$Region = factor( paste0("Reg",W$Region) )
W3:8] = lapply(W[3:8], log, base=10)
W[summary(W)
Channel Region Fresh Milk Grocery
Ch1:298 Reg1: 77 Min. :0.477 Min. :1.74 Min. :0.477
Ch2:142 Reg2: 47 1st Qu.:3.495 1st Qu.:3.19 1st Qu.:3.333
Reg3:316 Median :3.930 Median :3.56 Median :3.677
Mean :3.792 Mean :3.53 Mean :3.666
3rd Qu.:4.229 3rd Qu.:3.86 3rd Qu.:4.028
Max. :5.050 Max. :4.87 Max. :4.968
Frozen Detergents_Paper Delicassen
Min. :1.40 Min. :0.477 Min. :0.477
1st Qu.:2.87 1st Qu.:2.409 1st Qu.:2.611
Median :3.18 Median :2.912 Median :2.985
Mean :3.17 Mean :2.947 Mean :2.895
3rd Qu.:3.55 3rd Qu.:3.594 3rd Qu.:3.260
Max. :4.78 Max. :4.611 Max. :4.681
集群分析:將分析對象依其 相似性 分群
💡 層級式集群分析的步驟:
■
scale()
: 標準化 Standardization
■
dist()
: 距離矩陣
■ hclust()
:
層級式集群分析
■ plot()
: 畫出樹狀圖
■
rect.hclust()
: 依據dandrogram做切割
■
cutree()
: 產生分群向量
為了方便解釋,我們先使用兩個分群變數做層級式集群分析
= W[,3:4] %>% scale %>% dist %>% hclust hc
樹狀圖的判讀與切割
plot(hc)
=6; rect.hclust(hc, k=k, border="red") k
產生分群向量
$group = cutree(hc, k=3) %>% factor
Wtable(W$group)
1 2 3
286 148 6
將分析對象畫在分群變數空間上面
ggplot(W, aes(x=Fresh, y=Milk, col=group)) +
geom_point(size=3, alpha=0.5)
= W[,3:7] %>% scale %>% dist %>% hclust
hc plot(hc)
= 6; rect.hclust(hc, k, border="red") k
$group = cutree(hc, k) %>% factor W
%>% as.dendrogram %>% color_branches(k) %>% color_labels(k,col='white') %>% plot hc
降維的方法(工具): + 主成分分析:Principle Component
Analysis - PCA()
+ 多元尺度分析:Multi-Dimensional Scaling
- cmdscale()
names(W)[3:8] = c('生鮮','奶製品','雜貨','冷凍','清潔用品','熟食')
3:8] %>% PCA(graph=FALSE) %>% fviz_pca_biplot(
W[,col.ind=W$group, #
label="var", pointshape=19, mean.point=F,
addEllipses=T, ellipse.level=0.7,
ellipse.type = "convex", palette="ucscgb",
repel=T
)
💡 學習重點:
■
集群分析的基本觀念
■ 距離矩陣:Distance Matrix
■
層級式集群分析:Hierarchical Cluster Analysis
■
樹狀圖(Dendrogram)的判讀
■ 依據樹狀圖決定要分多少群
■
集群分析與尺度縮減的綜合應用
■ 現代化的資料視覺化工具套件