::p_load(dplyr, ggplot2, car, vcd, GGally, mvtnorm) pacman
= read.csv('data/wholesales.csv')
W $Channel = factor( paste0("Ch",W$Channel) )
W$Region = factor( paste0("Reg",W$Region) )
W3:8] = lapply(W[3:8], log, base=10)
W[summary(W)
Channel Region Fresh Milk Grocery
Ch1:298 Reg1: 77 Min. :0.477 Min. :1.74 Min. :0.477
Ch2:142 Reg2: 47 1st Qu.:3.495 1st Qu.:3.19 1st Qu.:3.333
Reg3:316 Median :3.930 Median :3.56 Median :3.677
Mean :3.792 Mean :3.53 Mean :3.666
3rd Qu.:4.229 3rd Qu.:3.86 3rd Qu.:4.028
Max. :5.050 Max. :4.87 Max. :4.968
Frozen Detergents_Paper Delicassen
Min. :1.40 Min. :0.477 Min. :0.477
1st Qu.:2.87 1st Qu.:2.409 1st Qu.:2.611
Median :3.18 Median :2.912 Median :2.985
Mean :3.17 Mean :2.947 Mean :2.895
3rd Qu.:3.55 3rd Qu.:3.594 3rd Qu.:3.260
Max. :4.78 Max. :4.611 Max. :4.681
B1a. 點狀圖 Simple Scatter Plot
par(cex=0.7, mar=c(4,4,2,2))
plot(W$Milk, W$Grocery)
B1b. 點狀圖+回歸線 Scatter Plot with Regrssion Line
ggplot(W, aes(x=Milk, y=Grocery)) +
geom_point(alpha=0.3) +
geom_smooth(method="lm")
`geom_smooth()` using formula = 'y ~ x'
B2. 回歸係數 Correlation \[r_{xy}=\frac{Cov(x,y)}{\sigma_x \sigma_y} =\frac{\Sigma_{i=1}^n(x_i - \bar{x})(y_i - \bar{y})} {\sqrt{\Sigma_{i=1}^n(x_i - \bar{x})^2} \sqrt{\Sigma_{i=1}^n(y_i - \bar{y})^2}}\]
cor(W$Milk, W$Grocery)
[1] 0.75885
B3. 回歸係數檢定 Correlation Test
cor.test(W$Milk, W$Grocery)
Pearson's product-moment correlation
data: W$Milk and W$Grocery
t = 24.4, df = 438, p-value <0.0000000000000002
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.71617 0.79588
sample estimates:
cor
0.75885
💡 : 簡單講,\(p-value\)大致可以視為「沒有關係」的機率(準確的說它並不是這樣)
💡 : 如果「沒有關係」的機率很小,我們就可以推論這關係是「顯著」的
💡 : \(p-value\): 給定虛無假設(\(H_0:r=0\)),檢定統計量(\(t\))大於觀察值(\(24.4\))的機率
🗿 : 當\(p = 0.05\)時,對立假設(\(H_A\))為真的機率是?
💡 : 貝氏定理:\(P(A|B) = P(B|A) \cdot P(A) / P(B)\)
B4. Simulating Bi-Variate Normal Distibution
par(cex=0.7, mar=c(1,1,1,1), mfrow=c(3,3))
for(r in seq(-1,1,0.25)) {
= c(0,0)
mu = matrix(c(1,r,r,1),nrow=2) # covariance matrix
sigma rmvnorm(500, mu, sigma) %>% plot(col='gray')
text(0,0,r,cex=3,col='blue',font=2)
}
C1. Matrix of Correlation Coefficients
cor(W[,3:8]) %>% round(3)
Fresh Milk Grocery Frozen Detergents_Paper Delicassen
Fresh 1.000 -0.020 -0.133 0.384 -0.156 0.255
Milk -0.020 1.000 0.759 -0.055 0.678 0.338
Grocery -0.133 0.759 1.000 -0.165 0.796 0.236
Frozen 0.384 -0.055 -0.165 1.000 -0.212 0.255
Detergents_Paper -0.156 0.678 0.796 -0.212 1.000 0.167
Delicassen 0.255 0.338 0.236 0.255 0.167 1.000
💡 : 相關性矩陣:(a)對角線等於1;(b)左下、右上對稱
C1. Matrix of Scatter Plots
::scatterplotMatrix(W[,3:8]) car
D1. 關聯性卡方檢定 Chisq-Test for Association
table(W$Channel, W$Region) %>% chisq.test()
Pearson's Chi-squared test
data: .
X-squared = 4.35, df = 2, p-value = 0.11
Channel
and Region
is not significantChannel
does not depends on
Region
and vice versa.D2. 馬賽克圖 Mosaic Plot
library(vcd)
structable(Channel ~ Region, W) %>%
mosaic(shade=T, labeling=labeling_residuals)
D3. Another Example
<- margin.table(HairEyeColor, 1:2)
haireye haireye
Eye
Hair Brown Blue Hazel Green
Black 68 20 15 5
Brown 119 84 54 29
Red 26 17 14 14
Blond 7 94 10 16
頭髮顏色的邊際機率
mpHair = rowSums(haireye)/sum(haireye)) # marginal prob. (
Black Brown Red Blond
0.18243 0.48311 0.11993 0.21453
眼睛顏色的邊際機率
mpEye = colSums(haireye)/sum(haireye)) # marginal prob. (
Brown Blue Hazel Green
0.37162 0.36318 0.15709 0.10811
若頭髮、眼睛顏色是不相關的(Independent),則頭髮、眼睛的聯合機率分佈為
expProb = mpHair %o% mpEye) (
Brown Blue Hazel Green
Black 0.067796 0.066255 0.028659 0.019722
Brown 0.179533 0.175453 0.075894 0.052228
Red 0.044569 0.043557 0.018841 0.012966
Blond 0.079723 0.077911 0.033701 0.023192
sum(expProb)
[1] 1
若頭髮、眼睛顏色是不相關的(Independent),則各頭髮、眼睛組合的期望值是
expVal = expProb * sum(haireye)) (
Brown Blue Hazel Green
Black 40.135 39.223 16.966 11.6757
Brown 106.284 103.868 44.929 30.9189
Red 26.385 25.785 11.154 7.6757
Blond 47.196 46.123 19.951 13.7297
標準化殘差(Standardized Residual)
s.res = (haireye - expVal) / sqrt(expVal)) (
Eye
Hair Brown Blue Hazel Green
Black 4.398399 -3.069377 -0.477352 -1.953684
Brown 1.233458 -1.949477 1.353284 -0.345100
Red -0.074978 -1.730125 0.852253 2.282737
Blond -5.850997 7.049590 -2.227844 0.612698
馬賽克圖
mosaic(haireye, shade=T, labeling=labeling_residuals)
💡 學習重點:
■ The
Association between Hair
and Eye
is
significant
■ For Black-Hair:
■ the propotion of Brown-Eye
is significantly higher than the expected
■ the
propotion of Blue-Eye is significantly lower
■ For Red-Hair:
■ he propotion of Green-Eye is significantly higher
■ For
Blond-Hair:
■ the propotion of Brown-Eye is significantly
lower
■ the propotion of Blue-Eye is significantly higher
■ the propotion of Hazel-Eye is significantly lower
D4. The Expected Value
in Contingent
Table
expected = independence_table(haireye) ) (
Eye
Hair Brown Blue Hazel Green
Black 40.135 39.223 16.966 11.6757
Brown 106.284 103.868 44.929 30.9189
Red 26.385 25.785 11.154 7.6757
Blond 47.196 46.123 19.951 13.7297
It is derived from the marginal probability in both directions
rowSums(haireye) %o% colSums(haireye)) / sum(haireye) (
Brown Blue Hazel Green
Black 40.135 39.223 16.966 11.6757
Brown 106.284 103.868 44.929 30.9189
Red 26.385 25.785 11.154 7.6757
Blond 47.196 46.123 19.951 13.7297
D5. Residuals
residuals = haireye - expected ) (
Eye
Hair Brown Blue Hazel Green
Black 27.86486 -19.22297 -1.96622 -6.67568
Brown 12.71622 -19.86824 9.07095 -1.91892
Red -0.38514 -8.78547 2.84628 6.32432
Blond -40.19595 47.87669 -9.95101 2.27027
D6. Standardized Residuals
std.residuals = residuals / sqrt(expected) ) (
Eye
Hair Brown Blue Hazel Green
Black 4.398399 -3.069377 -0.477352 -1.953684
Brown 1.233458 -1.949477 1.353284 -0.345100
Red -0.074978 -1.730125 0.852253 2.282737
Blond -5.850997 7.049590 -2.227844 0.612698
🧙 熱圖與馬賽克圖的比較 Heatmap.Rmd
library(GGally)
ggpairs(iris, aes(colour = Species, alpha=0.4),
lower=list(combo = wrap("facethist", binwidth = 0.2)))
par(cex=0.8, mar=c(2,3,1,1), mfrow=c(1,1))
table(W$Channel) %>% barplot
par(cex=0.8, mar=c(4,3,1,1))
hist(W$Fresh)
structable(Channel ~ Region, W) %>%
mosaic(shade=T, labeling=labeling_residuals)
💡 :
如果類別變數裡面有太多分類(或各分類的數量很不平衡),我們應該考慮使用熱圖
類別和數量變數擺在一起,就可以做分類統計
tapply(W$Milk, W$Region, sum)
Reg1 Reg2 Reg3
270.08 163.29 1118.47
分類平均
tapply(W$Milk, list(W$Channel, W$Region), mean)
Reg1 Reg2 Reg3
Ch1 3.3686 3.2274 3.3470
Ch2 3.9628 3.8382 3.9263
分類總和
tapply(W$Milk, list(W$Channel, W$Region), sum)
Reg1 Reg2 Reg3
Ch1 198.750 90.366 706.21
Ch2 71.331 72.926 412.26
另一種做法
xtabs(Milk ~ Channel + Region, data=W)
Region
Channel Reg1 Reg2 Reg3
Ch1 198.750 90.366 706.214
Ch2 71.331 72.926 412.260
ggplot(W, aes(x=log(Milk))) +
geom_histogram(aes(fill=Region), alpha=0.5, bins=20) +
facet_grid(Channel~Region) +
labs(title="Distribtion of Sales of Milk")
ggplot(W, aes(x=log(Milk))) +
geom_density(aes(fill=Region), alpha=0.5) + # compare the density of Region
facet_grid(~Channel) + # by Channel
labs(title="Density of Sales of Milk")
ggplot(W, aes(x=log(Milk))) +
geom_density(aes(fill=Channel), alpha=0.5) + # compare the density of Channel
facet_grid(~Region) + # by Region
labs(title="Density of Sales of Milk")
ggplot(W, aes(x=log(Milk), y=log(Fresh))) +
geom_point() +
stat_smooth(method="lm",se=F)
`geom_smooth()` using formula = 'y ~ x'
ggplot(W, aes(x=log(Milk), y=log(Fresh))) +
geom_point() +
stat_smooth(method="lm", se=F) +
facet_grid(~Region)
`geom_smooth()` using formula = 'y ~ x'
ggplot(W, aes(x=log(Milk), y=log(Fresh))) +
geom_point() +
stat_smooth(method="lm", se=F, col='red') +
facet_grid(Channel~Region)
`geom_smooth()` using formula = 'y ~ x'
💡 學習重點:
存在整個族群的關係和個別分群之中的關係有可能是不同(甚至是相反)的!