pacman::p_load(dplyr, ggplot2, car, vcd, GGally, mvtnorm)


【A】批發商資料集

W = read.csv('data/wholesales.csv')
W$Channel = factor( paste0("Ch",W$Channel) )
W$Region = factor( paste0("Reg",W$Region) )
W[3:8] = lapply(W[3:8], log, base=10)
summary(W)
 Channel    Region        Fresh            Milk         Grocery     
 Ch1:298   Reg1: 77   Min.   :0.477   Min.   :1.74   Min.   :0.477  
 Ch2:142   Reg2: 47   1st Qu.:3.495   1st Qu.:3.19   1st Qu.:3.333  
           Reg3:316   Median :3.930   Median :3.56   Median :3.677  
                      Mean   :3.792   Mean   :3.53   Mean   :3.666  
                      3rd Qu.:4.229   3rd Qu.:3.86   3rd Qu.:4.028  
                      Max.   :5.050   Max.   :4.87   Max.   :4.968  
     Frozen     Detergents_Paper   Delicassen   
 Min.   :1.40   Min.   :0.477    Min.   :0.477  
 1st Qu.:2.87   1st Qu.:2.409    1st Qu.:2.611  
 Median :3.18   Median :2.912    Median :2.985  
 Mean   :3.17   Mean   :2.947    Mean   :2.895  
 3rd Qu.:3.55   3rd Qu.:3.594    3rd Qu.:3.260  
 Max.   :4.78   Max.   :4.611    Max.   :4.681  


【B】連續變數的相關性(係數) Correlation

B1a. 點狀圖 Simple Scatter Plot

par(cex=0.7, mar=c(4,4,2,2))
plot(W$Milk, W$Grocery)

B1b. 點狀圖+回歸線 Scatter Plot with Regrssion Line

ggplot(W, aes(x=Milk, y=Grocery)) +
  geom_point(alpha=0.3) +
  geom_smooth(method="lm")
`geom_smooth()` using formula = 'y ~ x'

B2. 回歸係數 Correlation \[r_{xy}=\frac{Cov(x,y)}{\sigma_x \sigma_y} =\frac{\Sigma_{i=1}^n(x_i - \bar{x})(y_i - \bar{y})} {\sqrt{\Sigma_{i=1}^n(x_i - \bar{x})^2} \sqrt{\Sigma_{i=1}^n(y_i - \bar{y})^2}}\]

cor(W$Milk, W$Grocery)
[1] 0.75885

B3. 回歸係數檢定 Correlation Test

cor.test(W$Milk, W$Grocery)

    Pearson's product-moment correlation

data:  W$Milk and W$Grocery
t = 24.4, df = 438, p-value <0.0000000000000002
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.71617 0.79588
sample estimates:
    cor 
0.75885 

💡 : 簡單講,\(p-value\)大致可以視為「沒有關係」的機率(準確的說它並不是這樣)

💡 : 如果「沒有關係」的機率很小,我們就可以推論這關係是「顯著」的

💡 : \(p-value\): 給定虛無假設(\(H_0:r=0\)),檢定統計量(\(t\))大於觀察值(\(24.4\))的機率

🗿 : \(p = 0.05\)時,對立假設(\(H_A\))為真的機率是?

💡 : 貝氏定理:\(P(A|B) = P(B|A) \cdot P(A) / P(B)\)

B4. Simulating Bi-Variate Normal Distibution

par(cex=0.7, mar=c(1,1,1,1), mfrow=c(3,3))
for(r in seq(-1,1,0.25)) {
  mu = c(0,0)
  sigma = matrix(c(1,r,r,1),nrow=2)   # covariance matrix 
  rmvnorm(500, mu, sigma) %>% plot(col='gray')
  text(0,0,r,cex=3,col='blue',font=2)
  }


【C】相關性矩陣

C1. Matrix of Correlation Coefficients

cor(W[,3:8]) %>% round(3)
                  Fresh   Milk Grocery Frozen Detergents_Paper Delicassen
Fresh             1.000 -0.020  -0.133  0.384           -0.156      0.255
Milk             -0.020  1.000   0.759 -0.055            0.678      0.338
Grocery          -0.133  0.759   1.000 -0.165            0.796      0.236
Frozen            0.384 -0.055  -0.165  1.000           -0.212      0.255
Detergents_Paper -0.156  0.678   0.796 -0.212            1.000      0.167
Delicassen        0.255  0.338   0.236  0.255            0.167      1.000

💡 : 相關性矩陣:(a)對角線等於1;(b)左下、右上對稱

C1. Matrix of Scatter Plots

car::scatterplotMatrix(W[,3:8])