pacman::p_load(dplyr, ggplot2, plotly, ggpubr)

The Whole Seller Dataset

W = read.csv('data/wholesales.csv')
W$Channel = factor( paste0("Ch",W$Channel) )
W$Region = factor( paste0("Reg",W$Region) )
W[3:8] = lapply(W[3:8], log, base=10)

[A] Linear Regression in R

Fitting the model

md = lm(Milk ~ Grocery, W)

Check Model Summary

summary(md)

Call:
lm(formula = Milk ~ Grocery, data = W)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.1371 -0.1875  0.0219  0.1593  1.8732 

Coefficients:
            Estimate Std. Error t value             Pr(>|t|)    
(Intercept)   0.8318     0.1115    7.46     0.00000000000047 ***
Grocery       0.7352     0.0301   24.39 < 0.0000000000000002 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.306 on 438 degrees of freedom
Multiple R-squared:  0.576, Adjusted R-squared:  0.575 
F-statistic:  595 on 1 and 438 DF,  p-value: <0.0000000000000002


[B] The Theoretical Model and It’s Estimations

y = W$Milk                # target var.
x = W$Grocery             # explaining var.
b0 = md$coef[1]           # intercept
b1 = md$coef[2]           # slope
yhat = b0 + b1 * x      # calculate predicted values
                        # also available in md$fitted.values  
er = y - yhat                  # calculate predicted values
                               # also available in md$residuals


[C] Plot the Regression Line

\[ Milk_i = b_0 + b_1 Grocery_i\]

par(cex=0.8, mar=c(4,4,1,1))
plot(W$Grocery, W$Milk, pch=20, col="#80808080")  # plot the data points
abline(b0, b1, col='red')   # then plot the line

We can plot the regression line with ggplot and make it interactive via plotly

ggplot(aes(Grocery, Milk), data=W) + 
  geom_point(alpha=0.4, size=0.8) + 
  geom_smooth(method="lm", level=0.95, lwd=0.2) + 
  theme_bw() -> p
ggplotly(p)
`geom_smooth()` using formula 'y ~ x'

🌻 The grey area indicate the 95% confidence interval of \(\bar{y}|x\)



[D] Prediction

In order to do predition, we need some new data

new_data = tibble(Grocery = seq(1,5,0.5))
new_data
# A tibble: 9 x 1
  Grocery
    <dbl>
1     1  
2     1.5
3     2  
4     2.5
5     3  
6     3.5
7     4  
8     4.5
9     5  

🌻 simply predicting \(\bar y\)

point.est = predict(md, new_data)
point.est
     1      2      3      4      5      6      7      8      9 
1.5670 1.9345 2.3021 2.6697 3.0373 3.4049 3.7725 4.1401 4.5077 

🌻 predicting \(\bar y\) with 95% confidence interval

conf = predict(md, new_data, interval="confidence"); conf
     fit    lwr    upr
1 1.5670 1.4064 1.7275
2 1.9345 1.8030 2.0661
3 2.3021 2.1993 2.4049
4 2.6697 2.5949 2.7445
5 3.0373 2.9885 3.0861
6 3.4049 3.3746 3.4352
7 3.7725 3.7377 3.8074
8 4.1401 4.0830 4.1973
9 4.5077 4.4236 4.5918

🌻 predicting \(y\) with 95% prediction interval

pred = predict(md, new_data, interval="prediction"); pred
     fit     lwr    upr
1 1.5670 0.94409 2.1898
2 1.9345 1.31853 2.5506
3 2.3021 1.69161 2.9127
4 2.6697 2.06329 3.2762
5 3.0373 2.43354 3.6411
6 3.4049 2.80235 4.0075
7 3.7725 3.16969 4.3753
8 4.1401 3.53559 4.7446
9 4.5077 3.90004 5.1154
df = cbind(new_data,conf,pred[,-1]) 
names(df) = c("x","yhat","c.lwr","c.upr","p.lwr","p.upr")
df
    x   yhat  c.lwr  c.upr   p.lwr  p.upr
1 1.0 1.5670 1.4064 1.7275 0.94409 2.1898
2 1.5 1.9345 1.8030 2.0661 1.31853 2.5506
3 2.0 2.3021 2.1993 2.4049 1.69161 2.9127
4 2.5 2.6697 2.5949 2.7445 2.06329 3.2762
5 3.0 3.0373 2.9885 3.0861 2.43354 3.6411
6 3.5 3.4049 3.3746 3.4352 2.80235 4.0075
7 4.0 3.7725 3.7377 3.8074 3.16969 4.3753
8 4.5 4.1401 4.0830 4.1973 3.53559 4.7446
9 5.0 4.5077 4.4236 4.5918 3.90004 5.1154
ggplot(df, aes(x)) +
  geom_point(aes(y=yhat),col='red',size=2) +
  geom_line(aes(y=yhat),col='red') +
  geom_line(aes(y=c.lwr),col='orange') +
  geom_line(aes(y=c.upr),col='orange') +
  geom_line(aes(y=p.lwr),col='magenta',linetype='dashed') +
  geom_line(aes(y=p.upr),col='magenta',linetype='dashed') +
  geom_point(data=W, aes(x=Grocery, y=Milk), size=0.8, alpha=0.5) +
  xlim(1,5) + ylim(1,5) + theme_bw()

🌻 Confidence Interval covers 95% of \(\bar y\) 🌻 Prediction Interval covers 95% of \(y\)


[E] Model Summary

summary(md)

Call:
lm(formula = Milk ~ Grocery, data = W)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.1371 -0.1875  0.0219  0.1593  1.8732 

Coefficients:
            Estimate Std. Error t value             Pr(>|t|)    
(Intercept)   0.8318     0.1115    7.46     0.00000000000047 ***
Grocery       0.7352     0.0301   24.39 < 0.0000000000000002 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.306 on 438 degrees of freedom
Multiple R-squared:  0.576, Adjusted R-squared:  0.575 
F-statistic:  595 on 1 and 438 DF,  p-value: <0.0000000000000002

🌻 Two readings indicates the accuracy of the model

🌻 Coefficients: \(b_1\) indicates \(x\)’s marginal effect to \(y\)


§ Plot \(b_1\) as an Random Variable
curve(dnorm(x, 0.7352, 0.0301), -0.1, 1, n=400, 
      xlab=bquote(italic(b[1])), ylab="",
      main=bquote("The Distribution of Random Variable: " ~ italic(b[1])))
# add 95% confidence interval
abline(v=qnorm(c(0.025, 0.975),0.7352, 0.0301), col="red")

🌻 Conventionally, a coefficient is significant

  • if it’s 95% (98% or 99%) confidence interval does not cover zero
  • i.e., we have a high confidence that the relationship is not neglectable