CASE : Quality of Medical Care
pacman::p_load(caTools, ggplot2, dplyr)
D = read.csv("data/quality.csv") # Read in dataset
set.seed(88)
split = sample.split(D$PoorCare, SplitRatio = 0.75) # split vector
TR = subset(D, split == TRUE)
TS = subset(D, split == FALSE)
glm1 = glm(PoorCare ~ OfficeVisits + Narcotics, TR, family=binomial)
summary(glm1)p = 0.5)Fig 10C.1 - Confusion Matrix & Model Accuracy
Predicted Probability (Training)
par(cex=0.8)
pred = predict(glm1, type="response")
hist(pred)
abline(v=0.5, col='red')Confusion Matrix (Training)
cmx = table(
Acture=TR$PoorCare, # actual y
Predict=pred > 0.5 # predicted y with threshold p=0.5
)
cmx Predict
Acture FALSE TRUE
0 70 4
1 15 10
Accuracy Metrics (Training)
# define a helper function for repeated usages
AccuracyMetrices = function(x, k=3) { c(
accuracy = sum(diag(x))/sum(x), #
sensitivity = as.numeric(x[2,2]/rowSums(x)[2]), #
specificity = as.numeric(x[1,1]/rowSums(x)[1]) #
) %>% round(k) }
# it takes a confusion matrix and product a vector of accuracy measures
AccuracyMetrices(cmx) accuracy sensitivity specificity
0.808 0.400 0.946
Predicted Probability (Testing)
par(cex=0.8)
pred2 = predict(glm1, newdata=TS, type="response")
hist(pred2, 10)
abline(v=0.5, col='red')Confusion Matrix (Testing)
cmx2 = table(Acture=TS$PoorCare, Predict=pred2 > 0.5)
cmx2 Predict
Acture FALSE TRUE
0 23 1
1 5 3
Accuracy Matrices (Testing)
# with the helper function, we can compare the training and testing accuracy at once
sapply(list(Train=cmx, Test=cmx2), AccuracyMetrices) Train Test
accuracy 0.808 0.812
sensitivity 0.400 0.375
specificity 0.946 0.958
Fig 12.2 - Distribution of Predicted Probability (DPP)
Distribution of Predicted Probability (Train)
data.frame(y=factor(TR$PoorCare), pred=pred) %>%
ggplot(aes(x=pred, fill=y)) +
geom_histogram(bins=20, col='white', position="stack", alpha=0.5) +
ggtitle("Distribution of Predicted Probability (DPP,Train)") +
xlab("predicted probability")
Distribution of Predicted Probability (Test)
#
#🌻 Notes on Moving Threshold :
DPPSIM.R🌻 colAUC() : a tool that draw ROC and calculate AUC
conveniently
ROC - Receiver Operation Curve
par(mfrow=c(1,2), cex=0.8)
trAUC = colAUC(pred, y=TR$PoorCare, plotROC=T)
tsAUC = colAUC(pred2, y=TS$PoorCare, plotROC=T)AUC - Area Under Curve
c(trAUC, tsAUC)[1] 0.77459 0.79948
🗿 Quiz:
Use all of the
columns except TR$MemberID to build a model that paredict
PoorCare, and :
【A】 Create Training and Testing
Confusion Matrix respectively
【B】 Calculate
Training and Testing ACC、SENS and
SPEC respectively
【C】 Draw Training and Testing
DPP respectively
【D】 Estimate Training and Testing
AUC respectively
【E】 Is this model more accurate
than the model with two predictors?
【F】 Why is it more (or less)
accurate?