CASE : Quality of Medical Care
::p_load(caTools, ggplot2, dplyr)
pacman= read.csv("data/quality.csv") # Read in dataset
D set.seed(88)
= sample.split(D$PoorCare, SplitRatio = 0.75) # split vector
split = subset(D, split == TRUE)
TR = subset(D, split == FALSE)
TS = glm(PoorCare ~ OfficeVisits + Narcotics, TR, family=binomial)
glm1 summary(glm1)
p = 0.5
)Predicted Probability (Training)
par(cex=0.8)
= predict(glm1, type="response")
pred hist(pred)
abline(v=0.5, col='red')
Confusion Matrix (Training)
= table(
cmx Acture=TR$PoorCare, # actual y
Predict=pred > 0.5 # predicted y with threshold p=0.5
) cmx
Predict
Acture FALSE TRUE
0 70 4
1 15 10
Accuracy Metrics (Training)
# define a helper function for repeated usages
= function(x, k=3) { c(
AccuracyMetrices accuracy = sum(diag(x))/sum(x), #
sensitivity = as.numeric(x[2,2]/rowSums(x)[2]), #
specificity = as.numeric(x[1,1]/rowSums(x)[1]) #
%>% round(k) }
) # it takes a confusion matrix and product a vector of accuracy measures
AccuracyMetrices(cmx)
accuracy sensitivity specificity
0.808 0.400 0.946
Predicted Probability (Testing)
par(cex=0.8)
= predict(glm1, newdata=TS, type="response")
pred2 hist(pred2, 10)
abline(v=0.5, col='red')
Confusion Matrix (Testing)
= table(Acture=TS$PoorCare, Predict=pred2 > 0.5)
cmx2 cmx2
Predict
Acture FALSE TRUE
0 23 1
1 5 3
Accuracy Matrices (Testing)
# with the helper function, we can compare the training and testing accuracy at once
sapply(list(Train=cmx, Test=cmx2), AccuracyMetrices)
Train Test
accuracy 0.808 0.812
sensitivity 0.400 0.375
specificity 0.946 0.958
Distribution of Predicted Probability (Train)
data.frame(y=factor(TR$PoorCare), pred=pred) %>%
ggplot(aes(x=pred, fill=y)) +
geom_histogram(bins=20, col='white', position="stack", alpha=0.5) +
ggtitle("Distribution of Predicted Probability (DPP,Train)") +
xlab("predicted probability")
Distribution of Predicted Probability (Test)
#
#
🌻 Notes on Moving Threshold :
DPPSIM.R
🌻 colAUC()
: a tool that draw ROC and calculate AUC
conveniently
ROC - Receiver Operation Curve
par(mfrow=c(1,2), cex=0.8)
= colAUC(pred, y=TR$PoorCare, plotROC=T)
trAUC = colAUC(pred2, y=TS$PoorCare, plotROC=T) tsAUC
AUC - Area Under Curve
c(trAUC, tsAUC)
[1] 0.77459 0.79948
🗿 Quiz:
Use all of the
columns except TR$MemberID
to build a model that paredict
PoorCare
, and :
【A】 Create Training and Testing
Confusion Matrix
respectively
【B】 Calculate
Training and Testing ACC
、SENS
and
SPEC
respectively
【C】 Draw Training and Testing
DPP
respectively
【D】 Estimate Training and Testing
AUC
respectively
【E】 Is this model more accurate
than the model with two predictors?
【F】 Why is it more (or less)
accurate?