CASE : Quality of Medical Care

pacman::p_load(caTools, ggplot2, dplyr)
D = read.csv("data/quality.csv")  # Read in dataset
set.seed(88)
split = sample.split(D$PoorCare, SplitRatio = 0.75)  # split vector
TR = subset(D, split == TRUE)
TS = subset(D, split == FALSE)
glm1 = glm(PoorCare ~ OfficeVisits + Narcotics, TR, family=binomial)
summary(glm1)


【A】Accuracy with Fix Threshold (p = 0.5)

Fig 10C.1 - Confusion Matrix & Model Accuracy
Fig 10C.1 - Confusion Matrix & Model Accuracy


Training Data

Distribution of Predicted Probability (Training)

par(cex=0.8)
pred = predict(glm1, type="response")
hist(pred)
abline(v=0.5, col='red')

Confusion Matrix (Training)

cmx = table(
  Acture=TR$PoorCare,   # actual y
  Predict=pred > 0.5    # predicted y with threshold p=0.5
  )
cmx
      Predict
Acture FALSE TRUE
     0    70    4
     1    15   10

Accuracy Metrics (Training)

# define a helper function for repeated usages
AccuracyMetrices = function(x, k=3) { c(
  accuracy = sum(diag(x))/sum(x),                    # 
  sensitivity = as.numeric(x[2,2]/rowSums(x)[2]),    # 
  specificity = as.numeric(x[1,1]/rowSums(x)[1])     # 
  ) %>% round(k) }
# it takes a confusion matrix and product a vector of accuracy measures
AccuracyMetrices(cmx)
   accuracy sensitivity specificity 
      0.808       0.400       0.946 


Testing Data

Distribution of Predicted Probability (Testing)

par(cex=0.8)
pred2 = predict(glm1, newdata=TS, type="response")
hist(pred2, 10)
abline(v=0.5, col='red')

Confusion Matrix (Testing)

cmx2 = table(Acture=TS$PoorCare, Predict=pred2 > 0.5)
cmx2
      Predict
Acture FALSE TRUE
     0    23    1
     1     5    3

Accuracy Matrices (Testing)

# with the helper function, we can compare the training and testing accuracy at once
sapply(list(Train=cmx, Test=cmx2), AccuracyMetrices)
            Train  Test
accuracy    0.808 0.812
sensitivity 0.400 0.375
specificity 0.946 0.958



【B】Classified Distribution of Predicted Probability (cDPP)

Fig 12.2 - Classified Distribution of Predicted Probability (cDPP)
Fig 12.2 - Classified Distribution of Predicted Probability (cDPP)


Classified Distribution of Predicted Probability (Train)

data.frame(y=factor(TR$PoorCare), pred=pred) %>% 
  ggplot(aes(x=pred, fill=y)) + 
  geom_histogram(bins=20, col='white', position="identity", alpha=0.3) +
  ggtitle("Classified Distribution of Predicted Probability (cDPP,Train)") +
  xlab("predicted probability") +
  theme_bw()


Classified Distribution of Predicted Probability (Test)

#
#




【C】ROC & AUC

🌻 Notes on Moving Threshold :


🌻 colAUC() : a tool that draw ROC and calculate AUC conveniently

ROC - Receiver Operation Curve

par(mfrow=c(1,2), cex=0.8)
trAUC = colAUC(pred, y=TR$PoorCare, plotROC=T)
tsAUC = colAUC(pred2, y=TS$PoorCare, plotROC=T)

AUC - Area Under Curve

c(trAUC, tsAUC)
[1] 0.77459 0.79948


🗿 Quiz:
Use all of the columns except TR$MemberID to build a model that paredict PoorCare, and :
  【A】 Create Training and Testing Confusion Matrix respectively
  【B】 Calculate Training and Testing ACCSENS and SPEC respectively
  【C】 Draw Training and Testing DPP respectively
  【D】 Estimate Training and Testing AUC respectively
  【E】 Is this model more accurate than the model with two predictors?
  【F】 Why is it more (or less) accurate?



glm2 = glm(PoorCare ~ ., TR[,-c(1,8)], family=binomial)
summary(glm2)

Call:
glm(formula = PoorCare ~ ., family = binomial, data = TR[, -c(1, 
    8)])

Coefficients:
                         Estimate Std. Error z value Pr(>|z|)   
(Intercept)              -1.52721    1.24887   -1.22   0.2214   
InpatientDays             0.13978    0.09725    1.44   0.1506   
ERVisits                 -0.32645    0.24856   -1.31   0.1891   
OfficeVisits              0.09852    0.04687    2.10   0.0355 * 
Narcotics                 0.04695    0.04567    1.03   0.3039   
DaysSinceLastERVisit     -0.00389    0.00188   -2.06   0.0391 * 
Pain                      0.00227    0.01538    0.15   0.8827   
ProviderCount             0.02847    0.03330    0.85   0.3927   
MedicalClaims             0.01568    0.02441    0.64   0.5205   
ClaimLines               -0.01055    0.00924   -1.14   0.2535   
StartedOnCombinationTRUE  2.34587    1.66703    1.41   0.1594   
AcuteDrugGapSmall         0.26808    0.10246    2.62   0.0089 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 111.888  on 98  degrees of freedom
Residual deviance:  72.527  on 87  degrees of freedom
AIC: 96.53

Number of Fisher Scoring iterations: 6
pTR = predict(glm2, type="response")
pTS = predict(glm2, TS, type="response")
cmTR = table(Acture=TR$PoorCare, Predict=pTR > 0.5)
cmTS = table(Acture=TS$PoorCare, Predict=pTS > 0.5)
sapply(list(Train1=cmx, Train2=cmTR, Test1=cmx2, Test2=cmTS), AccuracyMetrices)
            Train1 Train2 Test1 Test2
accuracy     0.808  0.798 0.812 0.844
sensitivity  0.400  0.440 0.375 0.500
specificity  0.946  0.919 0.958 0.958
c(testAUC1 = colAUC(pred2, y=TS$PoorCare), 
  testAUC1 = colAUC(pTS, y=TS$PoorCare))
testAUC1 testAUC1 
 0.79948  0.86458