pacman::p_load(caTools, ggplot2, dplyr)
D = read.csv("data/quality.csv")  # Read in dataset
set.seed(88)
split = sample.split(D$PoorCare, SplitRatio = 0.75)  # split vector
TR = subset(D, split == TRUE)
TS = subset(D, split == FALSE)
glm1 = glm(PoorCare ~ OfficeVisits + Narcotics, TR, family=binomial)
summary(glm1)


【A】From Prediction to Decision

Fig 12.3 - Prediction to Decision



【B】Distribution of Predicted Probabilities (DPP)

Usually we’d use test data to create the DPP. However, we’re using the entire data set because this dataset is very small.

pred = predict(glm1, D, type="response")
y = D$PoorCare
data.frame(pred, y) %>% 
  ggplot(aes(x=pred, fill=factor(y))) + 
  geom_histogram(bins=20, col='white', position="stack", alpha=0.5) +
  ggtitle("Distribution of Predicted Probability (DPP,FULL)") +
  xlab("predicted probability")


【C】Estimating the Expected Payoff

The Payoff Matrix

The Expected Payoff


Create the Payoff Matrix according to the business context

payoff = matrix(c(0,-100,-10,-50),2,2)
rownames(payoff) = c("FALSE","TRUE")
colnames(payoff) = c("NoAct","Act")
payoff
      NoAct Act
FALSE     0 -10
TRUE   -100 -50

Estimate the Expected Payoff for each Confusion Matrix derived from each Threshold of Action

cutoff = seq(0, 1, 0.01)
result = sapply(cutoff, function(p) {
  cm = table(factor(y==1, c(F,T)), factor(pred>p, c(F,T))) 
  sum(cm * payoff) # sum of confusion * payoff matrix
  })
i = which.max(result)
par(cex=0.7, mar=c(4,4,3,1))
plot(cutoff, result, type='l', col='cyan', lwd=2, main=sprintf(
  "Optomal Expected Result: $%d @ %.2f",result[i],cutoff[i]))
abline(v=seq(0,1,0.1),h=seq(-6000,0,100),col='lightgray',lty=3)
points(cutoff[i], result[i], pch=20, col='red', cex=2)


🌷 Because the dataset is too small, the result of simulation looks very chopy. Here we create a larger test dataset (10K) to fit the normal business situation.

library(MASS)

Attaching package: 'MASS'
The following object is masked from 'package:dplyr':

    select
d0 = subset(D, PoorCare==0)
A0 = mvrnorm(7480, colMeans(d0[,4:5]), cov(d0[,4:5]))
d1 = subset(D, PoorCare==1)
A1 = mvrnorm(2520, colMeans(d1[,4:5]), cov(d1[,4:5]))
A = rbind(A0, A1) %>% as.data.frame() 
A$PoorCare = c(rep(0L, 7480), rep(1L, 2520))
table(A$PoorCare)

   0    1 
7480 2520 

There are 10K patients in the test dataset. We make the prediction and save the relevant data objects for latter use.

y = A$PoorCare
pred = predict(glm1, A, type="response")
save(y, pred, payoff, file="Sim11B.Rdata")

Now we can run the simulation with the large dataset …

cutoff = seq(0, 1, 0.01)
result = sapply(cutoff, function(p) {
  cm = table(factor(y==1, c(F,T)), factor(pred>p, c(F,T))) 
  sum(cm * payoff) # sum of confusion * payoff matrix
  }) 
i = which.max(result)
par(cex=0.7, mar=c(4,4,3,1))
plot(cutoff, result, type='l', col='cyan', lwd=2, main=sprintf(
  "Optomal Expected Result: K$%.2f @ %.2f",result[i],cutoff[i]))
abline(v=seq(0,1,0.1),h=seq(-300000,-100000,25000),col='lightgray',lty=3)
points(cutoff[i], result[i], pch=20, col='red', cex=2)

🌷 As you can see, business data sets of normal size usually would lead to a smooth curve in simulation.



【D】Interactive Simulation

In RStudio, we can make an interactive simulator with the manipulate package. However, the manipulated code cannot be executed in R Markdown. We need to run the code in Sim11B.R.

library(manipulate)
manipulate({
  payoff = matrix(c(TN,FN,FP,TP),2,2)
  cutoff = seq(0, 1, 0.01)
  result = sapply(cutoff, function(p) {
    cm = table(factor(y==1, c(F,T)), factor(pred>p, c(F,T))) 
    sum(cm * payoff) # sum of confusion * payoff matrix
  }) / 1000
  i = which.max(result)
  par(cex=0.7)
  plot(cutoff, result, type='l', col='cyan', lwd=2, main=sprintf(
    "Optomal Expected Result: K$%.2f @ %.2f",result[i],cutoff[i]),
    ylim=c(-320, -120))
  abline(v=seq(0,1,0.1),h=seq(-400,-100,25),col='lightgray',lty=3)
  points(cutoff[i], result[i], pch=20, col='red', cex=2)
},
TN = slider(-100,0,   0,"TN(無行動)",step=5),
FN = slider(-100,0,-100,"FN(平均風險成本)",step=5),
FP = slider(-100,0, -10,"FP(行動成本)",step=5),
TP = slider(-100,0, -50,"TP(行動成本+較小風險成本)",step=5)
) 


🗿 Preactice:
RunSim12.R and answer the following questions with the default setting …
  【A】 What is the optimal threshold? What is the corresponding expected outcome?
  【B】 What are the threshold and outcome if we do nothing?
  【C】 What are the threshold and outcome if we take action to every patients?
  【D】 Is the optimal threshold better than the NO_ACT and the ALL_IN strategy?

If we have a series of actions that cost $5, $10, $15, $20, $35 per patient and can reduce the risk from $100 to $70, $60, $50, $40, $30 respectively …
  【H】 What are their optimal thresholds and expected payoffs respectively?
  【I】 Which action has the best expectd payoff?



🌞 Reference Code

pacman::p_load(plotly)

PMX = list(
  c(0,-100, -5,-75), c(0,-100,-10,-70), c(0,-100,-15,-65),
  c(0,-100,-20,-60), c(0,-100,-35,-65)) %>% 
  lapply(matrix, nrow=2, ncol=2)

do.call(rbind, lapply(1:length(PMX), function(i) {
  r = sapply(cutoff, function(p) {
    cm = table(factor(y==1,c(F,T)), factor(pred>p,c(F,T))) 
    sum(cm * PMX[[i]]) })
  data.frame(drug=paste0("drug_",i), cutoff=cutoff, exp.payoff=r)
  })) %>% 
  ggplot(aes(x=cutoff, y=exp.payoff, col=drug)) +
  geom_line() + theme_bw() -> p

ggplotly(p)