UNIT10D Model Prediction vs. Business Decision

pacman::p_load(caTools, ggplot2, dplyr)
D = read.csv("data/quality.csv")  # Read in dataset
set.seed(88)
split = sample.split(D$PoorCare, SplitRatio = 0.75)  # split vector
TR = subset(D, split == TRUE)
TS = subset(D, split == FALSE)
glm1 = glm(PoorCare ~ OfficeVisits + Narcotics, TR, family=binomial)
summary(glm1)

【A】From Prediction to Decision

Fig 10D.1 - From Prediction to Decision

【B】預測機率分佈 (DPP)

Usually we’d use test data to create the DPP. However, we’re using the entire data set because it is very small.

pred = predict(glm1, D, type="response")
y = D$PoorCare
data.frame(pred, y) %>% 
  ggplot(aes(x=pred, fill=factor(y))) + 
  geom_histogram(bins=20, col='white', position="stack", alpha=0.5) +
  ggtitle("Distribution of Predicted Probability (DPP,FULL)") +
  xlab("predicted probability")

【C】Estimating the Expected Payoff

Expected_Payoff = Sum( Payoff_Matrix * Confusion_Matrix )

TN: NoAction, GoodCare; nothing
FN: NoAction, PoorCare; the average risk of poor care
FP: Action, GoodCare; the cost of predictive action
TP: Action, PoorCare; FR + the reduced risk after taking action

Create the Payoff Matrix according to the business context

payoff = matrix(c(0,-100,-10,-50),2,2)
rownames(payoff) = c("FALSE","TRUE")
colnames(payoff) = c("NoAct","Act")
payoff

      NoAct Act
FALSE     0 -10
TRUE   -100 -50

Estimate the Expected Payoff for each Confusion Matrix derived from each Threshold of Action

cutoff = seq(0, 1, 0.01)
result = sapply(cutoff, function(p) {
  cm = table(factor(y==1, c(F,T)), factor(pred>p, c(F,T))) 
  sum(cm * payoff) # sum of confusion * payoff matrix
  })
i = which.max(result)
par(cex=0.7, mar=c(4,4,3,1))
plot(cutoff, result, type='l', col='cyan', lwd=2, main=sprintf(
  "Optomal Expected Result: $%d @ %.2f",result[i],cutoff[i]))
abline(v=seq(0,1,0.1),h=seq(-6000,0,100),col='lightgray',lty=3)
points(cutoff[i], result[i], pch=20, col='red', cex=2)

【D】Strategic Simulation

When we’re not sure about the figures in the payoff matrix, we can treated them as adjustable variables and write a simulation code to examine how the optimal threshold and expected outcome may vary with those variables.

使用manipulate套件做策略模擬

library(manipulate)
manipulate({
  payoff = matrix(c(TN,FN,FP,TP),2,2)
  cutoff = seq(0, 1, 0.01)
  result = sapply(cutoff, function(p) {
    cm = table(factor(y==1, c(F,T)), factor(pred>p, c(F,T))) 
    sum(cm * payoff) # sum of confusion * payoff matrix
    })
  i = which.max(result)
  par(cex=0.7)
  plot(cutoff, result, type='l', col='cyan', lwd=2, main=sprintf(
    "Optomal Expected Result: $%d @ %.2f",result[i],cutoff[i]))
  abline(v=seq(0,1,0.1),h=seq(-10000,0,100),col='lightgray',lty=3)
  points(cutoff[i], result[i], pch=20, col='red', cex=2)
  },
  TN = slider(-100,0,   0,step=5),
  FN = slider(-100,0,-100,step=5),
  FP = slider(-100,0, -10,step=5),
  TP = slider(-100,0, -50,step=5)
  )

🗿 Preactice：
RunSim12.R and answer the following questions with the default setting …
【A】 What is the optimal threshold? What is the corresponding expected outcome?
【B】 What are the threshold and outcome if we do nothing?
【C】 What are the threshold and outcome if we take action to every patients?
【D】 Is the optimal threshold better than the NO_ACT and the ALL_IN strategy?

If we have a series of actions that cost $5, $10, $15, $20, $35 per patient and can reduce the risk from $100 to $70, $60, $50, $40, $30 respectively …
【H】 What are their optimal thresholds and expected payoffs respectively?
【I】 Which action has the best expectd payoff?

🌞 Reference Code

pacman::p_load(plotly)

PMX = list(
  c(0,-100, -5,-75), c(0,-100,-10,-70), c(0,-100,-15,-65),
  c(0,-100,-20,-60), c(0,-100,-35,-65)) %>% 
  lapply(matrix, nrow=2, ncol=2)

do.call(rbind, lapply(1:length(PMX), function(i) {
  r = sapply(cutoff, function(p) {
    cm = table(factor(y==1,c(F,T)), factor(pred>p,c(F,T))) 
    sum(cm * PMX[[i]]) })
  data.frame(drug=paste0("drug_",i), cutoff=cutoff, exp.payoff=r)
  })) %>% 
  ggplot(aes(x=cutoff, y=exp.payoff, col=drug)) +
  geom_line() + theme_bw() -> p

ggplotly(p)

【E】From Prediction to Decision

🔑 Key Points：
■ Predictions do not generate value by itself
■ Combining Prediction, Assumption and Simulation helps to …
。choose among alternative solutions
。identify the optimal threshold of action
。identify the targeted subjects
。estimate the expected payoff