::p_load(caTools, ggplot2, dplyr)
pacman= read.csv("data/quality.csv") # Read in dataset
D set.seed(88)
= sample.split(D$PoorCare, SplitRatio = 0.75) # split vector
split = subset(D, split == TRUE)
TR = subset(D, split == FALSE)
TS = glm(PoorCare ~ OfficeVisits + Narcotics, TR, family=binomial)
glm1 summary(glm1)
Usually we’d use test data to create the DPP. However, we’re using the entire data set because it is very small.
= predict(glm1, D, type="response")
pred = D$PoorCare
y data.frame(pred, y) %>%
ggplot(aes(x=pred, fill=factor(y))) +
geom_histogram(bins=20, col='white', position="stack", alpha=0.5) +
ggtitle("Distribution of Predicted Probability (DPP,FULL)") +
xlab("predicted probability")
Expected_Payoff = Sum( Payoff_Matrix * Confusion_Matrix )
Create the Payoff Matrix according to the business context
= matrix(c(0,-100,-10,-50),2,2)
payoff rownames(payoff) = c("FALSE","TRUE")
colnames(payoff) = c("NoAct","Act")
payoff
NoAct Act
FALSE 0 -10
TRUE -100 -50
Estimate the Expected Payoff for each Confusion Matrix derived from each Threshold of Action
= seq(0, 1, 0.01)
cutoff = sapply(cutoff, function(p) {
result = table(factor(y==1, c(F,T)), factor(pred>p, c(F,T)))
cm sum(cm * payoff) # sum of confusion * payoff matrix
})= which.max(result)
i par(cex=0.7, mar=c(4,4,3,1))
plot(cutoff, result, type='l', col='cyan', lwd=2, main=sprintf(
"Optomal Expected Result: $%d @ %.2f",result[i],cutoff[i]))
abline(v=seq(0,1,0.1),h=seq(-6000,0,100),col='lightgray',lty=3)
points(cutoff[i], result[i], pch=20, col='red', cex=2)
When we’re not sure about the figures in the payoff matrix, we can treated them as adjustable variables and write a simulation code to examine how the optimal threshold and expected outcome may vary with those variables.
使用manipulate
套件做策略模擬
library(manipulate)
manipulate({
= matrix(c(TN,FN,FP,TP),2,2)
payoff = seq(0, 1, 0.01)
cutoff = sapply(cutoff, function(p) {
result = table(factor(y==1, c(F,T)), factor(pred>p, c(F,T)))
cm sum(cm * payoff) # sum of confusion * payoff matrix
})= which.max(result)
i par(cex=0.7)
plot(cutoff, result, type='l', col='cyan', lwd=2, main=sprintf(
"Optomal Expected Result: $%d @ %.2f",result[i],cutoff[i]))
abline(v=seq(0,1,0.1),h=seq(-10000,0,100),col='lightgray',lty=3)
points(cutoff[i], result[i], pch=20, col='red', cex=2)
},TN = slider(-100,0, 0,step=5),
FN = slider(-100,0,-100,step=5),
FP = slider(-100,0, -10,step=5),
TP = slider(-100,0, -50,step=5)
)
🗿 Preactice:
RunSim12.R
and answer the following questions with the default setting …
【A】 What is the optimal threshold? What is the corresponding expected outcome?
【B】 What are the threshold and outcome if we do nothing?
【C】 What are the threshold and outcome if we take action to every patients?
【D】 Is the optimal threshold better than the NO_ACT and the ALL_IN strategy?
If we have a series of actions that cost $5, $10, $15, $20, $35
per patient and can reduce the risk from $100
to $70, $60, $50, $40, $30
respectively …
【H】 What are their optimal thresholds and expected payoffs respectively?
【I】 Which action has the best expectd payoff?
🌞 Reference Code
::p_load(plotly)
pacman
= list(
PMX c(0,-100, -5,-75), c(0,-100,-10,-70), c(0,-100,-15,-65),
c(0,-100,-20,-60), c(0,-100,-35,-65)) %>%
lapply(matrix, nrow=2, ncol=2)
do.call(rbind, lapply(1:length(PMX), function(i) {
= sapply(cutoff, function(p) {
r = table(factor(y==1,c(F,T)), factor(pred>p,c(F,T)))
cm sum(cm * PMX[[i]]) })
data.frame(drug=paste0("drug_",i), cutoff=cutoff, exp.payoff=r)
%>%
})) ggplot(aes(x=cutoff, y=exp.payoff, col=drug)) +
geom_line() + theme_bw() -> p
ggplotly(p)
🔑 Key Points:
■ Predictions do not generate value by itself
■ Combining Prediction, Assumption and Simulation helps to …
。choose among alternative solutions
。identify the optimal threshold of action
。identify the targeted subjects
。estimate the expected payoff