::p_load(caTools, ggplot2, dplyr)
pacman= read.csv("data/quality.csv") # Read in dataset
D set.seed(88)
= sample.split(D$PoorCare, SplitRatio = 0.75) # split vector
split = subset(D, split == TRUE)
TR = subset(D, split == FALSE)
TS = glm(PoorCare ~ OfficeVisits + Narcotics, TR, family=binomial)
glm1 summary(glm1)
Usually we’d use test data to create the DPP. However, we’re using the entire data set because this dataset is very small.
= predict(glm1, D, type="response")
pred = D$PoorCare
y data.frame(pred, y) %>%
ggplot(aes(x=pred, fill=factor(y))) +
geom_histogram(bins=20, col='white', position="stack", alpha=0.5) +
ggtitle("Distribution of Predicted Probability (DPP,FULL)") +
xlab("predicted probability")
The Payoff Matrix
The Expected Payoff
Create the Payoff Matrix according to the business context
= matrix(c(0,-100,-10,-50),2,2)
payoff rownames(payoff) = c("FALSE","TRUE")
colnames(payoff) = c("NoAct","Act")
payoff
NoAct Act
FALSE 0 -10
TRUE -100 -50
Estimate the Expected Payoff for each Confusion Matrix derived from each Threshold of Action
= seq(0, 1, 0.01)
cutoff = sapply(cutoff, function(p) {
result = table(factor(y==1, c(F,T)), factor(pred>p, c(F,T)))
cm sum(cm * payoff) # sum of confusion * payoff matrix
})= which.max(result)
i par(cex=0.7, mar=c(4,4,3,1))
plot(cutoff, result, type='l', col='cyan', lwd=2, main=sprintf(
"Optomal Expected Result: $%d @ %.2f",result[i],cutoff[i]))
abline(v=seq(0,1,0.1),h=seq(-6000,0,100),col='lightgray',lty=3)
points(cutoff[i], result[i], pch=20, col='red', cex=2)
🌷 Because the dataset is too small, the result of simulation looks very chopy. Here we create a larger test dataset (10K) to fit the normal business situation.
library(MASS)
Attaching package: 'MASS'
The following object is masked from 'package:dplyr':
select
= subset(D, PoorCare==0)
d0 = mvrnorm(7480, colMeans(d0[,4:5]), cov(d0[,4:5]))
A0 = subset(D, PoorCare==1)
d1 = mvrnorm(2520, colMeans(d1[,4:5]), cov(d1[,4:5]))
A1 = rbind(A0, A1) %>% as.data.frame()
A $PoorCare = c(rep(0L, 7480), rep(1L, 2520))
Atable(A$PoorCare)
0 1
7480 2520
There are 10K patients in the test dataset. We make the prediction and save the relevant data objects for latter use.
= A$PoorCare
y = predict(glm1, A, type="response")
pred save(y, pred, payoff, file="Sim11B.Rdata")
Now we can run the simulation with the large dataset …
= seq(0, 1, 0.01)
cutoff = sapply(cutoff, function(p) {
result = table(factor(y==1, c(F,T)), factor(pred>p, c(F,T)))
cm sum(cm * payoff) # sum of confusion * payoff matrix
}) = which.max(result)
i par(cex=0.7, mar=c(4,4,3,1))
plot(cutoff, result, type='l', col='cyan', lwd=2, main=sprintf(
"Optomal Expected Result: K$%.2f @ %.2f",result[i],cutoff[i]))
abline(v=seq(0,1,0.1),h=seq(-300000,-100000,25000),col='lightgray',lty=3)
points(cutoff[i], result[i], pch=20, col='red', cex=2)
🌷 As you can see, business data sets of normal size usually would lead to a smooth curve in simulation.
In RStudio, we can make an interactive simulator with the
manipulate
package. However, the manipulated code cannot be
executed in R Markdown. We need to run the code in
Sim11B.R
.
library(manipulate)
manipulate({
= matrix(c(TN,FN,FP,TP),2,2)
payoff = seq(0, 1, 0.01)
cutoff = sapply(cutoff, function(p) {
result = table(factor(y==1, c(F,T)), factor(pred>p, c(F,T)))
cm sum(cm * payoff) # sum of confusion * payoff matrix
/ 1000
}) = which.max(result)
i par(cex=0.7)
plot(cutoff, result, type='l', col='cyan', lwd=2, main=sprintf(
"Optomal Expected Result: K$%.2f @ %.2f",result[i],cutoff[i]),
ylim=c(-320, -120))
abline(v=seq(0,1,0.1),h=seq(-400,-100,25),col='lightgray',lty=3)
points(cutoff[i], result[i], pch=20, col='red', cex=2)
},TN = slider(-100,0, 0,"TN(無行動)",step=5),
FN = slider(-100,0,-100,"FN(平均風險成本)",step=5),
FP = slider(-100,0, -10,"FP(行動成本)",step=5),
TP = slider(-100,0, -50,"TP(行動成本+較小風險成本)",step=5)
)
🗿 Preactice:
RunSim12.R
and answer the following questions with the
default setting …
【A】 What is the optimal threshold? What is the
corresponding expected outcome?
【B】 What are the threshold and
outcome if we do nothing?
【C】 What are the threshold and outcome
if we take action to every patients?
【D】 Is the optimal
threshold better than the NO_ACT and the ALL_IN strategy?
If we
have a series of actions that cost $5, $10, $15, $20, $35
per patient and can reduce the risk from $100
to
$70, $60, $50, $40, $30
respectively …
【H】 What are
their optimal thresholds and expected payoffs respectively?
【I】
Which action has the best expectd payoff?
🌞 Reference Code
::p_load(plotly)
pacman
= list(
PMX c(0,-100, -5,-75), c(0,-100,-10,-70), c(0,-100,-15,-65),
c(0,-100,-20,-60), c(0,-100,-35,-65)) %>%
lapply(matrix, nrow=2, ncol=2)
do.call(rbind, lapply(1:length(PMX), function(i) {
= sapply(cutoff, function(p) {
r = table(factor(y==1,c(F,T)), factor(pred>p,c(F,T)))
cm sum(cm * PMX[[i]]) })
data.frame(drug=paste0("drug_",i), cutoff=cutoff, exp.payoff=r)
%>%
})) ggplot(aes(x=cutoff, y=exp.payoff, col=drug)) +
geom_line() + theme_bw() -> p
ggplotly(p)