::p_load(caTools, ggplot2, dplyr)
pacman= read.csv("data/quality.csv") # Read in dataset D
set.seed(88)
= sample.split(D$PoorCare, SplitRatio = 0.75) # split vector
split = subset(D, split == TRUE)
TR = subset(D, split == FALSE)
TS = glm(PoorCare ~ OfficeVisits + Narcotics, TR, family=binomial) glm1
summary(glm1)
因為這個資料集很小,我們使用全部的資料來做模擬 (通常我們是使用測試資料集)
= predict(glm1, D, type="response")
pred = D$PoorCare
y data.frame(pred, y) %>%
ggplot(aes(x=pred, fill=factor(y))) +
geom_histogram(bins=20, col='white', position="stack", alpha=0.5) +
ggtitle("Distribution of Predicted Probability (DPP,FULL)") +
xlab("predicted probability")
報酬矩陣 Payoff Matrix
= matrix(c(0,-100,-10,-50),2,2)
payoff rownames(payoff) = c("FALSE","TRUE")
colnames(payoff) = c("NoAct","Act")
payoff
NoAct Act
FALSE 0 -10
TRUE -100 -50
期望報酬 Expected Payoff
= seq(0, 1, 0.01)
cutoff = sapply(cutoff, function(p) {
result = table(factor(y==1, c(F,T)), factor(pred>p, c(F,T)))
cm sum(cm * payoff) # sum of confusion * payoff matrix
})= which.max(result)
i par(cex=0.7, mar=c(4,4,3,1))
plot(cutoff, result, type='l', col='cyan', lwd=2, main=sprintf(
"Optomal Expected Result: $%d @ %.2f",result[i],cutoff[i]))
abline(v=seq(0,1,0.1),h=seq(-6000,0,100),col='lightgray',lty=3)
points(cutoff[i], result[i], pch=20, col='red', cex=2)
🌷 因為資料點太少,以上模擬的曲線很不平整,以下我們將資料點數擴大為10K
library(MASS)
Attaching package: 'MASS'
The following object is masked from 'package:dplyr':
select
= subset(D, PoorCare==0)
d0 = mvrnorm(7480, colMeans(d0[,4:5]), cov(d0[,4:5]))
A0 = subset(D, PoorCare==1)
d1 = mvrnorm(2520, colMeans(d1[,4:5]), cov(d1[,4:5]))
A1 = rbind(A0, A1) %>% as.data.frame()
A $PoorCare = c(rep(0L, 7480), rep(1L, 2520))
Atable(A$PoorCare)
0 1
7480 2520
資料框A
之中有一萬個保戶的資料
= A$PoorCare
y = predict(glm1, A, type="response")
pred #save(y, pred, payoff, file="Sim11B.Rdata")
= seq(0, 1, 0.01)
cutoff = sapply(cutoff, function(p) {
result = table(factor(y==1, c(F,T)), factor(pred>p, c(F,T)))
cm sum(cm * payoff) # sum of confusion * payoff matrix
}) = which.max(result)
i par(cex=0.7, mar=c(4,4,3,1))
plot(cutoff, result, type='l', col='cyan', lwd=2, main=sprintf(
"Optomal Expected Result: K$%.2f @ %.2f",result[i],cutoff[i]))
abline(v=seq(0,1,0.1),h=seq(-300000,-100000,25000),col='lightgray',lty=3)
points(cutoff[i], result[i], pch=20, col='red', cex=2)
🌷 商務資料的通常都相當大,所以模擬曲線一般都比較平整
使用manipulate
套件,我們可以在RStudio之中做動態的模擬
…
使用manipulate
套件做策略模擬
library(manipulate)
manipulate({
= matrix(c(TN,FN,FP,TP),2,2)
payoff = seq(0, 1, 0.01)
cutoff = sapply(cutoff, function(p) {
result = table(factor(y==1, c(F,T)), factor(pred>p, c(F,T)))
cm sum(cm * payoff) # sum of confusion * payoff matrix
/ 1000
}) = which.max(result)
i par(cex=0.7)
plot(cutoff, result, type='l', col='cyan', lwd=2, main=sprintf(
"Optomal Expected Result: K$%.2f @ %.2f",result[i],cutoff[i]),
ylim=c(-320, -120))
abline(v=seq(0,1,0.1),h=seq(-400,-100,25),col='lightgray',lty=3)
points(cutoff[i], result[i], pch=20, col='red', cex=2)
},TN = slider(-100,0, 0,"TN(無行動)",step=5),
FN = slider(-100,0,-100,"FN(平均風險成本)",step=5),
FP = slider(-100,0, -10,"FP(行動成本)",step=5),
TP = slider(-100,0, -50,"TP(行動成本+較小風險成本)",step=5)
)
🗿 練習:
執行Sim12.R
,先依預設的報酬矩陣回答下列問題:
【A】
最佳臨界機率是? 它所對應的期望報酬是多少?
【B】
什麼都不做時,臨界機率和期望報酬各是多少?
【C】
每位保戶都做時,臨界機率和期望報酬各是多少?
【D】
以上哪一種做法期的望報酬比較高?
【E】
在所有的商務情境都是這種狀況嗎?
藉由調整報酬矩陣:
【F】
模擬出「全不做」比「全做」還要好的狀況
【G】
並舉出一個會發生這種狀況的商務情境
有五種成本分別為$5, $10, $15, $20, $35
的介入方法,它們分別可以將風險成本從$100
降低到$70, $60, $50, $40, $30
…
【H】 它們的最佳期望報酬分別是多少?
【I】
哪一種介入方法的最佳期望報酬是最大的呢?
🌞 參考模擬程式
::p_load(plotly)
pacman
= list(
PMX c(0,-100, -5,-75), c(0,-100,-10,-70), c(0,-100,-15,-65),
c(0,-100,-20,-60), c(0,-100,-35,-65)) %>%
lapply(matrix, nrow=2, ncol=2)
= do.call(rbind, lapply(1:length(PMX), function(i) {
df = sapply(cutoff, function(p) {
r = table(factor(y==1,c(F,T)), factor(pred>p,c(F,T)))
cm sum(cm * PMX[[i]]) })
data.frame(drug=paste0("drug_",i), cutoff=cutoff, exp.payoff=r)
}))
top_n(df,1,exp.payoff)
drug cutoff exp.payoff
1 drug_4 0.43 -207100
ggplot(df, aes(x=cutoff, y=exp.payoff, col=drug)) +
geom_line() + theme_bw() -> p
ggplotly(p)