OLIST08b：預測與決策

載入套件與資料

pacman::p_load(plotly,ggplot2,tidyr,dplyr,sf)
load("data/A.rdata")
load("data/B.rdata")

流量最大的6個州(`top6`)

top6 = intersect(count(A, from, sort=T) %>% head(6) %>% pull(from),
          count(A, to, sort=T) %>% head(6) %>% pull(to))

cols = c('gray90','cyan','green1','orange','red','red')
filter(A, from%in%top6, to%in%top6) %>% 
  ggplot(aes(t.estimate, t.total)) +
  xlab('預計到達天數') + ylab('實際到達天數') + 
  geom_hex(bins=20,col='lightgray',lwd=0.1) + xlim(0,40) + ylim(0,40) +
  scale_fill_gradientn(colors=cols) +
  geom_abline(slope=1,intercept=c(0,-10),col='pink') +
  facet_grid(from~to) + theme_bw() +
  theme(strip.text=element_text(size=8, margin=margin(1,1,1,1)))

🌻 SP周內的物流量很大

`SP`州內的物流狀況

load("data/SP.rdata")
sp = filter(sp, !is.na(t.total)) %>%
  mutate(yymm = format(order_purchase_timestamp,"%Y.%m")) %>% 
  filter(yymm >= "2017.01", yymm <= "2018.08")
with(sp, c(
  mean(t.total > t.estimate, na.rm=T),
  mean(t.estimate > t.total + 10, na.rm=T),
  cor(t.estimate, t.total, use="complete.obs")
  ))

[1] 0.0611 0.4779 0.3336

🌻 即使在SP本周之內，預計到達天數的估計也很不準確

雖然延遲到貨的比例不高(6.1%，實際到達天數 > 預計到達天數)
但是到貨日期的估計太保守(47.8%，預計到達天數 > 實際到達天數 + 10)
預計和實際到貨天數之間的相關係數只有33.4%

顧客、廠商與訂單分佈

sf %>% select(
  `顧客數`=c_cust, `廠商數`=s_seller,
  `送達訂單`=c_order, `送出訂單`=s_order, ) %>%
  mutate_at(vars(2,4), replace_na, 1) %>% 
  mutate_at(vars(1:4),log10) %>% plot

🌻 顧客與廠商的分佈(log trans.)類似但不完全相同

col5 = c('seagreen', 'green', 'white', 'yellow', 'orange', 'red')
z0 = c("c_day","s_day","c_delay","s_delay")
z1 = c("顧客到貨天數","廠商到貨天數","顧客延遲天數","廠商延遲天數")

df = sf %>% mutate(nome = as.character(nome)) %>% 
  select(nome,c_day,c_delay,s_day,s_delay) %>% 
  gather('key',value,2:5) %>% 
  mutate(key = factor(key, z0, z1), value=round(value,1)) %>% 
  group_by(key) %>% mutate(
    title = paste0("<i><b>",nome," : ",value,"</b></i>"),  # 各小區的實際值
    `標準值(z)` = scale(value) %>% round(2)        # 使用統一的標準化色階
    ) %>% ungroup

highlight_key(df, ~nome) %>%
  ggplot(aes(text=title, fill=`標準值(z)`)) + geom_sf() +
  scale_fill_gradientn(colors=col5) + 
  facet_wrap(~key, ncol=2) + 
  theme(axis.text=element_blank(),axis.ticks=element_blank()) -> g
ggplotly(g) %>% highlight("plotly_hover")

🌻 顧客的到貨天數和延遲天數的樣態是不相容的

有些到貨天數較低的地方，延遲天數偏高
有些到貨天數很高的地方，延遲天數卻很低

預計到達和實際到達天數的相關性

ggplot(sp, aes(t.estimate, t.total)) +
  xlab('預計到達天數') + ylab('實際到達天數') + 
  geom_hex(bins=20,col='lightgray',lwd=0.1) + 
  xlim(0,42) + ylim(0,42) +
  scale_fill_gradientn(colors=cols) +
  geom_abline(slope=1,intercept=c(0,-10),col='pink') +
  facet_wrap(~yymm,ncol=4) + theme_bw() -> p

df = do.call(rbind, lapply(split(sp, sp$yymm), function(d) {tibble(
  x=c(4,40,40), y=c(40,2,40), val=with(d, c(
    mean(t.total > t.estimate, na.rm=T),
    mean(t.estimate > t.total + 10, na.rm=T),
    cor(t.total, t.estimate, "complete.obs")
    )))})) %>% 
  mutate(val = round(val*100,1))
df$yymm = substr(rownames(df),1,7)
p + geom_text(data=df,aes(x,y,label=val), size=3.5) +
  theme(strip.text=element_text(size=8, margin=margin(1,1,1,1)))

🌻 看來OLIST一直在調整SP州內的物流機制和到貨時間的估計方法

左上方的數字代表延遲到貨的比例(%，實際到達天數 > 預計到達天數)
右下方的數字代表到貨日期的估計太保守(%，預計到達天數 > 實際到達天數 + 10)
右上方的數字是預計和實際到貨天數之間的相關係數(%)
2018.08的到貨日期估計有很大的誤差
- 延遲到貨的比例：17.8%
- 估計太保守的比例：11.6%
- 相關係數只有：28.8%

使用模型估計到貨天數

§ 整理資料

我們用2018.05~2018.07三個月的資料
來估計2018.08訂單的到貨時間

tr = filter(sp, yymm>="2018.05", yymm<="2018.07")
ts = filter(sp, yymm=="2018.08")
sid = group_by(tr, seller_id) %>% 
  summarise(a.days = mean(t.total, na.rm=T))
tr = left_join(tr,sid)
ts = left_join(ts,sid)
ts1 = filter(ts, complete.cases(ts))

§ 建立模型

用 …

a.day：廠商的平均到貨天數 (前三個月的平均值)
c_day：顧客所在位置的平均到貨天數 (前六個月的平均值)
freight_value：運費 (代表距離、重量、大小)

來估計 …

t.total：實際到貨天數

mod = lm(t.total~a.days+c_day+freight_value, tr)
summary(mod)$coef

              Estimate Std. Error t value Pr(>|t|)
(Intercept)    -4.4715    0.34404  -13.00 3.21e-38
a.days          0.9708    0.01519   63.92 0.00e+00
c_day           0.5568    0.04265   13.06 1.50e-38
freight_value   0.0191    0.00484    3.95 8.03e-05

§ 估計模型的準確性

對訓練資料的準確性
對測試資料的準確性

c(cor(predict(mod), tr$t.total),       # training correlation
  cor(predict(mod, ts1), ts1$t.total)  # testing correlation
  )

[1] 0.622 0.403

🌻 訓練與測試的準確性差很多，但這個問題在我們課程的討論範圍之外

使用模型做策略規劃

§ 設定策略、估計效果

🎯 策略一：預期到貨日期 = 模型預測值+5天

SLACK = 5  # prediction with a slack 
d = tibble(actual=ts1$t.total, estimate=predict(mod, ts1)+SLACK)

df = tibble(x=c(4,40,40), y=c(40,2,40), v=with(d, c(
  mean(actual>estimate), mean(estimate>actual+10), cor(actual, estimate)
  ) %>% {round(.*100, 1)} ) ) 

ggplot(d, aes(estimate, actual)) +
  geom_hex(bins=20,col='lightgray',lwd=0.1) + 
  scale_fill_gradientn(colors=cols) +
  geom_abline(slope=1,intercept=c(0,-10),col='pink') +
  xlim(0,42) + ylim(0,42) + theme_bw() +
  geom_text(data=df, aes(x, y, label=v), size=5)

🌻 即使是很簡單的模型，對估計的準確度也能有相當大的幫助

延遲到貨的比例從 17.8% 下降到 6.2%
估計太保守的比例從 11.6% 下降到 4.9%
相關係數從 28.8% 提高到 40.3%

§ 策略模擬、策略目標、策略優化

🎯 策略模擬： 試著調整「鬆弛時間(SLACK)」的值

如果「預期到貨日期 = 模型預測值 + 7天」，延遲到貨比例會變成？
如果我們希望「延遲到貨比例」不大於5%，「鬆弛時間」要設為多少天？
鬆弛時間越大越好嗎？
你覺得鬆弛時間設在哪裡最好呢？為什麼？

🎯 策略模擬： 如果我們希望最大化預期到貨+10 > 實際到貨 > 預期到貨的比例

pred = predict(mod, ts1); act=ts1$t.total
sapply(seq(2,8,0.5), function(p) { 
  est = pred + p
  c(plus=p, delay=mean(act>est), slack=mean(est>act+10) 
  )}) %>% t %>% as.data.frame %>% 
  mutate(deviate = delay+slack) %>% 
  gather('key','value',-1) %>% 
  ggplot(aes(plus, value, col=key)) + 
  geom_point(size=1.5) + geom_line(size=1) -> g
ggplotly(g)

OLIST08b：預測與決策

中山大學管理學院卓雍然

2021-04-28 08:52:08

流量最大的6個州(`top6`)

`SP`州內的物流狀況

預計到達和實際到達天數的相關性

使用模型估計到貨天數

§ 整理資料

§ 建立模型

§ 估計模型的準確性

使用模型做策略規劃

§ 設定策略、估計效果

§ 策略模擬、策略目標、策略優化

OLIST08b：預測與決策

中山大學管理學院 卓雍然

2021-04-28 08:52:08

流量最大的6個州(top6)

SP州內的物流狀況

預計到達和實際到達天數的相關性

使用模型估計到貨天數

§ 整理資料

§ 建立模型

§ 估計模型的準確性

使用模型做策略規劃

§ 設定策略、估計效果

§ 策略模擬、策略目標、策略優化

中山大學管理學院卓雍然

流量最大的6個州(`top6`)

`SP`州內的物流狀況