載入套件與資料
top6
)top6 = intersect(count(A, from, sort=T) %>% head(6) %>% pull(from),
count(A, to, sort=T) %>% head(6) %>% pull(to))
cols = c('gray90','cyan','green1','orange','red','red')
filter(A, from%in%top6, to%in%top6) %>%
ggplot(aes(t.estimate, t.total)) +
xlab('預計到達天數') + ylab('實際到達天數') +
geom_hex(bins=20,col='lightgray',lwd=0.1) + xlim(0,40) + ylim(0,40) +
scale_fill_gradientn(colors=cols) +
geom_abline(slope=1,intercept=c(0,-10),col='pink') +
facet_grid(from~to) + theme_bw() +
theme(strip.text=element_text(size=8, margin=margin(1,1,1,1)))
🌻 SP
周內的物流量很大
SP
州內的物流狀況load("data/SP.rdata")
sp = filter(sp, !is.na(t.total)) %>%
mutate(yymm = format(order_purchase_timestamp,"%Y.%m")) %>%
filter(yymm >= "2017.01", yymm <= "2018.08")
with(sp, c(
mean(t.total > t.estimate, na.rm=T),
mean(t.estimate > t.total + 10, na.rm=T),
cor(t.estimate, t.total, use="complete.obs")
))
[1] 0.0611 0.4779 0.3336
🌻 即使在SP
本周之內,預計到達天數的估計也很不準確
6.1%
,實際到達天數 > 預計到達天數)47.8%
,預計到達天數 > 實際到達天數 + 10)33.4%
顧客、廠商與訂單分佈
sf %>% select(
`顧客數`=c_cust, `廠商數`=s_seller,
`送達訂單`=c_order, `送出訂單`=s_order, ) %>%
mutate_at(vars(2,4), replace_na, 1) %>%
mutate_at(vars(1:4),log10) %>% plot
🌻 顧客與廠商的分佈(log trans.)類似但不完全相同
col5 = c('seagreen', 'green', 'white', 'yellow', 'orange', 'red')
z0 = c("c_day","s_day","c_delay","s_delay")
z1 = c("顧客到貨天數","廠商到貨天數","顧客延遲天數","廠商延遲天數")
df = sf %>% mutate(nome = as.character(nome)) %>%
select(nome,c_day,c_delay,s_day,s_delay) %>%
gather('key',value,2:5) %>%
mutate(key = factor(key, z0, z1), value=round(value,1)) %>%
group_by(key) %>% mutate(
title = paste0("<i><b>",nome," : ",value,"</b></i>"), # 各小區的實際值
`標準值(z)` = scale(value) %>% round(2) # 使用統一的標準化色階
) %>% ungroup
highlight_key(df, ~nome) %>%
ggplot(aes(text=title, fill=`標準值(z)`)) + geom_sf() +
scale_fill_gradientn(colors=col5) +
facet_wrap(~key, ncol=2) +
theme(axis.text=element_blank(),axis.ticks=element_blank()) -> g
ggplotly(g) %>% highlight("plotly_hover")
🌻 顧客的到貨天數和延遲天數的樣態是不相容的
ggplot(sp, aes(t.estimate, t.total)) +
xlab('預計到達天數') + ylab('實際到達天數') +
geom_hex(bins=20,col='lightgray',lwd=0.1) +
xlim(0,42) + ylim(0,42) +
scale_fill_gradientn(colors=cols) +
geom_abline(slope=1,intercept=c(0,-10),col='pink') +
facet_wrap(~yymm,ncol=4) + theme_bw() -> p
df = do.call(rbind, lapply(split(sp, sp$yymm), function(d) {tibble(
x=c(4,40,40), y=c(40,2,40), val=with(d, c(
mean(t.total > t.estimate, na.rm=T),
mean(t.estimate > t.total + 10, na.rm=T),
cor(t.total, t.estimate, "complete.obs")
)))})) %>%
mutate(val = round(val*100,1))
df$yymm = substr(rownames(df),1,7)
p + geom_text(data=df,aes(x,y,label=val), size=3.5) +
theme(strip.text=element_text(size=8, margin=margin(1,1,1,1)))
🌻 看來OLIST一直在調整SP
州內的物流機制和到貨時間的估計方法
2018.08
的到貨日期估計有很大的誤差
17.8%
11.6%
28.8%
2018.05~2018.07
三個月的資料2018.08
訂單的到貨時間用 …
a.day
:廠商的平均到貨天數 (前三個月的平均值)c_day
:顧客所在位置的平均到貨天數 (前六個月的平均值)freight_value
:運費 (代表距離、重量、大小)來估計 …
t.total
:實際到貨天數 Estimate Std. Error t value Pr(>|t|)
(Intercept) -4.4715 0.34404 -13.00 3.21e-38
a.days 0.9708 0.01519 63.92 0.00e+00
c_day 0.5568 0.04265 13.06 1.50e-38
freight_value 0.0191 0.00484 3.95 8.03e-05
c(cor(predict(mod), tr$t.total), # training correlation
cor(predict(mod, ts1), ts1$t.total) # testing correlation
)
[1] 0.622 0.403
🌻 訓練與測試的準確性差很多,但這個問題在我們課程的討論範圍之外
🎯 策略一:預期到貨日期 = 模型預測值+5天
SLACK = 5 # prediction with a slack
d = tibble(actual=ts1$t.total, estimate=predict(mod, ts1)+SLACK)
df = tibble(x=c(4,40,40), y=c(40,2,40), v=with(d, c(
mean(actual>estimate), mean(estimate>actual+10), cor(actual, estimate)
) %>% {round(.*100, 1)} ) )
ggplot(d, aes(estimate, actual)) +
geom_hex(bins=20,col='lightgray',lwd=0.1) +
scale_fill_gradientn(colors=cols) +
geom_abline(slope=1,intercept=c(0,-10),col='pink') +
xlim(0,42) + ylim(0,42) + theme_bw() +
geom_text(data=df, aes(x, y, label=v), size=5)
🌻 即使是很簡單的模型,對估計的準確度也能有相當大的幫助
17.8%
下降到 6.2%
11.6%
下降到 4.9%
28.8%
提高到 40.3%
🎯 策略模擬: 試著調整「鬆弛時間(SLACK
)」的值
🎯 策略模擬: 如果我們希望最大化預期到貨+10 > 實際到貨 > 預期到貨
的比例
pred = predict(mod, ts1); act=ts1$t.total
sapply(seq(2,8,0.5), function(p) {
est = pred + p
c(plus=p, delay=mean(act>est), slack=mean(est>act+10)
)}) %>% t %>% as.data.frame %>%
mutate(deviate = delay+slack) %>%
gather('key','value',-1) %>%
ggplot(aes(plus, value, col=key)) +
geom_point(size=1.5) + geom_line(size=1) -> g
ggplotly(g)