學習重點:使用dplyr套件做資料框整理

載入套件與資料

pacman::p_load(dplyr,tidyr,ggplot2)
D = readRDS("data/counties.rds")
D
# A tibble: 3,138 x 40
  census_id state   county  region metro   population   men women hispanic white
  <chr>     <chr>   <chr>   <chr>  <chr>        <dbl> <dbl> <dbl>    <dbl> <dbl>
1 1001      Alabama Autauga South  Metro        55221 26745 28476      2.6  75.8
2 1003      Alabama Baldwin South  Metro       195121 95314 99807      4.5  83.1
3 1005      Alabama Barbour South  Nonmet~      26932 14497 12435      4.6  46.2
4 1007      Alabama Bibb    South  Metro        22604 12073 10531      2.2  74.5
# i 3,134 more rows
# i 30 more variables: black <dbl>, native <dbl>, asian <dbl>, pacific <dbl>,
#   citizens <dbl>, income <dbl>, income_err <dbl>, income_per_cap <dbl>,
#   income_per_cap_err <dbl>, poverty <dbl>, child_poverty <dbl>,
#   professional <dbl>, service <dbl>, office <dbl>, construction <dbl>,
#   production <dbl>, drive <dbl>, carpool <dbl>, transit <dbl>, walk <dbl>,
#   other_transp <dbl>, work_at_home <dbl>, mean_commute <dbl>, ...


1. 資料轉換

class(D)  # 列出物件的資料結構
[1] "spec_tbl_df" "tbl_df"      "tbl"         "data.frame" 

tbl_df 可以看成加強版的 data_frmae;原有資料框的操作之外,它有一些附加的功能

🌻 glimpse 加強版的str

glimpse(D)
Rows: 3,138
Columns: 40
$ census_id          <chr> "1001", "1003", "1005", "1007", "1009", "1011", "10~
$ state              <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabam~
$ county             <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", ~
$ region             <chr> "South", "South", "South", "South", "South", "South~
$ metro              <chr> "Metro", "Metro", "Nonmetro", "Metro", "Metro", "No~
$ population         <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 1~
$ men                <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 5627~
$ women              <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 603~
$ hispanic           <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7~
$ white              <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.~
$ black              <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3,~
$ native             <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0~
$ asian              <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0~
$ pacific            <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0~
$ citizens           <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 88~
$ income             <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 41~
$ income_err         <dbl> 2391, 1263, 2973, 3995, 3141, 5884, 1793, 925, 2949~
$ income_per_cap     <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 21~
$ income_per_cap_err <dbl> 1080, 711, 798, 1618, 708, 2055, 714, 489, 1366, 15~
$ poverty            <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.~
$ child_poverty      <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.~
$ professional       <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.~
$ service            <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.~
$ office             <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.~
$ construction       <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5~
$ production         <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.~
$ drive              <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.~
$ carpool            <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, ~
$ transit            <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0~
$ walk               <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1~
$ other_transp       <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1~
$ work_at_home       <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1~
$ mean_commute       <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.~
$ employed           <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401,~
$ private_work       <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.~
$ public_work        <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.~
$ self_employed      <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4~
$ family_work        <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0~
$ unemployment       <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.~
$ land_area          <dbl> 594.4, 1589.8, 884.9, 622.6, 644.8, 622.8, 776.8, 6~

🌞 D : 全美國3,138個郡(county)的統計資料

🌻 select() 選擇欄位(變數)

D %>% select(state,county,population,unemployment)
# A tibble: 3,138 x 4
  state   county  population unemployment
  <chr>   <chr>        <dbl>        <dbl>
1 Alabama Autauga      55221          7.6
2 Alabama Baldwin     195121          7.5
3 Alabama Barbour      26932         17.6
4 Alabama Bibb         22604          8.3
# i 3,134 more rows
D1 = D %>% select(state,county,population,unemployment)

🌻 arrange() 重新排列資料(rows)

D1 %>% arrange(population) %>% head
# A tibble: 6 x 4
  state    county    population unemployment
  <chr>    <chr>          <dbl>        <dbl>
1 Hawaii   Kalawao           85          0  
2 Texas    King             267          5.1
3 Nebraska McPherson        433          0.9
4 Montana  Petroleum        443          6.6
5 Nebraska Arthur           448          4  
6 Nebraska Loup             548          0.7
# D1[order(D1$population),] %>% head
D1 %>% arrange(desc(population)) %>% head   # 降冪排列
# A tibble: 6 x 4
  state      county      population unemployment
  <chr>      <chr>            <dbl>        <dbl>
1 California Los Angeles   10038388         10  
2 Illinois   Cook           5236393         10.7
3 Texas      Harris         4356362          7.5
4 Arizona    Maricopa       4018143          7.7
5 California San Diego      3223096          8.7
6 California Orange         3116069          7.6
# D1[order(-D1$population),] %>% head


❓ 請列舉這幾個功能的主要異同 sort(), order() and arrange()?

🌻 filter() 篩選資料

D1 %>% 
  filter(state == "New York", unemployment < 8) %>% 
  arrange(desc(population))
# A tibble: 39 x 4
  state    county      population unemployment
  <chr>    <chr>            <dbl>        <dbl>
1 New York New York       1629507          7.5
2 New York Suffolk        1501373          6.4
3 New York Nassau         1354612          6.4
4 New York Westchester     967315          7.6
# i 35 more rows

🌻 mutate() : 定義新欄位

D1 %>% 
  mutate(n_unemp = unemployment*population/100) %>%
  arrange(desc(n_unemp)) %>% 
  head(4)
# A tibble: 4 x 5
  state      county      population unemployment  n_unemp
  <chr>      <chr>            <dbl>        <dbl>    <dbl>
1 California Los Angeles   10038388         10   1003839.
2 Illinois   Cook           5236393         10.7  560294.
3 Texas      Harris         4356362          7.5  326727.
4 Arizona    Maricopa       4018143          7.7  309397.

以上的這些功能用我們已經學過的R內建功能(配合索引)也都做得到,所以為什麼還要學dplyr呢? 🤔

🌞 dplyr的好處在於:



2. 資料彙總

🌻 count() 分類計算數量或加總

count(D)             # 總資料筆數
# A tibble: 1 x 1
      n
  <int>
1  3138
count(D1, state)      # 各州的資料筆數 No. Counties
# A tibble: 50 x 2
  state        n
  <chr>    <int>
1 Alabama     67
2 Alaska      28
3 Arizona     15
4 Arkansas    75
# i 46 more rows
# table(D$state)

count() 可以說是加強版的 table()

D1 %>% count(state, wt=population, sort=T)
# A tibble: 50 x 2
  state             n
  <chr>         <dbl>
1 California 38421464
2 Texas      26538497
3 New York   19673174
4 Florida    19645772
# i 46 more rows

With the wt and sort arguments, it can

in a single line of code.

🌻 summarise() 資料彙總,可以一次定義很多個彙總值,

D %>% summarise(
  totalPop = sum(population),
  avgPop = mean(population)
  )
# A tibble: 1 x 2
   totalPop  avgPop
      <dbl>   <dbl>
1 315845353 100652.
# sum(D$population)
# mean(D$population)

🌻 group_by() %>% summarise() 分類(群)彙總

D1 %>% group_by(state) %>% summarise(
  totalPop = sum(population),
  avgPop = mean(population)
  ) %>% 
  arrange(desc(avgPop))
# A tibble: 50 x 3
  state         totalPop  avgPop
  <chr>            <dbl>   <dbl>
1 California    38421464 662439.
2 Massachusetts  6705586 478970.
3 Connecticut    3593222 449153.
4 Arizona        6641928 442795.
# i 46 more rows


❓ 我們之前有學過用 tapply()做分群運算,跟相比較group_by() %>% summarise()`有甚麼好處呢?

tapply() 一樣,我們也可以一次 group_by 很多個分群變數

D %>% group_by(state, metro) %>% summarise(
  totalPop = sum(population),
  avgPop = mean(population)
  ) %>% 
  arrange(desc(avgPop))
`summarise()` has grouped output by 'state'. You can override using the
`.groups` argument.
# A tibble: 97 x 4
# Groups:   state [50]
  state         metro totalPop   avgPop
  <chr>         <chr>    <dbl>    <dbl>
1 California    Metro 37587429 1015876.
2 Arizona       Metro  6295145  786893.
3 Nevada        Metro  2529002  632250.
4 Massachusetts Metro  6606838  600622.
# i 93 more rows

🌷 但是 …


🌻 top_n(x, n, wt) select the top-n rows by wt from x

# select from D1 the three rows that has the largest populations 
D1 %>% top_n(n=3, wt=population)
# A tibble: 3 x 4
  state      county      population unemployment
  <chr>      <chr>            <dbl>        <dbl>
1 California Los Angeles   10038388         10  
2 Illinois   Cook           5236393         10.7
3 Texas      Harris         4356362          7.5
# select from each `state` the three rows that has the largest populations 
group_by(D1, state) %>% top_n(3, population) %>% head(9)
# A tibble: 9 x 4
# Groups:   state [3]
  state   county                       population unemployment
  <chr>   <chr>                             <dbl>        <dbl>
1 Alabama Jefferson                        659026          9.1
2 Alabama Madison                          346438          8.5
3 Alabama Mobile                           414251          9.8
4 Alaska  Anchorage Municipality           299107          6.7
5 Alaska  Fairbanks North Star Borough      99705          7.9
6 Alaska  Matanuska-Susitna Borough         96178          9.8
7 Arizona Maricopa                        4018143          7.7
8 Arizona Pima                             998537         10  
9 Arizona Pinal                            389772         10.6

❓ 上面兩段程式的差別在哪裡呢?



3. 更靈活的語法 Select, Transmute and Rename

select() 可以選擇一個範圍的欄位

D %>% select(state, county, drive:work_at_home)
# A tibble: 3,138 x 8
  state   county  drive carpool transit  walk other_transp work_at_home
  <chr>   <chr>   <dbl>   <dbl>   <dbl> <dbl>        <dbl>        <dbl>
1 Alabama Autauga  87.5     8.8     0.1   0.5          1.3          1.8
2 Alabama Baldwin  84.7     8.8     0.1   1            1.4          3.9
3 Alabama Barbour  83.8    10.9     0.4   1.8          1.5          1.6
4 Alabama Bibb     83.2    13.5     0.5   0.6          1.5          0.7
# i 3,134 more rows

也可以用字串比對做選擇,如 starts_with, ends_with or contains

D %>% select(state, county, contains("work"))
# A tibble: 3,138 x 6
  state   county  work_at_home private_work public_work family_work
  <chr>   <chr>          <dbl>        <dbl>       <dbl>       <dbl>
1 Alabama Autauga          1.8         73.6        20.9         0  
2 Alabama Baldwin          3.9         81.5        12.3         0.4
3 Alabama Barbour          1.6         71.8        20.8         0.1
4 Alabama Bibb             0.7         76.8        16.1         0.4
# i 3,134 more rows
# D %>% select(state, county, starts_with("work"))

也可以刪去某些欄位

D %>% select(-contains("work")) %>% ncol
[1] 36

names() 列出所有欄位名稱

names(D1)
[1] "state"        "county"       "population"   "unemployment"

🌻 rename() 變更欄位名稱

D1 %>% rename(unemp = unemployment) %>% head(3)
# A tibble: 3 x 4
  state   county  population unemp
  <chr>   <chr>        <dbl> <dbl>
1 Alabama Autauga      55221   7.6
2 Alabama Baldwin     195121   7.5
3 Alabama Barbour      26932  17.6

select()可以同時對多個欄位做選擇、排序和變更名稱

D1 %>% select(state, county, unemp = unemployment, population) %>% head(3)
# A tibble: 3 x 4
  state   county  unemp population
  <chr>   <chr>   <dbl>      <dbl>
1 Alabama Autauga   7.6      55221
2 Alabama Baldwin   7.5     195121
3 Alabama Barbour  17.6      26932

🌻 transmute() 結合select()mutate()的功能,可以同時做欄位選擇和定義新欄位

D %>% transmute(state, county, pop=population, fracM = men/population)
# A tibble: 3,138 x 4
  state   county     pop fracM
  <chr>   <chr>    <dbl> <dbl>
1 Alabama Autauga  55221 0.484
2 Alabama Baldwin 195121 0.488
3 Alabama Barbour  26932 0.538
4 Alabama Bibb     22604 0.534
# i 3,134 more rows