dplyr
Package
dplyr
Package
::p_load(dplyr,tidyr,ggplot2) pacman
Read US Counties data from .rds
= readRDS("data/counties.rds") D
class(D)
[1] "spec_tbl_df" "tbl_df" "tbl" "data.frame"
tbl_df
is similar to data frame with some
enhancements
🌻 glimpse
str
glimpse(D)
Rows: 3,138
Columns: 40
$ census_id <chr> "1001", "1003", "1005", "1007", "1009", "1011", "10~
$ state <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabam~
$ county <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", ~
$ region <chr> "South", "South", "South", "South", "South", "South~
$ metro <chr> "Metro", "Metro", "Nonmetro", "Metro", "Metro", "No~
$ population <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 1~
$ men <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 5627~
$ women <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 603~
$ hispanic <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7~
$ white <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.~
$ black <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3,~
$ native <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0~
$ asian <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0~
$ pacific <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0~
$ citizens <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 88~
$ income <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 41~
$ income_err <dbl> 2391, 1263, 2973, 3995, 3141, 5884, 1793, 925, 2949~
$ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 21~
$ income_per_cap_err <dbl> 1080, 711, 798, 1618, 708, 2055, 714, 489, 1366, 15~
$ poverty <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.~
$ child_poverty <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.~
$ professional <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.~
$ service <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.~
$ office <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.~
$ construction <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5~
$ production <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.~
$ drive <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.~
$ carpool <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, ~
$ transit <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0~
$ walk <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1~
$ other_transp <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1~
$ work_at_home <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1~
$ mean_commute <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.~
$ employed <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401,~
$ private_work <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.~
$ public_work <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.~
$ self_employed <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4~
$ family_work <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0~
$ unemployment <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.~
$ land_area <dbl> 594.4, 1589.8, 884.9, 622.6, 644.8, 622.8, 776.8, 6~
🌻 select()
%>% select(state,county,population,unemployment) D
# A tibble: 3,138 x 4
state county population unemployment
<chr> <chr> <dbl> <dbl>
1 Alabama Autauga 55221 7.6
2 Alabama Baldwin 195121 7.5
3 Alabama Barbour 26932 17.6
4 Alabama Bibb 22604 8.3
# ... with 3,134 more rows
# i Use `print(n = ...)` to see more rows
= D %>% select(state,county,population,unemployment) D1
🌻 arrange()
%>% arrange(population) D1
# A tibble: 3,138 x 4
state county population unemployment
<chr> <chr> <dbl> <dbl>
1 Hawaii Kalawao 85 0
2 Texas King 267 5.1
3 Nebraska McPherson 433 0.9
4 Montana Petroleum 443 6.6
# ... with 3,134 more rows
# i Use `print(n = ...)` to see more rows
# D1[order(D1$population),]
%>% arrange(desc(population)) D1
# A tibble: 3,138 x 4
state county population unemployment
<chr> <chr> <dbl> <dbl>
1 California Los Angeles 10038388 10
2 Illinois Cook 5236393 10.7
3 Texas Harris 4356362 7.5
4 Arizona Maricopa 4018143 7.7
# ... with 3,134 more rows
# i Use `print(n = ...)` to see more rows
# D1[order(-D1$population),]
❓ Can you distinguish the functions sort()
,
order()
and arrange()
?
🌻 filter()
%>%
D1 filter(state == "New York", unemployment < 8) %>%
arrange(desc(population))
# A tibble: 39 x 4
state county population unemployment
<chr> <chr> <dbl> <dbl>
1 New York New York 1629507 7.5
2 New York Suffolk 1501373 6.4
3 New York Nassau 1354612 6.4
4 New York Westchester 967315 7.6
# ... with 35 more rows
# i Use `print(n = ...)` to see more rows
🌻 mutate()
%>%
D1 mutate(unempRate = 100 * unemployment/population) %>%
arrange(desc(unempRate)) %>%
head(4)
# A tibble: 4 x 5
state county population unemployment unempRate
<chr> <chr> <dbl> <dbl> <dbl>
1 Colorado San Juan 606 13.8 2.28
2 Texas King 267 5.1 1.91
3 Texas McMullen 778 14.1 1.81
4 Montana Petroleum 443 6.6 1.49
You might have notice we can do all of the above with R’s indexing
and build-in functions. Why dplyr
then? 🤔
🌞 The benefits of dplyr are …
%>%
🌻 count()
count(D) # count the number of rows in D
# A tibble: 1 x 1
n
<int>
1 3138
count(D, state) # count the number of rows by states
# A tibble: 50 x 2
state n
<chr> <int>
1 Alabama 67
2 Alaska 28
3 Arizona 15
4 Arkansas 75
# ... with 46 more rows
# i Use `print(n = ...)` to see more rows
# table(D$state)
count()
is a very useful function …
%>% count(state,wt=population,sort=T) D
# A tibble: 50 x 2
state n
<chr> <dbl>
1 California 38421464
2 Texas 26538497
3 New York 19673174
4 Florida 19645772
# ... with 46 more rows
# i Use `print(n = ...)` to see more rows
With the wt
and sort
arguments, it can
population
by state
’sin a single line of code.
🌻 summarise()
%>% summarise(
D totalPop = sum(population),
avgPop = mean(population)
)
# A tibble: 1 x 2
totalPop avgPop
<dbl> <dbl>
1 315845353 100652.
# sum(D$population)
# mean(D$population)
🌻 group_by() %>% summarise()
%>% group_by(state) %>% summarise(
D totalPop = sum(population),
avgPop = mean(population)
%>%
) arrange(desc(avgPop))
# A tibble: 50 x 3
state totalPop avgPop
<chr> <dbl> <dbl>
1 California 38421464 662439.
2 Massachusetts 6705586 478970.
3 Connecticut 3593222 449153.
4 Arizona 6641928 442795.
# ... with 46 more rows
# i Use `print(n = ...)` to see more rows
❓ We have learned to make group summaries with tapply()
…
tapply
to implement the above
operation?tapply
, what is the benefit of
group_by
and summarise
?%>% group_by(state, metro) %>% summarise(
D totalPop = sum(population),
avgPop = mean(population),
.groups = "drop"
%>%
) arrange(desc(avgPop))
# A tibble: 97 x 4
state metro totalPop avgPop
<chr> <chr> <dbl> <dbl>
1 California Metro 37587429 1015876.
2 Arizona Metro 6295145 786893.
3 Nevada Metro 2529002 632250.
4 Massachusetts Metro 6606838 600622.
# ... with 93 more rows
# i Use `print(n = ...)` to see more rows
❓ What happen if we did not drop
the
groups
🌷 Sometimes the (hidden) group structure would make troubles. We’ll see it latter.
🌻 top_n(x, n, wt)
n
rows by
wt
from x
# select from D1 the three rows that has the largest populations
%>% top_n(3, population) D1
# A tibble: 3 x 4
state county population unemployment
<chr> <chr> <dbl> <dbl>
1 California Los Angeles 10038388 10
2 Illinois Cook 5236393 10.7
3 Texas Harris 4356362 7.5
# select from each `state` the three rows that has the largest populations
group_by(D1, state) %>% top_n(3, population)
# A tibble: 150 x 4
# Groups: state [50]
state county population unemployment
<chr> <chr> <dbl> <dbl>
1 Alabama Jefferson 659026 9.1
2 Alabama Madison 346438 8.5
3 Alabama Mobile 414251 9.8
4 Alaska Anchorage Municipality 299107 6.7
# ... with 146 more rows
# i Use `print(n = ...)` to see more rows
❓ Compare the above two code chucks, you will see the effect of the group structure.
We can select
a range of columns
%>% select(state, county, drive:work_at_home) D
# A tibble: 3,138 x 8
state county drive carpool transit walk other_transp work_at_home
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Alabama Autauga 87.5 8.8 0.1 0.5 1.3 1.8
2 Alabama Baldwin 84.7 8.8 0.1 1 1.4 3.9
3 Alabama Barbour 83.8 10.9 0.4 1.8 1.5 1.6
4 Alabama Bibb 83.2 13.5 0.5 0.6 1.5 0.7
# ... with 3,134 more rows
# i Use `print(n = ...)` to see more rows
select columns that starts_with
, ends_with
or contains
certain patterns,
%>% select(state, county, contains("work")) D
# A tibble: 3,138 x 6
state county work_at_home private_work public_work family_work
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Alabama Autauga 1.8 73.6 20.9 0
2 Alabama Baldwin 3.9 81.5 12.3 0.4
3 Alabama Barbour 1.6 71.8 20.8 0.1
4 Alabama Bibb 0.7 76.8 16.1 0.4
# ... with 3,134 more rows
# i Use `print(n = ...)` to see more rows
# D %>% select(state, county, starts_with("work"))
or exclude columns with certain patterns.
%>% select(-contains("work")) %>% ncol D
[1] 36
names()
lists the column names
names(D1)
[1] "state" "county" "population" "unemployment"
🌻 rename()
%>% rename(unemp = unemployment) D1
# A tibble: 3,138 x 4
state county population unemp
<chr> <chr> <dbl> <dbl>
1 Alabama Autauga 55221 7.6
2 Alabama Baldwin 195121 7.5
3 Alabama Barbour 26932 17.6
4 Alabama Bibb 22604 8.3
# ... with 3,134 more rows
# i Use `print(n = ...)` to see more rows
We can select, re-position and rename in a select()
statement
%>% select(state, county, unemp = unemployment, population) %>% head(3) D1
# A tibble: 3 x 4
state county unemp population
<chr> <chr> <dbl> <dbl>
1 Alabama Autauga 7.6 55221
2 Alabama Baldwin 7.5 195121
3 Alabama Barbour 17.6 26932
🌻 transmute()
select()
and
mutate()
%>% transmute(state, county, fracM = men/population) D
# A tibble: 3,138 x 3
state county fracM
<chr> <chr> <dbl>
1 Alabama Autauga 0.484
2 Alabama Baldwin 0.488
3 Alabama Barbour 0.538
4 Alabama Bibb 0.534
# ... with 3,134 more rows
# i Use `print(n = ...)` to see more rows
Let’s make a data frame that distinguish the four functions
data.frame(
function_name = c("select()","mutate()","rename()","transmute()"),
unselected_columns = c("removed","reserved","reserved","removed"),
make_new_columns = c("no","yes","no","yes")
)
function_name unselected_columns make_new_columns
1 select() removed no
2 mutate() reserved yes
3 rename() reserved no
4 transmute() removed yes