Data Manipulation with dplyr
James Chapman
Curriculum Manager, DataCamp
counties %>%
summarize(total_population = sum(population))
# A tibble: 1 x 1
total_population
<dbl>
1 315845353
counties %>%
summarize(total_population = sum(population),
average_unemployment = mean(unemployment))
# A tibble: 1 x 2
total_population average_unemployment
<dbl> <dbl>
1 315845353 7.80
sum()
mean()
median()
min()
max()
n()
counties %>% group_by(state) %>%
summarize(total_pop = sum(population), average_unemployment = mean(unemployment))
# A tibble: 50 x 3
state total_pop average_unemployment
<chr> <dbl> <dbl>
1 Alabama 4830620 758.
2 Alaska 725461 257.
3 Arizona 6641928 180.
4 Arkansas 2958208 674.
5 California 38421464 626.
6 Colorado 5278906 477.
7 Connecticut 3593222 65.3
8 Delaware 926454 23.8
9 Florida 19645772 696.
10 Georgia 10006693 1586.
# … with 40 more rows
counties %>% group_by(state) %>% summarize(total_pop = sum(population), average_unemployment = mean(unemployment)) %>%
arrange(desc(average_unemployment))
# A tibble: 50 x 3
state total_pop average_unemployment
<chr> <dbl> <dbl>
1 Mississippi 2988081 12.0
2 Arizona 6641928 12.0
3 South Carolina 4777576 11.3
4 Alabama 4830620 11.3
5 California 38421464 10.8
6 Nevada 2798636 10.5
7 North Carolina 9845333 10.5
8 Florida 19645772 10.4
9 Georgia 10006693 9.97
10 Michigan 9900571 9.96
# … with 40 more rows
counties %>%
select(state, metro, county, population)
# A tibble: 3,138 x 4
state metro county population
<chr> <chr> <chr> <dbl>
1 Alabama Metro Autauga 55221
2 Alabama Metro Baldwin 195121
3 Alabama Nonmetro Barbour 26932
4 Alabama Metro Bibb 22604
5 Alabama Metro Blount 57710
6 Alabama Nonmetro Bullock 10678
7 Alabama Nonmetro Butler 20354
8 Alabama Metro Calhoun 116648
9 Alabama Nonmetro Chambers 34079
10 Alabama Nonmetro Cherokee 26008
# … with 3,128 more rows
counties %>%
group_by(state, metro) %>%
summarize(total_pop = sum(population))
# A tibble: 97 x 3
# Groups: state [50]
state metro total_pop
<chr> <chr> <dbl>
1 Alabama Metro 3671377
2 Alabama Nonmetro 1159243
3 Alaska Metro 494990
4 Alaska Nonmetro 230471
5 Arizona Metro 6295145
6 Arizona Nonmetro 346783
7 Arkansas Metro 1806867
8 Arkansas Nonmetro 1151341
9 California Metro 37587429
10 California Nonmetro 834035
# … with 87 more rows
counties %>%
group_by(state, metro) %>%
summarize(total_pop = sum(population)) %>%
ungroup()
# A tibble: 97 x 3
state metro total_pop
<chr> <chr> <dbl>
1 Alabama Metro 3671377
2 Alabama Nonmetro 1159243
3 Alaska Metro 494990
4 Alaska Nonmetro 230471
5 Arizona Metro 6295145
6 Arizona Nonmetro 346783
7 Arkansas Metro 1806867
8 Arkansas Nonmetro 1151341
9 California Metro 37587429
10 California Nonmetro 834035
# … with 87 more rows
Data Manipulation with dplyr