Programming with dplyr
Dr. Chester Ismay
Educator, Data Scientist, and R/Python Consultant
world_bank_data %>% mutate(prop_perc_college_complete = perc_college_complete / 100,
.keep = "used")
# A tibble: 300 x 2
perc_college_complete prop_perc_college_complete
<dbl> <dbl>
1 7.26 0.0726
2 20.4 0.204
3 18.0 0.180
# ... with 297 more rows
world_bank_data %>%
mutate(across(.cols = perc_college_complete,
.fns = ~ .x / 100,
.names = "prop_{.col}"),
.keep = "used")
# A tibble: 300 x 2
perc_college_complete prop_perc_college_complete
<dbl> <dbl>
1 7.26 0.0726
2 20.4 0.204
3 18.0 0.180
# ... with 297 more rows
world_bank_with_prop <- world_bank_data %>%
mutate(across(.cols = starts_with("perc"),
.fns = ~ .x / 100,
.names = "prop_{.col}"),
.keep = "used")
glimpse(world_bank_with_prop)
Rows: 300
Columns: 8
$ perc_electric_access <dbl> 100.000000, 100.000000, 100.000000, 100.000000...
$ perc_college_complete <dbl> 7.25665, 20.35655, 18.04557, 7.57332, 7.69954, 8.94607...
$ perc_cvd_crd_70 <dbl> 15.8, 26.2, 28.1, 15.5, 15.0, 14.8, 14.0, 23.8, 25.6, 33.3, 14.2...
$ perc_rural_pop <dbl> 45.601, 35.615, 30.834, 44.956, 44.334, 43.713, 43.093, 2.910....
$ prop_perc_electric_access <dbl> 1.00000000, 1.00000000, 1.00000000, 1.00000000...
$ prop_perc_college_complete <dbl> 0.0725665, 0.2035655, 0.1804557, 0.0757332, 0.0769954...
$ prop_perc_cvd_crd_70 <dbl> 0.158, 0.262, 0.281, 0.155, 0.150, 0.148, 0.140, 0.238, 0.256...
$ prop_perc_rural_pop <dbl> 0.45601, 0.35615, 0.30834, 0.44956, 0.44334, 0.43713, 0.43093...
prop_perc
to prop
names(world_bank_new_cols) <- sub(
pattern = "prop_perc",
replacement = "prop",
x = names(world_bank_new_cols),
)
names(world_bank_new_cols)
[1] "perc_electric_access" "perc_college_complete" "perc_cvd_crd_70"
[4] "perc_rural_pop" "prop_electric_access" "prop_college_complete"
[7] "prop_cvd_crd_70" "prop_rural_pop"
world_bank_data %>% filter(year == 2015) %>%
summarize(across(.cols = ends_with("rate"), .fns = median, .names = "median_{.col}"))
# A tibble: 1 x 3
median_infant_mortality_rate median_fertility_rate median_unemployment_rate
<dbl> <dbl> <dbl>
1 5.7 1.87 6.40
world_bank_data %>%
count(country, continent)
# A tibble: 101 x 3
country continent n
<chr> <fct> <int>
1 Albania Europe 2
2 Angola Africa 1
3 Armenia Asia 3
4 Australia Oceania 4
5 Austria Europe 3
# ... with 96 more rows
world_bank_data %>% count(across(
.cols = !where(is.numeric) ))
# A tibble: 101 x 5
iso country continent region n
<chr> <chr> <fct> <fct> <int>
1 AGO Angola Africa Middle Africa 1
2 ALB Albania Europe Southern Europe 2
3 ARE United Arab Emirates Asia Western Asia 1
# ... with 98 more rows
world_bank_data %>%
count(across(.cols = !where(is.numeric)),
sort = TRUE)
# A tibble: 101 x 5
iso country continent region n
<chr> <chr> <fct> <fct> <int>
1 PRT Portugal Europe Southern Europe 17
2 BGR Bulgaria Europe Eastern Europe 12
3 SGP Singapore Asia South-Eastern Asia 11
4 COL Colombia Americas South America 10
5 ECU Ecuador Americas South America 10
# ... with 96 more rows
Programming with dplyr