Programming with dplyr
Dr. Chester Ismay
Educator, Data Scientist, and R/Python Consultant
is.na()
sum()
some_vector <- c(5, NA, 2, NA, 10)
is.na(some_vector)
[1] FALSE TRUE FALSE TRUE FALSE
sum(is.na(vec_test))
[1] 2
glimpse(world_bank_data)
Rows: 300
Columns: 12
$ iso <chr> "PRT", "ARM", "BGR", "PRT", "PRT", "PRT", "PRT",...
$ country <chr> "Portugal", "Armenia", "Bulgaria", "Portugal", "...
$ continent <fct> Europe, Asia, Europe, Europe, Europe, Europe, Eu...
$ region <fct> Southern Europe, Western Asia, Eastern Europe, S...
$ year <dbl> 2000, 2001, 2001, 2001, 2002, 2003, 2004, 2004, ...
$ infant_mortality_rate <dbl> 5.5, 25.3, 17.1, 5.2, 4.7, 4.3, 4.0, 9.2, 17.2, ...
$ fertility_rate <dbl> 1.47, 1.20, 1.20, 1.46, 1.45, 1.44, 1.43, 2.70, ...
$ perc_electric_access <dbl> 100.00000, 100.00000, 100.00000, 100.00000, 100....
$ perc_college_complete <dbl> 7.25665, 20.35655, 18.04557, 7.57332, 7.69954, 8...
$ perc_cvd_crd_70 <dbl> 15.8, 26.2, 28.1, 15.5, 15.0, 14.8, 14.0, 23.8, ...
$ unemployment_rate <dbl> 3.81, 10.91, 19.92, 3.83, 4.50, 6.13, 6.32, 0.71...
$ perc_rural_pop <dbl> 45.601, 35.615, 30.834, 44.956, 44.334, 43.713, ...
world_bank_data %>% rowwise() %>% mutate(num_missing = sum(is.na(
c_across(infant_mortality_rate:last_col())) )) %>%
select(country:year, num_missing) %>% arrange(desc(num_missing))
# A tibble: 300 x 5
# Rowwise:
country continent region year num_missing
<chr> <fct> <fct> <dbl> <int>
1 Australia Oceania Australia and New Zealand 2016 2
2 Austria Europe Western Europe 2016 2
3 Azerbaijan Asia Western Asia 2016 2
4 Bahrain Asia Western Asia 2016 2
5 Bangladesh Asia Southern Asia 2016 2
# ... with 295 more rows
world_bank_data %>% filter(country == "Australia", year == 2016) %>%
glimpse()
Rows: 1
Columns: 12
$ iso <chr> "AUS"
$ country <chr> "Australia"
$ continent <fct> Oceania
$ region <fct> Australia and New Zealand
$ year <dbl> 2016
$ infant_mortality_rate <dbl> NA
$ fertility_rate <dbl> NA
$ perc_electric_access <dbl> 100
$ perc_college_complete <dbl> 30.02981
$ perc_cvd_crd_70 <dbl> 9
$ unemployment_rate <dbl> 5.71
$ perc_rural_pop <dbl> 14.2
world_bank_data %>% filter(if_any(
.cols = starts_with("perc"),
.fns = ~ .x < 5)) %>%
select(country, year, starts_with("perc"))
# A tibble: 60 x 6
country year perc_electric_access perc_college_complete perc_cvd_crd_70 perc_rural_pop
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Qatar 2004 100 20.9 23.8 2.91
2 Pakistan 2005 83.8 3.92 33.3 66.0
3 Singapore 2006 100 19.6 12.5 0
4 Honduras 2007 73.5 4.23 17.3 50.1
5 Qatar 2007 100 25.1 21.4 2.08
# ... with 55 more rows
world_bank_data %>%
filter(if_all(
.cols = starts_with("perc"),
.fns = ~ .x >= 25)) %>%
select(country, year, starts_with("perc"))
# A tibble: 4 x 6
country year perc_electric_access perc_college_complete perc_cvd_crd_70 perc_rural_pop
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Russia 2010 100 59.3 30.9 26.3
2 Georgia 2012 100 30.2 25.3 43.7
3 Georgia 2014 100 30.9 25.1 42.9
4 Georgia 2016 100 32.8 26.5 42.2
Programming with dplyr