Programming with dplyr
Dr. Chester Ismay
Educator, Data Scientist, and R/Python Consultant
glimpse(world_bank_data)
Rows: 300
Columns: 12
$ iso <chr> "PRT", "ARM", "BGR", "PRT", "PRT", "PRT", "PRT...
$ country <chr> "Portugal", "Armenia", "Bulgaria", "Portugal",...
$ continent <fct> Europe, Asia, Europe, Europe, Europe, Europe, ...
$ region <fct> Southern Europe, Western Asia, Eastern Europe,...
$ year <dbl> 2000, 2001, 2001, 2001, 2002, 2003, 2004, 2004...
$ infant_mortality_rate <dbl> 5.5, 25.3, 17.1, 5.2, 4.7, 4.3, 4.0, 9.2, 17.2...
$ fertility_rate <dbl> 1.47, 1.20, 1.20, 1.46, 1.45, 1.44, 1.43, 2.70...
$ perc_electric_access <dbl> 100.00000, 100.00000, 100.00000, 100.00000, 10...
$ perc_college_complete <dbl> 7.25665, 20.35655, 18.04557, 7.57332, 7.69954,...
$ perc_cvd_crd_70 <dbl> 15.8, 26.2, 28.1, 15.5, 15.0, 14.8, 14.0, 23.8...
$ unemployment_rate <dbl> 3.81, 10.91, 19.92, 3.83, 4.50, 6.13, 6.32, 0....
$ perc_rural_pop <dbl> 45.601, 35.615, 30.834, 44.956, 44.334, 43.713...
world_bank_data %>%
select(country, year, infant_mortality_rate, fertility_rate, unemployment_rate)
# A tibble: 300 x 5
country year infant_mortality_rate fertility_rate unemployment_rate
<chr> <dbl> <dbl> <dbl> <dbl>
1 Portugal 2000 5.5 1.47 3.81
2 Armenia 2001 25.3 1.2 10.9
3 Bulgaria 2001 17.1 1.2 19.9
4 Portugal 2001 5.2 1.46 3.83
5 Pakistan 2005 80 3.79 0.610
# ... with 295 more rows
country
, year
, infant_mortality_rate
, fertility_rate
, and unemployment_rate
?world_bank_data %>%
select(contains("y"))
# A tibble: 300 x 5
country year infant_mortality_rate fertility_rate unemployment_rate
<chr> <dbl> <dbl> <dbl> <dbl>
1 Portugal 2000 5.5 1.47 3.81
2 Armenia 2001 25.3 1.2 10.9
3 Bulgaria 2001 17.1 1.2 19.9
4 Portugal 2001 5.2 1.46 3.83
5 Pakistan 2005 80 3.79 0.610
# ... with 295 more rows
Tokens:
|
- For matching two strings
^
- For specifying the start of a string
$
- For specifying the end of a string
matches()
when looking for a regular expressionworld_bank_data %>%
select(matches("y|perc"))
# A tibble: 300 x 9
country year infant_mortality_rate fertility_rate perc_electric_access
<chr> <dbl> <dbl> <dbl> <dbl>
1 Portugal 2000 5.5 1.47 100
2 Armenia 2001 25.3 1.2 100
3 Bulgaria 2001 17.1 1.2 100
4 Portugal 2001 5.2 1.46 100
5 Pakistan 2005 80 3.79 83.8
# ... with 295 more rows, and 4 more variables: perc_college_complete <dbl>,
# perc_cvd_crd_70 <dbl>, unemployment_rate <dbl>, perc_rural_pop <dbl>
world_bank_data %>%
select(matches("^co"))
# A tibble: 300 x 2
country continent
<chr> <fct>
1 Portugal Europe
2 Armenia Asia
3 Bulgaria Europe
4 Portugal Europe
5 Pakistan Asia
# ... with 295 more rows
world_bank_data %>%
select(country, matches("on$"))
# A tibble: 300 x 2
country region
<chr> <fct>
1 Portugal Southern Europe
2 Armenia Western Asia
3 Bulgaria Eastern Europe
4 Portugal Southern Europe
5 Pakistan Southern Asia
# ... with 290 more rows
Programming with dplyr