That has crossed the line

Programming with dplyr

Dr. Chester Ismay

Educator, Data Scientist, and R/Python Consultant

Creating a new column based on another

world_bank_data %>% 
  mutate(prop_perc_college_complete = perc_college_complete / 100,

        .keep = "used")

# A tibble: 300 x 2
   perc_college_complete prop_perc_college_complete
                   <dbl>                      <dbl>
 1                  7.26                     0.0726
 2                 20.4                      0.204 
 3                 18.0                      0.180 
# ... with 297 more rows

Say hello to across()

world_bank_data %>% 
  mutate(across(.cols = perc_college_complete, 
                .fns = ~ .x / 100,
                .names = "prop_{.col}"),
        .keep = "used")

# A tibble: 300 x 2
   perc_college_complete prop_perc_college_complete
                   <dbl>                      <dbl>
 1                  7.26                     0.0726
 2                 20.4                      0.204 
 3                 18.0                      0.180 
# ... with 297 more rows

Computing across multiple columns

world_bank_with_prop <- world_bank_data %>% 
  mutate(across(.cols = starts_with("perc"), 
                .fns = ~ .x / 100,
                .names = "prop_{.col}"),
        .keep = "used")

glimpse(world_bank_with_prop)

Rows: 300
Columns: 8
$ perc_electric_access       <dbl> 100.000000, 100.000000, 100.000000, 100.000000...
$ perc_college_complete      <dbl> 7.25665, 20.35655, 18.04557, 7.57332, 7.69954, 8.94607...
$ perc_cvd_crd_70            <dbl> 15.8, 26.2, 28.1, 15.5, 15.0, 14.8, 14.0, 23.8, 25.6, 33.3, 14.2...
$ perc_rural_pop             <dbl> 45.601, 35.615, 30.834, 44.956, 44.334, 43.713, 43.093, 2.910....
$ prop_perc_electric_access  <dbl> 1.00000000, 1.00000000, 1.00000000, 1.00000000...
$ prop_perc_college_complete <dbl> 0.0725665, 0.2035655, 0.1804557, 0.0757332, 0.0769954...
$ prop_perc_cvd_crd_70       <dbl> 0.158, 0.262, 0.281, 0.155, 0.150, 0.148, 0.140, 0.238, 0.256...
$ prop_perc_rural_pop        <dbl> 0.45601, 0.35615, 0.30834, 0.44956, 0.44334, 0.43713, 0.43093...

Tweaking column names

Change prop_perc to prop

names(world_bank_new_cols) <- sub(
  pattern = "prop_perc",
  replacement = "prop",
  x = names(world_bank_new_cols),
)
names(world_bank_new_cols)

[1] "perc_electric_access"  "perc_college_complete" "perc_cvd_crd_70"      
[4] "perc_rural_pop"        "prop_electric_access"  "prop_college_complete"
[7] "prop_cvd_crd_70"       "prop_rural_pop"

across() with summarize()

world_bank_data %>% 
  filter(year == 2015) %>%

  summarize(across(.cols = ends_with("rate"), 
                   .fns = median,
                   .names = "median_{.col}"))

# A tibble: 1 x 3
  median_infant_mortality_rate median_fertility_rate median_unemployment_rate
                         <dbl>                 <dbl>                    <dbl>
1                          5.7                  1.87                     6.40

count() how many rows are in each combination

world_bank_data %>% 
  count(country, continent)

# A tibble: 101 x 3
   country    continent     n
   <chr>      <fct>     <int>
 1 Albania    Europe        2
 2 Angola     Africa        1
 3 Armenia    Asia          3
 4 Australia  Oceania       4
 5 Austria    Europe        3
# ... with 96 more rows

count() with across() and introducing where()

world_bank_data %>% 
  count(across(

    .cols = !where(is.numeric)
))

# A tibble: 101 x 5
   iso   country              continent region                        n
   <chr> <chr>                <fct>     <fct>                     <int>
 1 AGO   Angola               Africa    Middle Africa                 1
 2 ALB   Albania              Europe    Southern Europe               2
 3 ARE   United Arab Emirates Asia      Western Asia                  1
# ... with 98 more rows

Sorted result

world_bank_data %>% 
  count(across(.cols = !where(is.numeric)),
        sort = TRUE)

# A tibble: 101 x 5
   iso   country    continent region                 n
   <chr> <chr>      <fct>     <fct>              <int>
 1 PRT   Portugal   Europe    Southern Europe       17
 2 BGR   Bulgaria   Europe    Eastern Europe        12
 3 SGP   Singapore  Asia      South-Eastern Asia    11
 4 COL   Colombia   Americas  South America         10
 5 ECU   Ecuador    Americas  South America         10
# ... with 96 more rows

Let's practice!

Programming with dplyr