The filter and arrange verbs

Data Manipulation with dplyr

James Chapman

Curriculum Manager, DataCamp

counties_selected <- counties %>%
  select(state, county, population, unemployment)

counties_selected
# A tibble: 3,138 x 4
   state   county   population unemployment
   <chr>   <chr>         <dbl>        <dbl>
 1 Alabama Autauga       55221          7.6
 2 Alabama Baldwin      195121          7.5
 3 Alabama Barbour       26932         17.6
 4 Alabama Bibb          22604          8.3
 5 Alabama Blount        57710          7.7
 6 Alabama Bullock       10678         18  
 7 Alabama Butler        20354         10.9
 8 Alabama Calhoun      116648         12.3
 9 Alabama Chambers      34079          8.9
10 Alabama Cherokee      26008          7.9
# … with 3,128 more rows
Data Manipulation with dplyr

arrange()

  • Sorts observations based on one or more variables
counties_selected %>%

arrange(population)
# A tibble: 3,138 x 4
   state      county    population unemployment
   <chr>      <chr>          <dbl>        <dbl>
 1 Hawaii     Kalawao           85          0  
 2 Texas      King             267          5.1
 3 Nebraska   McPherson        433          0.9
 4 Montana    Petroleum        443          6.6
 5 Nebraska   Arthur           448          4  
 6 Nebraska   Loup             548          0.7
 7 Nebraska   Blaine           551          0.7
 8 New Mexico Harding          565          6  
 9 Texas      Kenedy           565          0  
10 Colorado   San Juan         606         13.8
# … with 3,128 more rows
Data Manipulation with dplyr

desc()

counties_selected %>%
  arrange(desc(population))
# A tibble: 3,138 x 4
   state      county      population unemployment
   <chr>      <chr>            <dbl>        <dbl>
 1 California Los Angeles   10038388         10  
 2 Illinois   Cook           5236393         10.7
 3 Texas      Harris         4356362          7.5
 4 Arizona    Maricopa       4018143          7.7
 5 California San Diego      3223096          8.7
 6 California Orange         3116069          7.6
 7 Florida    Miami-Dade     2639042         10  
 8 New York   Kings          2595259         10  
 9 Texas      Dallas         2485003          7.6
10 New York   Queens         2301139          8.6
# … with 3,128 more rows
Data Manipulation with dplyr

filter()

  • Extract observations based on conditions
counties_selected %>%
  arrange(desc(population)) %>%

filter(state == "New York")
# A tibble: 62 x 4
   state    county      population unemployment
   <chr>    <chr>            <dbl>        <dbl>
 1 New York Kings          2595259         10  
 2 New York Queens         2301139          8.6
 3 New York New York       1629507          7.5
 4 New York Suffolk        1501373          6.4
 5 New York Bronx          1428357         14  
 6 New York Nassau         1354612          6.4
 7 New York Westchester     967315          7.6
 8 New York Erie            921584          7  
 9 New York Monroe          749356          7.7
10 New York Richmond        472481          6.9
# … with 52 more rows
Data Manipulation with dplyr

filter()

counties_selected %>%
  arrange(desc(population)) %>%
  filter(unemployment < 6)
# A tibble: 949 x 4
   state    county       population unemployment
   <chr>    <chr>             <dbl>        <dbl>
 1 Virginia Fairfax         1128722          4.9
 2 Utah     Salt Lake       1078958          5.8
 3 Hawaii   Honolulu         984178          5.6
 4 Texas    Collin           862215          4.9
 5 Texas    Denton           731851          5.7
 6 Texas    Fort Bend        658331          5.1
 7 Kansas   Johnson          566814          4.5
 8 Maryland Anne Arundel     555280          5.9
 9 Colorado Jefferson        552344          5.9
10 Utah     Utah             551957          5.5
# … with 939 more rows
Data Manipulation with dplyr

Combining conditions

counties_selected %>%
  arrange(desc(population)) %>%
  filter(state == "New York",
         unemployment < 6)
# A tibble: 5 x 4
  state    county     population unemployment
  <chr>    <chr>           <dbl>        <dbl>
1 New York Tompkins       103855          5.9
2 New York Chemung         88267          5.4
3 New York Madison         72427          5.1
4 New York Livingston      64801          5.4
5 New York Seneca          35144          5.5
Data Manipulation with dplyr

Let's practice!

Data Manipulation with dplyr

Preparing Video For Download...