Data Manipulation with dplyr
James Chapman
Curriculum Manager, DataCamp
v <- c(1, 3, 6, 14)
v
1 3 6 14
lag(v)
NA 1 3 6
v
1 3 6 14
lag(v)
NA 1 3 6
v - lag(v)
NA 2 3 8
babynames_fraction <- babynames %>%
group_by(year) %>%
mutate(year_total = sum(number)) %>%
ungroup() %>%
mutate(fraction = number / year_total)
babynames_fraction %>%
filter(name == "Matthew") %>%
arrange(year)
# A tibble: 28 x 5
year name number year_total fraction
<dbl> <chr> <int> <int> <dbl>
1 1880 Matthew 113 201478 0.000561
2 1885 Matthew 111 240822 0.000461
3 1890 Matthew 86 301352 0.000285
4 1895 Matthew 112 350934 0.000319
5 1900 Matthew 130 450148 0.000289
6 1905 Matthew 107 423875 0.000252
7 1910 Matthew 197 590607 0.000334
8 1915 Matthew 798 1830351 0.000436
9 1920 Matthew 967 2259494 0.000428
10 1925 Matthew 840 2330750 0.000360
# … with 18 more rows
babynames_fraction %>%
filter(name == "Matthew") %>%
arrange(year) %>%
mutate(difference = fraction - lag(fraction))
# A tibble: 28 x 6
year name number year_total fraction difference
<dbl> <chr> <int> <int> <dbl> <dbl>
1 1880 Matthew 113 201478 0.000561 NA
2 1885 Matthew 111 240822 0.000461 -0.0000999
3 1890 Matthew 86 301352 0.000285 -0.000176
4 1895 Matthew 112 350934 0.000319 0.0000338
5 1900 Matthew 130 450148 0.000289 -0.0000304
6 1905 Matthew 107 423875 0.000252 -0.0000364
7 1910 Matthew 197 590607 0.000334 0.0000811
8 1915 Matthew 798 1830351 0.000436 0.000102
9 1920 Matthew 967 2259494 0.000428 -0.00000801
10 1925 Matthew 840 2330750 0.000360 -0.0000676
# … with 18 more rows
babynames_fraction %>%
filter(name == "Matthew") %>%
arrange(year) %>%
mutate(difference = fraction - lag(fraction)) %>%
arrange(desc(difference))
# A tibble: 28 x 6
year name number year_total fraction difference
<dbl> <chr> <int> <int> <dbl> <dbl>
1 1975 Matthew 28665 3014943 0.00951 0.00389
2 1970 Matthew 20265 3604252 0.00562 0.00286
3 1985 Matthew 47367 3563364 0.0133 0.00223
4 1980 Matthew 38054 3439117 0.0111 0.00156
5 1965 Matthew 10015 3624610 0.00276 0.00109
6 1960 Matthew 6942 4152075 0.00167 0.000853
7 1955 Matthew 3287 4012691 0.000819 0.000447
8 1915 Matthew 798 1830351 0.000436 0.000102
9 1950 Matthew 1303 3502592 0.000372 0.0000967
10 1910 Matthew 197 590607 0.000334 0.0000811
# … with 18 more rows
babynames_fraction %>%
arrange(name, year) %>%
group_by(name) %>%
mutate(difference = fraction - lag(fraction)) %>%
ungroup() %>%
arrange(desc(difference))
# A tibble: 332,595 × 6
year name number year_total fraction difference
<dbl> <chr> <int> <int> <dbl> <dbl>
1 1935 Shirley 42790 2088487 0.0205 0.0137
2 1985 Ashley 47509 3563364 0.0133 0.0110
3 1955 Debra 50630 4012691 0.0126 0.0109
4 1975 Jason 52486 3014943 0.0174 0.00981
5 1970 Jennifer 46276 3604252 0.0128 0.00863
6 1965 Lisa 60443 3624610 0.0167 0.00854
7 1940 Judith 22462 2301630 0.00976 0.00790
8 1925 Betty 32897 2330750 0.0141 0.00790
9 1950 Deborah 29111 3502592 0.00831 0.00776
10 1945 Linda 41572 2652029 0.0157 0.00767
# … with 332,585 more rows
Data Manipulation with dplyr