Reshaping Data with tidyr
Jeroen Boeye
Head of Machine Learning, Faktion
netflix_df
# A tibble: 637 x 3
title type duration
<chr> <chr> <chr>
1 Article 15 Movie 125 min
2 Kill Me If You Dare Movie 100 min
3 The Spy TV Show 1 Seasons
4 The World We Make Movie 108 min
5 Watchman Movie 93 min
netflix_df %>%
separate(duration, into = c("value", "unit"), convert = TRUE)
# A tibble: 5 x 4
title type value unit
<chr> <chr> <int> <chr>
1 Article 15 Movie 125 min
2 Kill Me If You Dare Movie 100 min
3 The Spy TV Show 1 Seasons
4 The World We Make Movie 108 min
5 Watchman Movie 93 min
netflix_df %>%
separate(duration, into = c("value", "unit"), convert = TRUE) %>%
group_by(type, unit) %>%
summarize(mean_duration = mean(value))
# A tibble: 2 x 3
# Groups: type [2]
type unit mean_duration
<chr> <chr> <dbl>
1 Movie min 98.6
2 TV Show Seasons 1.85
star_wars_df
# A tibble: 4 x 2
given_name family_name
<chr> <chr>
1 Luke Skywalker
2 Han Solo
3 Leia Organa
4 R2 D2
star_wars_df %>%
unite("name", given_name, family_name)
# A tibble: 4 x 1
name
<chr>
1 Luke_Skywalker
2 Han_Solo
3 Leia_Organa
4 R2_D2
star_wars_df %>%
unite("name", given_name, family_name, sep = " ")
# A tibble: 4 x 1
name
<chr>
1 Luke Skywalker
2 Han Solo
3 Leia Organa
4 R2 D2
drink_df
# A tibble: 2 x 2
drink ingredients
<chr> <chr>
1 Chocolate milk milk, chocolate, sugar
2 Orange juice oranges, sugar
drink_df %>%
separate_rows(ingredients, sep = ", ")
# A tibble: 5 x 2
drink ingredients
<chr> <chr>
1 Chocolate milk milk
2 Chocolate milk chocolate
3 Chocolate milk sugar
4 Orange juice oranges
5 Orange juice sugar
drink_df %>%
separate_rows(ingredients, sep = ", ") %>%
count(drink)
# A tibble: 2 x 2
drink n
<chr> <int>
1 Chocolate milk 3
2 Orange juice 2
drink_df %>%
separate_rows(ingredients, sep = ", ") %>%
count(ingredients)
# A tibble: 4 x 2
ingredients n
<chr> <int>
1 chocolate 1
2 milk 1
3 oranges 1
4 sugar 2
drink_df %>%
separate_rows(ingredients, sep = ", ") %>%
ggplot(aes(x=drink, fill=ingredients)) +
geom_bar()
Reshaping Data with tidyr