Joining Data with dplyr
Chris Cardillo
Data Scientist
questions
# A tibble: 294,735 x 3
id creation_date score
<int> <date> <int>
1 22557677 2014-03-21 1
2 22557707 2014-03-21 2
3 22558084 2014-03-21 2
4 22558395 2014-03-21 2
5 22558613 2014-03-21 0
6 22558677 2014-03-21 2
7 22558887 2014-03-21 8
8 22559180 2014-03-21 1
9 22559312 2014-03-21 0
10 22559322 2014-03-21 2
# … with 294,725 more rows
answers
# A tibble: 380,635 x 4
id creation_date question_id score
<int> <date> <int> <int>
1 39143713 2016-08-25 39143518 3
2 39143869 2016-08-25 39143518 1
3 39143935 2016-08-25 39142481 0
4 39144014 2016-08-25 39024390 0
5 39144252 2016-08-25 39096741 6
6 39144375 2016-08-25 39143885 5
7 39144430 2016-08-25 39144077 0
8 39144625 2016-08-25 39142728 1
9 39144794 2016-08-25 39043648 0
10 39145033 2016-08-25 39133170 1
# … with 380,625 more rows
questions %>%
bind_rows(answers)
# A tibble: 675,370 x 4
id creation_date score question_id
<int> <date> <int> <int>
1 22557677 2014-03-21 1 NA
2 22557707 2014-03-21 2 NA
3 22558084 2014-03-21 2 NA
4 22558395 2014-03-21 2 NA
5 22558613 2014-03-21 0 NA
6 22558677 2014-03-21 2 NA
7 22558887 2014-03-21 8 NA
8 22559180 2014-03-21 1 NA
9 22559312 2014-03-21 0 NA
10 22559322 2014-03-21 2 NA
# … with 675,360 more rows
questions_type <- questions %>%
mutate(type = "question")
answers_type <- answers %>%
mutate(type = "answer")
posts <- bind_rows(questions_type, answers_type)
posts
# A tibble: 675,370 x 5
id creation_date score type question_id
<int> <date> <int> <chr> <int>
1 22557677 2014-03-21 1 question NA
2 22557707 2014-03-21 2 question NA
3 22558084 2014-03-21 2 question NA
4 22558395 2014-03-21 2 question NA
5 22558613 2014-03-21 0 question NA
6 22558677 2014-03-21 2 question NA
7 22558887 2014-03-21 8 question NA
8 22559180 2014-03-21 1 question NA
9 22559312 2014-03-21 0 question NA
10 22559322 2014-03-21 2 question NA
# … with 675,360 more rows
posts %>%
group_by(type) %>%
summarize(average_score = mean(score))
# A tibble: 2 x 2
type average_score
<chr> <dbl>
1 answer 2.88
2 question 1.90
library(lubridate)
posts %>%
mutate(year = year(creation_date))
# A tibble: 675,370 x 6
id creation_date score type question_id year
<int> <date> <int> <chr> <int> <dbl>
1 22557677 2014-03-21 1 question NA 2014
2 22557707 2014-03-21 2 question NA 2014
3 22558084 2014-03-21 2 question NA 2014
4 22558395 2014-03-21 2 question NA 2014
5 22558613 2014-03-21 0 question NA 2014
6 22558677 2014-03-21 2 question NA 2014
7 22558887 2014-03-21 8 question NA 2014
8 22559180 2014-03-21 1 question NA 2014
9 22559312 2014-03-21 0 question NA 2014
10 22559322 2014-03-21 2 question NA 2014
# … with 675,360 more rows
posts %>%
mutate(year = year(creation_date)) %>%
count(year, type)
# A tibble: 24 x 3
year type n
<dbl> <chr> <int>
1 2008 answer 27
2 2008 question 8
3 2009 answer 1356
4 2009 question 524
5 2010 answer 4846
6 2010 question 2264
7 2011 answer 11077
8 2011 question 5837
9 2012 answer 18967
10 2012 question 12210
# … with 14 more rows
questions_answers_year <- posts %>%
mutate(year = year(creation_date)) %>%
count(year, type)
ggplot(questions_answers_year, aes(year, n, color = type)) +
geom_line()
Joining Data with dplyr