Introduction to Text Analysis in R
Maham Faisal Khan
Senior Data Science Content Developer
tidy_review <- review_data %>%
mutate(id = row_number()) %>%
unnest_tokens(word, review) %>%
anti_join(stop_words)
tidy_review
# A tibble: 78,868 x 5
id date product stars word
<int> <chr> <chr> <dbl> <chr>
1 2 1/12/15 iRobot Roomba 650 for Pets 4 walk
2 2 1/12/15 iRobot Roomba 650 for Pets 4 rest
3 3 12/26/13 iRobot Roomba 650 for Pets 5 roomba
4 3 12/26/13 iRobot Roomba 650 for Pets 5 proof
5 3 12/26/13 iRobot Roomba 650 for Pets 5 house
# … with 78,863 more rows
word_counts <- tidy_review %>% count(word) %>% arrange(desc(n))
ggplot( word_counts, aes(x = word, y = n) ) + geom_col()
word_counts2 <- tidy_review %>%
count(word) %>%
filter(n > 300) %>%
arrange(desc(n))
word_counts2
# A tibble: 25 x 2
word n
<chr> <int>
1 roomba 2286
2 clean 1204
3 vacuum 989
4 hair 900
5 cleaning 809
# … with 15 more rows
ggplot(
word_counts2, aes(x = word, y = n)
) +
geom_col() +
coord_flip() +
ggtitle("Review Word Counts")
Introduction to Text Analysis in R