Introduction to Natural Language Processing in R
Kasey Jones
Research Data Scientist
library(tidytext)
sentiments
# A tibble: 27,314 x 4
word sentiment lexicon score
<chr> <chr> <chr> <int>
1 abacus trust nrc NA
2 abandon fear nrc NA
3 abandon negative nrc NA
4 abandon sadness nrc NA
5 abandoned anger nrc NA
AFINN
: scores words from -5 (extremely negative) to 5 (extremely positive)bing
: positive/negative label for all wordsnrc
: labels words as fear, joy, anger, etc.library(tidytext)
get_sentiments("afinn")
# A tibble: 2,476 x 2
1 abandon -2
2 abandoned -2
3 abandons -2
...
# Read the data
animal_farm <- read.csv("animal_farm.csv")
animal_farm <- as_tibble(animal_farm)
# Tokenize and remove stop words
animal_farm_tokens <- animal_farm %>%
unnest_tokens(output = "word", token = "words", input = text_column) %>%
anti_join(stop_words)
animal_farm_tokens %>%
inner_join(get_sentiments("afinn"))
# A tibble: 1,175 x 3
chapter word score
<chr> <chr> <int>
1 Chapter 1 drunk -2
2 Chapter 1 strange -1
3 Chapter 1 dream 1
4 Chapter 1 agreed 1
5 Chapter 1 safely 1
animal_farm_tokens %>%
inner_join(get_sentiments("afinn")) %>%
group_by(chapter) %>%
summarise(sentiment = sum(score)) %>%
arrange(sentiment)
# A tibble: 10 x 2
chapter sentiment
<chr> <int>
1 Chapter 7 -166
2 Chapter 8 -158
3 Chapter 4 -84
word_totals <- animal_farm_tokens %>%
group_by(chapter) %>%
count()
animal_farm_tokens %>%
inner_join(get_sentiments("bing")) %>%
group_by(chapter) %>%
count(sentiment) %>%
filter(sentiment == 'negative') %>%
transform(p = n / word_totals$n) %>%
arrange(desc(p))
chapter sentiment n p
1 Chapter 7 negative 154 0.11711027
2 Chapter 6 negative 106 0.10750507
3 Chapter 4 negative 68 0.10559006
4 Chapter 10 negative 117 0.10372340
5 Chapter 8 negative 155 0.10006456
6 Chapter 9 negative 121 0.09152799
7 Chapter 3 negative 65 0.08843537
8 Chapter 1 negative 77 0.08603352
9 Chapter 5 negative 93 0.08462238
10 Chapter 2 negative 67 0.07395143
as.data.frame(table(get_sentiments("nrc")$sentiment)) %>%
arrange(desc(Freq))
Var1 Freq
1 negative 3324
2 positive 2312
3 fear 1476
4 anger 1247
5 trust 1231
6 sadness 1191
...
fear <- get_sentiments("nrc") %>%
filter(sentiment == "fear")
animal_farm_tokens %>%
inner_join(fear) %>%
count(word, sort = TRUE)
# A tibble: 220 x 2
word n
<chr> <int>
1 rebellion 29
2 death 19
3 gun 19
4 terrible 15
5 bad 14
6 enemy 12
7 broke 11
...
Introduction to Natural Language Processing in R