Introduction to Natural Language Processing in R
Kasey Jones
Research Data Scientist
animal_farm %>%
unnest_tokens(output = "word", token = "words",
input = text_column) %>%
anti_join(stop_words) %>%
count(word, sort = TRUE)
# A tibble: 3,611 x 2
word n
<chr> <int>
1 animals 248
2 farm 163
...
text1 <- c("Few words are important.")
text2 <- c("All words are important.")
text3 <- c("Most words are important.")
Unique Words:
text1
text2
text3
# Lowercase, without stop words
word_vector <- c("few", "all", "most", "words", "important")
# Representation for text1
text1 <- c("Few words are important.")
text1_vector <- c(1, 0, 0, 1, 1)
# Representation for text2
text2 <- c("All words are important.")
text2_vector <- c(0, 1, 0, 1, 1)
# Representation for text3
text3 <- c("Most words are important.")
text3_vector <- c(0, 0, 1, 1, 1)
words <- animal_farm %>%
unnest_tokens(output = "word", token = "words", input = text_column) %>%
anti_join(stop_words) %>%
count(chapter, word, sort = TRUE)
words
# A tibble: 6,807 x 3
chapter word n
<chr> <chr> <int>
1 Chapter 8 napoleon 43
2 Chapter 8 animals 41
3 Chapter 9 boxer 34
...
words %>%
filter(word == 'napoleon') %>%
arrange(desc(n))
# A tibble: 9 x 3
chapter word n
<chr> <chr> <int>
1 Chapter 8 napoleon 43
2 Chapter 7 napoleon 24
3 Chapter 5 napoleon 22
...
8 Chapter 3 napoleon 3
9 Chapter 4 napoleon 1
library(tidytext); library(dplyr)
russian_tweets <- read.csv("russian_1.csv")
russian_tweets <- as_tibble(russian_tweets)
tidy_tweets <- russian_tweets %>%
unnest_tokens(word, content) %>%
anti_join(stop_words)
tidy_tweets %>%
count(word, sort = TRUE)
# A tibble: 43,666 x 2
...
Sparse Matrix
Sparse matrix example:
Introduction to Natural Language Processing in R