Introduction to Natural Language Processing in R
Kasey Jones
Research Data Scientist
Napoloeon
Boxer
# Make sentences
sentences <- animal_farm %>%
unnest_tokens(output = "sentence", token = "sentences", input = text_column)
# Label sentences by animal
sentences$boxer <- grepl('boxer', sentences$sentence)
sentences$napoleon <- grepl('napoleon', sentences$sentence)
# Replace the animal name
sentences$sentence <- gsub("boxer", "animal X", sentences$sentence)
sentences$sentence <- gsub("napoleon", "animal X", sentences$sentence)
animal_sentences <- sentences[sentences$boxer + sentences$napoleon == 1, ]
animal_sentences$Name <-
as.factor(ifelse(animal_sentences$boxer, "boxer", "napoleon"))
# 75 of each
animal_sentences <-
rbind(animal_sentences[animal_sentences$Name == "boxer", ][c(1:75), ],
animal_sentences[animal_sentences$Name == "napoleon", ][c(1:75), ])
animal_sentences$sentence_id <- c(1:dim(animal_sentences)[1])
library(tm); library(tidytext)
library(dplyr); library(SnowballC)
animal_tokens <- animal_sentences %>%
unnest_tokens(output = "word", token = "words", input = sentence) %>%
anti_join(stop_words) %>%
mutate(word = wordStem(word))
animal_matrix <- animal_tokens %>%
count(sentence_id, word) %>%
cast_dtm(document = sentence_id, term = word,
value = n, weighting = tm::weightTfIdf)
animal_matrix
<<DocumentTermMatrix (documents: 150, terms: 694)>>
Non-/sparse entries: 1235/102865
Sparsity : 99%
Maximal term length: 17
Weighting : term frequency - inverse document frequency
Solution: removeSparseTerms()
removeSparseTerms(animal_matrix, sparse = .90)
<<DocumentTermMatrix (documents: 150, terms: 4)>>
Non-/sparse entries: 207/393
Sparsity : 66%
removeSparseTerms(animal_matrix, sparse = .99)
removeSparseTerms(animal_matrix, sparse = .99)
<<DocumentTermMatrix (documents: 150, terms: 172)>>
Non-/sparse entries: 713/25087
Sparsity : 97%
Introduction to Natural Language Processing in R