Text Mining with Bag-of-Words in R
Ted Kwartler
Instructor
# Combine both corpora: all_tweets all_coffee <- paste(coffee_tweets$text, collapse = "") all_chardonnay <- paste(chardonnay_tweets$text, collapse = "")
all_tweets <- c(all_coffee, all_chardonnay)
# Clean all_tweets all_tweets <- VectorSource(all_tweets) all_corpus <- VCorpus(all_tweets) all_clean <- clean_corpus(all_corpus) all_dm <- TermDocumentMatrix(all_clean) all_m <- as.matrix(all_tdm)
# Make commonality cloud commonality.cloud(all_m, colors = "steelblue1", max.words = 100)
# Combine both corpora: all_tweets all_coffee <- paste(coffee_tweets$text, collapse = "") all_chardonnay <- paste(chardonnay_tweets$text, collapse = "") all_tweets <- c(all_coffee, all_chardonnay)
# Clean all_tweets all_tweets <- VectorSource(all_tweets) all_corpus <- VCorpus(all_tweets) all_clean <- clean_corpus(all_corpus) all_tdm <- TermDocumentMatrix(all_clean)
colnames(all_tdm) <- c("coffee", "chardonnay")
all_m <- as.matrix(all_tdm) # Make comparison cloud comparison.cloud(all_m, colors = c("orange", "blue"), max.words = 50)
# Identify terms shared by both documents common_words <- subset( all_tdm_m, all_tdm_m[, 1] > 0 & all_tdm_m[, 2] > 0 )
# Find most commonly shared words difference <- abs(common_words[, 1] - common_words[, 2])
common_words <- cbind(common_words, difference) common_words <- common_words[order(common_words[, 3], decreasing = TRUE), ] top25_df <- data.frame(x = common_words[1:25, 1], y = common_words[1:25, 2], labels = rownames(common_words[1:25, ]))
# Make pyramid plot
pyramid.plot(top25_df$x, top25_df$y,
labels = top25_df$labels,
main = "Words in Common",
gap = 8, laxly = NULL,
raxlab = NULL, unit = NULL,
top.labels = c("Chardonnay",
"Words",
"Coffee")
)
# Create word network
word_associate(coffee_tweets$text,
match.string = c("barista"),
stopwords = c(Top200Words, "coffee", "amp"),
network.plot = TRUE,
cloud.colors = c("gray85", "darkred"))
# Add title
title(main = "Barista Coffee Tweet Associations")
Text Mining with Bag-of-Words in R