Text Mining with Bag-of-Words in R
Ted Kwartler
Instructor
# Create bigram TDM
amzn_p_tdm <- TermDocumentMatrix(
amzn_pros_corp,
control = list(tokenize = tokenizer)
)
# Convert TDM to matrix amzn_p_m <- as.matrix(amzn_p_tdm)
# Compute term frequencies amzn_p_freq <- rowSums(amzn_p_m)
# Sort in decreasing order of frequency term_frequency <- sort(amzn_p_freq, decreasing = TRUE)
# View the top 5 most frequent bigrams term_frequency[1:5]
good pay great benefits smart people
25 24 20
place work fast paced
17 16
# Find common words common_words <- subset(all_tdm_m, all_tdm_m[, 1] > 0 & all_tdm_m[, 2] > 0) difference <- abs(common_words[, 1] - common_words[, 2]) common_words <- cbind(common_words, difference) common_words <- common_words[order(common_words[, 3], decreasing = TRUE), ]
# Create data frame: top 15 words top15_df <- data.frame(x = common_words[1:15, 1], y = common_words[1:15, 2], labels = rownames(common_words[1:15, ]))
# Make pyramid plot pyramid.plot(top15_df$x, top15_df$y, labels = top15_df$labels, gap = 12, main = "Words in Common", unit = NULL, top.labels = c("Amzn", "Cons Words", "Google"))
Text Mining with Bag-of-Words in R