Analyzing Social Media Data in R
Vivek Vijayaraghavan
Data Science Coach
search_tweets()
library(rtweet)
# Extract tweets on "#google" using search_tweets()
search_tweets("#google", n = 18000, include_rts = FALSE)
status_id created_at screen_name
<chr> <S3: POSIXct> <chr>
1164921105066463232 2019-08-23 15:23:29 catapanoannal
1164921037143699456 2019-08-23 15:23:13 STARBEXPLORE
1164920927341039621 2019-08-23 15:22:46 indra_susanto
1164920898475794435 2019-08-23 15:22:40 virfice
1164920877940482048 2019-08-23 15:22:35 KnowledgeNile
1164920647962832897 2019-08-23 15:21:40 mahomes_tech
created_at
has the timestamp of the tweets# Extract tweets on "#camry" using search_tweets()
camry_st <- search_tweets("#camry", n = 18000, include_rts = FALSE)
created_at screen_name text
<S3: POSIXct> <chr> <chr>
2019-08-23 03:29:58 dromru Toyota Camry 2019 <U+0433><U+043E><U+0434><U+0
2019-08-23 02:59:04 NusTrivia Sportier 2020 Toyota Camry TRD to cost $31,995
2019-08-22 18:09:06 NusTrivia 2020 Toyota Camry TRD Costs $31,995, It’s The
2019-08-23 01:56:51 RaitisRides ALL NEW 2020 Toyota Avalon is coming to R
2019-08-23 01:17:36 jhooie I have to say, when I finally settled down tod
# Create a time series plot
ts_plot(camry_st, by = "hours", color = "blue")
# Convert tweet data into a time series object
camry_ts <- ts_data(camry_st, by = 'hours')
head(camry_ts)
time n
<S3: POSIXct> <int>
2019-08-13 14:00:00 12
2019-08-13 15:00:00 34
2019-08-13 16:00:00 1
2019-08-13 17:00:00 2
# Rename the two columns in the time series object
names(camry_ts) <- c("time", "camry_n")
head(camry_ts)
time camry_n
<S3: POSIXct> <int>
2019-08-13 14:00:00 12
2019-08-13 15:00:00 34
2019-08-13 16:00:00 1
2019-08-13 17:00:00 2
tesla_st <- search_tweets("#tesla", n = 18000, include_rts = FALSE)
tesla_ts <- ts_data(tesla_st, by = 'hours')
names(tesla_ts) <- c("time", "tesla_n")
head(tesla_ts)
time tesla_n
<S3: POSIXct> <int>
2019-08-13 13:00:00 17
2019-08-13 14:00:00 58
2019-08-13 15:00:00 38
2019-08-13 16:00:00 32
2019-08-13 17:00:00 38
# Merge the two time series objects and retain "time" column
merged_df <- merge(tesla_ts, camry_ts, by = "time", all = TRUE)
head(merged_df)
time tesla_n camry_n
<S3:POSIXct> <int> <int>
2019-08-13 13:00:00 17 NA
2019-08-13 14:00:00 58 12
2019-08-13 15:00:00 38 34
2019-08-13 16:00:00 32 1
# Stack the tweet frequency columns using melt() function
library(reshape)
melt_df <- melt(merged_df, na.rm = TRUE, id.vars = "time")
head(melt_df)
time variable value
<S3: POSIXct> <fct> <int>
2019-08-13 13:00:00 tesla_n 17
2019-08-13 14:00:00 tesla_n 58
2019-08-13 15:00:00 tesla_n 38
2019-08-13 16:00:00 tesla_n 32
2019-08-13 17:00:00 tesla_n 38
2019-08-13 18:00:00 tesla_n 34
# Plot frequency of tweets on Camry and Tesla
ggplot(data = melt_df,
aes(x = time, y = value, col = variable)) +
geom_line(lwd = 0.8)
Analyzing Social Media Data in R