Christmas Tweets and what they tell

 

Introduction

library(plyr)
library(dplyr)
library(stringr)
library(ggplot2)
library(highcharter)
library(tm.plugin.webmining)
library(plotly)
library(tidytext)
library(gridExtra)
library(igraph)
library(widyr)
hols<-read.csv("HolidayTweets.csv",stringsAsFactors = F,header = T,quote = "")

text text data_tweets data_tweets$hashtags data_tweets$num_hashtags data_tweets$clean_text data_tweets$number_of_words

data_tweets$picture

data_tweets$X ```

The data set given consists of ID information and the HTML metadata. Here we use the extractHTMLStrip() function provided by the **tm.plugin.webmining**
library.Upon extraction, we use several text processing tools to extract other information such as hashtags, number of words and whether pictures/links were used.

## Hashtags

### What were the most commonly used Hashtags?
```r
hash hash hash$text % group_by(text) %>% summarise(n=n()) %>% arrange(desc(n)) %>% filter(n>150) %>%
hchart("column",x=text,y=log(n)) %>%
hc_xAxis(labels = list(rotation = -90, step = 1))


Naturally, we would expect the most popular hashtag to be Christmas related(naturally). We also see #iphone, #selfie, #giftcard,#blackfriday and #shoes amongst other things.Christmas shopping hit an all time high in 2016, with overall spending in the US crossing a trillion dollars. We also see mentions of other festivals during the season , such as #hanukkah.

Distribution of Number of Hashtags used

ggplot(data=data_tweets,aes(x=num_hashtags))+geom_histogram(binwidth = 1)

Number of Words Used

Distribution of Number of Words Used

ggplot(data_tweets, aes(x=number_of_words)) + geom_histogram(binwidth = 1) + ggtitle("Number of Words Used")

Distribution of Number of Words used with and without hashtags

data_tweets %>% mutate(if_hashtag=ifelse(num_hashtags>0,"Yes","No")) %>%
ggplot(aes(x=number_of_words, fill=if_hashtag)) +geom_histogram(position="identity", alpha=0.4,binwidth =0.5)

We see that both distributions are skewed which leads us to use the median statistic to encapsulate the number of words for both of these distributions.

Medians

DT::datatable(data_tweets %>% mutate(if_hashtag=ifelse(num_hashtags>0,"Yes","No"))%>%group_by(if_hashtag) %>% summarise(n=median(number_of_words)) )

We estimate that tweets with hashtags have at least 5 times more words used than tweets without hashtags.

Number of Tweets with(out) pictures and links

Bar Chart Visualization

data_tweets %>% group_by(picture)%>% summarise(n=n()) %>%
hchart("column",x=picture,y=log(n))

We see that most of the tweets did not contain pictures/links. The plots look comparable because of log scaling.

Do tweets with links/pictures have more/less word usage?

DT::datatable(data_tweets%>%group_by(picture) %>% summarise(n=median(number_of_words)) )

Tweets with picture/links tend to use more words based on the median estimate

Hashtags and Links

# Do a group-wise transform(), splitting on "Date"
data_tweets % mutate(if_hashtag = ifelse(num_hashtags>0,"Yes","No"))
temp % group_by(picture,if_hashtag) %>% summarise(n=n())
ce percent_n = n / sum(n) * 100)
ggplot(ce, aes(x=picture, y=percent_n, fill=if_hashtag)) +
geom_bar(stat="identity")

A large portion of tweets that have pictures/links use hashtags.

Sentiments

library(tidyr)
library(RSentiment)
tweet tweet = gsub("(f|ht)(tp)(s?)(://)(.*)[.|/](.*)", " ", tweet)
#retweet
tweet = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", " ", tweet)
# removing hashtags
tweet = gsub("#\\w+", " ", tweet)
# removing @people
tweet = gsub("@\\w+", " ", tweet)
#removing punctuations
tweet = gsub("[[:punct:]]", " ", tweet)
#removing numbers
tweet = gsub("[[:digit:]]", " ", tweet)
#removing emojis
tweet<-str_replace_all(tweet,"[^[:graph:]]"," ")
tweet tweet wordstoremove tweet # removing non-english characters
#tweet1 data_tweets$clean_text_2

data_tweets$sentiment

for(i in 1:dim(data_tweets)[1])
{
data_tweets$sentiment[i] }
pi % group_by(sentiment) %>% summarise(n=n()) %>% mutate(Percentage=(n/sum(n))*100)
plot_ly() %>%
add_pie( data= pi,
labels=pi$sentiment,
values = pi$Percentage,
name = "") %>% layout(title = 'Percentage Sentiment in Tweets',
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))


We see that most of the tweets were neutral in nature, followed by positive and negative ones.

Why are there negative tweets?

To answer this question, we look into what words make up negative tweets.

words_neg temp

words %
unnest_tokens(word,clean_text_2)%>%
filter(!word %in% stop_words$word,
str_detect(word,"^[a-z']+$"))

for(i in 1:dim(words)[1])
{
words$sentiment[i] % filter(sentiment=="Negative"| sentiment=="Very Negative") %>% group_by(word) %>% summarise(n=n()) %>% arrange(desc(n)) %>% filter(n>10) %>%
hchart("column",x=word,y=n)%>%
hc_xAxis(labels = list(rotation = -90, step = 1))

Words like “emergency”,”shame”,”pig”,”evil”,”bs”,”terrible” come under this category.

Sentiments of Tweets that have/do not have pictures/links

ax zeroline=FALSE,
showline=FALSE,
showticklabels=FALSE,
showgrid=FALSE
)
temp % group_by(picture,sentiment) %>% summarise(n=n())
temp_pic temp_no_pic temp_pic % mutate(percentage=(n/sum(n))*100)
temp_no_pic % mutate(percentage=(n/sum(n))*100)
pie_chart_1 %
add_pie(data = temp_pic,
labels=temp_pic$sentiment,
values = temp_pic$percentage,
name = "By Sentiment",
domain = list(x = c(0.52, 1), y = c(0.5, 1)))%>%
add_pie(data = temp_no_pic,
labels=temp_no_pic$sentiment,
values = temp_no_pic$percentage,
name = "By Sentiment",
domain = list(x = c(0, 0.48), y = c(0.5, 1))) %>%
layout(title = "Picture and No Picture",
xaxis=ax,
yaxis=ax)
pie_chart_1
#pie_chart_2

Most Common Words Used

library(DT)
temp

words %
unnest_tokens(word,clean_text_2)%>%
filter(!word %in% stop_words$word,
str_detect(word,"^[a-z']+$"))

words %>% group_by(word) %>% summarise(n=n()) %>% filter(n>250) %>% arrange(desc(n)) %>%
hchart("column",x=word,y=log(n))%>%
hc_xAxis(labels = list(rotation = -90, step = 1))

#DT::datatable(words %>% group_by(word) %>% summarise(n=n()) %>% filter(n>100) %>% arrange(desc(n)))

Correlated Terms

words_counts %
count(word) %>%
filter(n >= 30)

words_correlations %
semi_join(words_counts) %>%
pairwise_cor(word,X,sort = TRUE, upper = FALSE)
words_correlations 0.7)
words_correlations graph E(graph)$weight V(graph)$label.cex

plot(graph,edge.width=E(graph)$weight,layout=layout.fruchterman.reingold,vertex.size = 5)
library(networkD3)
graph #g simpleNetwork(graph,charge = -200 , opacity = 0.6, zoom = T, fontSize = 15)

#DT::datatable(words_correlations)

Source

You can find a more interactive presentation here. This presentation makes use of the RMarkdown format and HTML widgets made available by the plotly() and highchart() libraries. Through this project, we were able to understand Tweets related to Christmas a little better. A large portion of the Tweets did not make use of pictures/links. We were shocked to see some negative Tweets.Most correlated terms show the mention of advertised products for the holiday season.

Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Google+ photo

You are commenting using your Google+ account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s