WordCloud – NoSimpler

Twitter Word Cloud

A word cloud can be built from words occurring in tweets for given search terms or tweet topic. It's a visual representation of the frequency of words - higher the frequency bigger is the size of the word in the word cloud plot .

Programming Logic

Steps for Twitter mining to build wordcloud

Pre-requisite:
Register your application with Twitter for OAuth so that you can read the tweets in to your R code.

Step 1:
Install the required R packages and load them

Step 2:
Set up the environment options, if any

Step 3:
Connect to Twitter using the OAuth credentials

Step 4:
Read the desired number of tweets (max. ~ 1500) for the given topic or search terms

Step 5:
Convert the tweets into text

Step 6:
Clean the tweets for analysis by removing punctuation, numbers, stop-words, special characters; and make text lowercase

Step 7:
Create tweet corpus, it is a data structure where tweets are stored as documents

Step 8:
Create the term document matrix, TDM, for the tweets - it stores the words and their frequency

Step 9:
Build the wordcloud using the TDM

OAuth with Twitter

Save the oauth credentials in a text file with Tab delimited column names as header. Path of file should be same as that of R code.

OAuth your R application with Twitter for reading in tweets

See video on oauth @2:16:00:

Install required R packages

install.packages("twitteR", dependencies=TRUE)
install.packages("RCurl")
install.packages('bitops')
install.packages('base64enc')
install.packages('httpuv')
install.packages('tm')
install.packages('wordcloud')
install.packages("stringr")

Load the R packages

libs = c("twitteR", "RCurl", "tm", "stringr", "wordcloud")
lapply(libs, require, character.only=TRUE)

Set Environment option

To Not convert character variables to factor variables

options(stringsAsFactors = FALSE)

function 1 : doOAuth

Read oauth credentials from file and connect to Twitter

Input - filepath, filename

doOAuth = function(path, filename){

file = paste(path,filename,sep='/')
oauthCreds = read.table(file,header=T)
setup_twitter_oauth(oauthCreds$consumer_key,
oauthCreds$consumer_secret,
oauthCreds$access_token,
oauthCreds$access_secret)

}

function 2 : getTweets

Read recent tweets for search terms or topic
Input - search terms, number of tweets
Output - list containing tweets

getTweets = function(searchTerms, numberOfTweets){

tweets_list = searchTwitter(searchTerms,lang="en",n=numberOfTweets,resultType="recent")
#length(tweets_list)
class(tweets_list)
return(tweets_list)

}

function 3 : getTweets_text

Retrieve the text part from tweets in list

Input - list containing tweets

Output - character vector

getTweets_text = function(tweets_list){

tweets_text = sapply(tweets_list, function(x) x$getText())
#str(tweets_text)
#class(tweets_text)
return (tweets_text)

}

function 4 : getTweets_corpus

Create tweets corpus in which each document is a tweet

Input - character vector of tweets in text format

Output - corpus of tweets

getTweets_corpus = function(tweets_text){

tweets_corpus = Corpus(VectorSource(tweets_text))
tweets_corpus
#see the first tweet in the tweet corpus
inspect(tweets_corpus[1:3])
return(tweets_corpus)

}

function 5 : cleanCorpus

Remove punctuations, whitespaces, numbers, stop-words from tweets and convert to lowercase

Input - tweets corpus

Output - clean tweets corpus

cleanCorpus = function(tweets_corpus){

tweets_corpus_clean = tm_map(tweets_corpus, removePunctuation)
tweets_corpus_clean = tm_map(tweets_corpus_clean, stripWhitespace)
tweets_corpus_clean = tm_map(tweets_corpus_clean, removeNumbers)
tweets_corpus_clean = tm_map(tweets_corpus_clean, removeWords, stopwords("english"))
tweets_corpus_clean = tm_map(tweets_corpus_clean, content_transformer(tolower))
toSpace = content_transformer(function(x, pattern) gsub(pattern,"",x))
tweets_corpus_clean = tm_map(tweets_corpus_clean, toSpace,"https*|youtu*")
#tweets_corpus_clean = tm_map(tweets_corpus_clean, stemDocument)
return(tweets_corpus_clean)

}

function 6 : create_TDM

TDM(term document matrix) contains the words and their frequency. It is a simple_triple_matrix

Input - search terms, clean tweets corpus

Output - term document matrix

createTDM = function(searchTerms,tweets_corpus_clean){

tweets_tdm = TermDocumentMatrix(tweets_corpus_clean)
#class(tweets_tdm)
#[1] "TermDocumentMatrix" "simple_triplet_matrix"
str(tweets_tdm)
###############
# List of 6
# $ i : int [1:3946] 96 295 305 501 510 513 527 532 780 823 ...
# $ j : int [1:3946] 1 1 1 1 1 1 1 1 1 1 ...
# $ v : num [1:3946] 1 1 1 1 1 1 1 1 1 1 ...

# $ nrow : int 1409
# $ ncol : int 400

# $ dimnames:List of 2
# ..$ Terms: chr [1:1409] "—fantastic""| __truncated__ "‘blade""| __truncated__ "‘the""| __truncated__ "“ui""| __truncated__ ...

# ..$ Docs : chr [1:400] "1" "2" "3" "4" ...
# - attr(*, "class")= chr [1:2] "TermDocumentMatrix" "simple_triplet_matrix"
# - attr(*, "weighting")= chr [1:2] "term frequency" "tf"
###################################

return(tweets_tdm)

}

function 7 : createTDM_WordFreq_df

Create a dataframe for words and their frequency in the term document matrix

Input - tweets term document matrix tdm

Output - clean tweets corpus

createTDM_WordFreq_df = function(tdm){
tweets_tdm = as.matrix(tdm)
class(tweets_tdm)
#[1] "matrix"
str(tweets_tdm)
###################
# num [1:1409, 1:400] 0 0 0 0 0 0 0 0 0 0 ...
# - attr(*, "dimnames")=List of 2
# ..$ Terms: chr [1:1409] "-fantastic""| __truncated__ "'blade""| __truncated__ "'the""| __truncated__ ""ui""| __truncated__ ...
# ..$ Docs : chr [1:400] "1" "2" "3" "4" ...
###############################

tdm_term_freq_sort = sort(rowSums(tweets_tdm), decreasing=TRUE)
#class(tdm_term_freq_sort)
#[1] "numeric"
#str(tdm_term_freq_sort)
##############
# Named num [1:1409] 82 78 78 78 78 69 62 53 46 46 ...
# - attr(*, "names")= chr [1:1409] "connect" "carmack" "john" "keynote" ...
#
##################################

tdm_term_freq_sort_inc = sort(rowSums(tweets_tdm), decreasing=FALSE)
#class(tdm_term_freq_sort_inc)
#[1] "numeric"
#str(tdm_term_freq_sort_inc)
##############
# Named num [1:1409] 1 1 1 1 1 1 1 1 1 1 ...
# - attr(*, "names")= chr [1:1409] "-fantastic""| __truncated__ "'blade""| __truncated__ "'the""| __truncated__ ""ui""| __truncated__ ...
##################################

tdm_term_freq_df = data.frame(word = names(tdm_term_freq_sort),
freq = tdm_term_freq_sort)
str(tdm_term_freq_df)
head(tdm_term_freq_df,10)
#set rownames to number
#return( list(name = searchTerms, tdm = tweets_term_freq_df))
return(tdm_term_freq_df)
}

Plot the word-cloud

Call twitter data mining functions per the programming logic and plot a word-cloud for search terms

# oauth with twitter

doOAuth("<C:/...>","<twitter_Oauth_credentials.txt>")

# set seed so that result is same

set.seed(1234)

#specifiy the search terms

searchTerms = c('Donald Trump + Hillary Clinton + President')

# specify number of tweets to read

numberOfTweets = 400

# call data mining functions

tweets_list = getTweets(searchTerms, numberOfTweets)
length(tweets_list)
tweets_text = getTweets_text(tweets_list)
tweets_corpus = getTweets_corpus(tweets_text)
tweets_clean = cleanCorpus(tweets_corpus)
tweets_tdm = createTDM(searchTerms, tweets_clean)
tweets_WordFreq_df = createTDM_WordFreq_df(tweets_tdm)

class(tweets_WordFreq_df)
str(tweets_WordFreq_df)
#'data.frame': 1707 obs. of 2 variables:
# $ word: chr "amp" "virtualreality" "microsoft" "tco…""| __truncated__ ...
# $ freq: num 73 56 39 36 36 35 33 32 31 29 ...

# plot wordcloud

wordcloud(words = tweets_WordFreq_df$word,
freq= tweets_WordFreq_df$freq,
min.freq=5,
max.words=300,
random.order=FALSE,
rot.per=0.35,
colors=brewer.pal(8,'Dark2'),
scale=c(3,0.5))

Analysis Using TDM

Term Document Matrix

#find words that occur at least four times

findFreqTerms(tweets_tdm, lowfreq=1)

# find correlated words
# identify words associated with google

findAssocs(tweets_tdm, terms="trump", corlimit=0.3)
# $trump
# donald hillary clinton huffpost million
# 0.86 0.58 0.47 0.32 0.32
# strong tcobpuanisezn tcorxijbafo
# 0.32 0.32 0.32

# frequency table of words using data frame

head(tweets_WordFreq_df,10)
# word freq
# trump trump 570
# hillary hillary 563
# donald donald 561
# clinton clinton 554
# president president 552
# will will 263
# the the 255
# united united 172
# become become 166
# crush crush 160

# plot word frequencies
# plot the frequency of first 10 frequent words

barplot(tweets_WordFreq_df[1:10,]$freq,
las =2,
names.arg = tweets_WordFreq_df[1:10,]$word,
col="lightblue",
main="Most frequent words",
ylab = "Word frequencies")

Word cloud for tweet search term

'Donald + Hillary + President'

wc-donald-hillary-president

Barplot for most frequent words for tweet

'Donald + Hillary + President'

sa-donald-hillary-freq-words