Load the R packages
libs = c("twitteR", "RCurl", "tm", "stringr", "wordcloud")
lapply(libs, require, character.only=TRUE)
function 1 : doOAuth
Read oauth credentials from file and connect to Twitter
Input - filepath, filename
doOAuth = function(path, filename){
file = paste(path,filename,sep='/')
oauthCreds = read.table(file,header=T)
setup_twitter_oauth(oauthCreds$consumer_key,
oauthCreds$consumer_secret,
oauthCreds$access_token,
oauthCreds$access_secret)
}
function 3 : getTweets_text
Retrieve the text part from tweets in list
Input - list containing tweets
Output - character vector
getTweets_text = function(tweets_list){
tweets_text = sapply(tweets_list, function(x) x$getText())
#str(tweets_text)
#class(tweets_text)
return (tweets_text)
}
function 5 : cleanCorpus
Remove punctuations, whitespaces, numbers, stop-words from tweets and convert to lowercase
Input - tweets corpus
Output - clean tweets corpus
cleanCorpus = function(tweets_corpus){
tweets_corpus_clean = tm_map(tweets_corpus, removePunctuation)
tweets_corpus_clean = tm_map(tweets_corpus_clean, stripWhitespace)
tweets_corpus_clean = tm_map(tweets_corpus_clean, removeNumbers)
tweets_corpus_clean = tm_map(tweets_corpus_clean, removeWords, stopwords("english"))
tweets_corpus_clean = tm_map(tweets_corpus_clean, content_transformer(tolower))
toSpace = content_transformer(function(x, pattern) gsub(pattern,"",x))
tweets_corpus_clean = tm_map(tweets_corpus_clean, toSpace,"https*|youtu*")
#tweets_corpus_clean = tm_map(tweets_corpus_clean, stemDocument)
return(tweets_corpus_clean)
}
function 7 : createTDM_WordFreq_df
Create a dataframe for words and their frequency in the term document matrix
Input - tweets term document matrix tdm
Output - clean tweets corpus
createTDM_WordFreq_df = function(tdm){
tweets_tdm = as.matrix(tdm)
class(tweets_tdm)
#[1] "matrix"
str(tweets_tdm)
###################
# num [1:1409, 1:400] 0 0 0 0 0 0 0 0 0 0 ...
# - attr(*, "dimnames")=List of 2
# ..$ Terms: chr [1:1409] "-fantastic""| __truncated__ "'blade""| __truncated__ "'the""| __truncated__ ""ui""| __truncated__ ...
# ..$ Docs : chr [1:400] "1" "2" "3" "4" ...
###############################
tdm_term_freq_sort = sort(rowSums(tweets_tdm), decreasing=TRUE)
#class(tdm_term_freq_sort)
#[1] "numeric"
#str(tdm_term_freq_sort)
##############
# Named num [1:1409] 82 78 78 78 78 69 62 53 46 46 ...
# - attr(*, "names")= chr [1:1409] "connect" "carmack" "john" "keynote" ...
#
##################################
tdm_term_freq_sort_inc = sort(rowSums(tweets_tdm), decreasing=FALSE)
#class(tdm_term_freq_sort_inc)
#[1] "numeric"
#str(tdm_term_freq_sort_inc)
##############
# Named num [1:1409] 1 1 1 1 1 1 1 1 1 1 ...
# - attr(*, "names")= chr [1:1409] "-fantastic""| __truncated__ "'blade""| __truncated__ "'the""| __truncated__ ""ui""| __truncated__ ...
##################################
tdm_term_freq_df = data.frame(word = names(tdm_term_freq_sort),
freq = tdm_term_freq_sort)
str(tdm_term_freq_df)
head(tdm_term_freq_df,10)
#set rownames to number
#return( list(name = searchTerms, tdm = tweets_term_freq_df))
return(tdm_term_freq_df)
}
Analysis Using TDM
Term Document Matrix
#find words that occur at least four times
findFreqTerms(tweets_tdm, lowfreq=1)
# find correlated words
# identify words associated with google
findAssocs(tweets_tdm, terms="trump", corlimit=0.3)
# $trump
# donald hillary clinton huffpost million
# 0.86 0.58 0.47 0.32 0.32
# strong tcobpuanisezn tcorxijbafo
# 0.32 0.32 0.32
# frequency table of words using data frame
head(tweets_WordFreq_df,10)
# word freq
# trump trump 570
# hillary hillary 563
# donald donald 561
# clinton clinton 554
# president president 552
# will will 263
# the the 255
# united united 172
# become become 166
# crush crush 160
# plot word frequencies
# plot the frequency of first 10 frequent words
barplot(tweets_WordFreq_df[1:10,]$freq,
las =2,
names.arg = tweets_WordFreq_df[1:10,]$word,
col="lightblue",
main="Most frequent words",
ylab = "Word frequencies")