# =========================================================================
# LOAD REQUIRED PACKAGES
# =========================================================================
# If required, install the required packages by using
# > install.packages(c("rJava","RWeka","SnowballC","tm","wordcloud",""), 
# dependencies=T)
# -------------------------------------------------------------------------
library(ggplot2)
library(rJava)
library(RWeka)
library(SnowballC)
library(tm)
library(wordcloud)
# -------------------------------------------------------------------------
# Define the number of plots
par(mfrow=c(1,2))
# -------------------------------------------------------------------------
# Loop through the text files
for (k in 1:2){
# -------------------------------------------------------------------------

# =========================================================================
# LOAD THE TEXT FILES NAMED 1.txt, 2.txt, etc. AS A CORPUS, THAT IS A
# COLLECTION OF TEXT DOCUMENTS
# =========================================================================
speech<-readLines(paste(k,".txt",sep=""),warn=FALSE)
myCorpus<-VCorpus(VectorSource(speech))

# =========================================================================
# INSPECT THE TEXT
# =========================================================================
# The code below is used for to make text fit for paper width; use the loop
# if the corpus consists of several text files.
# -------------------------------------------------------------------------
for (i in 1:1) {
	cat(paste("[[", i, "]] ", sep=""))
	writeLines(strwrap(myCorpus[[i]], width=73))
}

# =========================================================================
# TRANSFORMING THE TEXT
# =========================================================================
# convert to lower case
# tm v0.6
myCorpus <- tm_map(myCorpus, content_transformer(tolower))
# tm v0.5-10
# myCorpus <- tm_map(myCorpus, tolower)

# remove anything other than English letters or space
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
myCorpus <- tm_map(myCorpus, content_transformer(removeNumPunct))
# remove punctuation
myCorpus <- tm_map(myCorpus, removePunctuation)
# remove numbers
myCorpus <- tm_map(myCorpus, removeNumbers)

# add extra stopwords, depending on context
myStopwords <- c(stopwords("english"), "applause", "can", "will")
# remove "r" and "big" from stopwords
myStopwords <- setdiff(myStopwords, c("r", "big"))
# remove stopwords from corpus
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
# remove extra whitespace
myCorpus <- tm_map(myCorpus, stripWhitespace)

# =========================================================================
# STEMMING WORDS
# =========================================================================
# In many applications, words need to be stemmed to retrieve their
# radicals, so that various forms derived from a stem would be taken as the
# same when counting word frequency. For instance, words "update",
# "updated" and "updating" would all be stemmed to "updat".
# -------------------------------------------------------------------------
# keep a copy of corpus to use later as a dictionary for stem completion
myCorpusCopy <- myCorpus
# stem words
myCorpus <- tm_map(myCorpus, stemDocument)

# =========================================================================
# COMPLETE THE STEMS FOR THE DICTIONARY
# =========================================================================
# tm v0.5-10
# myCorpus <- tm_map(myCorpus, stemCompletion)
# tm v0.6
stemCompletion2 <- function(x, dictionary) {
	x <- unlist(strsplit(as.character(x), " "))
	# Unexpectedly, stemCompletion completes an empty string to
	# a word in dictionary. Remove empty string to avoid above issue.
	x <- x[x != ""]
	x <- stemCompletion(x, dictionary=dictionary)
	x <- paste(x, sep="", collapse=" ")
	PlainTextDocument(stripWhitespace(x))
}
myCorpus <- lapply(myCorpus, stemCompletion2, dictionary=myCorpusCopy)
myCorpus <- Corpus(VectorSource(myCorpus))

# =========================================================================
# BUILDING A TERM-DOCUMENT MATRIX
# =========================================================================
tdm <- TermDocumentMatrix(myCorpus, control=list(wordLengths=c(1,Inf)))
tdm
# -------------------------------------------------------------------------
# If you want to look at selected items of tdm, you can use the following
# code:
# -------------------------------------------------------------------------
# idx <- which(dimnames(tdm)$Terms == "fair")
# inspect(tdm[idx+(0:5),1])

# =========================================================================
# INSPECT FREQUENT WORDS AND PLOT THEM
# =========================================================================
shed<-100
while(length(findFreqTerms(tdm,lowfreq=shed))<=20){shed<-shed-1}
shed<-shed+1
findFreqTerms(tdm, lowfreq=shed)
termFrequency <- rowSums(as.matrix(tdm))
termFrequency <- subset(termFrequency, termFrequency>=shed)
df <- data.frame(term=names(termFrequency), freq=termFrequency)
barplot(termFrequency, las=2)
# -------------------------------------------------------------------------
# Another plotting option is the following:
# -------------------------------------------------------------------------
# ggplot(df, aes(x=term, y=freq)) + geom_bar(stat="identity")
# + xlab("Terms") + ylab("Count") + coord_flip()


# -------------------------------------------------------------------------
}