# ========================================================================= # LOAD REQUIRED PACKAGES # ========================================================================= # If required, install the required packages by using # > install.packages(c("rJava","RWeka","SnowballC","tm","wordcloud",""), # dependencies=T) # ------------------------------------------------------------------------- library(ggplot2) library(rJava) library(RWeka) library(SnowballC) library(tm) library(wordcloud) # ------------------------------------------------------------------------- # Define the number of plots par(mfrow=c(1,2)) # ------------------------------------------------------------------------- # Loop through the text files for (k in 1:2){ # ------------------------------------------------------------------------- # ========================================================================= # LOAD THE TEXT FILES NAMED 1.txt, 2.txt, etc. AS A CORPUS, THAT IS A # COLLECTION OF TEXT DOCUMENTS # ========================================================================= speech<-readLines(paste(k,".txt",sep=""),warn=FALSE) myCorpus<-VCorpus(VectorSource(speech)) # ========================================================================= # INSPECT THE TEXT # ========================================================================= # The code below is used for to make text fit for paper width; use the loop # if the corpus consists of several text files. # ------------------------------------------------------------------------- for (i in 1:1) { cat(paste("[[", i, "]] ", sep="")) writeLines(strwrap(myCorpus[[i]], width=73)) } # ========================================================================= # TRANSFORMING THE TEXT # ========================================================================= # convert to lower case # tm v0.6 myCorpus <- tm_map(myCorpus, content_transformer(tolower)) # tm v0.5-10 # myCorpus <- tm_map(myCorpus, tolower) # remove anything other than English letters or space removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x) myCorpus <- tm_map(myCorpus, content_transformer(removeNumPunct)) # remove punctuation myCorpus <- tm_map(myCorpus, removePunctuation) # remove numbers myCorpus <- tm_map(myCorpus, removeNumbers) # add extra stopwords, depending on context myStopwords <- c(stopwords("english"), "applause", "can", "will") # remove "r" and "big" from stopwords myStopwords <- setdiff(myStopwords, c("r", "big")) # remove stopwords from corpus myCorpus <- tm_map(myCorpus, removeWords, myStopwords) # remove extra whitespace myCorpus <- tm_map(myCorpus, stripWhitespace) # ========================================================================= # STEMMING WORDS # ========================================================================= # In many applications, words need to be stemmed to retrieve their # radicals, so that various forms derived from a stem would be taken as the # same when counting word frequency. For instance, words "update", # "updated" and "updating" would all be stemmed to "updat". # ------------------------------------------------------------------------- # keep a copy of corpus to use later as a dictionary for stem completion myCorpusCopy <- myCorpus # stem words myCorpus <- tm_map(myCorpus, stemDocument) # ========================================================================= # COMPLETE THE STEMS FOR THE DICTIONARY # ========================================================================= # tm v0.5-10 # myCorpus <- tm_map(myCorpus, stemCompletion) # tm v0.6 stemCompletion2 <- function(x, dictionary) { x <- unlist(strsplit(as.character(x), " ")) # Unexpectedly, stemCompletion completes an empty string to # a word in dictionary. Remove empty string to avoid above issue. x <- x[x != ""] x <- stemCompletion(x, dictionary=dictionary) x <- paste(x, sep="", collapse=" ") PlainTextDocument(stripWhitespace(x)) } myCorpus <- lapply(myCorpus, stemCompletion2, dictionary=myCorpusCopy) myCorpus <- Corpus(VectorSource(myCorpus)) # ========================================================================= # BUILDING A TERM-DOCUMENT MATRIX # ========================================================================= tdm <- TermDocumentMatrix(myCorpus, control=list(wordLengths=c(1,Inf))) tdm # ------------------------------------------------------------------------- # If you want to look at selected items of tdm, you can use the following # code: # ------------------------------------------------------------------------- # idx <- which(dimnames(tdm)$Terms == "fair") # inspect(tdm[idx+(0:5),1]) # ========================================================================= # INSPECT FREQUENT WORDS AND PLOT THEM # ========================================================================= shed<-100 while(length(findFreqTerms(tdm,lowfreq=shed))<=20){shed<-shed-1} shed<-shed+1 findFreqTerms(tdm, lowfreq=shed) termFrequency <- rowSums(as.matrix(tdm)) termFrequency <- subset(termFrequency, termFrequency>=shed) df <- data.frame(term=names(termFrequency), freq=termFrequency) barplot(termFrequency, las=2) # ------------------------------------------------------------------------- # Another plotting option is the following: # ------------------------------------------------------------------------- # ggplot(df, aes(x=term, y=freq)) + geom_bar(stat="identity") # + xlab("Terms") + ylab("Count") + coord_flip() # ------------------------------------------------------------------------- }