findwords <- function(tf) { # read in the words from the file, into a vector of mode character txt <- scan(tf,"") wl <- list() for (i in 1:length(txt)) { wrd <- txt[i] # i-th word in input file wl[[wrd]] <- c(wl[[wrd]], i) } return(wl) } ## NOTA BENE: Faster/better version in ## --------->> ../Ch6/findwords.R ## if(FALSE) ## Use datafile <- file.choose()# --> and choose text1.txt interactively ! datafile getwd() datafile <- "text1.txt" ## for MM: datafile <- "../../../text1.txt" w1 <- findwords(datafile)#-> head(w1) tt <- scan(datafile, "") head(tt) ## the first few head(sort(tt), 12)# the first *twelve* after alphabetical sorting ## [1] "(OOP)" "•" "•" "•" "•" "1." "9.1" "9.1.1" "a" "a" "a" "a" ### Need to pre-process: t.in <- readLines(datafile) head(t.in) ## Now get rid of all "non-word characters": ?sub ?regex ## 1) --> can use ' [:alnum:] to get all "alphanumeric" characters, ## i.e., letters and digits: ## 2) "^" means *not* (in the set that follows): ## Replace all non-alphanumeric characters by ' ' (blank): head( gsub("[^[:alnum:]]", " ", t.in) ) t.out <- gsub("[^[:alnum:]]", " ", t.in) head(t.out, 20) if(FALSE) ## now we could write the changed "good" lines to a file again, ## and read that via findwords(): writeLines(t.out, "text2.txt") ## But R is smarter and can work with *connections* instead of just files: txtConn <- textConnection(t.out) txtConn # ... describes itself: "can read" = "yes" debug(findwords) ww <- findwords(txtConn) ## Write a function that returns *ordered* word counts len.w <- sapply(ww, length) # ... close srt.w <- sort(len.w, decreasing=TRUE) ##--> a bit better: --> "freqwl.R" freqwl(ww)