findwords <- function(tf) {
   # read in the words from the file, into a vector of mode character
   txt <- scan(tf,"")
   wl <- list()
   for (i in 1:length(txt)) {
      wrd <- txt[i]  # i-th word in input file
      wl[[wrd]] <- c(wl[[wrd]], i)
   }
   return(wl)
}
## NOTA BENE: Faster/better version in
## --------->> ../Ch6/findwords.R


##
if(FALSE) ## Use
datafile <- file.choose()# --> and choose  text1.txt  interactively !
datafile

getwd()
datafile <- "text1.txt"
## for MM:
datafile <- "../../../text1.txt"

w1 <- findwords(datafile)#->
head(w1)

tt <- scan(datafile, "")
head(tt) ## the first few
head(sort(tt), 12)# the first *twelve* after alphabetical sorting
## [1] "(OOP)"  "•"  "•"  "•"  "•"  "1."  "9.1"  "9.1.1" "a"  "a"  "a"  "a"


### Need to pre-process:
t.in <- readLines(datafile)
head(t.in)
## Now get rid of all  "non-word characters":
?sub
?regex
## 1) --> can use  ' [:alnum:] to get all "alphanumeric" characters,
##     i.e., letters and digits:
## 2) "^" means *not* (in the set that follows):

## Replace all non-alphanumeric characters by  ' ' (blank):
head( gsub("[^[:alnum:]]", " ", t.in) )

t.out <- gsub("[^[:alnum:]]", " ", t.in)
head(t.out, 20)

if(FALSE)
    ## now we could write the changed "good" lines to a file again,
    ## and read that via  findwords():
    writeLines(t.out, "text2.txt")

## But R is smarter and can work with *connections* instead of just files:
txtConn <- textConnection(t.out)
txtConn # ... describes itself:    "can read" = "yes"

debug(findwords)

ww <- findwords(txtConn)

## Write a function that returns *ordered* word counts 


len.w <- sapply(ww, length) # ... close
srt.w <- sort(len.w, decreasing=TRUE)
##--> a bit better: --> "freqwl.R"
freqwl(ww)