[R] select .txt from .txt in a directory

Sat Nov 9 23:46:54 CET 2013

Hi,
Try:
library(stringr)
##### Created the selected files (98) in a separate working  folder (SubsetFiles1) (refer to my previous mail)
filelst <- list.files()
#Sublst <- filelst[1:2]
res <- lapply(filelst,function(x) {con <- file(x)
     Lines1 <- readLines(con) close(con)
     Lines2 <- Lines1[-1]
     Lines3 <- str_split(Lines2,"-9999.9M")
     Lines4 <- str_trim(unlist(lapply(Lines3,function(x) {x[x==""] <- NA
     paste(x,collapse=" ")})))
     Lines5 <- gsub("(\\d+)[A-Za-z]","\\1",Lines4)
     res1 <- read.table(text=Lines5,sep="",header=FALSE,fill=TRUE)
     res1})

##Created another folder "Modified" to store the "res" files
lapply(seq_along(res),function(i) write.table(res[[i]],paste("/home/arunksa111/Zl/Modified",paste0("Mod_",filelst[i]),sep="/"),row.names=FALSE,quote=FALSE))

 lstf1 <- list.files(path="/home/arunksa111/Zl/Modified")  

lst1 <- lapply(lstf1,function(x) readLines(paste("/home/arunksa111/Zl/Modified",x,sep="/")))
 which(lapply(lst1,function(x) length(grep("\\d+-9999.9",x)))>0 )
 #[1]  7 11 14 15 30 32 39 40 42 45 46 53 60 65 66 68 69 70 73 74 75 78 80 82 83
#[26] 86 87 90 91 93

lst2 <- lapply(lst1,function(x) gsub("(\\d+)(-9999.9)","\\1 \\2",x))
 #lapply(lst2,function(x) x[grep("\\d+-9999.9",x)]) ##checking for the pattern

lst3 <- lapply(lst2,function(x) {x<-gsub("(-9999.9)(-9999.9)","\\1 \\2",x)})#
#lapply(lst3,function(x) x[grep("\\d+-9999.9",x)])  ##checking for the pattern
# lapply(lst3,function(x) x[grep("-9999.9",x)]) ###second check
lst4 <- lapply(lst3,function(x) gsub("(Day) (\\d+)","\\1_\\2", x[-1]))  #removed the additional header "V1", "V2", etc.

#sapply(lst4,function(x) length(strsplit(x[1]," ")[[1]])) #checking the number of columns that should be present
lst5 <- lapply(lst4,function(x) unlist(lapply(x, function(y) word(y,1,33))))
lst6 <- lapply(lst5,function(x) read.table(text=x,header=TRUE,stringsAsFactors=FALSE,sep="",fill=TRUE))
# head(lst6[[94]],3)
lst7 <- lapply(lst6,function(x) x[x$Year >=1961 & x$Year <=2005,])
#head(lst7[[45]],3)
 lst8 <- lapply(lst7,function(x) x[!is.na(x$Year),])

lst9 <- lapply(lst8,function(x) {
    if((min(x$Year)>1961)|(max(x$Year)<2005)){
      n1<- (min(x$Year)-1961)*12
      x1<- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n1))
      n2<- (2005-max(x$Year))*12
      x2<- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n2))
       colnames(x1) <- colnames(x)
       colnames(x2) <- colnames(x)        
      x3<- rbind(x1,x,x2)
    }
   else if((min(x$Year)==1961) & (max(x$Year)==2005)) {
          if((min(x$Mo[x$Year==1961])>1)|(max(x$Mo[x$Year==2005])<12)){
       n1 <- min(x$Mo[x$Year==1961])-1
       x1 <- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n1))
       n2 <- (12-max(x$Mo[x$Year==2005]))          
       x2 <- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n2))
       colnames(x1) <- colnames(x)
       colnames(x2) <- colnames(x)
       x3 <- rbind(x1,x,x2)
      }
        else {    
        x
    }

    } })

which(sapply(lst9,nrow)!=540)
#[1] 45 46 54 64 65 66 70 75 97
lst10 <- lapply(lst9,function(x) {x1 <- x[!is.na(x$Year),]
             hx1 <- head(x1,1)
             tx1 <- tail(x1,1)
             x2 <- as.data.frame(matrix(NA, ncol=ncol(x), nrow=hx1$Mo-1))
             x3 <- as.data.frame(matrix(NA,ncol=ncol(x),nrow=12-tx1$Mo))
             colnames(x2) <- colnames(x)
             colnames(x3) <- colnames(x)
             if(nrow(x) < 540) rbind(x2,x,x3) else x  })
which(sapply(lst10,nrow)!=540)
#integer(0)

lst11 <-lapply(lst10,function(x) data.frame(col1=unlist(data.frame(t(x)[-c(1:2),]),use.names=FALSE))) 
  lst12<- lapply(seq_along(lst10),function(i){
    x<- lst11[[i]]
    colnames(x)<- lstf1[i]
    row.names(x)<- 1:nrow(x)
    x
  })
res2 <-  do.call(cbind,lst11)
 dim(res2)
#[1] 16740    98

res2[res2==-9999.9]<-NA # change missing value identifier as in your data set
which(res2==-9999.9)
#integer(0)

dates1<-seq.Date(as.Date('1Jan1961',format="%d%b%Y"),as.Date('31Dec2005',format="%d%b%Y"),by="day")
dates2<- as.character(dates1)
sldat<- split(dates2,list(gsub("-.*","",dates2)))
lst12<-lapply(sldat,function(x) lapply(split(x,gsub(".*-(.*)-.*","\\1",x)), function(y){x1<-as.numeric(gsub(".*-.*-(.*)","\\1",y));if((31-max(x1))>0) {x2<-seq(max(x1)+1,31,1);x3<-paste0(unique(gsub("(.*-.*-).*","\\1",y)),x2);c(y,x3)} else y} ))
any(sapply(lst12,function(x) any(lapply(x,length)!=31)))
#[1] FALSE

lst22<-lapply(lst12,function(x) unlist(x,use.names=FALSE))
sapply(lst22,length)
dates3<-unlist(lst22,use.names=FALSE)
length(dates3)
res3 <- data.frame(dates=dates3,res2,stringsAsFactors=FALSE)
str(res3)
res3$dates<-as.Date(res3$dates)
res4 <- res3[!is.na(res3$dates),]
res4[1:3,1:3]
dim(res4)
 #[1] 16436    99

A.K.

On Friday, November 8, 2013 5:54 PM, Zilefac Elvis <zilefacelvis at yahoo.com> wrote:

Hi Ak,

I think I figured out how to do the sub-setting. All I needed was to use column 3 in Temperature_inventory and select matching .txt files in the .zip file. The final result would be a subset of files whose IDs are in column 3 of temp_inventory.
*************************************************************************
I also have this script which you developed for managing precipitation files. Now I want to use the same code for the temperature files I sent to you. I tried doing it with some errors.
Please try these scripts on my temperature data. If you need further information let me know.
Note here that -9999.99M is -9999.9M in the temperature files.

library(stringr)# load it
res<-lapply(temp,function(x) {con <- file(x);
                                 Lines1<- readLines(con);
                                 close(con);
                                 Lines2<-Lines1[-1];# myfiles contain headers in row 2, so I removed the headers
                                 Lines3<- str_split(Lines2,"-9999.99M");
                                 Lines4<- str_trim(unlist(lapply(Lines3,function(x){x[x==""]<-NA;#replace missing identifier with NA
                                                                                    paste(x,collapse=" ")})));
                                 Lines5<- gsub("(\\d+)[A-Za-z]","\\1",Lines4);
                                 res<- read.table(text=Lines5,sep="",header=FALSE,fill=TRUE)})
lapply(res,head,2)# take a look at first two rows of res.
lapply(seq_along(res),function(i) write.table(res[[i]],paste0(gsub(".txt","",temp[i]),".txt"),row.names=FALSE,quote=FALSE))
#********************************************************************************************************
# Then use the following as a continuation from the one above

lstf1<- list.files(pattern=".txt")
length(lstf1)
fun2<- function(lstf){
  lst1<-lapply(lstf,function(x) readLines(x))
  lst2<-lapply(lst1,function(x) {gsub("(\\d+)(-9999.99)","\\1 \\2",x)})#change missing value identifier as in your data set
  lst3<-lapply(lst2,function(x) {x<-gsub("(\\d+)(-9999.99)","\\1 \\2",x)})#change missing value identifier as in your data set
  lst4<- lapply(lst3,function(x) read.table(text=x,header=TRUE,stringsAsFactors=FALSE,sep="",fill=TRUE))
  lst5<- lapply(lst4,function(x) x[x$V1>=1961 & x$V1<=2005,])
  lst6<- lapply(lst5,function(x) x[!is.na(x$V1),])
  lst7<- lapply(lst6,function(x) {
    if((min(x$V1)>1961)|(max(x$V1)<2005)){
      n1<- (min(x$V1)-1961)*12
      x1<- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n1))
      n2<- (2005-max(x$V1))*12
      x2<- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n2))
      x3<- rbind(x1,x,x2)
    }
    else {
      x
    } })
  lst8<-lapply(lst7,function(x) data.frame(col1=unlist(data.frame(t(x)[-c(1:2),]),use.names=FALSE))) ####changed
  lst9<- lapply(seq_along(lst8),function(i){
    x<- lst8[[i]]
    colnames(x)<- lstf1[i]
    row.names(x)<- 1:nrow(x)
    x
  })
  do.call(cbind,lst9)}
res<-fun2(lstf1)
dim(res)
res[res==-9999.99]<-NA # change missing value identifier as in your data set
which(res==-9999.99)#change missing value identifier as in your data set
dates1<-seq.Date(as.Date('1Jan1961',format="%d%b%Y"),as.Date('31Dec2005',format="%d%b%Y"),by="day")
dates2<- as.character(dates1)
sldat<- split(dates2,list(gsub("-.*","",dates2)))
lst11<-lapply(sldat,function(x) lapply(split(x,gsub(".*-(.*)-.*","\\1",x)), function(y){x1<-as.numeric(gsub(".*-.*-(.*)","\\1",y));if((31-max(x1))>0) {x2<-seq(max(x1)+1,31,1);x3<-paste0(unique(gsub("(.*-.*-).*","\\1",y)),x2);c(y,x3)} else y} ))
any(sapply(lst1,function(x) any(lapply(x,length)!=31)))
lst22<-lapply(lst11,function(x) unlist(x,use.names=FALSE))
sapply(lst22,length)
dates3<-unlist(lst22,use.names=FALSE)
length(dates3)
res1<- data.frame(dates=dates3,res,stringsAsFactors=FALSE)
str(res1)
res1$dates<-as.Date(res1$dates)
res2<-res1[!is.na(res1$dates),]
res2[1:3,1:3]
dim(res2)
write.csv(res2, file = "TemperatureAllstations.csv")#
#***********************************************************************************

Waiting for your useful input.

Thanks so much,
Atem.

On Friday, November 8, 2013 2:18 PM, arun <sm
you wanted to do.  If you want to transfer the subset of files from the main folder to a new location, then you may try: (make sure you create a copy of the original .txt folder before doing this)
I created three sub folders and two files (BTemperature_Stations.txt and Tempearture inventory.csv) in my working directory.

list.files()
#[1] "BTemperature_Stations.txt" "Files1"          ## Files1 folder contains all the .txt files; #SubsetFiles: created to subset the files that match the condition                 
#[3]
"FilesCopy"                 "SubsetFiles1"          #FilesCopy. A copy of the Files1 folder   
#[5] "Tempearture inventory.csv"

list.files(pattern="\\.")
#[1] "BTemperature_Stations.txt" "Tempearture inventory.csv"
fl1 <- list.files(pattern="\\.")
 dat1 <- read.table(fl1[1],header=TRUE,sep="",stringsAsFactors=FALSE,fill=TRUE,check.names=FALSE)
 dat2 <- read.csv(fl1[2],header=TRUE,sep=",",stringsAsFactors=FALSE,check.names=FALSE)
vec1 <- dat1[,3][dat1[,3]%in% dat2[,3]]
vec2 <- list.files(path="/home/arunksa111/Zl/Files1",recursive=TRUE)
 sum(gsub(".txt","",vec2) %in% vec1)
#[1] 98
vec3 <-  vec2[gsub(".txt","",vec2) %in% vec1]
lapply(vec3, function(x) file.rename(paste("/home/arunksa111/Zl/Files1",x,sep="/"), paste("/home/arunksa111/Zl/SubsetFiles1",x,sep="/"))) #change the path accordingly. 
length(list.files(path="/home/arunksa111/Zl/SubsetFiles1"))
#[1] 98

fileDim <- sapply(vec3,function(x) {x1 <-read.delim(paste("/home/arunksa111/Zl/SubsetFiles1",x,sep="/"),header=TRUE,stringsAsFactors=FALSE,sep=",",check.names=FALSE); dim(x1)})
fileDim[,1:3]
#     dn3011120.txt dn3011240.txt dn3011887.txt
#[1,]          1151           791          1054
#[2,]             7             7             7

A.K.

On Friday, November 8, 2013 1:41 PM, Zilefac Elvis <
les from a list of files. All are text files. The index for selection is found in column 3 of both files.

Attached are my data files.
Btemperature_Stations is my
main file.
Temperature inventory is my 'wanted' file and is a subset of Btemperature_Stations.
Using column 3 in both files, select the files in Temperature inventory from Btemperature_Stations.
The .zip file contains the .txt files which you will extract to a folder and do the selection in R.

Thanks,
Atem.