[R] select .txt from .txt in a directory
arun
smartpink111 at yahoo.com
Sat Nov 9 23:46:54 CET 2013
Hi,
Try:
library(stringr)
##### Created the selected files (98) in a separate working folder (SubsetFiles1) (refer to my previous mail)
filelst <- list.files()
#Sublst <- filelst[1:2]
res <- lapply(filelst,function(x) {con <- file(x)
Lines1 <- readLines(con) close(con)
Lines2 <- Lines1[-1]
Lines3 <- str_split(Lines2,"-9999.9M")
Lines4 <- str_trim(unlist(lapply(Lines3,function(x) {x[x==""] <- NA
paste(x,collapse=" ")})))
Lines5 <- gsub("(\\d+)[A-Za-z]","\\1",Lines4)
res1 <- read.table(text=Lines5,sep="",header=FALSE,fill=TRUE)
res1})
##Created another folder "Modified" to store the "res" files
lapply(seq_along(res),function(i) write.table(res[[i]],paste("/home/arunksa111/Zl/Modified",paste0("Mod_",filelst[i]),sep="/"),row.names=FALSE,quote=FALSE))
lstf1 <- list.files(path="/home/arunksa111/Zl/Modified")
lst1 <- lapply(lstf1,function(x) readLines(paste("/home/arunksa111/Zl/Modified",x,sep="/")))
which(lapply(lst1,function(x) length(grep("\\d+-9999.9",x)))>0 )
#[1] 7 11 14 15 30 32 39 40 42 45 46 53 60 65 66 68 69 70 73 74 75 78 80 82 83
#[26] 86 87 90 91 93
lst2 <- lapply(lst1,function(x) gsub("(\\d+)(-9999.9)","\\1 \\2",x))
#lapply(lst2,function(x) x[grep("\\d+-9999.9",x)]) ##checking for the pattern
lst3 <- lapply(lst2,function(x) {x<-gsub("(-9999.9)(-9999.9)","\\1 \\2",x)})#
#lapply(lst3,function(x) x[grep("\\d+-9999.9",x)]) ##checking for the pattern
# lapply(lst3,function(x) x[grep("-9999.9",x)]) ###second check
lst4 <- lapply(lst3,function(x) gsub("(Day) (\\d+)","\\1_\\2", x[-1])) #removed the additional header "V1", "V2", etc.
#sapply(lst4,function(x) length(strsplit(x[1]," ")[[1]])) #checking the number of columns that should be present
lst5 <- lapply(lst4,function(x) unlist(lapply(x, function(y) word(y,1,33))))
lst6 <- lapply(lst5,function(x) read.table(text=x,header=TRUE,stringsAsFactors=FALSE,sep="",fill=TRUE))
# head(lst6[[94]],3)
lst7 <- lapply(lst6,function(x) x[x$Year >=1961 & x$Year <=2005,])
#head(lst7[[45]],3)
lst8 <- lapply(lst7,function(x) x[!is.na(x$Year),])
lst9 <- lapply(lst8,function(x) {
if((min(x$Year)>1961)|(max(x$Year)<2005)){
n1<- (min(x$Year)-1961)*12
x1<- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n1))
n2<- (2005-max(x$Year))*12
x2<- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n2))
colnames(x1) <- colnames(x)
colnames(x2) <- colnames(x)
x3<- rbind(x1,x,x2)
}
else if((min(x$Year)==1961) & (max(x$Year)==2005)) {
if((min(x$Mo[x$Year==1961])>1)|(max(x$Mo[x$Year==2005])<12)){
n1 <- min(x$Mo[x$Year==1961])-1
x1 <- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n1))
n2 <- (12-max(x$Mo[x$Year==2005]))
x2 <- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n2))
colnames(x1) <- colnames(x)
colnames(x2) <- colnames(x)
x3 <- rbind(x1,x,x2)
}
else {
x
}
} })
which(sapply(lst9,nrow)!=540)
#[1] 45 46 54 64 65 66 70 75 97
lst10 <- lapply(lst9,function(x) {x1 <- x[!is.na(x$Year),]
hx1 <- head(x1,1)
tx1 <- tail(x1,1)
x2 <- as.data.frame(matrix(NA, ncol=ncol(x), nrow=hx1$Mo-1))
x3 <- as.data.frame(matrix(NA,ncol=ncol(x),nrow=12-tx1$Mo))
colnames(x2) <- colnames(x)
colnames(x3) <- colnames(x)
if(nrow(x) < 540) rbind(x2,x,x3) else x })
which(sapply(lst10,nrow)!=540)
#integer(0)
lst11 <-lapply(lst10,function(x) data.frame(col1=unlist(data.frame(t(x)[-c(1:2),]),use.names=FALSE)))
lst12<- lapply(seq_along(lst10),function(i){
x<- lst11[[i]]
colnames(x)<- lstf1[i]
row.names(x)<- 1:nrow(x)
x
})
res2 <- do.call(cbind,lst11)
dim(res2)
#[1] 16740 98
res2[res2==-9999.9]<-NA # change missing value identifier as in your data set
which(res2==-9999.9)
#integer(0)
dates1<-seq.Date(as.Date('1Jan1961',format="%d%b%Y"),as.Date('31Dec2005',format="%d%b%Y"),by="day")
dates2<- as.character(dates1)
sldat<- split(dates2,list(gsub("-.*","",dates2)))
lst12<-lapply(sldat,function(x) lapply(split(x,gsub(".*-(.*)-.*","\\1",x)), function(y){x1<-as.numeric(gsub(".*-.*-(.*)","\\1",y));if((31-max(x1))>0) {x2<-seq(max(x1)+1,31,1);x3<-paste0(unique(gsub("(.*-.*-).*","\\1",y)),x2);c(y,x3)} else y} ))
any(sapply(lst12,function(x) any(lapply(x,length)!=31)))
#[1] FALSE
lst22<-lapply(lst12,function(x) unlist(x,use.names=FALSE))
sapply(lst22,length)
dates3<-unlist(lst22,use.names=FALSE)
length(dates3)
res3 <- data.frame(dates=dates3,res2,stringsAsFactors=FALSE)
str(res3)
res3$dates<-as.Date(res3$dates)
res4 <- res3[!is.na(res3$dates),]
res4[1:3,1:3]
dim(res4)
#[1] 16436 99
A.K.
On Friday, November 8, 2013 5:54 PM, Zilefac Elvis <zilefacelvis at yahoo.com> wrote:
Hi Ak,
I think I figured out how to do the sub-setting. All I needed was to use column 3 in Temperature_inventory and select matching .txt files in the .zip file. The final result would be a subset of files whose IDs are in column 3 of temp_inventory.
*************************************************************************
I also have this script which you developed for managing precipitation files. Now I want to use the same code for the temperature files I sent to you. I tried doing it with some errors.
Please try these scripts on my temperature data. If you need further information let me know.
Note here that -9999.99M is -9999.9M in the temperature files.
library(stringr)# load it
res<-lapply(temp,function(x) {con <- file(x);
Lines1<- readLines(con);
close(con);
Lines2<-Lines1[-1];# myfiles contain headers in row 2, so I removed the headers
Lines3<- str_split(Lines2,"-9999.99M");
Lines4<- str_trim(unlist(lapply(Lines3,function(x){x[x==""]<-NA;#replace missing identifier with NA
paste(x,collapse=" ")})));
Lines5<- gsub("(\\d+)[A-Za-z]","\\1",Lines4);
res<- read.table(text=Lines5,sep="",header=FALSE,fill=TRUE)})
lapply(res,head,2)# take a look at first two rows of res.
lapply(seq_along(res),function(i) write.table(res[[i]],paste0(gsub(".txt","",temp[i]),".txt"),row.names=FALSE,quote=FALSE))
#********************************************************************************************************
# Then use the following as a continuation from the one above
lstf1<- list.files(pattern=".txt")
length(lstf1)
fun2<- function(lstf){
lst1<-lapply(lstf,function(x) readLines(x))
lst2<-lapply(lst1,function(x) {gsub("(\\d+)(-9999.99)","\\1 \\2",x)})#change missing value identifier as in your data set
lst3<-lapply(lst2,function(x) {x<-gsub("(\\d+)(-9999.99)","\\1 \\2",x)})#change missing value identifier as in your data set
lst4<- lapply(lst3,function(x) read.table(text=x,header=TRUE,stringsAsFactors=FALSE,sep="",fill=TRUE))
lst5<- lapply(lst4,function(x) x[x$V1>=1961 & x$V1<=2005,])
lst6<- lapply(lst5,function(x) x[!is.na(x$V1),])
lst7<- lapply(lst6,function(x) {
if((min(x$V1)>1961)|(max(x$V1)<2005)){
n1<- (min(x$V1)-1961)*12
x1<- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n1))
n2<- (2005-max(x$V1))*12
x2<- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n2))
x3<- rbind(x1,x,x2)
}
else {
x
} })
lst8<-lapply(lst7,function(x) data.frame(col1=unlist(data.frame(t(x)[-c(1:2),]),use.names=FALSE))) ####changed
lst9<- lapply(seq_along(lst8),function(i){
x<- lst8[[i]]
colnames(x)<- lstf1[i]
row.names(x)<- 1:nrow(x)
x
})
do.call(cbind,lst9)}
res<-fun2(lstf1)
dim(res)
res[res==-9999.99]<-NA # change missing value identifier as in your data set
which(res==-9999.99)#change missing value identifier as in your data set
dates1<-seq.Date(as.Date('1Jan1961',format="%d%b%Y"),as.Date('31Dec2005',format="%d%b%Y"),by="day")
dates2<- as.character(dates1)
sldat<- split(dates2,list(gsub("-.*","",dates2)))
lst11<-lapply(sldat,function(x) lapply(split(x,gsub(".*-(.*)-.*","\\1",x)), function(y){x1<-as.numeric(gsub(".*-.*-(.*)","\\1",y));if((31-max(x1))>0) {x2<-seq(max(x1)+1,31,1);x3<-paste0(unique(gsub("(.*-.*-).*","\\1",y)),x2);c(y,x3)} else y} ))
any(sapply(lst1,function(x) any(lapply(x,length)!=31)))
lst22<-lapply(lst11,function(x) unlist(x,use.names=FALSE))
sapply(lst22,length)
dates3<-unlist(lst22,use.names=FALSE)
length(dates3)
res1<- data.frame(dates=dates3,res,stringsAsFactors=FALSE)
str(res1)
res1$dates<-as.Date(res1$dates)
res2<-res1[!is.na(res1$dates),]
res2[1:3,1:3]
dim(res2)
write.csv(res2, file = "TemperatureAllstations.csv")#
#***********************************************************************************
Waiting for your useful input.
Thanks so much,
Atem.
On Friday, November 8, 2013 2:18 PM, arun <sm
you wanted to do. If you want to transfer the subset of files from the main folder to a new location, then you may try: (make sure you create a copy of the original .txt folder before doing this)
I created three sub folders and two files (BTemperature_Stations.txt and Tempearture inventory.csv) in my working directory.
list.files()
#[1] "BTemperature_Stations.txt" "Files1" ## Files1 folder contains all the .txt files; #SubsetFiles: created to subset the files that match the condition
#[3]
"FilesCopy" "SubsetFiles1" #FilesCopy. A copy of the Files1 folder
#[5] "Tempearture inventory.csv"
list.files(pattern="\\.")
#[1] "BTemperature_Stations.txt" "Tempearture inventory.csv"
fl1 <- list.files(pattern="\\.")
dat1 <- read.table(fl1[1],header=TRUE,sep="",stringsAsFactors=FALSE,fill=TRUE,check.names=FALSE)
dat2 <- read.csv(fl1[2],header=TRUE,sep=",",stringsAsFactors=FALSE,check.names=FALSE)
vec1 <- dat1[,3][dat1[,3]%in% dat2[,3]]
vec2 <- list.files(path="/home/arunksa111/Zl/Files1",recursive=TRUE)
sum(gsub(".txt","",vec2) %in% vec1)
#[1] 98
vec3 <- vec2[gsub(".txt","",vec2) %in% vec1]
lapply(vec3, function(x) file.rename(paste("/home/arunksa111/Zl/Files1",x,sep="/"), paste("/home/arunksa111/Zl/SubsetFiles1",x,sep="/"))) #change the path accordingly.
length(list.files(path="/home/arunksa111/Zl/SubsetFiles1"))
#[1] 98
fileDim <- sapply(vec3,function(x) {x1 <-read.delim(paste("/home/arunksa111/Zl/SubsetFiles1",x,sep="/"),header=TRUE,stringsAsFactors=FALSE,sep=",",check.names=FALSE); dim(x1)})
fileDim[,1:3]
# dn3011120.txt dn3011240.txt dn3011887.txt
#[1,] 1151 791 1054
#[2,] 7 7 7
A.K.
On Friday, November 8, 2013 1:41 PM, Zilefac Elvis <
les from a list of files. All are text files. The index for selection is found in column 3 of both files.
Attached are my data files.
Btemperature_Stations is my
main file.
Temperature inventory is my 'wanted' file and is a subset of Btemperature_Stations.
Using column 3 in both files, select the files in Temperature inventory from Btemperature_Stations.
The .zip file contains the .txt files which you will extract to a folder and do the selection in R.
Thanks,
Atem.
More information about the R-help
mailing list