[R] dates and time series management
arun
smartpink111 at yahoo.com
Wed Jun 5 15:44:41 CEST 2013
Hi,
Try this:
lstf1<- list.files(pattern=".txt")
length(lstf1)
#[1] 119
fun2<- function(lstf){
lst1<-lapply(lstf,function(x) readLines(x))
lst2<-lapply(lst1,function(x) {gsub("(\\d+)(-9999.99)","\\1 \\2",x)})
lst3<-lapply(lst2,function(x) {x<-gsub("(\\d+)(-9999.99)","\\1 \\2",x)})
lst4<- lapply(lst3,function(x) read.table(text=x,header=TRUE,stringsAsFactors=FALSE,sep="",fill=TRUE))
lst5<- lapply(lst4,function(x) x[x$V1>=1961 & x$V1<=2005,])
lst6<- lapply(lst5,function(x) x[!is.na(x$V1),])
lst7<- lapply(lst6,function(x) {
if((min(x$V1)>1961)|(max(x$V1)<2005)){
n1<- (min(x$V1)-1961)*12
x1<- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n1))
n2<- (2005-max(x$V1))*12
x2<- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n2))
x3<- rbind(x1,x,x2)
}
else {
x
} })
lst8<- lapply(lst7,function(x) data.frame(col1=unlist(x[,-c(1:2)])))
lst9<- lapply(seq_along(lst8),function(i){
x<- lst8[[i]]
colnames(x)<- lstf1[i]
row.names(x)<- 1:nrow(x)
x
})
do.call(cbind,lst9)}
res<-fun2(lstf1)
dim(res)
#[1] 16740 119
res[1:5,1:3]
# dt3011120.txt dt3011240.txt dt3011887.txt
#1 1.67 NA 0.17
#2 0.00 NA 0.28
#3 0.00 NA 0.00
#4 0.00 NA 0.30
#5 0.00 NA 0.00
########################################
There are some formatting issues in your files:
For eg. If I run the function line by line:
lst1<-lapply(lstf1,function(x) readLines(x))
sapply(lst1,function(x) any(grepl("\\d+-9999.99",x)))
[1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[37] TRUE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
[49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[73] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE
[85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
[97] FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE
[109] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE
###means some rows in the a few files have:
#-9999.99 0 0 0 0.00-9999.99 0 0.00-9999.99 0 0 0 0.00-9999.99 (no space before -9999.99)
lst2<-lapply(lst1,function(x) {gsub("(\\d+)(-9999.99)","\\1 \\2",x)})
sapply(lst2,function(x) any(grepl("\\d+-9999.99",x))) #still a few files had the problem
[1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
[85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
[97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
lst3<-lapply(lst2,function(x) {x<-gsub("(\\d+)(-9999.99)","\\1 \\2",x)})
any(sapply(lst3,function(x) any(grepl("\\d+-9999.99",x))))
#[1] FALSE
lst4<- lapply(lst3,function(x) read.table(text=x,header=TRUE,stringsAsFactors=FALSE,sep="",fill=TRUE))
any(sapply(lst4,function(x) any(sapply(x,is.character))))
#[1] FALSE
lst5<- lapply(lst4,function(x) x[x$V1>=1961 & x$V1<=2005,])
lst6<- lapply(lst5,function(x) x[!is.na(x$V1),])
sapply(lst6,nrow)
# [1] 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540
# [19] 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540
# [37] 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540
# [55] 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540
# [73] 540 540 540 540 528 492 528 540 348 540 540 480 540 540 540 540 540 540
# [91] 540 540 540 540 540 540 540 540 540 540 540 540 540 540 528 540 540 540
#[109] 540 540 540 540 540 540 540 540 540 468 540
lst7<- lapply(lst6,function(x) {
if((min(x$V1)>1961)|(max(x$V1)<2005)){
n1<- (min(x$V1)-1961)*12
x1<- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n1))
n2<- (2005-max(x$V1))*12
x2<- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n2))
x3<- rbind(x1,x,x2)
}
else {
x
} })
sapply(lst7,nrow)
# [1] 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540
# [19] 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540
# [37] 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540
# [55] 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540
# [73] 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540
# [91] 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540 540
#[109] 540 540 540 540 540 540 540 540 540 540 540
Hope this helps.
A.K.
________________________________
From: Zilefac Elvis <zilefacelvis at yahoo.com>
To: arun <smartpink111 at yahoo.com>
Sent: Wednesday, June 5, 2013 2:05 AM
Subject: Re: dates and time series management
Hi A.K,
Sorry my internet connection was so bad last evening.
I have attached all the files as .zip.
Below is the output you requested.
As I explained, the start date in 'res' should be 1961 and end date should be 2005 in all 119 files.
Thanks A.K
> lapply(lst1,head,3)
[[1]]
V1.V2.V3.V4.V5.V6.V7.V8.V9.V10.V11.V12.V13.V14.V15.V16.V17.V18.V19.V20.V21.V22.V23.V24.V25.V26.V27.V28.V29.V30.V31.V32.V33
1 1915 1 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
2 1915 2 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
3 1915 3 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[
More information about the R-help
mailing list