[R] : Quantile and rowMean from multiple files in a folder

Zilefac Elvis zilefacelvis at yahoo.com
Tue Apr 15 22:38:06 CEST 2014


Hi AK,
Thanks very much. I worked great.
Many thanks.
Atem.


On Tuesday, April 15, 2014 9:20 AM, arun <smartpink111 at yahoo.com> wrote:


Hi Atem,
May be this works.
### Q1: working directory: Observed #Only one file per Site.  Assuming this is the
### case for the full dataset, then I guess there is no need to average
dir.create("final")
lst1 <- split(list.files(pattern = ".csv"), gsub("\\_.*", "", list.files(pattern = ".csv")))

lst2 <- lapply(lst1, function(x1) lapply(x1, function(x2) {
    lines1 <- readLines(x2)
    header1 <- lines1[1:2]
    dat1 <- read.table(text = lines1, header = FALSE, sep = ",", stringsAsFactors = FALSE, 
        skip = 2)
    colnames(dat1) <- Reduce(paste, strsplit(header1, ","))
    dat1[-c(nrow(dat1), nrow(dat1) - 1), ]
}))

lst3 <- lst2[sapply(seq_along(lst2),function(i){lstN <- sapply(lst2[[i]],function(x) is.integer(ncol(x)))})]
length(lst2)
#[1] 120
length(lst3)
#[1] 119

library(plyr)
library(stringr)

lst4 <- setNames(lapply(seq_along(lst3), function(i) {
    lapply(lst3[[i]], function(x) {
        names(x)[-1] <- paste(names(x)[-1], names(lst1)[i], sep = "_")
        names(x) <- str_trim(names(x))
        x
    })[[1]]
}), names(lst3))
df1 <- join_all(lst4, by = "Year")
dim(df1)
# [1] 9 27311

dimCol <- sapply(split(names(df1)[-1], gsub(".*\\_", "", names(df1)[-1])), function(x) {
    df2 <- df1[, x]
    df3 <- data.frame(Percentiles = paste0(seq(0, 100, by = 1), "%"), numcolwise(function(y) quantile(y, 
        seq(0, 1, by = 0.01), na.rm = TRUE))(df2), stringsAsFactors = FALSE)
    ncol(df3)
})

lst5 <- split(names(df1)[-1], gsub(".*\\_", "", names(df1)[-1]))

lapply(seq_along(lst5), function(i) {
    df2 <- df1[, lst5[[i]]]
    df3 <- data.frame(Percentiles = paste0(seq(0, 100, by = 1), "%"), numcolwise(function(y) quantile(y, 
        seq(0, 1, by = 0.01), na.rm = TRUE))(df2), stringsAsFactors = FALSE)
     write.csv(df3, paste0(paste(getwd(), "final", paste(names(lst4)[[i]], "Quantile", 
     sep = "_"), sep = "/"), ".csv"), row.names = FALSE, quote = FALSE)
})

ReadOut1 <- lapply(list.files(recursive = TRUE)[grep("Quantile", list.files(recursive = TRUE))], 
    function(x) read.csv(x, header = TRUE, stringsAsFactors = FALSE))
dir.create("Indices")
sapply(ReadOut1, dim)[,1:3]  ##different dimensions
#     [,1] [,2] [,3]
#[1,]  101  101  101
#[2,]  157  258  258

names1 <- unlist(lapply(ReadOut1, function(x) names(x)[-1]))
names2 <- gsub("\\_.*", "", names1)
names3 <- unique(gsub("[.]", " ", names2))

length(names3)
#[1] 264
#lstNew <- simplify2array(ReadOut1)  ###results you got
# nrow(lstNew)
#NULL####

ReadOut2 <-  lapply(seq_along(ReadOut1),function(i) {df2 <- ReadOut1[[i]]; df3 <-as.data.frame(matrix(NA,nrow=101,ncol=length(names3), dimnames=list(NULL, names3))); names(df2) <- gsub("[.]"," ", gsub("\\_.*","", names(df2))); df2 <- df2[,-1]; df3[,match(names(df2), names(df3))] <- df2; df3})

lstNew <- simplify2array(ReadOut2)
nrow(lstNew)
#[1] 264

lapply(1:nrow(lstNew), function(i) { dat1 <- data.frame(Percentiles = paste0(seq(0, 100, by = 1), "%"), do.call(cbind, lstNew[i, ]), stringsAsFactors = FALSE); colnames(dat1) <- c("Percentiles", paste(names(lst3), rep(rownames(lstNew)[i],length(lst3)),sep="_")); write.csv(dat1,paste0(paste(getwd(), "Indices", gsub(" ", "_",rownames(lstNew)[i]), sep="/"),".csv"),row.names=FALSE, quote=FALSE)})


## Output2:
ReadOut2 <- lapply(list.files(recursive = TRUE)[grep("Indices", list.files(recursive = TRUE))], 
    function(x) read.csv(x, header = TRUE, stringsAsFactors = FALSE))
length(ReadOut2)
#[1] 264

ReadOut2[[1]][1:3,1:3]
#  Percentiles G100_pav.ANN G101_pav.ANN
#1          0%     0.766900      0.96240
#2          1%     0.796132      0.96572
#3          2%     0.825364      0.96904


Attached is the file.

A.K.




On Tuesday, April 15, 2014 4:00 AM, Zilefac Elvis <zilefacelvis at yahoo.com> wrote:
Hi AK,
I tried all codes for observations. All others work great except this (probably due to different dimensions.
What I did is that I took the Observed.zip file, deleted the station which had no data and applied the code. However, this section of the code did not work. The problem is that lstNew is NULL. So, nothing is actually written to "Indices".

I will check ReadOut1 when I get up from sleep.

Thanks,
Atem.

dir.create("Indices")
names1 <- lapply(ReadOut1, function(x) names(x))[[1]] 
lstNew <- simplify2array(ReadOut1)
nrow(lstNew) 
#[1] NULL 
lapply(2:nrow(lstNew), function(i) { dat1 <- data.frame(lstNew[1], do.call(cbind, lstNew[i, ]), stringsAsFactors = FALSE) colnames(dat1) <- c(rownames(lstNew)[1], paste(names(lst1), rep(rownames(lstNew)[i],  length(lst1)), sep = "_")) write.csv(dat1, paste0(paste(getwd(), "Indices", rownames(lstNew)[i], sep = "/"),  ".csv"), row.names = FALSE, quote = FALSE)
}) 
===================================================================================================================


On Tuesday, April 15, 2014 12:45 AM, arun <smartpink111 at yahoo.com> wrote:
HI  Atem,

No problem.  Hope it works for Observation files too.  Remember that before you run the same code for sample in Observation, check the dimensions of the files (as I did previously).  If there is change of dimensions, make them the same dimensions using the methods I showed.  Then, I guess it should work.
A.K.






On Tuesday, April 15, 2014 2:21 AM, Zilefac Elvis <zilefacelvis at yahoo.com> wrote:
Hi AK,
All codes for simulation files work great.
I will try the code for observations and let you know.
Thanks very much.
Atem.








On Tuesday, April 15, 2014 12:01 AM, arun <smartpink111 at yahoo.com> wrote:
Yes,
my new solution ignores such cases.







On Monday, April 14, 2014 11:58 PM, Zilefac Elvis <zilefacelvis at yahoo.com> wrote:
Hi AK,
Please ignore any such site.
I will check it and include in the analysis.
Thanks,
Atem.



On Monday, April 14, 2014 9:34 PM, arun <smartpink111 at yahoo.com> wrote:



Hi,

I looked at your Observed.zip.  In that one of the file is without any data:
GG83_Sim.csv.ind.csv
The contents of the file are just:

Year    
Year    
trend    
p    < 


A.K.


On Monday, April 14, 2014 10:41 PM, Zilefac Elvis <zilefacelvis at yahoo.com> wrote:
Hi AK,
Q1) Please try to correct the error using the larger data set (Sample.zip). The issue is that once you write the codes and restrict it to smaller data sets, I find it difficult to generalize it to larger data sets.

Q2) From the Quantilecode2.txt you just sent, you forgot to do the following section using the Observed.zip file. I tried to run the code to section Q1 in Quantilecode2.txt using a larger data set and received the same error :Error in 2:nrow(lstNew) : argument of length 0. I have attached a larger data set too for you to generalize the code to suit the larger data set. Please do not forget to include the code below in the final code of Q2.


Once you fix these two, I should be able to fix the rest following these examples.

Thanks AK. Sorry for overloading you with much work.
Atem.

#==============================================================================================================
dir.create("Indices") 
names1 <- lapply(ReadOut1, function(x) names(x))[[1]]
lstNew <- simplify2array(ReadOut1) lapply(2:nrow(lstNew), function(i) { dat1 <- data.frame(lstNew[1], do.call(cbind, lstNew[i, ]), stringsAsFactors = FALSE) colnames(dat1) <- c(rownames(lstNew)[1], paste(names(lst1), rep(rownames(lstNew)[i],  length(lst1)), sep = "_")) 
write.csv(dat1, paste0(paste(getwd(), "Indices", rownames(lstNew)[i], sep = "/"),  ".csv"), row.names = FALSE, quote = FALSE)
})  
## Output2:
ReadOut2 <- lapply(list.files(recursive = TRUE)[grep("Indices", list.files(recursive = TRUE))],  function(x) read.csv(x, header = TRUE, stringsAsFactors = FALSE))
length(ReadOut2)
# [1] 257
head(ReadOut2[[1]], 2) 

#==============================================================================================================




On Monday, April 14, 2014 8:07 PM, arun <smartpink111 at yahoo.com> wrote:

HI,

Please send your emails in plain text.  If you had looked at the dimensions of `lst2`:
sapply(lst2,function(x) sapply(x,ncol))[1:6,]
     G100 G101 G102 G103 G104 G105 G106 G107 G108 G109 G110 G111 G112 G113 G114
[1,]  258  258  258  258  258  257  258  258  258  258  258  258  258  258  247
[2,]  258  258  258  258  258  258  258  258  258  258  258  258  258  258  258
[3,]  258  258  258  258  258  258  258  258  258  258  258  258  258  258  257
[4,]  258  258  258  258  258  257  258  258  258  258  258  258  258  258  258
[5,]  258  258  258  258  258  258  258  258  258  258  258  258  258  258  258
[6,]  258  258  258  258  258  258  258  258  258  258  258  258  258  258  258
     G115 G116 G117 G118 G119 G120 GG10 GG11 GG12 GG13 GG14 GG15 GG16 GG17 GG18
[1,]  258  247  256  256  258  258  258  258  258  258  258  258  258  257  258
[2,]  258  250  257  258  258  256  258  258  258  258  258  258  258  258  258
[3,]  258  247  256  258  258  256  258  258  258  258  258  258  258  258  256
[4,]  258  258  258  257  258  258  258  258  258  258  258  258  258  257  258
[5,]  258  257  258  258  258  256  258  258  258  258  258  258  258  258  258
[6,]  258  257  249  257  258  258  258  258  258  258  258  258  258  258  258
     GG19 GG20 GG21 GG22 GG23 GG24 GG25 GG26 GG27 GG28
[1,]  258  258  258  258  258  258  258  258  258  258
[2,]  258  258  258  258  258  258  258  258  258  258
[3,]  258  258  257  258  256  257  258  258  258  258
[4,]  258  257  258  258  258  257  258  258  258  258
[5,]  258  258  257  258  257  258  258  258  258  258
[6,]  258  258  258  258  257  258  258  258  258  258 


#the dimensions are not consistent for the Simulations
within each Site.  My codes assumed that all the datasets were having the same number of columns, rows etc.






On Monday, April 14, 2014 6:26 PM, Zilefac Elvis <zilefacelvis at yahoo.com> wrote:

Hi AK,
I have another request for help.
Attached is a larger file (~27MB) for sample.zip. All files are same as previous except that I am using more sites to do the same thing that you did with sample.zip.

When generalizing Quantilecode.R to many sites, I receive an error when I run:

dir.create("Indices")
names1 <- lapply(ReadOut1, function(x) names(x))[[1]]
lstNew <- simplify2array(ReadOut1)

lapply(2:nrow(lstNew), function(i) {
  dat1 <- data.frame(lstNew[1], do.call(cbind, lstNew[i, ]), stringsAsFactors = FALSE)
  colnames(dat1) <- c(rownames(lstNew)[1], paste(names(lst1), rep(rownames(lstNew)[i], 
                                                                  length(lst1)), sep = "_"))
  write.csv(dat1, paste0(paste(getwd(), "Indices", rownames(lstNew)[i], sep = "/"), 
                         ".csv"), row.names = FALSE, quote = FALSE)
})

and I get this:
Error in 2:nrow(lstNew) : argument of length 0


I have tried a few tricks but could not overcome the error message.

Please help!
Atem.

sample (1).zip
Zilefac Elvis shared from Dropbox  
View on www.dropbox.com Preview by Yahoo  

On Monday, April 14, 2014 9:22 AM, arun <smartpink111 at yahoo.com> wrote:

Ok
. I got the results but mynet is down. Will send once it gets fixed

----------
Sent from my Nokia

------Original message------
From: zilefacelvis at yahoo.com <zilefacelvis at yahoo.com>
To: "arun" <smartpink111 at yahoo.com>
Date: Monday, April 14, 2014 3:01:38 PM GMT
Subject: Re: Re: Quantile and rowMean from multiple files in a folder





In the Observed.zip I
have just one file per site while in sample.zip I have 100 files(Sims) per site.





Thanks,


Atem.

------ Original Message ------



From : arun
To : Zilefac Elvis;
Sent : 14-04-2014 00:12
Subject : Re: Quantile and rowMean from multiple files in a folder
One more doubt, do you have more than one files per Site? In the example, it was just one file per Site.      On Monday, April 14, 2014 2:08 AM, arun  wrote: Hi, The problem is in the different dimensions of the Observed datasets.  sapply(seq_along(lst2),function(i){lstN<- lapply(lst2[[i]],function(x) x[,-1]);sapply(lstN,function(x) nrow(x))}) ##after removing the trend and P value rows #[1] 9 9 9 8 2 9    If you want to take the average, is it through filling NAs for those years that are missing in the files?? A.K.       On Monday, April 14, 2014 1:05 AM, Zilefac Elvis 
wrote:  Hi AK,  Q1) Please apply the Quantilecode.R to Observed.zip (attached). I tried but received an error which was self-explanatory but I could not change the dimensions in the code.   Q2) Please apply Quantilecode.R to both sample.zip and observed.zip. Here, instead of doing quantile(y, seq(0, 1, by = 0.01), take colMeans of the indices.    I have tried to solve both Q1 and Q2 but still unable to control the dimensions.  Thanks, Atem. On Sunday, April 13, 2014 9:05 AM, arun  wrote:    Hi Atem,  On my end, the codes are not formatted in the email as seen in the screen of formatR GUI.  I am attaching the .R file in case there is some difficulty for you. Arun    On Sunday, April 13, 2014 10:54 AM, arun  wrote: Hi,  I am formatting the codes using library(formatR). Hopefully, it will not be mangled in the email. dir.create("final") lst1<- split(list.files(pattern
=".csv"), gsub("\\_.*","", list.files(pattern =".csv")))  lst2<- lapply(lst1, function(x1) lapply(x1, function(x2) { lines1<- readLines(x2) header1<- lines1[1:2] dat1<- read.table(text = lines1, header = FALSE, sep =",", stringsAsFactors = FALSE, skip = 2) colnames(dat1)<- Reduce(paste, strsplit(header1,",")) dat1[-c(nrow(dat1), nrow(dat1) - 1), ] }))  library(plyr)   lapply(seq_along(lst2), function(i) { lstN<- lapply(lst2[[i]], function(x) x[, -1]) lstQ1<- lapply(lstN, function(x) numcolwise(function(y) quantile(y, seq(0, 1, by = 0.01), na.rm = TRUE))(x)) arr1<- array(unlist(lstQ1), dim = c(dim(lstQ1[[1]]), length(lstQ1)), dimnames = list(NULL, lapply(lstQ1, names)[[1]])) res<- rowMeans(arr1, dims = 2, na.rm = TRUE) colnames(res)<- gsub("","_", colnames(res)) res1<- data.frame(Percentiles = paste0(seq(0, 100, by = 1),"%"), res, stringsAsFactors = FALSE) write.csv(res1,
paste0(paste(getwd(),"final", paste(names(lst1)[[i]],"Quantile", sep ="_"), sep ="/"),".csv"), row.names = FALSE, quote = FALSE) })  ReadOut1<- lapply(list.files(recursive = TRUE)[grep("Quantile", list.files(recursive = TRUE))], function(x) read.csv(x, header = TRUE, stringsAsFactors = FALSE)) sapply(ReadOut1, dim) #    [,1] [,2] #[1,] 101 101 #[2,] 258 258  lapply(ReadOut1,function(x) x[1:2,1:3]) #[[1]] # Percentiles txav_DJF txav_MAM #1         0% -12.68566 7.09702 #2         1% -12.59062 7.15338 # #[[2]] # Percentiles txav_DJF txav_MAM #1         0% -12.75516 6.841840 #2         1% -12.68244 6.910664    ###Q2:  dir.create("Indices") names1<- lapply(ReadOut1, function(x) names(x))[[1]] lstNew<- simplify2array(ReadOut1) lapply(2:nrow(lstNew),
function(i) { dat1<- data.frame(lstNew[1], do.call(cbind, lstNew[i, ]), stringsAsFactors = FALSE) colnames(dat1)<- c(rownames(lstNew)[1], paste(names(lst1), rep(rownames(lstNew)[i], length(lst1)), sep ="_")) write.csv(dat1, paste0(paste(getwd(),"Indices", rownames(lstNew)[i], sep ="/"), ".csv"), row.names = FALSE, quote = FALSE) }) ## Output2: ReadOut2<- lapply(list.files(recursive = TRUE)[grep("Indices", list.files(recursive = TRUE))], function(x) read.csv(x, header = TRUE, stringsAsFactors = FALSE)) length(ReadOut2) # [1] 257    head(ReadOut2[[1]], 2) # Percentiles G100_pav_ANN G101_pav_ANN #1         0%    1.054380    1.032740 #2         1%  1.069457    1.045689    A.K.             On Sunday, April 13, 2014 2:46 AM, Zilefac Elvis  wrote:  Hi AK, Q1) I need your help again. Using the
previous data (attached) and the previous code below,instead of taking rowMeans, let's do quantile(x,seq(0,1,by=0.01)).   Delete the last 2 rows (Trend and p<) in each file before doing quantile(x,seq(0,1,by=0.01)).  For example, assume that I want to calculate quantile(x,seq(0,1,by=0.01)) for each column of Site G100. I will do so for the 5 sims of site G100 and then take their average. This will be approximately close to the true value than just calculating quantile(x,seq(0,1,by=0.01)) from one sim. Please dothis same thing for all the files.  So, when you do rowMeans, it should be the mean of quantile(x,seq(0,1,by=0.01)) calculated from all sims in that Site.  Output  The number of files in"final" remains the same (2 files). The"Year" column(will be replaced) will contain the names of quantile(x,seq(0,1,by=0.01)) such as 0%          1%          2%   
      3%          4%          5%          6%, ..., 98%       99%        100% . You can give this column any name such as"Percentiles".   Q2) From the folder"final", please go to each file identified by site name, take a column, say col1 of txav from each file, create a dataframe whose colnames are site codes (names of files in"final"). Create a folder called"Indices" and place this dataframe in it. The filename for the dataframe is txav, say. So, in"Indices", you will have one file having 3 columns [, c(Percentiles, G100,G101)]. The idea is that I want to be able to pick any column from files in"final" and form a dataframe from which I will generate my qqplot or boxplot.  Thanks very much AK. Atem This should be the final step of this my drama, at least for now.
#==============================================================================================================  dir.create("final") lst1<- split(list.files(pattern=".csv"),gsub("\\_.*","",list.files(pattern=".csv"))) lst2<- lapply(lst1,function(x1) lapply(x1, function(x2) {lines1<- readLines(x2); header1<- lines1[1:2]; dat1<- read.table(text=lines1,header=FALSE,sep=",",stringsAsFactors=FALSE, skip=2); colnames(dat1)<- Reduce(paste,strsplit(header1,","));dat1}))  lstYear<- lapply(lst2,function(x) lapply(x, function(y) y[,1,drop=FALSE])[[1]])    lapply(seq_along(lst2),function(i) {lstN<-lapply(lst2[[i]],function(x) x[,-1]); arr1<- array(unlist(lstN),dim=c(dim(lstN[[1]]),length(lstN)),dimnames=list(NULL,lapply(lstN,names)[[1]]));res<- cbind(lstYear[[i]],rowMeans(arr1,dims=2,na.rm=TRUE)); names(res)<- gsub("\\_$","",gsub("","_",names(res))); res[,1]<- gsub("<","",res[,1]);
write.csv(res,paste0(paste(getwd(),"final",names(lst1) [[i]],sep="/"),".csv"),row.names=FALSE,quote=FALSE) })     #====================================================================================================  




More information about the R-help mailing list