[R] Quantile and rowMean from multiple files in a folder

Tue Apr 15 02:54:02 CEST 2014

Hi Atem,

I guess this is what you wanted.

###Q1: 
###
###working directory: Observed
 #Only one file per Site.  Assuming this is the case for the full dataset, then I guess there is no need to average

dir.create("final")
lst1 <- split(list.files(pattern = ".csv"), gsub("\\_.*", "", list.files(pattern = ".csv")))

lst2 <-  lapply(lst1,function(x1) lapply(x1, function(x2) {lines1 <- readLines(x2); header1 <- lines1[1:2]; dat1 <- read.table(text=lines1,header=FALSE,sep=",",stringsAsFactors=FALSE, skip=2); colnames(dat1) <- Reduce(paste,strsplit(header1,","));dat1[-c(nrow(dat1),nrow(dat1)-1),]}))

#different number of rows
 sapply(seq_along(lst2),function(i){lstN <- lapply(lst2[[i]],function(x) x[,-1]);sapply(lstN,function(x) nrow(x))})
 #[1] 9 9 9 8 2 9

#difference in number of columns
sapply(seq_along(lst2),function(i) {sapply(lst2[[i]],function(x) ncol(x))})
 #[1] 157 258 258  98 157 258

library(plyr)
library(stringr)

lst3 <- setNames(lapply(seq_along(lst2),function(i) {lapply(lst2[[i]],function(x) {names(x)[-1] <- paste(names(x)[-1], names(lst1)[i],sep="_"); names(x) <- str_trim(names(x)); x})[[1]]}), names(lst1)) 

df1 <- join_all(lst3,by="Year")
dim(df1)
 #[1]    9 1181 

sapply(split(names(df1)[-1] ,gsub(".*\\_","",names(df1)[-1])),function(x) {df2 <- df1[,x];df3 <- data.frame(Percentiles=paste0(seq(0,100, by=1) ,"%"), numcolwise(function(y) quantile(y,seq(0,1,by=0.01),na.rm=TRUE))(df2),stringsAsFactors=FALSE);ncol(df3) })
 #G100 G101 G102 G103 G104 G105 
# 157  258  258   98  157  258 

lst4 <- split(names(df1)[-1] ,gsub(".*\\_","",names(df1)[-1]))

lapply(seq_along(lst4),function(i) {df2 <- df1[,lst4[[i]]]; df3 <- data.frame(Percentiles=paste0(seq(0,100, by=1) ,"%"), numcolwise(function(y) quantile(y,seq(0,1,by=0.01),na.rm=TRUE))(df2),stringsAsFactors=FALSE);df3[1:3,1:3]; write.csv(df3,paste0(paste(getwd(), "final",paste(names(lst1)[[i]],"Quantile",sep="_"),sep="/"),".csv"),row.names=FALSE,quote=FALSE)}) 

ReadOut1 <- lapply(list.files(recursive=TRUE)[grep("Quantile",list.files(recursive=TRUE))],function(x) read.csv(x,header=TRUE,stringsAsFactors=FALSE)) 

sapply(ReadOut1,dim)
#     [,1] [,2] [,3] [,4] [,5] [,6]
 #[1,]  101  101  101  101  101  101 
#[2,]  157  258  258   98  157  258

lapply(ReadOut1,function(x) x[1:2,1:3])[1:3]
 #[[1]] 
#  Percentiles pav.DJF_G100 pav.MAM_G100 
#1          0%            0     0.640500 
#2          1%            0     0.664604 
# 
#[[2]] 
#  Percentiles txav.DJF_G101 txav.MAM_G101
 #1          0%      -13.8756      4.742400 
#2          1%      -13.8140      4.817184
 #
 #[[3]] 
#  Percentiles txav.DJF_G102 txav.MAM_G102
 #1          0%     -15.05000      4.520700
 #2          1%     -14.96833      4.543828 
#####
###Q2: 
###Observed data 

dir.create("Indices")
 names1 <- unlist(lapply(ReadOut1,function(x)
 names(x)[-1])) 
names2 <-  gsub("\\_.*","",names1)
 names3 <- unique(gsub("[.]", " ", names2)) 

res <- do.call(rbind,lapply(seq_along(lst4),function(i) {df2 <- df1[,lst4[[i]]];vec1 <- colMeans(df2,na.rm=TRUE); vec2 <- rep(NA,length(names3));names(vec2) <- paste(names3,names(lst4)[[i]],sep="_"); vec2[names(vec2) %in% names(vec1)] <- vec1; names(vec2) <- gsub("\\_.*","",names(vec2)); vec2  }))

lapply(seq_len(ncol(res)),function(i) {mat1 <- t(res[,i,drop=FALSE]);colnames(mat1) <- names(lst4); write.csv(mat1,paste0(paste(getwd(),"Indices", gsub(" ","_",rownames(mat1)),sep="/"),".csv"),row.names=FALSE,quote=FALSE)})

##Output2:
ReadOut2 <- lapply(list.files(recursive=TRUE)[grep("Indices",list.files(recursive=TRUE))],function(x) read.csv(x,header=TRUE,stringsAsFactors=FALSE)) 

length(ReadOut2) 

#[1] 257

list.files(recursive=TRUE)[grep("Indices",list.files(recursive=TRUE))][1]
#[1] "Indices/pav_ANN.csv" 

res[,"pav ANN",drop=FALSE] 

#      pav ANN
#[1,] 1.298811
#[2,] 7.642922 

#[3,] 6.740011 

#[4,]       NA
#[5,] 1.296650 

#[6,] 6.887622 

ReadOut2[[1]]
#      G100     G101     G102 G103    G104     G105
#1 1.298811 7.642922 6.740011   NA 1.29665 6.887622 

###Sample data 

###Working directory changed to "sample" 

dir.create("Indices_colMeans")

lst1 <- split(list.files(pattern=".csv"),gsub("\\_.*","",list.files(pattern=".csv"))) 

lst2 <-  lapply(lst1,function(x1) lapply(x1, function(x2) {lines1 <- readLines(x2); header1 <- lines1[1:2]; dat1 <- read.table(text=lines1,header=FALSE,sep=",",stringsAsFactors=FALSE, skip=2); colnames(dat1) <- Reduce(paste,strsplit(header1,","));dat1[-c(nrow(dat1),nrow(dat1)-1),]}))
res1 <- do.call(rbind,lapply(seq_along(lst2),function(i) {rowMeans(do.call(cbind,lapply(lst2[[i]],function(x) colMeans(x[,-1],na.rm=TRUE))),na.rm=TRUE) })) 

lapply(seq_len(ncol(res1)),function(i){mat1 <- t(res1[,i,drop=FALSE]); colnames(mat1) <- names(lst2);write.csv(mat1,paste0(paste(getwd(),"Indices_colMeans",gsub(" ","_",rownames(mat1)),sep="/"),".csv"),row.names=FALSE,quote=FALSE)})

##Output2 Sample
ReadOut2S <- lapply(list.files(recursive=TRUE)[grep("Indices",list.files(recursive=TRUE))],function(x) read.csv(x,header=TRUE,stringsAsFactors=FALSE)) 

length(ReadOut2S)
#[1] 257

list.files(recursive=TRUE)[grep("Indices",list.files(recursive=TRUE))][1] 

#[1] "Indices_colMeans/pav_ANN.csv" 

res1[,"pav ANN",drop=FALSE] 

#      pav ANN
#[1,] 1.545620
#[2,] 1.518553 

ReadOut2S[[1]] 

#     G100     G101 

#1 1.54562 1.518553 

A.K.

On Monday, April 14, 2014 1:05 AM, Zilefac Elvis <zilefacelvis at yahoo.com> wrote:

Hi AK,

Q1) Please apply the Quantilecode.R to Observed.zip (attached). I tried but received an error which was self-explanatory but I could not change the dimensions in the code.

Q2) Please apply Quantilecode.R to both sample.zip and observed.zip. Here, instead of doing quantile(y, seq(0, 1, by = 0.01), take colMeans of the indices. 

I have tried to solve both Q1 and Q2 but still unable to control the dimensions.

Thanks,
Atem.
On Sunday, April 13, 2014 9:05 AM, arun <smartpink111 at yahoo.com> wrote:

Hi Atem,

On my end, the codes are not formatted in the email as seen in the screen of formatR GUI.

I am attaching the .R file in case there is some difficulty for you.
Arun

On Sunday, April 13, 2014 10:54 AM, arun <smartpink111 at yahoo.com> wrote:
Hi,

I am formatting the codes using library(formatR).  Hopefully, it will not be mangled in the email.
dir.create("final")
lst1 <- split(list.files(pattern = ".csv"), gsub("\\_.*", "", list.files(pattern = ".csv")))

lst2 <- lapply(lst1, function(x1) lapply(x1, function(x2) { lines1 <- readLines(x2) header1 <- lines1[1:2] dat1 <- read.table(text = lines1, header = FALSE, sep = ",", stringsAsFactors = FALSE,  skip = 2) colnames(dat1) <- Reduce(paste, strsplit(header1, ",")) dat1[-c(nrow(dat1), nrow(dat1) - 1), ]
}))

library(plyr) 

lapply(seq_along(lst2), function(i) { lstN <- lapply(lst2[[i]], function(x) x[, -1]) lstQ1 <- lapply(lstN, function(x) numcolwise(function(y) quantile(y, seq(0, 1,  by = 0.01), na.rm = TRUE))(x)) arr1 <- array(unlist(lstQ1), dim = c(dim(lstQ1[[1]]), length(lstQ1)), dimnames = list(NULL,  lapply(lstQ1, names)[[1]])) res <- rowMeans(arr1, dims = 2, na.rm = TRUE) colnames(res) <- gsub(" ", "_", colnames(res)) res1 <- data.frame(Percentiles = paste0(seq(0, 100, by = 1), "%"), res, stringsAsFactors = FALSE) write.csv(res1, paste0(paste(getwd(), "final", paste(names(lst1)[[i]], "Quantile",  sep = "_"), sep = "/"), ".csv"), row.names = FALSE, quote = FALSE)
})

ReadOut1 <- lapply(list.files(recursive = TRUE)[grep("Quantile", list.files(recursive = TRUE))],  function(x) read.csv(x, header = TRUE, stringsAsFactors = FALSE))
sapply(ReadOut1,
dim)
#     [,1] [,2]
#[1,]  101  101
#[2,]  258  258

lapply(ReadOut1,function(x) x[1:2,1:3])
#[[1]]
#  Percentiles  txav_DJF txav_MAM
#1          0% -12.68566  7.09702
#2          1% -12.59062  7.15338
#
#[[2]]
#  Percentiles  txav_DJF txav_MAM
#1          0% -12.75516 6.841840
#2          1% -12.68244 6.910664 

###Q2:

dir.create("Indices")
names1 <- lapply(ReadOut1, function(x) names(x))[[1]]
lstNew <- simplify2array(ReadOut1)
lapply(2:nrow(lstNew), function(i) { dat1 <- data.frame(lstNew[1], do.call(cbind, lstNew[i, ]), stringsAsFactors = FALSE) colnames(dat1) <- c(rownames(lstNew)[1], paste(names(lst1), rep(rownames(lstNew)[i],  length(lst1)), sep = "_")) write.csv(dat1, paste0(paste(getwd(), "Indices", rownames(lstNew)[i], sep = "/"),  ".csv"), row.names = FALSE, quote = FALSE)
}) ## Output2:
ReadOut2 <- lapply(list.files(recursive = TRUE)[grep("Indices", list.files(recursive = TRUE))],  function(x) read.csv(x, header = TRUE, stringsAsFactors = FALSE))
length(ReadOut2)
# [1] 257 

head(ReadOut2[[1]], 2)
#  Percentiles G100_pav_ANN G101_pav_ANN
#1          0%     1.054380     1.032740
#2          1% 
   1.069457     1.045689 

A.K.

On Sunday, April 13, 2014 2:46 AM, Zilefac Elvis <zilefacelvis at yahoo.com> wrote:

Hi AK,
Q1) I need your help again. Using the previous data (attached) and the previous code below,instead of taking rowMeans, let's do quantile(x,seq(0,1,by=0.01)). 

Delete the last 2 rows (Trend and p<) in each file before doing quantile(x,seq(0,1,by=0.01)).

For example, assume that I want to
calculate quantile(x,seq(0,1,by=0.01)) for each column of Site G100. I will do so for the 5 sims of site G100 and then take their average. This will be approximately close to the true value than just calculating quantile(x,seq(0,1,by=0.01)) from one sim. Please do this same thing for all the files.

So, when you do rowMeans, it should be the mean of quantile(x,seq(0,1,by=0.01)) calculated from all sims in that Site.

Output

The number of files in "final" remains the same (2 files). The "Year" column(will be replaced) will contain  the names of quantile(x,seq(0,1,by=0.01)) such as  0%           1%           2%           3%           4%           5%           6%, ..., 98%    
     99%         100% . You can give this column any name such as "Percentiles".

Q2)  From the folder "final", please go to each file identified by site name, take a column, say col1 of txav  from each file, create a dataframe whose colnames are site codes (names of files in "final"). Create a folder called "Indices" and place this dataframe in it. The filename for the dataframe is txav, say. So, in "Indices", you will have one file having 3 columns [, c(Percentiles, G100,G101)]. The idea is that I want to be able to pick any column from files in "final" and form a dataframe from which I will generate my qqplot or boxplot.

Thanks very much AK.
Atem
This should be the final step of this my drama, at least for now.
#==============================================================================================================

dir.create("final")
lst1 <- split(list.files(pattern=".csv"),gsub("\\_.*","",list.files(pattern=".csv"))) lst2 <-  lapply(lst1,function(x1) lapply(x1, function(x2) {lines1 <- readLines(x2); header1 <- lines1[1:2]; dat1 <- read.table(text=lines1,header=FALSE,sep=",",stringsAsFactors=FALSE, skip=2); colnames(dat1) <- Reduce(paste,strsplit(header1,","));dat1}))

lstYear <- lapply(lst2,function(x) lapply(x, function(y) y[,1,drop=FALSE])[[1]]) 

lapply(seq_along(lst2),function(i) {lstN <-lapply(lst2[[i]],function(x) x[,-1]); arr1 <- array(unlist(lstN),dim=c(dim(lstN[[1]]),length(lstN)),dimnames=list(NULL,lapply(lstN,names)[[1]]));res <-
cbind(lstYear[[i]],rowMeans(arr1,dims=2,na.rm=TRUE)); names(res) <- gsub("\\_$","",gsub(" ", "_",names(res))); res[,1] <- gsub(" <", "",res[,1]); write.csv(res,paste0(paste(getwd(),"final",names(lst1)
[[i]],sep="/"),".csv"),row.names=FALSE,quote=FALSE)  }) 

#====================================================================================================
-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: Quantilecode2.txt
URL: <https://stat.ethz.ch/pipermail/r-help/attachments/20140414/14715496/attachment-0002.txt>