[R] Quantile and rowMean from multiple files in a folder
arun
smartpink111 at yahoo.com
Tue Apr 15 02:54:02 CEST 2014
Hi Atem,
I guess this is what you wanted.
###Q1:
###
###working directory: Observed
#Only one file per Site. Assuming this is the case for the full dataset, then I guess there is no need to average
dir.create("final")
lst1 <- split(list.files(pattern = ".csv"), gsub("\\_.*", "", list.files(pattern = ".csv")))
lst2 <- lapply(lst1,function(x1) lapply(x1, function(x2) {lines1 <- readLines(x2); header1 <- lines1[1:2]; dat1 <- read.table(text=lines1,header=FALSE,sep=",",stringsAsFactors=FALSE, skip=2); colnames(dat1) <- Reduce(paste,strsplit(header1,","));dat1[-c(nrow(dat1),nrow(dat1)-1),]}))
#different number of rows
sapply(seq_along(lst2),function(i){lstN <- lapply(lst2[[i]],function(x) x[,-1]);sapply(lstN,function(x) nrow(x))})
#[1] 9 9 9 8 2 9
#difference in number of columns
sapply(seq_along(lst2),function(i) {sapply(lst2[[i]],function(x) ncol(x))})
#[1] 157 258 258 98 157 258
library(plyr)
library(stringr)
lst3 <- setNames(lapply(seq_along(lst2),function(i) {lapply(lst2[[i]],function(x) {names(x)[-1] <- paste(names(x)[-1], names(lst1)[i],sep="_"); names(x) <- str_trim(names(x)); x})[[1]]}), names(lst1))
df1 <- join_all(lst3,by="Year")
dim(df1)
#[1] 9 1181
sapply(split(names(df1)[-1] ,gsub(".*\\_","",names(df1)[-1])),function(x) {df2 <- df1[,x];df3 <- data.frame(Percentiles=paste0(seq(0,100, by=1) ,"%"), numcolwise(function(y) quantile(y,seq(0,1,by=0.01),na.rm=TRUE))(df2),stringsAsFactors=FALSE);ncol(df3) })
#G100 G101 G102 G103 G104 G105
# 157 258 258 98 157 258
lst4 <- split(names(df1)[-1] ,gsub(".*\\_","",names(df1)[-1]))
lapply(seq_along(lst4),function(i) {df2 <- df1[,lst4[[i]]]; df3 <- data.frame(Percentiles=paste0(seq(0,100, by=1) ,"%"), numcolwise(function(y) quantile(y,seq(0,1,by=0.01),na.rm=TRUE))(df2),stringsAsFactors=FALSE);df3[1:3,1:3]; write.csv(df3,paste0(paste(getwd(), "final",paste(names(lst1)[[i]],"Quantile",sep="_"),sep="/"),".csv"),row.names=FALSE,quote=FALSE)})
ReadOut1 <- lapply(list.files(recursive=TRUE)[grep("Quantile",list.files(recursive=TRUE))],function(x) read.csv(x,header=TRUE,stringsAsFactors=FALSE))
sapply(ReadOut1,dim)
# [,1] [,2] [,3] [,4] [,5] [,6]
#[1,] 101 101 101 101 101 101
#[2,] 157 258 258 98 157 258
lapply(ReadOut1,function(x) x[1:2,1:3])[1:3]
#[[1]]
# Percentiles pav.DJF_G100 pav.MAM_G100
#1 0% 0 0.640500
#2 1% 0 0.664604
#
#[[2]]
# Percentiles txav.DJF_G101 txav.MAM_G101
#1 0% -13.8756 4.742400
#2 1% -13.8140 4.817184
#
#[[3]]
# Percentiles txav.DJF_G102 txav.MAM_G102
#1 0% -15.05000 4.520700
#2 1% -14.96833 4.543828
#####
###Q2:
###Observed data
dir.create("Indices")
names1 <- unlist(lapply(ReadOut1,function(x)
names(x)[-1]))
names2 <- gsub("\\_.*","",names1)
names3 <- unique(gsub("[.]", " ", names2))
res <- do.call(rbind,lapply(seq_along(lst4),function(i) {df2 <- df1[,lst4[[i]]];vec1 <- colMeans(df2,na.rm=TRUE); vec2 <- rep(NA,length(names3));names(vec2) <- paste(names3,names(lst4)[[i]],sep="_"); vec2[names(vec2) %in% names(vec1)] <- vec1; names(vec2) <- gsub("\\_.*","",names(vec2)); vec2 }))
lapply(seq_len(ncol(res)),function(i) {mat1 <- t(res[,i,drop=FALSE]);colnames(mat1) <- names(lst4); write.csv(mat1,paste0(paste(getwd(),"Indices", gsub(" ","_",rownames(mat1)),sep="/"),".csv"),row.names=FALSE,quote=FALSE)})
##Output2:
ReadOut2 <- lapply(list.files(recursive=TRUE)[grep("Indices",list.files(recursive=TRUE))],function(x) read.csv(x,header=TRUE,stringsAsFactors=FALSE))
length(ReadOut2)
#[1] 257
list.files(recursive=TRUE)[grep("Indices",list.files(recursive=TRUE))][1]
#[1] "Indices/pav_ANN.csv"
res[,"pav ANN",drop=FALSE]
# pav ANN
#[1,] 1.298811
#[2,] 7.642922
#[3,] 6.740011
#[4,] NA
#[5,] 1.296650
#[6,] 6.887622
ReadOut2[[1]]
# G100 G101 G102 G103 G104 G105
#1 1.298811 7.642922 6.740011 NA 1.29665 6.887622
###Sample data
###Working directory changed to "sample"
dir.create("Indices_colMeans")
lst1 <- split(list.files(pattern=".csv"),gsub("\\_.*","",list.files(pattern=".csv")))
lst2 <- lapply(lst1,function(x1) lapply(x1, function(x2) {lines1 <- readLines(x2); header1 <- lines1[1:2]; dat1 <- read.table(text=lines1,header=FALSE,sep=",",stringsAsFactors=FALSE, skip=2); colnames(dat1) <- Reduce(paste,strsplit(header1,","));dat1[-c(nrow(dat1),nrow(dat1)-1),]}))
res1 <- do.call(rbind,lapply(seq_along(lst2),function(i) {rowMeans(do.call(cbind,lapply(lst2[[i]],function(x) colMeans(x[,-1],na.rm=TRUE))),na.rm=TRUE) }))
lapply(seq_len(ncol(res1)),function(i){mat1 <- t(res1[,i,drop=FALSE]); colnames(mat1) <- names(lst2);write.csv(mat1,paste0(paste(getwd(),"Indices_colMeans",gsub(" ","_",rownames(mat1)),sep="/"),".csv"),row.names=FALSE,quote=FALSE)})
##Output2 Sample
ReadOut2S <- lapply(list.files(recursive=TRUE)[grep("Indices",list.files(recursive=TRUE))],function(x) read.csv(x,header=TRUE,stringsAsFactors=FALSE))
length(ReadOut2S)
#[1] 257
list.files(recursive=TRUE)[grep("Indices",list.files(recursive=TRUE))][1]
#[1] "Indices_colMeans/pav_ANN.csv"
res1[,"pav ANN",drop=FALSE]
# pav ANN
#[1,] 1.545620
#[2,] 1.518553
ReadOut2S[[1]]
# G100 G101
#1 1.54562 1.518553
A.K.
On Monday, April 14, 2014 1:05 AM, Zilefac Elvis <zilefacelvis at yahoo.com> wrote:
Hi AK,
Q1) Please apply the Quantilecode.R to Observed.zip (attached). I tried but received an error which was self-explanatory but I could not change the dimensions in the code.
Q2) Please apply Quantilecode.R to both sample.zip and observed.zip. Here, instead of doing quantile(y, seq(0, 1, by = 0.01), take colMeans of the indices.
I have tried to solve both Q1 and Q2 but still unable to control the dimensions.
Thanks,
Atem.
On Sunday, April 13, 2014 9:05 AM, arun <smartpink111 at yahoo.com> wrote:
Hi Atem,
On my end, the codes are not formatted in the email as seen in the screen of formatR GUI.
I am attaching the .R file in case there is some difficulty for you.
Arun
On Sunday, April 13, 2014 10:54 AM, arun <smartpink111 at yahoo.com> wrote:
Hi,
I am formatting the codes using library(formatR). Hopefully, it will not be mangled in the email.
dir.create("final")
lst1 <- split(list.files(pattern = ".csv"), gsub("\\_.*", "", list.files(pattern = ".csv")))
lst2 <- lapply(lst1, function(x1) lapply(x1, function(x2) { lines1 <- readLines(x2) header1 <- lines1[1:2] dat1 <- read.table(text = lines1, header = FALSE, sep = ",", stringsAsFactors = FALSE, skip = 2) colnames(dat1) <- Reduce(paste, strsplit(header1, ",")) dat1[-c(nrow(dat1), nrow(dat1) - 1), ]
}))
library(plyr)
lapply(seq_along(lst2), function(i) { lstN <- lapply(lst2[[i]], function(x) x[, -1]) lstQ1 <- lapply(lstN, function(x) numcolwise(function(y) quantile(y, seq(0, 1, by = 0.01), na.rm = TRUE))(x)) arr1 <- array(unlist(lstQ1), dim = c(dim(lstQ1[[1]]), length(lstQ1)), dimnames = list(NULL, lapply(lstQ1, names)[[1]])) res <- rowMeans(arr1, dims = 2, na.rm = TRUE) colnames(res) <- gsub(" ", "_", colnames(res)) res1 <- data.frame(Percentiles = paste0(seq(0, 100, by = 1), "%"), res, stringsAsFactors = FALSE) write.csv(res1, paste0(paste(getwd(), "final", paste(names(lst1)[[i]], "Quantile", sep = "_"), sep = "/"), ".csv"), row.names = FALSE, quote = FALSE)
})
ReadOut1 <- lapply(list.files(recursive = TRUE)[grep("Quantile", list.files(recursive = TRUE))], function(x) read.csv(x, header = TRUE, stringsAsFactors = FALSE))
sapply(ReadOut1,
dim)
# [,1] [,2]
#[1,] 101 101
#[2,] 258 258
lapply(ReadOut1,function(x) x[1:2,1:3])
#[[1]]
# Percentiles txav_DJF txav_MAM
#1 0% -12.68566 7.09702
#2 1% -12.59062 7.15338
#
#[[2]]
# Percentiles txav_DJF txav_MAM
#1 0% -12.75516 6.841840
#2 1% -12.68244 6.910664
###Q2:
dir.create("Indices")
names1 <- lapply(ReadOut1, function(x) names(x))[[1]]
lstNew <- simplify2array(ReadOut1)
lapply(2:nrow(lstNew), function(i) { dat1 <- data.frame(lstNew[1], do.call(cbind, lstNew[i, ]), stringsAsFactors = FALSE) colnames(dat1) <- c(rownames(lstNew)[1], paste(names(lst1), rep(rownames(lstNew)[i], length(lst1)), sep = "_")) write.csv(dat1, paste0(paste(getwd(), "Indices", rownames(lstNew)[i], sep = "/"), ".csv"), row.names = FALSE, quote = FALSE)
}) ## Output2:
ReadOut2 <- lapply(list.files(recursive = TRUE)[grep("Indices", list.files(recursive = TRUE))], function(x) read.csv(x, header = TRUE, stringsAsFactors = FALSE))
length(ReadOut2)
# [1] 257
head(ReadOut2[[1]], 2)
# Percentiles G100_pav_ANN G101_pav_ANN
#1 0% 1.054380 1.032740
#2 1%
1.069457 1.045689
A.K.
On Sunday, April 13, 2014 2:46 AM, Zilefac Elvis <zilefacelvis at yahoo.com> wrote:
Hi AK,
Q1) I need your help again. Using the previous data (attached) and the previous code below,instead of taking rowMeans, let's do quantile(x,seq(0,1,by=0.01)).
Delete the last 2 rows (Trend and p<) in each file before doing quantile(x,seq(0,1,by=0.01)).
For example, assume that I want to
calculate quantile(x,seq(0,1,by=0.01)) for each column of Site G100. I will do so for the 5 sims of site G100 and then take their average. This will be approximately close to the true value than just calculating quantile(x,seq(0,1,by=0.01)) from one sim. Please do this same thing for all the files.
So, when you do rowMeans, it should be the mean of quantile(x,seq(0,1,by=0.01)) calculated from all sims in that Site.
Output
The number of files in "final" remains the same (2 files). The "Year" column(will be replaced) will contain the names of quantile(x,seq(0,1,by=0.01)) such as 0% 1% 2% 3% 4% 5% 6%, ..., 98%
99% 100% . You can give this column any name such as "Percentiles".
Q2) From the folder "final", please go to each file identified by site name, take a column, say col1 of txav from each file, create a dataframe whose colnames are site codes (names of files in "final"). Create a folder called "Indices" and place this dataframe in it. The filename for the dataframe is txav, say. So, in "Indices", you will have one file having 3 columns [, c(Percentiles, G100,G101)]. The idea is that I want to be able to pick any column from files in "final" and form a dataframe from which I will generate my qqplot or boxplot.
Thanks very much AK.
Atem
This should be the final step of this my drama, at least for now.
#==============================================================================================================
dir.create("final")
lst1 <- split(list.files(pattern=".csv"),gsub("\\_.*","",list.files(pattern=".csv"))) lst2 <- lapply(lst1,function(x1) lapply(x1, function(x2) {lines1 <- readLines(x2); header1 <- lines1[1:2]; dat1 <- read.table(text=lines1,header=FALSE,sep=",",stringsAsFactors=FALSE, skip=2); colnames(dat1) <- Reduce(paste,strsplit(header1,","));dat1}))
lstYear <- lapply(lst2,function(x) lapply(x, function(y) y[,1,drop=FALSE])[[1]])
lapply(seq_along(lst2),function(i) {lstN <-lapply(lst2[[i]],function(x) x[,-1]); arr1 <- array(unlist(lstN),dim=c(dim(lstN[[1]]),length(lstN)),dimnames=list(NULL,lapply(lstN,names)[[1]]));res <-
cbind(lstYear[[i]],rowMeans(arr1,dims=2,na.rm=TRUE)); names(res) <- gsub("\\_$","",gsub(" ", "_",names(res))); res[,1] <- gsub(" <", "",res[,1]); write.csv(res,paste0(paste(getwd(),"final",names(lst1)
[[i]],sep="/"),".csv"),row.names=FALSE,quote=FALSE) })
#====================================================================================================
-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: Quantilecode2.txt
URL: <https://stat.ethz.ch/pipermail/r-help/attachments/20140414/14715496/attachment-0002.txt>
More information about the R-help
mailing list