[R] Quantile and rowMean from multiple files in a folder
zilefacelvis at yahoo.com
zilefacelvis at yahoo.com
Tue Apr 15 03:13:22 CEST 2014
Hi AK,
Thanks very much.
I did send you another email with a larger Sample.zip file. The
Quantilecode.R which you initially developed for a smaller sample.zip did
not complete the task when I used it for a larger data set. Please check to
rectify the error message.
Thanks,
Atem.
------ Original Message ------
From : arun
To : R. Help;
Cc : Zilefac Elvis;
Sent : 14-04-2014 18:57
Subject : Re: Quantile and rowMean from multiple files in a folder
Hi Atem,
I guess this is what you wanted.
###Q1:
###
###working directory: Observed
#Only one file per Site. Assuming this is the case for the full dataset, then
I guess there is no need to average
dir.create("final")
lst1 <- split(list.files(pattern = ".csv"), gsub("\\_.*", "", list.files(patter
n = ".csv")))
lst2 <- lapply(lst1,function(x1) lapply(x1, function(x2) {lines1 <- readLines(
x2); header1 <- lines1[1:2]; dat1 <- read.table(text=lines1,header=FALSE,sep=",
",stringsAsFactors=FALSE, skip=2); colnames(dat1) <- Reduce(paste,strsplit(head
er1,","));dat1[-c(nrow(dat1),nrow(dat1)-1),]}))
#different number of rows
sapply(seq_along(lst2),function(i){lstN <- lapply(lst2[[i]],function(x) x[,-1]
);sapply(lstN,function(x) nrow(x))})
#[1] 9 9 9 8 2 9
#difference in number of columns
sapply(seq_along(lst2),function(i) {sapply(lst2[[i]],function(x) ncol(x))})
#[1] 157 258 258 98 157 258
library(plyr)
library(stringr)
lst3 <- setNames(lapply(seq_along(lst2),function(i) {lapply(lst2[[i]],function(
x) {names(x)[-1] <- paste(names(x)[-1], names(lst1)[i],sep="_"); names(x) <- st
r_trim(names(x)); x})[[1]]}), names(lst1))
df1 <- join_all(lst3,by="Year")
dim(df1)
#[1] 9 1181
sapply(split(names(df1)[-1] ,gsub(".*\\_","",names(df1)[-1])),function(x) {df2
<- df1[,x];df3 <- data.frame(Percentiles=paste0(seq(0,100, by=1) ,"%"), numcolw
ise(function(y) quantile(y,seq(0,1,by=0.01),na.rm=TRUE))(df2),stringsAsFactors=
FALSE);ncol(df3) })
#G100 G101 G102 G103 G104 G105
# 157 258 258 98 157 258
lst4 <- split(names(df1)[-1] ,gsub(".*\\_","",names(df1)[-1]))
lapply(seq_along(lst4),function(i) {df2 <- df1[,lst4[[i]]]; df3 <- data.frame(P
ercentiles=paste0(seq(0,100, by=1) ,"%"), numcolwise(function(y) quantile(y,seq
(0,1,by=0.01),na.rm=TRUE))(df2),stringsAsFactors=FALSE);df3[1:3,1:3]; write.csv
(df3,paste0(paste(getwd(), "final",paste(names(lst1)[[i]],"Quantile",sep="_"),s
ep="/"),".csv"),row.names=FALSE,quote=FALSE)})
ReadOut1 <- lapply(list.files(recursive=TRUE)[grep("Quantile",list.files(recurs
ive=TRUE))],function(x) read.csv(x,header=TRUE,stringsAsFactors=FALSE))
sapply(ReadOut1,dim)
# [,1] [,2] [,3] [,4] [,5] [,6]
#[1,] 101 101 101 101 101 101
#[2,] 157 258 258 98 157 258
lapply(ReadOut1,function(x) x[1:2,1:3])[1:3]
#[[1]]
# Percentiles pav.DJF_G100 pav.MAM_G100
#1 0% 0 0.640500
#2 1% 0 0.664604
#
#[[2]]
# Percentiles txav.DJF_G101 txav.MAM_G101
#1 0% -13.8756 4.742400
#2 1% -13.8140 4.817184
#
#[[3]]
# Percentiles txav.DJF_G102 txav.MAM_G102
#1 0% -15.05000 4.520700
#2 1% -14.96833 4.543828
#####
###Q2:
###Observed data
dir.create("Indices")
names1 <- unlist(lapply(ReadOut1,function(x)
names(x)[-1]))
names2 <- gsub("\\_.*","",names1)
names3 <- unique(gsub("[.]", " ", names2))
res <- do.call(rbind,lapply(seq_along(lst4),function(i) {df2 <- df1[,lst4[[i]]]
;vec1 <- colMeans(df2,na.rm=TRUE); vec2 <- rep(NA,length(names3));names(vec2) <
- paste(names3,names(lst4)[[i]],sep="_"); vec2[names(vec2) %in% names(vec1)] <-
vec1; names(vec2) <- gsub("\\_.*","",names(vec2)); vec2 }))
lapply(seq_len(ncol(res)),function(i) {mat1 <- t(res[,i,drop=FALSE]);colnames(m
at1) <- names(lst4); write.csv(mat1,paste0(paste(getwd(),"Indices", gsub(" ","_
",rownames(mat1)),sep="/"),".csv"),row.names=FALSE,quote=FALSE)})
##Output2:
ReadOut2 <- lapply(list.files(recursive=TRUE)[grep("Indices",list.files(recursi
ve=TRUE))],function(x) read.csv(x,header=TRUE,stringsAsFactors=FALSE))
length(ReadOut2)
#[1] 257
list.files(recursive=TRUE)[grep("Indices",list.files(recursive=TRUE))][1]
#[1] "Indices/pav_ANN.csv"
res[,"pav ANN",drop=FALSE]
# pav ANN
#[1,] 1.298811
#[2,] 7.642922
#[3,] 6.740011
#[4,] NA
#[5,] 1.296650
#[6,] 6.887622
ReadOut2[[1]]
# G100 G101 G102 G103 G104 G105
#1 1.298811 7.642922 6.740011 NA 1.29665 6.887622
###Sample data
###Working directory changed to "sample"
dir.create("Indices_colMeans")
lst1 <- split(list.files(pattern=".csv"),gsub("\\_.*","",list.files(pattern=".c
sv")))
lst2 <- lapply(lst1,function(x1) lapply(x1, function(x2) {lines1 <- readLines(
x2); header1 <- lines1[1:2]; dat1 <- read.table(text=lines1,header=FALSE,sep=",
",stringsAsFactors=FALSE, skip=2); colnames(dat1) <- Reduce(paste,strsplit(head
er1,","));dat1[-c(nrow(dat1),nrow(dat1)-1),]}))
res1 <- do.call(rbind,lapply(seq_along(lst2),function(i) {rowMeans(do.call(cbin
d,lapply(lst2[[i]],function(x) colMeans(x[,-1],na.rm=TRUE))),na.rm=TRUE) }))
lapply(seq_len(ncol(res1)),function(i){mat1 <- t(res1[,i,drop=FALSE]); colnames
(mat1) <- names(lst2);write.csv(mat1,paste0(paste(getwd(),"Indices_colMeans",gs
ub(" ","_",rownames(mat1)),sep="/"),".csv"),row.names=FALSE,quote=FALSE)})
##Output2 Sample
ReadOut2S <- lapply(list.files(recursive=TRUE)[grep("Indices",list.files(recurs
ive=TRUE))],function(x) read.csv(x,header=TRUE,stringsAsFactors=FALSE))
length(ReadOut2S)
#[1] 257
list.files(recursive=TRUE)[grep("Indices",list.files(recursive=TRUE))][1]
#[1] "Indices_colMeans/pav_ANN.csv"
res1[,"pav ANN",drop=FALSE]
# pav ANN
#[1,] 1.545620
#[2,] 1.518553
ReadOut2S[[1]]
# G100 G101
#1 1.54562 1.518553
A.K.
On Monday, April 14, 2014 1:05 AM, Zilefac Elvis wrote:
Hi AK,
Q1) Please apply the Quantilecode.R to Observed.zip (attached). I tried but rec
eived an error which was self-explanatory but I could not change the dimensions
in the code.
Q2) Please apply Quantilecode.R to both sample.zip and observed.zip. Here, inst
ead of doing quantile(y, seq(0, 1, by = 0.01), take colMeans of the indices.
I have tried to solve both Q1 and Q2 but still unable to control the dimensions
.
Thanks,
Atem.
On Sunday, April 13, 2014 9:05 AM, arun wrote:
Hi Atem,
On my end, the codes are not formatted in the email as seen in the screen of fo
rmatR GUI.
I am attaching the .R file in case there is some difficulty for you.
Arun
On Sunday, April 13, 2014 10:54 AM, arun wrote:
Hi,
I am formatting the codes using library(formatR). Hopefully, it will not be ma
ngled in the email.
dir.create("final")
lst1 <- split(list.files(pattern = ".csv"), gsub("\\_.*", "", list.files(patter
n = ".csv")))
lst2 <- lapply(lst1, function(x1) lapply(x1, function(x2) { lines1 <- readLines
(x2) header1 <- lines1[1:2] dat1 <- read.table(text = lines1, header = FALSE, s
ep = ",", stringsAsFactors = FALSE, skip = 2) colnames(dat1) <- Reduce(paste,
strsplit(header1, ",")) dat1[-c(nrow(dat1), nrow(dat1) - 1), ]
}))
library(plyr)
lapply(seq_along(lst2), function(i) { lstN <- lapply(lst2[[i]], function(x) x[,
-1]) lstQ1 <- lapply(lstN, function(x) numcolwise(function(y) quantile(y, seq(
0, 1, by = 0.01), na.rm = TRUE))(x)) arr1 <- array(unlist(lstQ1), dim = c(dim(
lstQ1[[1]]), length(lstQ1)), dimnames = list(NULL, lapply(lstQ1, names)[[1]]))
res <- rowMeans(arr1, dims = 2, na.rm = TRUE) colnames(res) <- gsub(" ", "_",
colnames(res)) res1 <- data.frame(Percentiles = paste0(seq(0, 100, by = 1), "%"
), res, stringsAsFactors = FALSE) write.csv(res1, paste0(paste(getwd(), "final"
, paste(names(lst1)[[i]], "Quantile", sep = "_"), sep = "/"), ".csv"), row.nam
es = FALSE, quote = FALSE)
})
ReadOut1 <- lapply(list.files(recursive = TRUE)[grep("Quantile", list.files(rec
ursive = TRUE))], function(x) read.csv(x, header = TRUE, stringsAsFactors = FA
LSE))
sapply(ReadOut1,
dim)
# [,1] [,2]
#[1,] 101 101
#[2,] 258 258
lapply(ReadOut1,function(x) x[1:2,1:3])
#[[1]]
# Percentiles txav_DJF txav_MAM
#1 0% -12.68566 7.09702
#2 1% -12.59062 7.15338
#
#[[2]]
# Percentiles txav_DJF txav_MAM
#1 0% -12.75516 6.841840
#2 1% -12.68244 6.910664
###Q2:
dir.create("Indices")
names1 <- lapply(ReadOut1, function(x) names(x))[[1]]
lstNew <- simplify2array(ReadOut1)
lapply(2:nrow(lstNew), function(i) { dat1 <- data.frame(lstNew[1], do.call(cbin
d, lstNew[i, ]), stringsAsFactors = FALSE) colnames(dat1) <- c(rownames(lstNew)
[1], paste(names(lst1), rep(rownames(lstNew)[i], length(lst1)), sep = "_")) wr
ite.csv(dat1, paste0(paste(getwd(), "Indices", rownames(lstNew)[i], sep = "/"),
".csv"), row.names = FALSE, quote = FALSE)
}) ## Output2:
ReadOut2 <- lapply(list.files(recursive = TRUE)[grep("Indices", list.files(recu
rsive = TRUE))], function(x) read.csv(x, header = TRUE, stringsAsFactors = FAL
SE))
length(ReadOut2)
# [1] 257
head(ReadOut2[[1]], 2)
# Percentiles G100_pav_ANN G101_pav_ANN
#1 0% 1.054380 1.032740
#2 1%
1.069457 1.045689
A.K.
On Sunday, April 13, 2014 2:46 AM, Zilefac Elvis wrote:
Hi AK,
Q1) I need your help again. Using the previous data (attached) and the previous
code below,instead of taking rowMeans, let's do quantile(x,seq(0,1,by=0.01)).
Delete the last 2 rows (Trend and p<) in each file before doing quantile(x,seq(
0,1,by=0.01)).
For example, assume that I want to
calculate quantile(x,seq(0,1,by=0.01)) for each column of Site G100. I will do
so for the 5 sims of site G100 and then take their average. This will be approx
imately close to the true value than just calculating quantile(x,seq(0,1,by=0.0
1)) from one sim. Please do this same thing for all the files.
So, when you do rowMeans, it should be the mean of quantile(x,seq(0,1,by=0.01))
calculated from all sims in that Site.
Output
The number of files in "final" remains the same (2 files). The "Year" column(wi
ll be replaced) will contain the names of quantile(x,seq(0,1,by=0.01)) such as
0% 1% 2% 3% 4% 5%
6%, ..., 98%
99% 100% . You can give this column any name such as "Percentiles"
.
Q2) From the folder "final", please go to each file identified by site name, t
ake a column, say col1 of txav from each file, create a dataframe whose colnam
es are site codes (names of files in "final"). Create a folder called "Indices"
and place this dataframe in it. The filename for the dataframe is txav, say. S
o, in "Indices", you will have one file having 3 columns [, c(Percentiles, G100
,G101)]. The idea is that I want to be able to pick any column from files in "f
inal" and form a dataframe from which I will generate my qqplot or boxplot.
Thanks very much AK.
Atem
This should be the final step of this my drama, at least for now.
#==============================================================================
================================
dir.create("final")
lst1 <- split(list.files(pattern=".csv"),gsub("\\_.*","",list.files(pattern=".c
sv"))) lst2 <- lapply(lst1,function(x1) lapply(x1, function(x2) {lines1 <- rea
dLines(x2); header1 <- lines1[1:2]; dat1 <- read.table(text=lines1,header=FALSE
,sep=",",stringsAsFactors=FALSE, skip=2); colnames(dat1) <- Reduce(paste,strspl
it(header1,","));dat1}))
lstYear <- lapply(lst2,function(x) lapply(x, function(y) y[,1,drop=FALSE])[[1]]
)
lapply(seq_along(lst2),function(i) {lstN <-lapply(lst2[[i]],function(x) x[,-1])
; arr1 <- array(unlist(lstN),dim=c(dim(lstN[[1]]),length(lstN)),dimnames=list(N
ULL,lapply(lstN,names)[[1]]));res <-
cbind(lstYear[[i]],rowMeans(arr1,dims=2,na.rm=TRUE)); names(res) <- gsub("\\_$"
,"",gsub(" ", "_",names(res))); res[,1] <- gsub(" <", "",res[,1]); write.csv(re
s,paste0(paste(getwd(),"final",names(lst1)
[[i]],sep="/"),".csv"),row.names=FALSE,quote=FALSE) })
#==============================================================================
======================
More information about the R-help
mailing list