[R] Code is too slow: mean-centering variables in a data frame by subgroup

```I would like to thank once more everyone who helped me with this question.
I compared the speed for different approaches. Below are the results
of my comparisons - in case anyone is interested:

### Building an EXAMPLE FRAME with N rows - with groups and a lot of NAs:
N<-100000
set.seed(1234)
frame<-data.frame(group=rep(paste("group",1:10),N/10),a=rnorm(1:N),b=rnorm(1:N),c=rnorm(1:N),d=rnorm(1:N),e=rnorm(1:N),f=rnorm(1:N),g=rnorm(1:N))
frame<-frame[order(frame\$group),]

## Introducing 60% NAs:
names.used<-names(frame)[2:length(frame)]
set.seed(1234)
for(i in names.used){
i.for.NA<-sample(1:N,round((N*.6),0))
frame[[i]][i.for.NA]<-NA
}
lapply(frame[2:8], function(x) length(x[is.na(x)])) # Checking that it worked
ORIGframe<-frame ## placeholder for the unchanged original frame

####### Objective of the code - divide each value by its group mean ####

### METHOD 1 - the FASTEST - using ave():##############################
frame<-ORIGframe
f2 <- function(frame) {
for(i in 2:ncol(frame)) {
frame[,i] <- ave(frame[,i], frame[,1], FUN=function(x)x/mean(x,na.rm=TRUE))
}
frame
}
system.time({new.frame<-f2(frame)})
# Took me 0.23-0.27 sec
#######################################

### METHOD 2 - fast, just a bit slower - using data.table:
##############################

# If you don't have it - install the package - NOT from CRAN:
install.packages("data.table",repos="http://R-Forge.R-project.org")
library(data.table)
frame<-ORIGframe
system.time({
table<-data.table(frame)
colMeanFunction<-function(data,key){
data[[key]]=NULL
ret=as.matrix(data)/matrix(rep(as.numeric(colMeans(as.data.frame(data),na.rm=T)),nrow(data)),nrow=nrow(data),ncol=ncol(data),byrow=T)
return(ret)
}
groupedMeans = table[,colMeanFunction(.SD, "group"), by="group"]
names.to.use<-names(groupedMeans)
for(i in 1:length(groupedMeans)){groupedMeans[[i]]<-as.data.frame(groupedMeans[[i]])}
groupedMeans<-do.call(cbind, groupedMeans)
names(groupedMeans)<-names.to.use
})
# Took me 0.37-.45 sec
#######################################

### METHOD 3 - fast, a tad slower (using model.matrix & matrix
multiplication):##############################
frame<-ORIGframe
system.time({
mat <- as.matrix(frame[,-1])
mm <- model.matrix(~0+group,frame)
col.grp.N <- crossprod( !is.na(mat), mm ) # Use this line if don't
want to use NAs for mean calculations
# col.grp.N <- crossprod( mat != 0 , mm ) # Use this line if don't
want to use zeros for mean calculations
mat[is.na(mat)] <- 0.0
col.grp.sum <- crossprod( mat, mm )
mat <- mat / ( t(col.grp.sum/col.grp.N)[ frame\$group,] )
is.na(mat) <- is.na(frame[,-1])
mat<-as.data.frame(mat)
})
# Took me 0.44-0.50 sec
#######################################

### METHOD 5-  much slower - it's the one I started
with:##############################
frame<-ORIGframe
system.time({
frame <- do.call(cbind, lapply(names.used, function(x){
unlist(by(frame, frame\$group, function(y) y[,x] / mean(y[,x],na.rm=T)))
}))
})
# Took me 1.25-1.32 min
#######################################

### METHOD 6 -  the slowest; using "plyr" and
"ddply":##############################
frame<-ORIGframe
library(plyr)
function3 <- function(x) x / mean(x, na.rm = TRUE)
system.time({
grouping.factor<-"group"
myvariables<-names(frame)[2:8]
frame3<-ddply(frame, grouping.factor, colwise(function3, myvariables))
})
# Took me 1.36-1.47 min
#######################################

Thanks again!
Dimitri

```