# [R] looping over lapply calls

Ramon Diaz-Uriarte rdiazuri at students.wisc.edu
Sat Jun 3 15:39:37 CEST 2000

Dear All,

I am writing a function to analyze simulated data. For each subset of data
(those with the same simulation counter), I have to fit a linear model and
output the coefficients and the F's from drop1 (marginal F tests). I have
tried three approaches: using lapply, using a for loop, and looping over
blocks, where within each block I use lapply (following the suggestion in "S
programming", pp. 156 and 174). The later is often the fastest method
(execution time can be less than half of the other methods).  I am wondering:

a) why exactly is that the case? (Is it related to the "split" in lapply or
the "matrix(unlist(etc))" in my function)

b) is there some rule of thumb to choose the size of the block over
which to use lapply?

Thanks,

Ramon

P.S. For completeness, I include below the core of the function I am using;

lmx.0 <- function(formula,data,max.num=0,lapply.size=100){ # looping over lapply
# remeber that sim.counter starts at 0
if(max.num) data<-data[data\$sim.counter<max.num+1,]  else max.num<-max(data\$sim.counter)
names.vars<-drop.scope(formula)

terms.in.model<-names(lm(formula=formula,data=data,subset=data\$sim.counter==0)[[1]])
# there must be a simpler way...

loop.counter<-(max.num+1)%/%lapply.size
rest.of.data<-(max.num+1)%%lapply.size

tmp<-matrix(nrow=max.num+1,ncol=length(names.vars)+length(terms.in.model))
i<-0
if (loop.counter) { #only enter in the loop if needed
for(i in 1:loop.counter){
datai<-data[data\$sim.counter<=((i*lapply.size)-1) & data\$sim.counter>=((i-1)*lapply.size),]
# obtain output of interest ---fm[[1]],my.drop(fm)--- by applying that function
# over the subset of data within loop; as result is list, unlist and turn into a matrix
# it is a little bit hard to read, but I am trying to minimize creating large
# intermediate objects.

tmp[(((i-1)*lapply.size)+1):(i*lapply.size),]<-matrix(unlist(lapply(split(datai,datai\$sim.counter),
function(datos,formula){
fm<-lm(formula=formula,data=datos);
c(fm[[1]],my.drop(fm))},
formula=formula)),
nrow=lapply.size,byrow=T)
}}
# here we deal with the other cases
datai<-data[data\$sim.counter>=(loop.counter*lapply.size),]
tmp[(((i*lapply.size)+1):(max.num+1)),] <-
matrix(unlist(lapply(split(datai,datai\$sim.counter),
function(datos,formula){fm<-lm(formula=formula,data=datos); c(fm[[1]],my.drop(fm))}, formula=formula)),
nrow=rest.of.data,byrow=T)

tmp
}

my.drop<- function (object)
{
# This is from drop1 (by BDR) after eliminating things I didn't need;
# this function is severely crippled, so use only here
# components are NOT named
x <- model.matrix(object)     iswt <-
!is.null(wt <- object\$weights)     n <- nrow(x)
asgn <- attr(x, "assign")
tl <- attr(object\$terms, "term.labels")
scope <- drop.scope(object)
ndrop <- match(scope, tl)
ns <- length(scope)
rdf <- object\$df.resid
chisq <- deviance.lm(object)
dfs <- numeric(ns)
y <- object\$residuals + predict(object)
rank <- object\$rank
for (i in 1:ns) {
ii <- seq(along = asgn)[asgn == ndrop[i]]
jj <- setdiff(seq(ncol(x)), ii)
z <- if (iswt)
lm.wfit(x[, jj, drop = FALSE], y, wt)
else lm.fit(x[, jj, drop = FALSE], y)
dfs[i] <- z\$rank
}
dfs <- c(object\$rank, dfs)
dfs <- (dfs[1] - dfs)[-1]
rdf <- object\$df.resid