[R] simplified multicore by() function
ivo welch
ivo.welch at gmail.com
Sat Oct 22 23:02:45 CEST 2011
dear R readers---I thought I would post the following snippet of R
code that makes by() like operations easier and faster on multicore
machines for R novices and amateurs. I hope it helps some. YMMV.
feel free to ignore.
PS: I wish R had a POD-like documentation system for end users that
are not writing full libraries. because it does not, I did not
provide documentation ala '?mc.by'.
/iaw
set.seed(0)
library(multicore)
################################################################
###
### these R functions are very type-limited wrappers for
### by()-like operations, using the multicore library. this
### means effort-less multi-CPU calculations.
###
### the user functions MUST return a numeric scalar or a vector.
###
### to enhance speed, internally the user function is wrapped, too.
###
### the output is ONE matrix, whose row-names are the categories.
###
################################################################
add.by.names <- function( mc.rv ) {
for (i in 1:length(mc.rv)) {
stopifnot( is.matrix( mc.rv[[i]] ) )
row.names(mc.rv[[i]]) <- rep( names(mc.rv)[i], nrow(mc.rv[[i]]) )
}
mc.rv
}
mc.by <- function(data, INDICES, FUN, ...) {
FUN.i <- function(.index, ...) { cbind(FUN(data[.index,], ...)) }
## we cbind, so we always get an array back
si <- split(1:nrow(data), INDICES)
soln <- mclapply( si, FUN.i, ... )
rv <- do.call("rbind", add.by.names(soln))
if (ncol(rv)==1) {
nm <- rownames(rv)
rv <- as.vector(rv)
names(rv) <- nm
}
rv
}
mc.byallrows <- function(data, INDICES, FUN, ...) {
FUN.i <- function(.index, ...) { cbind(FUN(data[.index,], ...)) }
si <- as.list(1:nrow(data)) ## a little faster than the split for
large data sets
soln <- mclapply( si, FUN.i, ... )
rv <- do.call("rbind", soln) ## omits naming.
if (ncol(rv)==1) rv <- as.vector(rv)
rv
}
function.sample <- function(d) cbind(d$x+d$y, d$x, d$y)
function.sample.simpler <- function(d) (d$x+d$y)
d <- data.frame( i=c( rep(1,2), rep(2,3), rep(3,4) ), x=rnorm(9), y=rnorm(9) )
report <- function( text2print, f.output ) {
cat("\n\n", text2print, ":\n"); print(f.output); cat("\n\n")
}
report( "the original R by() function", by( d, d$i, function.sample ))
report( "wrappled multicore by mc.by with user function returning
scalar", mc.by( d, d$i, function.sample.simpler ))
report( "wrappled multicore by mc.by with user function returning
vector", mc.by( d, d$i, function.sample ))
report( "wrappled multicore by mc.byallrows ", mc.byallrows( d, d$i,
function.sample ))
More information about the R-help
mailing list