[R] Speeding up R code - Apply a function to each row of a matrix using the dplyr package

Jeff Newmiller jdnewm|| @end|ng |rom dcn@d@v|@@c@@u@
Fri Nov 2 00:06:23 CET 2018


As Don suggests, looking for ways to do the whole calculation at once is a 
big efficiency booster. Also, avoiding unnecessary calculations (e.g. mean 
of 1:n is (n+1)/2 and mean(x+a) where a is a constant is mean(x)+a.

Reproducible example:

####################
#library(tictoc)
library(microbenchmark)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 
'package:stats':
#>
#>     filter, lag
#> The following objects are masked from 
'package:base':
#>
#>     intersect, setdiff, setequal, union
library(purrr)

func1 <- function( coord, A, B, C ) {

   X1 <- as.vector( coord[ 1 ] )
   Y1 <- as.vector( coord[ 2 ] )
   X2 <- as.vector( coord[ 3 ] )
   Y2 <- as.vector( coord[ 4 ] )

   if( C == 0 ) {
     res1 <- mean( c( ( X1 - A ) : ( X1 - 1 )
                    , ( Y1 + 1 ) : ( Y1 + 40 )
                    )
                 )
     res2 <- mean( c( ( X2 - A ) : ( X2 - 1 )
                    , ( Y2 + 1 ) : ( Y2 + 40 )
                    )
                 )
     res <- matrix( c( res1, res2 )
                  , ncol=2
                  , nrow=1
                  )

   } else {

     res1 <- mean( c( ( X1 - A ) : ( X1 - 1 )
                    , ( Y1 + 1 ) : ( Y1 + 40 )
                    )
                 )*B
     res2 <- mean( c( ( X2 - A ) : ( X2 - 1 )
                    , ( Y2 + 1 ) : ( Y2 + 40 )
                    )
                 )*B
     res <- matrix( c( res1, res2 )
                  , ncol=2
                  , nrow=1
                  )

   }

   res
}

#' @param coord is a one-row data frame
func2 <- function( coord, A, B, C ) {
   X1 <- coord[[ 1 ]]
   Y1 <- coord[[ 2 ]]
   X2 <- coord[[ 3 ]]
   Y2 <- coord[[ 4 ]]

   res <- matrix( c( mean( c( X1, Y1 ) )
                   , mean( c( X2, Y2 ) )
                   )
                , ncol=2
                , nrow=1
                ) + ( 40 - A ) / 2

   if ( C != 0 ) {
     res <- res * B
   }

   setNames( as.data.frame( res ), c( "V1", "V2" ) )
}

#' @param coord is a numeric vector of length 4
#' @return Numeric vector of length 2
func3 <- function( coord, A, B, C ) {
   res <- ( c( ( coord[ 1 ] + coord[ 2 ] )
             , ( coord[ 3 ] + coord[ 4 ] )
             )
          + ( 40 - A )
          ) / 2

   if ( C != 0 ) {
     res <- res * B
   }

   res
}

#' @param coord is a matrix with four columns
func4 <- function( coord, A, B, C ) {
   res <- ( cbind( ( coord[ , 1 ] + coord[ , 2 ] )
                 , ( coord[ , 3 ] + coord[ , 4 ] )
                 )
          + ( 40 - A )
          ) / 2

   if ( length( C ) == nrow( coord ) || length( C ) == 1 ) {
     idx <- C == 1
     res[ idx, ] <- res[ idx, ] * B
   }

   res
}

## Apply the function
set.seed( 1 )
n <- 1000
N <- 100
Nseq <- seq.int( N )
# Using T instead of TRUE is asking to get an 
unexpected result someday
tabDF <- data.frame( x1 = sample( Nseq, n, replace = TRUE )
                    , y1 = sample( Nseq, n, replace = TRUE )
                    , x2 = sample( Nseq, n, replace = TRUE )
                    , y2 = sample( Nseq, n, replace = TRUE )
                    )
tab <- as.matrix( tabDF )

fTest1 <- function() {
   test <- tab %>%
     split( 1:nrow(tab) ) %>%
     map(~ func1(.x, 40, 5, 1) ) %>%
     do.call( "rbind", . )
}

fTest2 <- function() {
   # conventional dplyr approach
   test <- tabDF %>%
     rowwise %>%
     do({
       func2( ., 40, 5, 1 )
     }) %>%
     ungroup
}

fTest3 <- function() {
   t( apply( tab, 1, func3, A=40, B=5, C=1 ) )
}

fTest4 <- function() {
   func4( tabDF, A=40, B=5, C=1 )
}

microbenchmark( result1 <- fTest1()
               , result2 <- fTest2()
               , result3 <- fTest3()
               , result4 <- fTest4()
               )
#> Unit: microseconds
#>                 expr        min         lq        mean      median
#>  result1 <- fTest1()  20305.562  23384.359  26939.6559  26262.8495
#>  result2 <- fTest2() 255441.229 276794.201 290628.3221 286046.6385
#>  result3 <- fTest3()   4869.288   5772.462   7242.2194   6615.7900
#>  result4 <- fTest4()     52.862     94.962    216.3508    105.7235
#>           uq        max neval
#>   29324.2775  46207.632   100
#>  294248.0795 473898.379   100
#>    7874.6455  21288.783   100
#>     127.0565   9253.006   100

stopifnot( result1[ , 1 ] == result2[[ 1 ]] )
stopifnot( result1[ , 2 ] == result2[[ 2 ]] )
stopifnot( result1 == result3 )
stopifnot( result1 == result4 )
####################

On Thu, 1 Nov 2018, MacQueen, Don via R-help wrote:

> Without more study, I can only give some general pointers.
>
> The as.vector() in X1 <- as.vector(coord[1]) is almost certainly not needed. It will add a little bit to your execution time.
> Converting the output of func() to a one row matrix is almost certainly not needed. Just return c(res1, res2).
>
> Your data frame appears to be entirely numeric, in which case you don't need to ever use a data frame.
>
> Try
>  apply( tab, 1, func, a=40, b=5, c=1 )
> instead of all that dplyr stuff.
>
>
> Your function can be redefined as
>
> func <- function(coord, a, b, c){
>
>          X1 <- as.vector(coord[1])
>          Y1 <- as.vector(coord[2])
>          X2 <- as.vector(coord[3])
>          Y2 <- as.vector(coord[4])
>
>           res1 <- mean(c((X1 - a) : (X1 - 1), (Y1 + 1) : (Y1 + 40)))
>           res2 <- mean(c((X2 - a) : (X2 - 1), (Y2 + 1) : (Y2 + 40)))
>
>            if (c==0) c(res1, res2) else c(res1, res2)*b
>        }
>
> I suspect you can operate on the entire matrix, without looping (which both the apply() method, and the split/rbind method do, in effect), and if so it will be much faster. But I can't say for sure without more study.
>
> --
> Don MacQueen
> Lawrence Livermore National Laboratory
> 7000 East Ave., L-627
> Livermore, CA 94550
> 925-423-1062
> Lab cell 925-724-7509
>
>
>
> On 11/1/18, 12:35 PM, "R-help on behalf of Nelly Reduan" <r-help-bounces using r-project.org on behalf of nell.redu using hotmail.fr> wrote:
>
>    Hello,
>
>    I have a input data frame with multiple rows. For each row, I want to apply a function. The input data frame has 1,000,000+ rows. How can I speed up my code ? I would like to keep the function "func".
>
>    Here is a reproducible example with a simple function:
>
>        library(tictoc)
>        library(dplyr)
>
>    func <- function(coord, a, b, c){
>
>          X1 <- as.vector(coord[1])
>          Y1 <- as.vector(coord[2])
>          X2 <- as.vector(coord[3])
>          Y2 <- as.vector(coord[4])
>
>          if(c == 0) {
>
>            res1 <- mean(c((X1 - a) : (X1 - 1), (Y1 + 1) : (Y1 + 40)))
>            res2 <- mean(c((X2 - a) : (X2 - 1), (Y2 + 1) : (Y2 + 40)))
>            res <- matrix(c(res1, res2), ncol=2, nrow=1)
>
>          } else {
>
>            res1 <- mean(c((X1 - a) : (X1 - 1), (Y1 + 1) : (Y1 + 40)))*b
>            res2 <- mean(c((X2 - a) : (X2 - 1), (Y2 + 1) : (Y2 + 40)))*b
>            res <- matrix(c(res1, res2), ncol=2, nrow=1)
>
>          }
>
>          return(res)
>        }
>
>        ## Apply the function
>        set.seed(1)
>        n = 10000000
>        tab <- as.matrix(data.frame(x1 = sample(1:100, n, replace = T), y1 = sample(1:100, n, replace = T), x2 = sample(1:100, n, replace = T), y2 = sample(1:100, n, replace = T)))
>
>
>      tic("test 1")
>      test <- tab %>%
>        split(1:nrow(tab)) %>%
>        map(~ func(.x, 40, 5, 1)) %>%
>        do.call("rbind", .)
>      toc()
>
>    test 1: 599.2 sec elapsed
>
>    Thanks very much for your time
>    Have a nice day
>    Nell
>
>    	[[alternative HTML version deleted]]
>
>    ______________________________________________
>    R-help using r-project.org mailing list -- To UNSUBSCRIBE and more, see
>    https://stat.ethz.ch/mailman/listinfo/r-help
>    PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
>    and provide commented, minimal, self-contained, reproducible code.
>
>
> ______________________________________________
> R-help using r-project.org mailing list -- To UNSUBSCRIBE and more, see
> https://stat.ethz.ch/mailman/listinfo/r-help
> PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
> and provide commented, minimal, self-contained, reproducible code.

---------------------------------------------------------------------------
Jeff Newmiller                        The     .....       .....  Go Live...
DCN:<jdnewmil using dcn.davis.ca.us>        Basics: ##.#.       ##.#.  Live Go...
                                       Live:   OO#.. Dead: OO#..  Playing
Research Engineer (Solar/Batteries            O.O#.       #.O#.  with
/Software/Embedded Controllers)               .OO#.       .OO#.  rocks...1k
---------------------------------------------------------------------------


More information about the R-help mailing list