[R] Possible memory leak with R v.2.5.0

Peter Waltman waltman at cs.nyu.edu
Thu Aug 16 07:21:22 CEST 2007


   I'm  working  with  a  very  large matrix ( 22k rows x 2k cols) of RNA
   expression  data with R v.2.5.0 on a RedHat Enterprise machine, x86_64
   architecture.
   The relevant code is below, but I call a function that takes a cluster
   of  this data ( a list structure that contains a $rows elt which lists
   the rows (genes ) in the cluster by ID, but not the actual data itself
   ).
   The function creates two copies of the matrix, one containing the rows
   in  the  cluster,  and  one  with  the rest of the rows in the matrix.
   After  doing  some  statistical  massaging,  the  function  returns  a
   statistical  score  for  each  rows/genes  in  the matrix, producing a
   vector of 22k elt's.
   When  I  run 'top', I see that the memory stamp of R after loading the
   matrix is ~750M.  However, after calling this function on 10 clusters,
   this  jumps  to  >  3.7 gig (at least by 'top's measurement), and this
   will not be reduced by any subsequent calls to gc().
   Output from gc() is:

     > gc()           used  (Mb) gc trigger   (Mb) max used  (Mb)
     Ncells   377925  20.2    6819934  364.3   604878  32.4
     Vcells 88857341 678.0  240204174 1832.7 90689707 692.0
     >

   output from top is:

        PID  USER       PR   NI   VIRT   RES   SHR  S %CPU %MEM    TIME+
     COMMAND
      1199 waltman   17   0 3844m 3.7g 3216 S  0.0 23.6  29:58.74 R

   Note, the relevant call that invoked my function is:

     test   <-   sapply(   c(1:10),   function(x)  get.vars.for.cluster(
     clusterStack[[x]], opt="rows" ) )

   Finally,  for  fun,  I  rm()'d  all variables with the rm( list=ls() )
   command, and then called gc().  The memory of this "empty" instance of
   R is still 3.4 gig, i.e.
   R.console:

     > rm( list=ls() )
     > ls()
     character(0)
     > gc()
                used  (Mb) gc trigger   (Mb) max used  (Mb)
     Ncells   363023  19.4    5455947  291.4   604878  32.4
     Vcells 44434871 339.1  192163339 1466.1 90689707 692.0
     >

   Subsequent top  output:
   output from top is:

        PID  USER       PR   NI   VIRT   RES   SHR  S %CPU %MEM    TIME+
     COMMAND
      1199 waltman   16   0 3507m 3.4g 3216 S  0.0 21.5  29:58.92 R

   Thanks for any help or suggestions,
   Peter Waltman
   p.s.  code snippet follows.  Note, that I've added extra rm() and gc()
   calls w/in the function to try to reduce the memory stamp to no avail.

     get.vars.for.cluster   =   function(   cluster,   genes=get.global(
     "gene.ids" ), opt=c("rows","cols"),
                               ratios=get.global("ratios"),  var.norm=T,
     r.sig=get.global( "r.sig" ),
                             allow.anticor=get.global( "allow.anticor" )
     ) {
       cat( "phw dbg msg\n")
       cluster <<- cluster
       opt <- match.arg( opt )
       rows <- cluster$rows
       cols <- cluster$cols
       if ( opt == "rows" ) {
         cat( "phw dbg msg: if opt == rows\n" )
         r <- ratios[ rows, cols ]
         r.all <- ratios[ genes, cols ]
         avg.rows <- apply( r, 2, mean, na.rm=T ) ##median )
         rm( r )  # phw added 8/9/07
         gc( reset=TRUE )     # phw added 8/9/07
         devs <- apply( r.all, 1, "-", avg.rows )
         if ( !allow.anticor ) rm( r.all, avg.rows )  # phw added 8/9/07
         gc( reset=TRUE ) #  phw added 8/9/07
         cat( "phw dbg msg: finished calc'ing avg.rows & devs\n" )
               ##   This   is  what  we'd  use  from  the  deHoon  paper
     (bioinformatics/bth927)
             ##sd.rows <- apply( r, 2, sd )
             ##devs <- devs * devs
             ##sd.rows <- sd.rows * sd.rows
             ##sds <- apply( devs, 2, "/", sd.rows )
             ##sds <- apply( sds, 2, sum )
             ##return( log10( sds ) )
             ## This is faster and nearly equivalent
         vars <- apply( devs, 2, var, na.rm=T )
         rm( devs )
         gc( reset=TRUE ) #  phw added 8/9/07
         test <- log10( vars ) #  phw added 8/9/07
         rm( vars ) #  phw added 8/9/07
         gc( reset=TRUE ) #  phw added 8/9/07
         vars <- log10( test ) #  phw added 8/9/07
         rm( test ) #  phw added 8/9/07
         gc( reset=TRUE ) #  phw added 8/9/07
     #    vars <- log10( vars )
         cat( "phw dbg msg: finished calc'ing vars (\n" )
             ## HOW TO ALLOW FOR ANTICOR??? Here's how:
         if ( allow.anticor ) {
           cat( "phw dbg msg: allow.anticor==T\n" )
                      ##  Get  variance  against the inverse of the mean
     profile
           devs.2 <- apply( r.all, 1, "-", -avg.rows )
           gc( reset=TRUE ) #  phw added 8/9/07
           vars.2 <- apply( devs.2, 2, var, na.rm=T )
           rm( devs.2 )
           gc( reset=TRUE ) #  phw added 8/9/07
           vars.2 <- log10( vars.2 )
           gc( reset=TRUE ) #  phw added 8/9/07
                      ##  For  each  gene  take  the  min of variance or
     anti-cor variance
           vars <- cbind( vars, vars.2 )
           rm( vars.2 )
           gc( reset=TRUE ) #  phw added 8/9/07
           vars <- apply( vars, 1, min )
           gc( reset=TRUE ) #  phw added 8/9/07
         }
              ##  Normalize  the values by the variance over the rows in
     the cluster
         if ( var.norm ) {
           cat( "phw dbg msg: var.norm == T \n")
           vars <- vars - mean( vars[ rows ], na.rm=T )
           tmp.sd <- sd( vars[ rows ], na.rm=T )
            if  (  !  is.na(  tmp.sd ) && tmp.sd != 0 ) vars <- vars / (
     tmp.sd + r.sig )
         }
         gc( reset=TRUE ) #  phw added 8/9/07
         return( vars )
       } else {
         cat( "phw dbg msg: else\n" )
         r.all <- ratios[ rows, ]
         ## Mean-normalized variance
          vars  <-  log10( apply( r.all, 2, var, na.rm=T ) / abs( apply(
     r.all, 2, mean, na.rm=T ) ) )
         names( vars ) <- colnames( ratios )
          ##  Normalize  the values by the variance over the rows in the
     cluster
         if ( var.norm ) {
           vars <- vars - mean( vars[ cluster$cols ], na.rm=T )
           tmp.sd <- sd( vars[ cluster$cols ], na.rm=T )
            if  (  !  is.na(  tmp.sd ) && tmp.sd != 0 ) vars <- vars / (
     tmp.sd + r.sig )
         }
         return( vars )
       }
     },


More information about the R-help mailing list