[Rd] applying duplicated, unique and match to lists?

Fri Nov 2 15:53:27 CET 2007

Dear R developers,

While improving duplicated.array() and friends and developing equivalents for the new ff package for large datasets I came across two questions:

1) is it safe to use duplicated.default(), unique.default() and match() on arbitrary lists? If so, we can speed up duplicated.array and friends considerably by using list() instead of paste(collapse="\r")

2) while duplicated.default() is very fast even on lists, match() is very slow on lists. Why is the internal conversion to character necessary? If the hashtable behind duplicated() in unique.c work for lists, why can't we use them for match()? If conversion to character is unavoidable, a better scaling alternative could be serializing and compressing to md5: even with final identity check agains unlikely collisions this is much faster in many cases (break even seems to be for quite small list elements like 2 doubles).

1) the new versions should also work for lists with a dim attribute (old versions has as.vector() which does not work for lists)
Factor 10 speedup for row duplicates (here atomic matrices)
>   system.time(duplicated(x, hashFUN=function(x)paste(x, collapse="\r")))
   user  system elapsed 
   2.37    0.02    2.45 
>   system.time(duplicated(x, hashFUN=md5))
   user  system elapsed 
   0.51    0.00    0.51 
>   system.time(duplicated(x, hashFUN=list))
   user  system elapsed 
   0.17    0.00    0.17

2) Speedup potential for list matching (md5 results below)
> x <- as.list(runif(100000))
> system.time(duplicated(x))
   user  system elapsed 
   0.01    0.00    0.02 
> system.time(match(x,x))
   user  system elapsed 
   2.01    0.00    2.03

Please find below more comments and tests, new code for duplicated.array() and friends, suggestions for new classes 'hash' (requiring digest) and 'id' (and if you are curious: first code drafts for the respective ff methods).

Best regards

Jens Oehlschlägel

----

# Hashing of large objects in ff
# (c) 2007 Jens Oehlschägel
# Licence: GPL2
# Created: 2007-10-30
# Last changed: 2007-10-30

require(digest) # digest maintainer: Dirk Eddelbuettel <edd at debian.org>

# { --- available hash functions ---

# perfect projection: list
# NOTE that the 'easiest hash function' is 'list'
# it is faster than everything else when calculating duplicated or unique, but it is extremely slow for 'match' (currently, R-2.6.0)
# thus for matching list elements, it is faster converting the list elements with md5

# no projection for vectors only
none <- function(x)x

# concatenation of as.character as currently (R-2.6.1) in duplicated.array, match.array (pairs of projections may erroneously apear as identical when the vectors are very similar, RAM expensive)
pasteid <- function(x)paste(x, collapse="\r")

# perfectly identity preserving projection (but even more RAM expensive)
id1 <- function(x)paste(.Call("R_serialize", x, NULL, FALSE, NULL, PACKAGE = "base")[-(1:14)], collapse="")

# 32 byte projection
md5 <- function(x).Call("digest", .Call("R_serialize", x, NULL, FALSE, NULL, PACKAGE = "base"), 1L, -1L, 14L, PACKAGE = "digest")

# 40 byte projection
sha1 <- function(x).Call("digest", .Call("R_serialize", x, NULL, FALSE, NULL, PACKAGE = "base"), 2L, -1L, 14L, PACKAGE = "digest")

# 8 byte projection: more collisions
crc32 <- function(x).Call("digest", .Call("R_serialize", x, NULL, FALSE, NULL, PACKAGE = "base"), 3L, -1L, 14L, PACKAGE = "digest")

#! \name{md5}
#! \alias{md5}
#! \title{ faster shortcut functions for in-memory digest }
#! \description{
#!   These functions project (serialize or hash) their input object and return a string. Because they avoid any R overhead they are better suitable for sapply() than using the more general function 'digest'
#! }
#! \usage{
#! md5(x)
#! sha1(x)
#! crc32(x)
#! id1(x)
#! }
#! %- maybe also 'usage' for other objects documented here.
#! \arguments{
#!   \item{x}{ a fully serializable R object }
#! }
#! \value{
#!   character scalar
#! }
#! \seealso{ \code{\link{digest}}, \code{\link[base]{serialize}} }
#! \examples{
#!   md5(pi)
#!   sha1(pi)
#!   crc32(pi)
#!   id1(pi)
#!
#!   dontshow{
#!     if (!identical(paste(serialize(list(str="a string", double=pi), connection=NULL)[-(1:14)], collapse=""), id1(list(str="a string", double=pi))))
#!       stop("something has changed in serialization, please fix the internal .Calls in function 'id1', 'md5, 'sha1', 'crc32'")
#!
#!     if (!identical(digest(list(str="a string", double=pi), algo="md5"), md5(list(str="a string", double=pi))))
#!       stop("something has changed in package 'digest' or in serialization, please fix the internal .Calls in function 'md5'")
#!
#!     if (!identical(digest(list(str="a string", double=pi), algo="sha1"), sha1(list(str="a string", double=pi))))
#!       stop("something has changed in package 'digest' or in serialization, please fix the internal .Calls in function 'sha1'")
#!
#!     if (!identical(digest(list(str="a string", double=pi), algo="crc32"), crc32(list(str="a string", double=pi))))
#!       stop("something has changed in package 'digest' or in serialization, please fix the internal .Calls in function 'crc32'")
#!   }
#! }
#! \keyword{misc}

# } --- available hash functions ---

if (FALSE){

  # current (R-2.6.0) versions of duplicated.* and unique.* can fail for very similar rows
  x <- matrix(1, 2, 2)
  x[1,1] <- 1 + 1e-15
  x[2,1] <- 1 + 2e-15
  x[1,1]==x[2,1]
  duplicated(x)
  apply(x, 1, paste, collapse="\r")
  # is using md5 is safer?
  apply(x, 1, md5)

  # atomic data
  n <- 10000
  x <- matrix(runif(n*20),n,20)
  x <- x[rep(1:nrow(x),rep(2,nrow(x))),]

  # using list or md5 is faster than pasteid or digest or even serialize via id1
  system.time(apply(x, 1, pasteid))
  system.time(apply(x, 1, digest))
  system.time(apply(x, 1, id1))
  system.time(apply(x, 1, md5))
  system.time(apply(x, 1, list))

  # using md5 takes less RAM for strings
  object.size(x)
  object.size(apply(x, 1, pasteid))
  object.size(apply(x, 1, md5))
  object.size(apply(x, 1, id1))
  object.size(apply(x, 1, list))

  # atomic matrix performance
  system.time(duplicated(x, hashFUN=pasteid))
  system.time(duplicated(x, hashFUN=id1))
  system.time(duplicated(x, hashFUN=md5))
  system.time(duplicated(x, hashFUN=list))

  # list data
  n <- 1000
  x <- matrix(as.list(runif(n*20)),n,20)
  x <- x[rep(1:nrow(x),rep(2,nrow(x))),]

  # list matrix performance
  system.time(duplicated(x, hashFUN=pasteid))
  system.time(duplicated(x, hashFUN=id1))
  system.time(duplicated(x, hashFUN=md5))
  system.time(duplicated(x, hashFUN=list))

  n <- 100000
  # match works fine for atomic and list character data
  x <- as.character(runif(n))
  system.time(duplicated(x))
  system.time(match(x,x))
  y <- as.list(x)
  system.time(duplicated(y))
  system.time(match(y,y))

  # but is very slow for numeric (double and integer) lists (although duplicated on numeric lists is fast, doesn't use match the same hashtable?)
  x <- runif(n)
  system.time(duplicated(x))
  system.time(match(x,x))
  y <- as.list(x)
  system.time(duplicated(y))
  system.time(match(y,y))
  # try some alternatives
  system.time({z <- sapply(y, id1); zt <- sapply(y, id1); match(z,zt)})
  system.time({z <- sapply(y, md5); zt <- sapply(y, md5); match(z,zt)})
  system.time({z <- sapply(y, md5); zt <- sapply(y, md5); pos<-match(z,zt); all(sapply(seq(along=pos), function(i)identical(z[i],zt[pos[i]])))})

  n <- 100000
  m <- 50
  # even worse: lists with vectors of numeric
  x <- matrix(runif(n), m)
  y <- lapply(1:ncol(x), function(i)x[,i])
  system.time(duplicated(y))
  system.time(match(y,y))
  # is so slow that md5 converting can speed up match considerably, even with final identity check (break-even is at m=2, for longer vectors md5 is faster)
  system.time({z <- sapply(y, md5); zt <- sapply(y, md5); match(z,z)})
  system.time({z <- sapply(y, md5); zt <- sapply(y, md5); pos<-match(z,zt); all(sapply(seq(along=pos), function(i)identical(z[i],zt[pos[i]])))})

  # less impressive but still so for strings
  x <- matrix(as.character(runif(n)), m)
  y <- lapply(1:ncol(x), function(i)x[,i])
  system.time(duplicated(y))
  system.time(match(y,y))
  # is so slow that md5 converting speeds up match
  system.time({z <- lapply(y, md5); match(z,zt)})
  system.time({z <- sapply(y, md5); zt <- sapply(y, md5); pos<-match(z,zt); all(sapply(seq(along=pos), function(i)identical(z[i],zt[pos[i]])))})

  rm(x)

}

duplicated.matrix <- duplicated.array <-
function (x, incomparables = FALSE, fromLast = FALSE, MARGIN = 1, hashFUN=list, ...)
{
    if (!is.logical(incomparables) || incomparables)
        .NotYetUsed("incomparables != FALSE")
    ndim <- length(dim(x))
    if (length(MARGIN) > ndim || any(MARGIN > ndim))
        stop("MARGIN = ", MARGIN, " is invalid for dim = ", dim(x))
    temp <- apply(x, MARGIN, hashFUN)
    d <- dim(temp)
    dn <- dimnames(temp)
    dim(temp) <- NULL   # we had as.vector here, but that fails for lists with dim attributes
    res <- duplicated(temp, fromLast = fromLast)
    dim(res) <- d
    dimnames(res) <- dn
    res
}

duplicated.data.frame <-
function (x, incomparables = FALSE, fromLast = FALSE, hashFUN=list, ...)
{
    if (!is.logical(incomparables) || incomparables)
        .NotYetUsed("incomparables != FALSE")
    ndim <- length(dim(x))
    temp <- apply(x, 1, hashFUN)
    d <- dim(temp)
    dn <- dimnames(temp)
    dim(temp) <- NULL  # we had as.vector here, but that fails for lists with dim attributes
    res <- duplicated(temp, fromLast = fromLast)
    dim(res) <- d
    dimnames(res) <- dn
    res
}

unique.matrix <- unique.array <-
function (x, incomparables = FALSE, fromLast = FALSE, MARGIN = 1, hashFUN=list, ...)
{
    if (!is.logical(incomparables) || incomparables)
        .NotYetUsed("incomparables != FALSE")
    ndim <- length(dim(x))
    if (length(MARGIN) > 1 || any(MARGIN > ndim))
        stop("MARGIN = ", MARGIN, " is invalid for dim = ", dim(x))
    temp <- apply(x, MARGIN, hashFUN)
    args <- rep(alist(a = ), ndim)
    names(args) <- NULL
    dim(temp) <- NULL  # we had as.vector here, but that fails for lists with dim attributes
    args[[MARGIN]] <- !duplicated(temp, fromLast = fromLast)
    do.call("[", c(list(x = x), args, list(drop = FALSE)))
}

unique.data.frame <-
function (x, incomparables = FALSE, fromLast = FALSE, hashFUN=list, ...)
{
    if (!is.logical(incomparables) || incomparables)
        .NotYetUsed("incomparables != FALSE")
    x[!duplicated(x, fromLast = fromLast, hashFUN = hashFUN), , drop = FALSE]
}

# like duplicated but return hash value instead of logical
hash <- function(x, ...)
    UseMethod("hash")

hash.default <- function(x, hashFUN=md5, ...)
    sapply(x, hashFUN)

hash.matrix <- hash.array <-
function (x, MARGIN = 1, hashFUN=md5, ...)
{
    ndim <- length(dim(x))
    if (length(MARGIN) > ndim || any(MARGIN > ndim))
        stop("MARGIN = ", MARGIN, " is invalid for dim = ", dim(x))
    apply(x, MARGIN, hashFUN)
}

hash.data.frame <-
function (x, hashFUN=md5, ...)
{
    ndim <- length(dim(x))
    apply(x, 1, hashFUN)
}

# like duplicated but return the position of first occurence instead of logical
id <- function(x, ...)
    UseMethod("id")

id.default <- function(x, fromLast = FALSE, hashFUN=NULL, ...){
  if (is.null(hashFUN)){
    if (is.list(x)) x <- sapply(x, md5)  # fix the slow performance of match on lists (R-2.6.0) by converting to md5
    d <- (1:length(x))[!duplicated(x, fromLast = fromLast)]
    d[match(x,x[d])]
  }else{
    x <- lapply(x, hashFUN)
    d <- (1:length(x))[!duplicated(x, fromLast = fromLast)]
    d[match(x,x[d])]
  }
}

id.matrix <- id.array <-
function (x, fromLast = FALSE, MARGIN = 1, hashFUN=md5, ...)
{
    ndim <- length(dim(x))
    nmarg <- length(MARGIN)
    if (nmarg > ndim || any(MARGIN > ndim))
        stop("MARGIN = ", MARGIN, " is invalid for dim = ", dim(x))
    h <- apply(x, MARGIN, hashFUN)
    d <- (1:length(h))[!duplicated(h, fromLast = fromLast)]
    i <- d[match(h,h[d])]
    dim(i) <- dim(h)
    dimnames(i) <- dimnames(h)
    i
}

id.data.frame <-
function (x, fromLast = FALSE, hashFUN=md5, ...)
{
    h <- apply(x, 1, hashFUN)
    d <- (1:length(h))[!duplicated(h, fromLast = fromLast)]
    d[match(h,h[d])]
}

if (FALSE){
  n <- 10000

  # test duplicated
  x <- matrix(runif(n*20),n,20)
  x <- x[rep(1:nrow(x),rep(2,nrow(x))),]

  d1 <- duplicated.matrix(x, hashFUN=list)
  d2 <- duplicated.matrix(x, hashFUN=md5)
  all.equal(d1,d2)
  table(d1)
  rm(d1,d2)

  system.time(duplicated.matrix(x, hashFUN=list))
  system.time(duplicated.matrix(x, hashFUN=md5))

  x <- matrix(as.list(runif(n*20)),n,20)
  x <- x[rep(1:nrow(x),rep(2,nrow(x))),]

  d1 <- duplicated.matrix(x, hashFUN=list)
  d2 <- duplicated.matrix(x, hashFUN=md5)
  all.equal(d1,d2)
  table(d1)
  rm(d1,d2)

  system.time(duplicated.matrix(x, hashFUN=list))
  system.time(duplicated.matrix(x, hashFUN=md5))

  # test unique
  x <- matrix(runif(n*20),n,20)
  x <- x[rep(1:nrow(x),rep(2,nrow(x))),]

  d1 <- unique.matrix(x, hashFUN=list)
  d2 <- unique.matrix(x, hashFUN=md5)
  all.equal(d1,d2)
  dim(d1)
  rm(d1,d2)

  system.time(unique.matrix(x, hashFUN=list))
  system.time(unique.matrix(x, hashFUN=md5))

  x <- matrix(as.list(runif(n*20)),n,20)
  x <- x[rep(1:nrow(x),rep(2,nrow(x))),]

  d1 <- unique.matrix(x, hashFUN=list)
  d2 <- unique.matrix(x, hashFUN=md5)
  all.equal(d1,d2)
  all.equal(md5(d1),md5(d2))
  dim(d1)
  rm(d1,d2)

  system.time(unique.matrix(x, hashFUN=list))
  system.time(unique.matrix(x, hashFUN=md5))

  # test id
  x <- matrix(runif(n*20),n,20)
  x <- x[rep(1:nrow(x),rep(2,nrow(x))),]

  d1 <- id.matrix(x, hashFUN=md5)
  d2 <- id.matrix(x, hashFUN=list)
  all.equal(d1,d2)
  length(unique(d1))
  rm(d1,d2)

  system.time(id.matrix(x, hashFUN=md5))
  system.time(id.matrix(x, hashFUN=list))

  x <- matrix(as.list(runif(n*20)),n,20)
  x <- x[rep(1:nrow(x),rep(2,nrow(x))),]

  d1 <- id.matrix(x, hashFUN=md5)
  d2 <- id.matrix(x, hashFUN=list)
  all.equal(d1,d2)
  length(unique(d1))
  rm(d1,d2)

  system.time(id.matrix(x, hashFUN=md5))
  system.time(id.matrix(x, hashFUN=list))

}

# BTW, the following are methods for the upcoming package/class 'ff'
# they limit the chunk size of RAM needed for reading the data from disk
# yet they assume that the result fits into RAM (and is returned as such)
# they rely on ffapply which helps with chunked indexing

hash.ff <- function (x, MARGIN = NULL, hashFUN=md5
, return.ff = FALSE
, ...   # passed to ffapply
)
{
    if (!is.logical(return.ff) || return.ff)
        .NotYetUsed("return.ff = TRUE")
    d <- dim(x)
    if (is.null(MARGIN))
      MARGIN <- if (is.null(d)) integer() else 1L
    nmarg <- length(MARGIN)
    if (nmarg){
      ndim <- length(d)
      if (nmarg > ndim || any(MARGIN > ndim))
          stop("MARGIN = ", MARGIN, " is invalid for dim = ", d)
      if (nmarg==ndim){
        ret <- apply(x[], MARGIN, hashFUN)
      }else{
        args <- rep(alist(a = ), ndim)
        names(args) <- NULL
        ret <- ffapply(x, {
          args[MARGIN] <- lapply(seq(along=MARGIN), function(i)i1[i]:i2[i])
          temp <- do.call("[", c(list(x = x), args, list(drop = FALSE)))
          apply(temp, MARGIN, hashFUN)
        }, margin=MARGIN, return="unlist", ...)
        if (nmarg>1){
          dim(ret) <- d[MARGIN]
          dimnames(ret) <- dimnames(x)[MARGIN]
        }
      }
    }else{
      ret <- ffapply(x, sapply(x[i1:i2], hashFUN), return="unlist", ...)
    }
    ret
}

# xx this is yet without a final identity check agains md5 collisions
duplicated.ff <-
function (x, incomparables = FALSE, fromLast = FALSE, MARGIN = 1, hashFUN=md5
, return.ff = FALSE
, ...   # passed to ffapply
)
{
    if (!is.logical(incomparables) || incomparables)
        .NotYetUsed("incomparables != FALSE")
    if (!is.logical(return.ff) || return.ff)
        .NotYetUsed("return.ff = TRUE")
    d <- dim(x)
    if (is.null(MARGIN))
      MARGIN <- if (is.null(d)) integer() else 1L
    ndim <- length(d)
    nmarg <- length(MARGIN)
    if (nmarg > ndim || any(MARGIN > ndim))
        stop("MARGIN = ", MARGIN, " is invalid for dim = ", dim(x))
    if (nmarg){
      if (nmarg==ndim){ # no hashFUN needed
        h <- aperm(x[], MARGIN)
      }else{
        h <- hash(x, MARGIN=MARGIN, return.ff=return.ff, hashFUN=hashFUN, ...)
      }
      d <- dim(h)
      dn <- dimnames(h)
      dim(h) <- NULL
      dup <- duplicated(h, fromLast = fromLast)
      dim(dup) <- d
      dimnames(dup) <- dn
    }else{
      # yet no RAM savings in this case
      n <- length(x)                                  # 1:length(n) WOULD be expanded
      dup <- duplicated(x[1:n], fromLast = fromLast)  # 1:n is NOT expanded and returns a simple vector (faster than as.vector(x[]))
    }
    dup
}

# xx this is yet without a final identity check agains md5 collisions
unique.ff <-
function (x, incomparables = FALSE, fromLast = FALSE, MARGIN = NULL, hashFUN=md5
, return.ff = FALSE
, ...   # passed to ffapply
)
{
    if (!is.logical(incomparables) || incomparables)
        .NotYetUsed("incomparables != FALSE")
    if (!is.logical(return.ff) || return.ff)
        .NotYetUsed("return.ff = TRUE")
    d <- dim(x)
    if (is.null(MARGIN))
      MARGIN <- if (is.null(d)) integer() else 1L
    ndim <- length(d)
    nmarg <- length(MARGIN)
    if (nmarg > 1 || any(MARGIN > ndim))
        stop("MARGIN = ", MARGIN, " is invalid for dim = ", dim(x))
    if (nmarg){
      if (nmarg==ndim){ # no hashFUN needed
        h <- aperm(x[], MARGIN)
      }else{
        h <- hash(x, MARGIN=MARGIN, return.ff=return.ff, hashFUN=hashFUN, ...)
      }
      d <- dim(h)
      dn <- dimnames(h)
      dim(h) <- NULL
      args <- rep(alist(a = ), ndim)
      names(args) <- NULL
      args[[MARGIN]] <- !duplicated(h, fromLast = fromLast)
      do.call("[", c(list(x = x), args, list(drop = FALSE)))
    }else{
      # yet no RAM savings in this case
      n <- length(x)
      unique(x[1:n], fromLast = fromLast) # 1:n is NOT expanded and returns a simple vector (faster than as.vector(x[]))
    }
}

# xx this is yet without a final identity check agains md5 collisions
id.ff <- function (x, fromLast = FALSE, MARGIN = NULL, hashFUN=NULL
, return.ff = FALSE
, ...   # passed to ffapply
)
{
    if (!is.logical(return.ff) || return.ff)
        .NotYetUsed("return.ff = TRUE")
    d <- dim(x)
    if (is.null(MARGIN))
      MARGIN <- if (is.null(d)) integer() else 1L
    nmarg <- length(MARGIN)
    if (nmarg){
      if (is.null(hashFUN))
        hashFUN <- md5
      ndim <- length(d)
      if (nmarg > ndim || any(MARGIN > ndim))
          stop("MARGIN = ", MARGIN, " is invalid for dim = ", d)
      if (nmarg==ndim){
        if (is.list(x[1]))
          h <- apply(x[], MARGIN, hashFUN)
        else
          h <- aperm(x[], MARGIN)
        dim(h) <- NULL
      }else{
        args <- rep(alist(a = ), ndim)
        names(args) <- NULL
        h <- ffapply(x, {
          args[MARGIN] <- lapply(seq(along=MARGIN), function(i)i1[i]:i2[i])
          temp <- do.call("[", c(list(x = x), args, list(drop = FALSE)))
          apply(temp, MARGIN, hashFUN)
        }, margin=MARGIN, return="unlist", ...)
      }
      nd <- (1:length(h))[!duplicated(h, fromLast = fromLast)]
      i <- nd[match(h,h[nd])]
      if (nmarg>1){
        # yet no RAM savings in this case
        dim(i) <- d[MARGIN]
        dimnames(i) <- dimnames(x)[MARGIN]
      }
    }else{
      i <- id(x, fromLast = fromLast, hashFUN=hashFUN)
    }
    i
}

# xx this is yet without a final identity check agains md5 collisions
# row identity for ff and R matrices (less overhead compared to id.ff via ffapply )
ffrowid <- function(x, ...){
  id(ffrowapply(x, apply(x[i1:i2,,drop=FALSE], 1, md5), return="unlist", use.names=FALSE, ...))
}

if (FALSE){
  a <- ff(0, dim=c(100000,10),dimorder=2:1)
  ffapply(a, a[i1:i2]<-runif(i2-i1+1))
  r <- ffrowhash(a)
}

if (FALSE){
  n <- 100000
  m <- 10
  x <- ff(0, dim=c(n,m))
  x[,dimorder=2:1] <- 1:(m*n/2)

  hash(x, MARGIN=integer()) # hash single cells, no RAM optimization
  hash(x)                   # hash rows, RAM savings because rows are read and md5ed i chunks
  duplicated(x)             #
  id(x)                     # positions of first occurences
  ffrowid(x)                # faster positions
  unique(x)                 # unique rows

}

--