[Rd] a fast table() for the 1D case

Fri Aug 9 10:19:42 CEST 2013

Hi,

table1D() below can be up to 60x faster than base::table() for the 1D
case. Here are the detailed speedups compared to base::table().

   o With a logical vector of length 5M:     11x faster
                                     (or more if 'useNA="always"')

   o With factor/integer/numeric/character of length 1M and 9 levels
     (or 9 distinct values for non-factors):
      - factor:                              60x faster
      - integer/numeric vector:              12x faster
      - character vector:                   2.4x faster

   o With factor/integer/numeric/character of length 1M and no
     duplicates:
       - factor:                              5x faster
       - integer vector:                      2x faster
       - numeric vector:                    1.7x faster
       - character vector:       no significant speedup

Would be great if this improvement could make it into base::table().

Thanks,
H.

   ## A fast table() implementation for the 1D case (replacing the '...'
   ## arg with 'x' and omitting the 'dnn' and 'deparse.level' arguments
   ## which are unrelated to performance).

   table1D <- function(x, exclude = if (useNA == "no") c(NA, NaN),
                       useNA = c("no", "ifany", "always"))
   {
     if (!missing(exclude) && is.null(exclude)) {
         useNA <- "always"
     } else {
         useNA <- match.arg(useNA)
     }
     if (useNA == "always" && !missing(exclude))
         exclude <- setdiff(exclude, NA)
     if (is.factor(x)) {
         x2 <- levels(x)
         append_NA <- (useNA == "always" ||
                       useNA == "ifany" && any(is.na(x))) &&
                      !any(is.na(x2))
         if (append_NA) {
             x2 <- c(x2, NA)
             x <- factor(x, levels=x2, exclude=NULL)
         }
         t2 <- tabulate(x, nbins=length(x2))
         if (!is.null(exclude)) {
             keep_idx <- which(!(x2 %in% exclude))
             x2 <- x2[keep_idx]
             t2 <- t2[keep_idx]
         }
     } else {
         xx <- match(x, x)
         t <- tabulate(xx, nbins=length(xx))
         keep_idx <- which(t != 0L)
         x2 <- x[keep_idx]
         t2 <- t[keep_idx]
         if (!is.null(exclude)) {
             exclude <- as.vector(exclude, typeof(x))
             keep_idx <- which(!(x2 %in% exclude))
             x2 <- x2[keep_idx]
             t2 <- t2[keep_idx]
         }
         oo <- order(x2)
         x2 <- x2[oo]
         t2 <- t2[oo]
         append_NA <- useNA == "always" && !any(is.na(x2))
         if (append_NA) {
             x2 <- c(x2, NA)
             t2 <- c(t2, 0L)
         }
     }
     ans <- array(t2)
     dimnames(ans) <- list(as.character(x2))
     names(dimnames(ans)) <- "x"  # always set to 'x'
     class(ans) <- "table"
     ans
   }

table1D() also fixes some issues with base::table() that can be exposed
by running the tests below.

   test_table <- function(FUN_NAME)
   {
     FUN <- match.fun(FUN_NAME)

     .make_target <- function(target_names, target_data)
     {
         ans <- array(target_data)
         dimnames(ans) <- list(as.character(target_names))
         names(dimnames(ans)) <- "x"
         class(ans) <- "table"
         ans
     }

     .check_identical <- function(target, current, varname, extra_args)
     {
         if (identical(target, current))
             return()
         if (extra_args != "")
             extra_args <- paste0(", ", extra_args)
         cat("unexpected result for '", FUN_NAME,
             "(x=", varname, extra_args, ")'\n", sep="")
     }

     .test_exclude <- function(x, varname, target_names0, target_data0, 
exclude)
     {
         extra_args <- paste0("exclude=", deparse(exclude))
         current <- FUN(x=x, exclude=exclude)
         target_names <- target_names0
         target_data <- target_data0
         if (is.null(exclude)) {
             if (!any(is.na(target_names))) {
                 target_names <- c(target_names, NA)
                 target_data <- c(target_data, 0L)
             }
         } else {
             if (!is.factor(x)) {
                 exclude <- as.vector(exclude, typeof(x))
             } else if (!any(is.na(levels(x)))) {
                 exclude <- union(exclude, NA)
             }
             exclude_idx <- match(exclude, target_names, nomatch=0L)
             if (any(exclude_idx != 0L)) {
                 target_names <- target_names[-exclude_idx]
                 target_data <- target_data[-exclude_idx]
             }
         }
         target <- .make_target(target_names, target_data)
         .check_identical(target, current, varname, extra_args)
     }

     .do_exclude_tests <- function(x, varname, target_names0, target_data0,
                                   more_excludes=NULL)
     {
         .BASIC_EXCLUDES <- list(c(NA, NaN), NULL, numeric(0), NA, NaN)
         excludes <- c(.BASIC_EXCLUDES, more_excludes)
         for (exclude in excludes)
             .test_exclude(x, varname, target_names0, target_data0, exclude)
     }

     ## Test on a numeric vector.
     x0 <- numeric(0)
     .do_exclude_tests(x0, "x0", character(0), integer(0), list(5.3))

     x1_target_names0 <- c(-9, 4, 5.3, NaN, NA)
     x1_target_data0 <- c(1L, 2L, 1L, 2L, 3L)
     x1 <- c(5.3, 4, NaN, 4, NA, NA, NaN, -9, NA)
     excludes <- list(c(5.3, -9),
                      c(5.3, NA, -9),
                      c(5.3, NaN, -9),
                      c(5.3, 80, -9),
                      x1_target_names0)
     .do_exclude_tests(x1, "x1", x1_target_names0, x1_target_data0, 
excludes)

     x2_target_names0 <- c(-9, 4, 5.3, NA, NaN)
     x2_target_data0 <- c(1L, 2L, 1L, 3L, 2L)
     x2 <- rev(x1)
     .do_exclude_tests(x2, "x2", x2_target_names0, x2_target_data0, 
excludes)

     x3_target_names0 <- c(-9, 4, 5.3)
     x3_target_data0 <- c(1L, 2L, 1L)
     x3 <- c(5.3, 4, 4, -9)
     .do_exclude_tests(x3, "x3", x3_target_names0, x3_target_data0, 
excludes)

     ## Test on a factor.
     f0 <- factor()
     .do_exclude_tests(f0, "f0", character(0), integer(0), list(5.3))

     f1 <- factor(x1)
     .do_exclude_tests(f1, "f1", x1_target_names0, x1_target_data0, 
excludes)

     f2 <- factor(x1, exclude=NULL)
     .do_exclude_tests(f2, "f2", x1_target_names0, x1_target_data0, 
excludes)

     f3_target_names0 <- c(6.82, x1_target_names0, -7.66)
     f3_target_data0 <- c(0L, 1L, 2L, 1L, 0L, 0L, 0L)
     f3 <- factor(x3, levels=f3_target_names0, exclude=NULL)
     .do_exclude_tests(f3, "f3", f3_target_names0, f3_target_data0, 
excludes)

     x4_target_names0 <- c(6.82, -9, 5.3, 4, -7.66)
     x4_target_data0 <- c(0L, 1L, 1L, 2L, 0L)
     f4 <- factor(x3, levels=x4_target_names0, exclude=NULL)
     .do_exclude_tests(f4, "f4", x4_target_names0, x4_target_data0, 
excludes)

     ## Test on a character vector.
     c0 <- character(0)
     .do_exclude_tests(c0, "c0", character(0), integer(0), list("Aa"))

     c1 <- c("b", "AA", "", "a", "ab", "NaN", "4", "Aa", NA, "NaN", 
"ab", NA)
     c1_target_names0 <- sort(unique(c1), na.last=TRUE)
     c1_target_data0 <- c(1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L)
     excludes <- list(c("Aa", 4, ""),
                      c("Aa", NA, 4, "", "Z"),
                      c("Aa", NaN, 4, "", "Z"),
                      c("Aa", 4, "", "Z"))
     .do_exclude_tests(c1, "c1", c1_target_names0, c1_target_data0, 
excludes)

     c2 <- c("b", "AA", "", "a", "ab", "", "", "4", "Aa", "ab")
     c2_target_names0 <- sort(unique(c2), na.last=TRUE)
     c2_target_data0 <- c(3L, 1L, 1L, 1L, 1L, 2L, 1L)
     .do_exclude_tests(c2, "c2", c2_target_names0, c2_target_data0, 
excludes)

     ## Test on a logical vector.
     l0 <- logical(0)
     .do_exclude_tests(l0, "l0", character(0), integer(0), list(c("Aa", 
TRUE)))

     l1 <- c(FALSE, FALSE, NA, TRUE, FALSE, FALSE, NA, NA, TRUE)
     l1_target_names0 <- c(FALSE, TRUE, NA)
     l1_target_data0 <- c(4L, 2L, 3L)
     excludes <- list(c(TRUE, FALSE),
                      c("Aa", NA, TRUE),
                      c("Aa", NaN, TRUE),
                      l1_target_names0)
     .do_exclude_tests(l1, "l1", l1_target_names0, l1_target_data0, 
excludes)

     l2 <- c(FALSE, FALSE, TRUE, FALSE, FALSE, TRUE)
     l2_target_names0 <- c(FALSE, TRUE)
     l2_target_data0 <- c(4L, 2L)
     .do_exclude_tests(l2, "l2", l2_target_names0, l2_target_data0, 
excludes)
   }

   test_table("table")    # will display some issues
   test_table("table1D")  # should not display anything

> sessionInfo()
R version 3.0.1 (2013-05-16)
Platform: x86_64-unknown-linux-gnu (64-bit)

locale:
  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C
  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8
  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8
  [7] LC_PAPER=C                 LC_NAME=C
  [9] LC_ADDRESS=C               LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base

loaded via a namespace (and not attached):
[1] tools_3.0.1

-- 
Hervé Pagès

Program in Computational Biology
Division of Public Health Sciences
Fred Hutchinson Cancer Research Center
1100 Fairview Ave. N, M1-B514
P.O. Box 19024
Seattle, WA 98109-1024

E-mail: hpages at fhcrc.org
Phone:  (206) 667-5791
Fax:    (206) 667-1319