[Rd] a fast table() for the 1D case
Hervé Pagès
hpages at fhcrc.org
Mon Sep 16 21:55:23 CEST 2013
Any chance some improvements can be made on table()?
table() is probably one of the most used R functions when working
interactively. Unfortunately it can be incredibly slow, especially
on a logical vector where a simple sum() is hundred times faster
(I actually got into the habit of using sum() instead of table()).
The table1D() proposal below doesn't go as far as using sum() on a
logical vector but it already provides significant speedups for most
use cases.
Thanks,
H.
On 08/09/2013 01:19 AM, Hervé Pagès wrote:
> Hi,
>
> table1D() below can be up to 60x faster than base::table() for the 1D
> case. Here are the detailed speedups compared to base::table().
>
> o With a logical vector of length 5M: 11x faster
> (or more if 'useNA="always"')
>
> o With factor/integer/numeric/character of length 1M and 9 levels
> (or 9 distinct values for non-factors):
> - factor: 60x faster
> - integer/numeric vector: 12x faster
> - character vector: 2.4x faster
>
> o With factor/integer/numeric/character of length 1M and no
> duplicates:
> - factor: 5x faster
> - integer vector: 2x faster
> - numeric vector: 1.7x faster
> - character vector: no significant speedup
>
> Would be great if this improvement could make it into base::table().
>
> Thanks,
> H.
>
> ## A fast table() implementation for the 1D case (replacing the '...'
> ## arg with 'x' and omitting the 'dnn' and 'deparse.level' arguments
> ## which are unrelated to performance).
>
> table1D <- function(x, exclude = if (useNA == "no") c(NA, NaN),
> useNA = c("no", "ifany", "always"))
> {
> if (!missing(exclude) && is.null(exclude)) {
> useNA <- "always"
> } else {
> useNA <- match.arg(useNA)
> }
> if (useNA == "always" && !missing(exclude))
> exclude <- setdiff(exclude, NA)
> if (is.factor(x)) {
> x2 <- levels(x)
> append_NA <- (useNA == "always" ||
> useNA == "ifany" && any(is.na(x))) &&
> !any(is.na(x2))
> if (append_NA) {
> x2 <- c(x2, NA)
> x <- factor(x, levels=x2, exclude=NULL)
> }
> t2 <- tabulate(x, nbins=length(x2))
> if (!is.null(exclude)) {
> keep_idx <- which(!(x2 %in% exclude))
> x2 <- x2[keep_idx]
> t2 <- t2[keep_idx]
> }
> } else {
> xx <- match(x, x)
> t <- tabulate(xx, nbins=length(xx))
> keep_idx <- which(t != 0L)
> x2 <- x[keep_idx]
> t2 <- t[keep_idx]
> if (!is.null(exclude)) {
> exclude <- as.vector(exclude, typeof(x))
> keep_idx <- which(!(x2 %in% exclude))
> x2 <- x2[keep_idx]
> t2 <- t2[keep_idx]
> }
> oo <- order(x2)
> x2 <- x2[oo]
> t2 <- t2[oo]
> append_NA <- useNA == "always" && !any(is.na(x2))
> if (append_NA) {
> x2 <- c(x2, NA)
> t2 <- c(t2, 0L)
> }
> }
> ans <- array(t2)
> dimnames(ans) <- list(as.character(x2))
> names(dimnames(ans)) <- "x" # always set to 'x'
> class(ans) <- "table"
> ans
> }
>
> table1D() also fixes some issues with base::table() that can be exposed
> by running the tests below.
>
> test_table <- function(FUN_NAME)
> {
> FUN <- match.fun(FUN_NAME)
>
> .make_target <- function(target_names, target_data)
> {
> ans <- array(target_data)
> dimnames(ans) <- list(as.character(target_names))
> names(dimnames(ans)) <- "x"
> class(ans) <- "table"
> ans
> }
>
> .check_identical <- function(target, current, varname, extra_args)
> {
> if (identical(target, current))
> return()
> if (extra_args != "")
> extra_args <- paste0(", ", extra_args)
> cat("unexpected result for '", FUN_NAME,
> "(x=", varname, extra_args, ")'\n", sep="")
> }
>
> .test_exclude <- function(x, varname, target_names0, target_data0,
> exclude)
> {
> extra_args <- paste0("exclude=", deparse(exclude))
> current <- FUN(x=x, exclude=exclude)
> target_names <- target_names0
> target_data <- target_data0
> if (is.null(exclude)) {
> if (!any(is.na(target_names))) {
> target_names <- c(target_names, NA)
> target_data <- c(target_data, 0L)
> }
> } else {
> if (!is.factor(x)) {
> exclude <- as.vector(exclude, typeof(x))
> } else if (!any(is.na(levels(x)))) {
> exclude <- union(exclude, NA)
> }
> exclude_idx <- match(exclude, target_names, nomatch=0L)
> if (any(exclude_idx != 0L)) {
> target_names <- target_names[-exclude_idx]
> target_data <- target_data[-exclude_idx]
> }
> }
> target <- .make_target(target_names, target_data)
> .check_identical(target, current, varname, extra_args)
> }
>
> .do_exclude_tests <- function(x, varname, target_names0, target_data0,
> more_excludes=NULL)
> {
> .BASIC_EXCLUDES <- list(c(NA, NaN), NULL, numeric(0), NA, NaN)
> excludes <- c(.BASIC_EXCLUDES, more_excludes)
> for (exclude in excludes)
> .test_exclude(x, varname, target_names0, target_data0,
> exclude)
> }
>
> ## Test on a numeric vector.
> x0 <- numeric(0)
> .do_exclude_tests(x0, "x0", character(0), integer(0), list(5.3))
>
> x1_target_names0 <- c(-9, 4, 5.3, NaN, NA)
> x1_target_data0 <- c(1L, 2L, 1L, 2L, 3L)
> x1 <- c(5.3, 4, NaN, 4, NA, NA, NaN, -9, NA)
> excludes <- list(c(5.3, -9),
> c(5.3, NA, -9),
> c(5.3, NaN, -9),
> c(5.3, 80, -9),
> x1_target_names0)
> .do_exclude_tests(x1, "x1", x1_target_names0, x1_target_data0,
> excludes)
>
> x2_target_names0 <- c(-9, 4, 5.3, NA, NaN)
> x2_target_data0 <- c(1L, 2L, 1L, 3L, 2L)
> x2 <- rev(x1)
> .do_exclude_tests(x2, "x2", x2_target_names0, x2_target_data0,
> excludes)
>
> x3_target_names0 <- c(-9, 4, 5.3)
> x3_target_data0 <- c(1L, 2L, 1L)
> x3 <- c(5.3, 4, 4, -9)
> .do_exclude_tests(x3, "x3", x3_target_names0, x3_target_data0,
> excludes)
>
> ## Test on a factor.
> f0 <- factor()
> .do_exclude_tests(f0, "f0", character(0), integer(0), list(5.3))
>
> f1 <- factor(x1)
> .do_exclude_tests(f1, "f1", x1_target_names0, x1_target_data0,
> excludes)
>
> f2 <- factor(x1, exclude=NULL)
> .do_exclude_tests(f2, "f2", x1_target_names0, x1_target_data0,
> excludes)
>
> f3_target_names0 <- c(6.82, x1_target_names0, -7.66)
> f3_target_data0 <- c(0L, 1L, 2L, 1L, 0L, 0L, 0L)
> f3 <- factor(x3, levels=f3_target_names0, exclude=NULL)
> .do_exclude_tests(f3, "f3", f3_target_names0, f3_target_data0,
> excludes)
>
> x4_target_names0 <- c(6.82, -9, 5.3, 4, -7.66)
> x4_target_data0 <- c(0L, 1L, 1L, 2L, 0L)
> f4 <- factor(x3, levels=x4_target_names0, exclude=NULL)
> .do_exclude_tests(f4, "f4", x4_target_names0, x4_target_data0,
> excludes)
>
> ## Test on a character vector.
> c0 <- character(0)
> .do_exclude_tests(c0, "c0", character(0), integer(0), list("Aa"))
>
> c1 <- c("b", "AA", "", "a", "ab", "NaN", "4", "Aa", NA, "NaN",
> "ab", NA)
> c1_target_names0 <- sort(unique(c1), na.last=TRUE)
> c1_target_data0 <- c(1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L)
> excludes <- list(c("Aa", 4, ""),
> c("Aa", NA, 4, "", "Z"),
> c("Aa", NaN, 4, "", "Z"),
> c("Aa", 4, "", "Z"))
> .do_exclude_tests(c1, "c1", c1_target_names0, c1_target_data0,
> excludes)
>
> c2 <- c("b", "AA", "", "a", "ab", "", "", "4", "Aa", "ab")
> c2_target_names0 <- sort(unique(c2), na.last=TRUE)
> c2_target_data0 <- c(3L, 1L, 1L, 1L, 1L, 2L, 1L)
> .do_exclude_tests(c2, "c2", c2_target_names0, c2_target_data0,
> excludes)
>
> ## Test on a logical vector.
> l0 <- logical(0)
> .do_exclude_tests(l0, "l0", character(0), integer(0), list(c("Aa",
> TRUE)))
>
> l1 <- c(FALSE, FALSE, NA, TRUE, FALSE, FALSE, NA, NA, TRUE)
> l1_target_names0 <- c(FALSE, TRUE, NA)
> l1_target_data0 <- c(4L, 2L, 3L)
> excludes <- list(c(TRUE, FALSE),
> c("Aa", NA, TRUE),
> c("Aa", NaN, TRUE),
> l1_target_names0)
> .do_exclude_tests(l1, "l1", l1_target_names0, l1_target_data0,
> excludes)
>
> l2 <- c(FALSE, FALSE, TRUE, FALSE, FALSE, TRUE)
> l2_target_names0 <- c(FALSE, TRUE)
> l2_target_data0 <- c(4L, 2L)
> .do_exclude_tests(l2, "l2", l2_target_names0, l2_target_data0,
> excludes)
> }
>
> test_table("table") # will display some issues
> test_table("table1D") # should not display anything
>
>
>> sessionInfo()
> R version 3.0.1 (2013-05-16)
> Platform: x86_64-unknown-linux-gnu (64-bit)
>
> locale:
> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
> [7] LC_PAPER=C LC_NAME=C
> [9] LC_ADDRESS=C LC_TELEPHONE=C
> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
>
> attached base packages:
> [1] stats graphics grDevices utils datasets methods base
>
> loaded via a namespace (and not attached):
> [1] tools_3.0.1
>
--
Hervé Pagès
Program in Computational Biology
Division of Public Health Sciences
Fred Hutchinson Cancer Research Center
1100 Fairview Ave. N, M1-B514
P.O. Box 19024
Seattle, WA 98109-1024
E-mail: hpages at fhcrc.org
Phone: (206) 667-5791
Fax: (206) 667-1319
More information about the R-devel
mailing list