[Rd] a fast table() for the 1D case
Hervé Pagès
hpages at fhcrc.org
Fri Aug 9 10:19:42 CEST 2013
Hi,
table1D() below can be up to 60x faster than base::table() for the 1D
case. Here are the detailed speedups compared to base::table().
o With a logical vector of length 5M: 11x faster
(or more if 'useNA="always"')
o With factor/integer/numeric/character of length 1M and 9 levels
(or 9 distinct values for non-factors):
- factor: 60x faster
- integer/numeric vector: 12x faster
- character vector: 2.4x faster
o With factor/integer/numeric/character of length 1M and no
duplicates:
- factor: 5x faster
- integer vector: 2x faster
- numeric vector: 1.7x faster
- character vector: no significant speedup
Would be great if this improvement could make it into base::table().
Thanks,
H.
## A fast table() implementation for the 1D case (replacing the '...'
## arg with 'x' and omitting the 'dnn' and 'deparse.level' arguments
## which are unrelated to performance).
table1D <- function(x, exclude = if (useNA == "no") c(NA, NaN),
useNA = c("no", "ifany", "always"))
{
if (!missing(exclude) && is.null(exclude)) {
useNA <- "always"
} else {
useNA <- match.arg(useNA)
}
if (useNA == "always" && !missing(exclude))
exclude <- setdiff(exclude, NA)
if (is.factor(x)) {
x2 <- levels(x)
append_NA <- (useNA == "always" ||
useNA == "ifany" && any(is.na(x))) &&
!any(is.na(x2))
if (append_NA) {
x2 <- c(x2, NA)
x <- factor(x, levels=x2, exclude=NULL)
}
t2 <- tabulate(x, nbins=length(x2))
if (!is.null(exclude)) {
keep_idx <- which(!(x2 %in% exclude))
x2 <- x2[keep_idx]
t2 <- t2[keep_idx]
}
} else {
xx <- match(x, x)
t <- tabulate(xx, nbins=length(xx))
keep_idx <- which(t != 0L)
x2 <- x[keep_idx]
t2 <- t[keep_idx]
if (!is.null(exclude)) {
exclude <- as.vector(exclude, typeof(x))
keep_idx <- which(!(x2 %in% exclude))
x2 <- x2[keep_idx]
t2 <- t2[keep_idx]
}
oo <- order(x2)
x2 <- x2[oo]
t2 <- t2[oo]
append_NA <- useNA == "always" && !any(is.na(x2))
if (append_NA) {
x2 <- c(x2, NA)
t2 <- c(t2, 0L)
}
}
ans <- array(t2)
dimnames(ans) <- list(as.character(x2))
names(dimnames(ans)) <- "x" # always set to 'x'
class(ans) <- "table"
ans
}
table1D() also fixes some issues with base::table() that can be exposed
by running the tests below.
test_table <- function(FUN_NAME)
{
FUN <- match.fun(FUN_NAME)
.make_target <- function(target_names, target_data)
{
ans <- array(target_data)
dimnames(ans) <- list(as.character(target_names))
names(dimnames(ans)) <- "x"
class(ans) <- "table"
ans
}
.check_identical <- function(target, current, varname, extra_args)
{
if (identical(target, current))
return()
if (extra_args != "")
extra_args <- paste0(", ", extra_args)
cat("unexpected result for '", FUN_NAME,
"(x=", varname, extra_args, ")'\n", sep="")
}
.test_exclude <- function(x, varname, target_names0, target_data0,
exclude)
{
extra_args <- paste0("exclude=", deparse(exclude))
current <- FUN(x=x, exclude=exclude)
target_names <- target_names0
target_data <- target_data0
if (is.null(exclude)) {
if (!any(is.na(target_names))) {
target_names <- c(target_names, NA)
target_data <- c(target_data, 0L)
}
} else {
if (!is.factor(x)) {
exclude <- as.vector(exclude, typeof(x))
} else if (!any(is.na(levels(x)))) {
exclude <- union(exclude, NA)
}
exclude_idx <- match(exclude, target_names, nomatch=0L)
if (any(exclude_idx != 0L)) {
target_names <- target_names[-exclude_idx]
target_data <- target_data[-exclude_idx]
}
}
target <- .make_target(target_names, target_data)
.check_identical(target, current, varname, extra_args)
}
.do_exclude_tests <- function(x, varname, target_names0, target_data0,
more_excludes=NULL)
{
.BASIC_EXCLUDES <- list(c(NA, NaN), NULL, numeric(0), NA, NaN)
excludes <- c(.BASIC_EXCLUDES, more_excludes)
for (exclude in excludes)
.test_exclude(x, varname, target_names0, target_data0, exclude)
}
## Test on a numeric vector.
x0 <- numeric(0)
.do_exclude_tests(x0, "x0", character(0), integer(0), list(5.3))
x1_target_names0 <- c(-9, 4, 5.3, NaN, NA)
x1_target_data0 <- c(1L, 2L, 1L, 2L, 3L)
x1 <- c(5.3, 4, NaN, 4, NA, NA, NaN, -9, NA)
excludes <- list(c(5.3, -9),
c(5.3, NA, -9),
c(5.3, NaN, -9),
c(5.3, 80, -9),
x1_target_names0)
.do_exclude_tests(x1, "x1", x1_target_names0, x1_target_data0,
excludes)
x2_target_names0 <- c(-9, 4, 5.3, NA, NaN)
x2_target_data0 <- c(1L, 2L, 1L, 3L, 2L)
x2 <- rev(x1)
.do_exclude_tests(x2, "x2", x2_target_names0, x2_target_data0,
excludes)
x3_target_names0 <- c(-9, 4, 5.3)
x3_target_data0 <- c(1L, 2L, 1L)
x3 <- c(5.3, 4, 4, -9)
.do_exclude_tests(x3, "x3", x3_target_names0, x3_target_data0,
excludes)
## Test on a factor.
f0 <- factor()
.do_exclude_tests(f0, "f0", character(0), integer(0), list(5.3))
f1 <- factor(x1)
.do_exclude_tests(f1, "f1", x1_target_names0, x1_target_data0,
excludes)
f2 <- factor(x1, exclude=NULL)
.do_exclude_tests(f2, "f2", x1_target_names0, x1_target_data0,
excludes)
f3_target_names0 <- c(6.82, x1_target_names0, -7.66)
f3_target_data0 <- c(0L, 1L, 2L, 1L, 0L, 0L, 0L)
f3 <- factor(x3, levels=f3_target_names0, exclude=NULL)
.do_exclude_tests(f3, "f3", f3_target_names0, f3_target_data0,
excludes)
x4_target_names0 <- c(6.82, -9, 5.3, 4, -7.66)
x4_target_data0 <- c(0L, 1L, 1L, 2L, 0L)
f4 <- factor(x3, levels=x4_target_names0, exclude=NULL)
.do_exclude_tests(f4, "f4", x4_target_names0, x4_target_data0,
excludes)
## Test on a character vector.
c0 <- character(0)
.do_exclude_tests(c0, "c0", character(0), integer(0), list("Aa"))
c1 <- c("b", "AA", "", "a", "ab", "NaN", "4", "Aa", NA, "NaN",
"ab", NA)
c1_target_names0 <- sort(unique(c1), na.last=TRUE)
c1_target_data0 <- c(1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L)
excludes <- list(c("Aa", 4, ""),
c("Aa", NA, 4, "", "Z"),
c("Aa", NaN, 4, "", "Z"),
c("Aa", 4, "", "Z"))
.do_exclude_tests(c1, "c1", c1_target_names0, c1_target_data0,
excludes)
c2 <- c("b", "AA", "", "a", "ab", "", "", "4", "Aa", "ab")
c2_target_names0 <- sort(unique(c2), na.last=TRUE)
c2_target_data0 <- c(3L, 1L, 1L, 1L, 1L, 2L, 1L)
.do_exclude_tests(c2, "c2", c2_target_names0, c2_target_data0,
excludes)
## Test on a logical vector.
l0 <- logical(0)
.do_exclude_tests(l0, "l0", character(0), integer(0), list(c("Aa",
TRUE)))
l1 <- c(FALSE, FALSE, NA, TRUE, FALSE, FALSE, NA, NA, TRUE)
l1_target_names0 <- c(FALSE, TRUE, NA)
l1_target_data0 <- c(4L, 2L, 3L)
excludes <- list(c(TRUE, FALSE),
c("Aa", NA, TRUE),
c("Aa", NaN, TRUE),
l1_target_names0)
.do_exclude_tests(l1, "l1", l1_target_names0, l1_target_data0,
excludes)
l2 <- c(FALSE, FALSE, TRUE, FALSE, FALSE, TRUE)
l2_target_names0 <- c(FALSE, TRUE)
l2_target_data0 <- c(4L, 2L)
.do_exclude_tests(l2, "l2", l2_target_names0, l2_target_data0,
excludes)
}
test_table("table") # will display some issues
test_table("table1D") # should not display anything
> sessionInfo()
R version 3.0.1 (2013-05-16)
Platform: x86_64-unknown-linux-gnu (64-bit)
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=C LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
loaded via a namespace (and not attached):
[1] tools_3.0.1
--
Hervé Pagès
Program in Computational Biology
Division of Public Health Sciences
Fred Hutchinson Cancer Research Center
1100 Fairview Ave. N, M1-B514
P.O. Box 19024
Seattle, WA 98109-1024
E-mail: hpages at fhcrc.org
Phone: (206) 667-5791
Fax: (206) 667-1319
More information about the R-devel
mailing list