[Rd] Minor logical bug in rbind.data.frame ?
Steven McKinney
smckinney at bccrc.ca
Fri Jan 12 23:18:10 CET 2007
When attempting to merge 3 data frames, one of which has fewer columns
than the others, rbind.data.frame correctly refuses to perform the bind.
However, the error message given is a bit obscure due to a logical
bug in the match.names() internal function to rbind.data.frame.
Illustration:
## Three data frames with same column variable names:
> foo <- data.frame(v1 = c('a', 'b'), v2 = c(1, 2), v3 = ordered(c('x', 'y')))
> bar <- data.frame(v2 = c(3, 4), v3 = ordered(c('x', 'y')), v1 = c('c', 'd'))
> baz <- data.frame(v1 = c('a', 'e'), v2 = c(5, 6), v3 = ordered(c('x', 'z')))
> rbind(foo, bar, baz)
v1 v2 v3
1 a 1 x
2 b 2 y
3 c 3 x
4 d 4 y
5 a 5 x
6 e 6 z
> ## All is fine.
## Third data frame has a different third column variable
> fifi <- data.frame(v1 = c('a', 'e'), v2 = c(9, 10), v4 = ordered(c('y', 'z')))
> rbind(foo, bar, fifi)
Error in match.names(clabs, names(xi)) : names do not match previous names:
v4
> ## Output looks fine.
## Third data frame is missing a column.
> gaga <- data.frame(v1 = c('a', 'e'), v2 = c(7, 8))
> rbind(foo, bar, gaga)
Error in paste(nmi[nii == 0], collapse = ", ") :
object "nii" not found
In addition: Warning message:
longer object length
is not a multiple of shorter object length in: clabs == nmi
>
This somewhat cryptic error message results because the second portion
of the second if() test clause in the match.names() function is not
evaluated, as it is the second argument to function '&&' and the first
argument (length(nmi) == length(clabs)) evaluates to FALSE. Thus variable
nii is not defined and the stop() command at the end of match.names()
throws an (unintended?) error.
If I switch the order of these two arguments to '&&' (see my
test function sm.rbind.data.frame below) I get what
appears to be the intended output given an input data frame with a
missing column (relative to the prior data frames):
> sm.rbind.data.frame(foo, bar, gaga)
Error in match.names(clabs, names(xi)) : names do not match previous names:
In addition: Warning message:
longer object length
is not a multiple of shorter object length in: clabs == nmi
Is this a possible fix for rbind.data.frame, or would the changed order
break something else?
## Modified function handles all the above test cases properly
> sm.rbind.data.frame(foo, bar, baz)
v1 v2 v3
1 a 1 x
2 b 2 y
3 c 3 x
4 d 4 y
5 a 5 x
6 e 6 z
> sm.rbind.data.frame(foo, bar, fifi)
Error in match.names(clabs, names(xi)) : names do not match previous names:
v4
> sm.rbind.data.frame(foo, bar, gaga)
Error in match.names(clabs, names(xi)) : names do not match previous names:
In addition: Warning message:
longer object length
is not a multiple of shorter object length in: clabs == nmi
>
### rbind.data.frame with modified match.names()
sm.rbind.data.frame <-
function (..., deparse.level = 1)
{
match.names <- function(clabs, nmi) {
if (all(clabs == nmi))
NULL
### Switched order of args to '&&' in following if() test clause
else if (all(nii <- match(nmi,
clabs, 0)) && length(nmi) == length(clabs)) {
m <- pmatch(nmi, clabs, 0)
if (any(m == 0))
stop("names do not match previous names")
m
}
else stop("names do not match previous names:\n\t", paste(nmi[nii ==
0], collapse = ", "))
}
Make.row.names <- function(nmi, ri, ni, nrow) {
if (nchar(nmi) > 0) {
if (ni == 0)
character(0)
else if (ni > 1)
paste(nmi, ri, sep = ".")
else nmi
}
else if (nrow > 0 && identical(ri, 1:ni))
as.integer(seq.int(from = nrow + 1, length = ni))
else ri
}
allargs <- list(...)
allargs <- allargs[sapply(allargs, length) > 0]
n <- length(allargs)
if (n == 0)
return(structure(list(), class = "data.frame", row.names = integer()))
nms <- names(allargs)
if (is.null(nms))
nms <- character(length(allargs))
cl <- NULL
perm <- rows <- rlabs <- vector("list", n)
nrow <- 0
value <- clabs <- NULL
all.levs <- list()
for (i in 1:n) {
xi <- allargs[[i]]
nmi <- nms[i]
if (is.matrix(xi))
allargs[[i]] <- xi <- as.data.frame(xi)
if (inherits(xi, "data.frame")) {
if (is.null(cl))
cl <- oldClass(xi)
ri <- attr(xi, "row.names")
ni <- length(ri)
if (is.null(clabs))
clabs <- names(xi)
else {
pi <- match.names(clabs, names(xi))
if (!is.null(pi))
perm[[i]] <- pi
}
rows[[i]] <- seq.int(from = nrow + 1, length = ni)
rlabs[[i]] <- Make.row.names(nmi, ri, ni, nrow)
nrow <- nrow + ni
if (is.null(value)) {
value <- unclass(xi)
nvar <- length(value)
all.levs <- vector("list", nvar)
has.dim <- logical(nvar)
facCol <- logical(nvar)
ordCol <- logical(nvar)
for (j in 1:nvar) {
xj <- value[[j]]
if (!is.null(levels(xj))) {
all.levs[[j]] <- levels(xj)
facCol[j] <- TRUE
}
else facCol[j] <- is.factor(xj)
ordCol[j] <- is.ordered(xj)
has.dim[j] <- length(dim(xj)) == 2
}
}
else for (j in 1:nvar) {
xij <- xi[[j]]
if (is.null(pi) || is.na(jj <- pi[[j]]))
jj <- j
if (facCol[jj]) {
if (length(lij <- levels(xij)) > 0) {
all.levs[[jj]] <- unique(c(all.levs[[jj]],
lij))
ordCol[jj] <- ordCol[jj] & is.ordered(xij)
}
else if (is.character(xij))
all.levs[[jj]] <- unique(c(all.levs[[jj]],
xij))
}
}
}
else if (is.list(xi)) {
ni <- range(sapply(xi, length))
if (ni[1] == ni[2])
ni <- ni[1]
else stop("invalid list argument: all variables should have the same length")
rows[[i]] <- ri <- as.integer(seq.int(from = nrow +
1, length = ni))
nrow <- nrow + ni
rlabs[[i]] <- Make.row.names(nmi, ri, ni, nrow)
if (length(nmi <- names(xi)) > 0) {
if (is.null(clabs))
clabs <- nmi
else {
tmp <- match.names(clabs, nmi)
if (!is.null(tmp))
perm[[i]] <- tmp
}
}
}
else if (length(xi) > 0) {
rows[[i]] <- nrow <- nrow + 1
rlabs[[i]] <- if (nchar(nmi) > 0)
nmi
else as.integer(nrow)
}
}
nvar <- length(clabs)
if (nvar == 0)
nvar <- max(sapply(allargs, length))
if (nvar == 0)
return(structure(list(), class = "data.frame", row.names = integer()))
pseq <- 1:nvar
if (is.null(value)) {
value <- list()
value[pseq] <- list(logical(nrow))
}
names(value) <- clabs
for (j in 1:nvar) if (length(lij <- all.levs[[j]]) > 0)
value[[j]] <- factor(as.vector(value[[j]]), lij, ordered = ordCol[j])
if (any(has.dim)) {
rmax <- max(unlist(rows))
for (i in (1:nvar)[has.dim]) if (!inherits(xi <- value[[i]],
"data.frame")) {
dn <- dimnames(xi)
rn <- dn[[1]]
if (length(rn) > 0)
length(rn) <- rmax
pi <- dim(xi)[2]
length(xi) <- rmax * pi
value[[i]] <- array(xi, c(rmax, pi), list(rn, dn[[2]]))
}
}
for (i in 1:n) {
xi <- unclass(allargs[[i]])
if (!is.list(xi))
if (length(xi) != nvar)
xi <- rep(xi, length.out = nvar)
ri <- rows[[i]]
pi <- perm[[i]]
if (is.null(pi))
pi <- pseq
for (j in 1:nvar) {
jj <- pi[j]
xij <- xi[[j]]
if (has.dim[jj]) {
value[[jj]][ri, ] <- xij
rownames(value[[jj]])[ri] <- rownames(xij)
}
else {
value[[jj]][ri] <- if (is.factor(xij))
as.vector(xij)
else xij
if (!is.null(nm <- names(xij)))
names(value[[jj]])[ri] <- nm
}
}
}
rlabs <- unlist(rlabs)
if (any(duplicated(rlabs)))
rlabs <- make.unique(as.character(unlist(rlabs)), sep = "")
if (is.null(cl)) {
as.data.frame(value, row.names = rlabs)
}
else {
class(value) <- cl
attr(value, "row.names") <- rlabs
value
}
}
Steven McKinney
Statistician
Molecular Oncology and Breast Cancer Program
British Columbia Cancer Research Centre
email: smckinney at bccrc.ca
tel: 604-675-8000 x7561
BCCRC
Molecular Oncology
675 West 10th Ave, Floor 4
Vancouver B.C.
V5Z 1L3
Canada
More information about the R-devel
mailing list