[Rd] Minor logical bug in rbind.data.frame ?

Steven McKinney smckinney at bccrc.ca
Fri Jan 12 23:18:10 CET 2007


When attempting to merge 3 data frames, one of which has fewer columns
than the others, rbind.data.frame correctly refuses to perform the bind.
However, the error message given is a bit obscure due to a logical
bug in the match.names() internal function to rbind.data.frame.

Illustration:

## Three data frames with same column variable names:
> foo <- data.frame(v1 = c('a', 'b'), v2 = c(1, 2), v3 = ordered(c('x', 'y')))
> bar <- data.frame(v2 = c(3, 4), v3 = ordered(c('x', 'y')), v1 = c('c', 'd'))
> baz <- data.frame(v1 = c('a', 'e'), v2 = c(5, 6), v3 = ordered(c('x', 'z')))
> rbind(foo, bar, baz)
  v1 v2 v3
1  a  1  x
2  b  2  y
3  c  3  x
4  d  4  y
5  a  5  x
6  e  6  z
> ## All is fine.

## Third data frame has a different third column variable
> fifi <- data.frame(v1 = c('a', 'e'), v2 = c(9, 10), v4 = ordered(c('y', 'z')))
> rbind(foo, bar, fifi)
Error in match.names(clabs, names(xi)) : names do not match previous names:
	v4
> ## Output looks fine.

## Third data frame is missing a column.
> gaga <- data.frame(v1 = c('a', 'e'), v2 = c(7, 8))
> rbind(foo, bar, gaga)
Error in paste(nmi[nii == 0], collapse = ", ") : 
	object "nii" not found
In addition: Warning message:
longer object length
	is not a multiple of shorter object length in: clabs == nmi 
>


This somewhat cryptic error message results because the second portion
of the second if() test clause in the match.names() function is not
evaluated, as it is the second argument to function '&&' and the first
argument (length(nmi) == length(clabs)) evaluates to FALSE.  Thus variable
nii is not defined and the stop() command at the end of match.names()
throws an (unintended?) error.

If I switch the order of these two arguments to '&&' (see my
test function sm.rbind.data.frame below)  I get what
appears to be the intended output given an input data frame with a
missing column (relative to the prior data frames):

> sm.rbind.data.frame(foo, bar, gaga)
Error in match.names(clabs, names(xi)) : names do not match previous names:
	
In addition: Warning message:
longer object length
	is not a multiple of shorter object length in: clabs == nmi 



Is this a possible fix for rbind.data.frame, or would the changed order
break something else?


## Modified function handles all the above test cases properly
> sm.rbind.data.frame(foo, bar, baz)
  v1 v2 v3
1  a  1  x
2  b  2  y
3  c  3  x
4  d  4  y
5  a  5  x
6  e  6  z
> sm.rbind.data.frame(foo, bar, fifi)
Error in match.names(clabs, names(xi)) : names do not match previous names:
	v4
> sm.rbind.data.frame(foo, bar, gaga)
Error in match.names(clabs, names(xi)) : names do not match previous names:
	
In addition: Warning message:
longer object length
	is not a multiple of shorter object length in: clabs == nmi 
> 


### rbind.data.frame with modified match.names()
sm.rbind.data.frame <- 
function (..., deparse.level = 1) 
{
    match.names <- function(clabs, nmi) {
        if (all(clabs == nmi)) 
            NULL
### Switched order of args to '&&' in following if() test clause
        else if (all(nii <- match(nmi, 
            clabs, 0)) && length(nmi) == length(clabs)) {
            m <- pmatch(nmi, clabs, 0)
            if (any(m == 0)) 
                stop("names do not match previous names")
            m
        }
        else stop("names do not match previous names:\n\t", paste(nmi[nii == 
            0], collapse = ", "))
    }
    Make.row.names <- function(nmi, ri, ni, nrow) {
        if (nchar(nmi) > 0) {
            if (ni == 0) 
                character(0)
            else if (ni > 1) 
                paste(nmi, ri, sep = ".")
            else nmi
        }
        else if (nrow > 0 && identical(ri, 1:ni)) 
            as.integer(seq.int(from = nrow + 1, length = ni))
        else ri
    }
    allargs <- list(...)
    allargs <- allargs[sapply(allargs, length) > 0]
    n <- length(allargs)
    if (n == 0) 
        return(structure(list(), class = "data.frame", row.names = integer()))
    nms <- names(allargs)
    if (is.null(nms)) 
        nms <- character(length(allargs))
    cl <- NULL
    perm <- rows <- rlabs <- vector("list", n)
    nrow <- 0
    value <- clabs <- NULL
    all.levs <- list()
    for (i in 1:n) {
        xi <- allargs[[i]]
        nmi <- nms[i]
        if (is.matrix(xi)) 
            allargs[[i]] <- xi <- as.data.frame(xi)
        if (inherits(xi, "data.frame")) {
            if (is.null(cl)) 
                cl <- oldClass(xi)
            ri <- attr(xi, "row.names")
            ni <- length(ri)
            if (is.null(clabs)) 
                clabs <- names(xi)
            else {
                pi <- match.names(clabs, names(xi))
                if (!is.null(pi)) 
                  perm[[i]] <- pi
            }
            rows[[i]] <- seq.int(from = nrow + 1, length = ni)
            rlabs[[i]] <- Make.row.names(nmi, ri, ni, nrow)
            nrow <- nrow + ni
            if (is.null(value)) {
                value <- unclass(xi)
                nvar <- length(value)
                all.levs <- vector("list", nvar)
                has.dim <- logical(nvar)
                facCol <- logical(nvar)
                ordCol <- logical(nvar)
                for (j in 1:nvar) {
                  xj <- value[[j]]
                  if (!is.null(levels(xj))) {
                    all.levs[[j]] <- levels(xj)
                    facCol[j] <- TRUE
                  }
                  else facCol[j] <- is.factor(xj)
                  ordCol[j] <- is.ordered(xj)
                  has.dim[j] <- length(dim(xj)) == 2
                }
            }
            else for (j in 1:nvar) {
                xij <- xi[[j]]
                if (is.null(pi) || is.na(jj <- pi[[j]])) 
                  jj <- j
                if (facCol[jj]) {
                  if (length(lij <- levels(xij)) > 0) {
                    all.levs[[jj]] <- unique(c(all.levs[[jj]], 
                      lij))
                    ordCol[jj] <- ordCol[jj] & is.ordered(xij)
                  }
                  else if (is.character(xij)) 
                    all.levs[[jj]] <- unique(c(all.levs[[jj]], 
                      xij))
                }
            }
        }
        else if (is.list(xi)) {
            ni <- range(sapply(xi, length))
            if (ni[1] == ni[2]) 
                ni <- ni[1]
            else stop("invalid list argument: all variables should have the same length")
            rows[[i]] <- ri <- as.integer(seq.int(from = nrow + 
                1, length = ni))
            nrow <- nrow + ni
            rlabs[[i]] <- Make.row.names(nmi, ri, ni, nrow)
            if (length(nmi <- names(xi)) > 0) {
                if (is.null(clabs)) 
                  clabs <- nmi
                else {
                  tmp <- match.names(clabs, nmi)
                  if (!is.null(tmp)) 
                    perm[[i]] <- tmp
                }
            }
        }
        else if (length(xi) > 0) {
            rows[[i]] <- nrow <- nrow + 1
            rlabs[[i]] <- if (nchar(nmi) > 0) 
                nmi
            else as.integer(nrow)
        }
    }
    nvar <- length(clabs)
    if (nvar == 0) 
        nvar <- max(sapply(allargs, length))
    if (nvar == 0) 
        return(structure(list(), class = "data.frame", row.names = integer()))
    pseq <- 1:nvar
    if (is.null(value)) {
        value <- list()
        value[pseq] <- list(logical(nrow))
    }
    names(value) <- clabs
    for (j in 1:nvar) if (length(lij <- all.levs[[j]]) > 0) 
        value[[j]] <- factor(as.vector(value[[j]]), lij, ordered = ordCol[j])
    if (any(has.dim)) {
        rmax <- max(unlist(rows))
        for (i in (1:nvar)[has.dim]) if (!inherits(xi <- value[[i]], 
            "data.frame")) {
            dn <- dimnames(xi)
            rn <- dn[[1]]
            if (length(rn) > 0) 
                length(rn) <- rmax
            pi <- dim(xi)[2]
            length(xi) <- rmax * pi
            value[[i]] <- array(xi, c(rmax, pi), list(rn, dn[[2]]))
        }
    }
    for (i in 1:n) {
        xi <- unclass(allargs[[i]])
        if (!is.list(xi)) 
            if (length(xi) != nvar) 
                xi <- rep(xi, length.out = nvar)
        ri <- rows[[i]]
        pi <- perm[[i]]
        if (is.null(pi)) 
            pi <- pseq
        for (j in 1:nvar) {
            jj <- pi[j]
            xij <- xi[[j]]
            if (has.dim[jj]) {
                value[[jj]][ri, ] <- xij
                rownames(value[[jj]])[ri] <- rownames(xij)
            }
            else {
                value[[jj]][ri] <- if (is.factor(xij)) 
                  as.vector(xij)
                else xij
                if (!is.null(nm <- names(xij))) 
                  names(value[[jj]])[ri] <- nm
            }
        }
    }
    rlabs <- unlist(rlabs)
    if (any(duplicated(rlabs))) 
        rlabs <- make.unique(as.character(unlist(rlabs)), sep = "")
    if (is.null(cl)) {
        as.data.frame(value, row.names = rlabs)
    }
    else {
        class(value) <- cl
        attr(value, "row.names") <- rlabs
        value
    }
}






Steven McKinney

Statistician
Molecular Oncology and Breast Cancer Program
British Columbia Cancer Research Centre

email: smckinney at bccrc.ca

tel: 604-675-8000 x7561

BCCRC
Molecular Oncology
675 West 10th Ave, Floor 4
Vancouver B.C. 
V5Z 1L3
Canada



More information about the R-devel mailing list