[Rd] Adjusted p-values with TukeyHSD (patch)

Fri Jul 15 08:49:43 CEST 2005

Dear R-developeRs,

        Attached follows a patch against svn 34959 that adds the
printing of p-values to the TukeyHSD.aov function in stats package. I
also updated the corresponding documentation file and added a 'see also'
reference to the simint function of the multcomp package.

        As it was already brought up in a previous thread [1] in R-help,
one can obtain the adjusted p-values using the multcomp package and its
simint function. The problem is that currently the simint function
scales very badly for a large number of contrasts (> 15). While the output
of TukeyHSD is almost instantaneous, simint may take more than half an
hour to process 19 contrasts.

        As a toy example, try:

        y <- rnorm(500)
        A <- gl(5,100)
        system.time(h1 <- TukeyHSD((aov(y ~ A))))
        system.time(h2 <- simint(y ~ A,type="Tukey"))

        Here I got:
        [1] 0.09 0.01 0.10 0.00 0.00
        [1] 26.87  0.03 27.10  0.00  0.00

        For a small number of contrasts they're equivalent, for example:

        data(warpbreaks)
        fm1 <- aov(breaks ~ wool + tension, data = warpbreaks)
        tHSD <- TukeyHSD(fm1, "tension", ordered = FALSE)
        print(tHSD)
        mcHSD <- simint(breaks ~ wool + tension, data = warpbreaks,
                     whichf="tension", type="Tukey")
        summary(mcHSD)

        I also attached the complete function (mTukeyHSD.R) to this
message, in case the patch is not accepted and someone else needs to do
the same thing. In any case, I think the reference to the multcomp
package in the TukeyHSD help page should be considered even if the patch
to the function is not accepted. 

       Thank you, 

References
[1] http://tolstoy.newcastle.edu.au/R/help/05/05/4599.html

--
Fernando Henrique Ferraz P. da Rosa
http://www.ime.usp.br/~feferraz
-------------- next part --------------
diff -ru r-devel/R/ r-local/R
diff -ru r-devel/R/src/library/stats/R/TukeyHSD.R r-local/R/src/library/stats/R/TukeyHSD.R

--- r-devel/R/src/library/stats/R/TukeyHSD.R	2005-07-14 23:13:49.000000000 -0300
+++ r-local/R/src/library/stats/R/TukeyHSD.R	2005-07-15 02:43:25.055207448 -0300
@@ -66,10 +66,12 @@
         center <- center[keep]
         width <- qtukey(conf.level, length(means), x$df.residual) *
             sqrt((MSE/2) * outer(1/n, 1/n, "+"))[keep]
-        dnames <- list(NULL, c("diff", "lwr", "upr"))
+        est <- center/(sqrt((MSE/2) * outer(1/n, 1/n, "+"))[keep])
+        pvals <- ptukey(abs(est),length(means),x$df.residual,lower.tail=FALSE)
+        dnames <- list(NULL, c("diff", "lwr", "upr","p adj"))
         if (!is.null(nms)) dnames[[1]] <- outer(nms, nms, paste, sep = "-")[keep]
-        out[[nm]] <- array(c(center, center - width, center + width),
-                           c(length(width), 3), dnames)
+        out[[nm]] <- array(c(center, center - width, center + width,pvals),
+                           c(length(width), 4), dnames)
     }
     class(out) <- c("multicomp", "TukeyHSD")
     attr(out, "orig.call") <- x$call
diff -ru r-devel/R/src/library/stats/man/TukeyHSD.Rd r-local/R/src/library/stats/man/TukeyHSD.Rd
--- r-devel/R/src/library/stats/man/TukeyHSD.Rd	2005-07-14 23:14:39.380653872 -0300
+++ r-local/R/src/library/stats/man/TukeyHSD.Rd	2005-07-15 03:14:36.520277744 -0300
@@ -55,7 +55,8 @@
   A list with one component for each term requested in \code{which}.
   Each component is a matrix with columns \code{diff} giving the
   difference in the observed means, \code{lwr} giving the lower
-  end point of the interval, and \code{upr} giving the upper end point.
+  end point of the interval, \code{upr} giving the upper end point
+  and \code{p adj} giving the p-value. 
 }
 \references{
   Miller, R. G. (1981)
@@ -69,7 +70,8 @@
   Douglas Bates
 }
 \seealso{
-  \code{\link{aov}}, \code{\link{qtukey}}, \code{\link{model.tables}}
+  \code{\link{aov}}, \code{\link{qtukey}}, \code{\link{model.tables}},
+  \code{\link[multcomp]{simint}}
 }
 \examples{
 summary(fm1 <- aov(breaks ~ wool + tension, data = warpbreaks))
-------------- next part --------------
###
###               Tukey multiple comparisons for R
###
### Copyright 2000-2001  Douglas M. Bates <bates at stat.wisc.edu>
### Modified for base R 2002 B. D. Ripley
###
### This file is made available under the terms of the GNU General
### Public License, version 2, or at your option, any later version,
### incorporated herein by reference.
###
### This program is distributed in the hope that it will be
### useful, but WITHOUT ANY WARRANTY; without even the implied
### warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
### PURPOSE.  See the GNU General Public License for more
### details.
###
### You should have received a copy of the GNU General Public
### License along with this program; if not, write to the Free
### Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
### MA 02111-1307, USA

TukeyHSD <-
    function(x, which, ordered = FALSE, conf.level = 0.95, ...)
    UseMethod("TukeyHSD")

TukeyHSD.aov <-
    function(x, which = seq(along = tabs), ordered = FALSE,
             conf.level = 0.95, ...)
{
    mm <- model.tables(x, "means")
    if(is.null(mm$n))
        stop("no factors in the fitted model")
    tabs <- mm$tables[-1]
    tabs <- tabs[which]
    ## mm$n need not be complete -- factors only -- so index by names
    nn <- mm$n[names(tabs)]
    nn_na <- is.na(nn)
    if(all(nn_na))
        stop("'which' specified no factors")
    if(any(nn_na)) {
        warning("'which' specified some non-factors which will be dropped")
        tabs <- tabs[!nn_na]
        nn <- nn[!nn_na]
    }
    out <- vector("list", length(tabs))
    names(out) <- names(tabs)
    MSE <- sum(resid(x)^2)/x$df.residual
    for (nm in names(tabs)) {
        tab <- tabs[[nm]]
        means <- as.vector(tab)
        nms <- if(length(d <- dim(tab)) > 1) {
            dn <- dimnames(tab)
            apply(do.call("expand.grid", dn), 1, paste, collapse=":")
        } else names(tab)
        n <- nn[[nm]]
        ## expand n to the correct length if necessary
        if (length(n) < length(means)) n <- rep.int(n, length(means))
        if (as.logical(ordered)) {
            ord <- order(means)
            means <- means[ord]
            n <- n[ord]
            if (!is.null(nms)) nms <- nms[ord]
        }
        center <- outer(means, means, "-")
        keep <- lower.tri(center)
        center <- center[keep]
        width <- qtukey(conf.level, length(means), x$df.residual) *
            sqrt((MSE/2) * outer(1/n, 1/n, "+"))[keep]
        est <- center/(sqrt((MSE/2) * outer(1/n, 1/n, "+"))[keep])
        pvals <- ptukey(abs(est),length(means),x$df.residual,lower.tail=FALSE)
        dnames <- list(NULL, c("diff", "lwr", "upr","p adj"))
        if (!is.null(nms)) dnames[[1]] <- outer(nms, nms, paste, sep = "-")[keep]
        out[[nm]] <- array(c(center, center - width, center + width,pvals),
                           c(length(width), 4), dnames)
    }
    class(out) <- c("multicomp", "TukeyHSD")
    attr(out, "orig.call") <- x$call
    attr(out, "conf.level") <- conf.level
    attr(out, "ordered") <- ordered
    out
}

print.TukeyHSD <- function(x, ...)
{
    cat("  Tukey multiple comparisons of means\n")
    cat("    ", format(100*attr(x, "conf.level"), 2),
        "% family-wise confidence level\n", sep="")
    if (attr(x, "ordered"))
        cat("    factor levels have been ordered\n")
    cat("\nFit: ", deparse(attr(x, "orig.call"), 500), "\n\n", sep="")
    attr(x, "orig.call") <- attr(x, "conf.level") <- attr(x, "ordered") <- NULL
    print.default(unclass(x), ...)
}

plot.TukeyHSD <- function (x, ...)
{
    for (i in seq(along = x)) {
        xi <- x[[i]]
        yvals <- nrow(xi):1
        plot(c(xi[, "lwr"], xi[, "upr"]), rep.int(yvals, 2), type = "n",
             axes = FALSE, xlab = "", ylab = "", ...)
        axis(1, ...)
        axis(2, at = nrow(xi):1, labels = dimnames(xi)[[1]],
             srt = 0, ...)
        abline(h = yvals, lty = 1, lwd = 0, col = "lightgray")
        abline(v = 0, lty = 2, lwd = 0, ...)
        segments(xi[, "lwr"], yvals, xi[, "upr"], yvals, ...)
        segments(as.vector(xi), rep.int(yvals - 0.1, 3), as.vector(xi),
                 rep.int(yvals + 0.1, 3), ...)
        title(main = paste(format(100 * attr(x, "conf.level"),
              2), "% family-wise confidence level\n", sep = ""),
              xlab = paste("Differences in mean levels of", names(x)[i]))
        box()
    }
}