Ansari-Bradley test (PR#2252)

Wed, 6 Nov 2002 08:55:16 +0100 (MET)

> Full_Name: Wei Xu
> Version: 1.5.1
> OS: WindowsME
> Submission from: (NULL) (63.215.238.92)
> 
> 
> The P-value for a two.sided test is not consistent with the confidence
> interval.
> For example, P-value=0.1372, but the 95% CI doesn't include the H0 value(1).
> 
> > x
>  [1] 0.80 0.83 1.89 1.04 1.45 1.38 1.91 1.64 0.73 1.46
> > y
> [1] 1.15 0.88 0.90 0.74 1.21
> > ansari.test(x,y,alternative="two.sided",conf.int=TRUE,conf.level=0.95)
> 
>         Ansari-Bradley test
> 
> data:  x and y 
> AB = 36, p-value = 0.1372
> alternative hypothesis: true ratio of scales is not equal to 1 
> 95 percent confidence interval:
>  1.081081 2.581081 
> sample estimates:
> ratio of scales 
>        1.972973 
> 

First: sorry for the delay but I was not able to solve this until now. 

Second: well, it is both a bug and a statistical problem, 
`ansari.test' should know about (a version that does is
attached). 

The truth is that x and y are assumed to be "appropriately centered" as
Bauer (1972) states (see the man page for the reference). The model is Y ~
G and X ~ F with

G(x) = F(x / s) (or the other way round ...)

without modelling the "shift". And your example reads

R> x <- c(0.80, 0.83, 1.89, 1.04, 1.45, 1.38, 1.91, 1.64, 0.73, 1.46)
R> y <- c(1.15, 0.88, 0.90, 0.74, 1.21)
R> summary(x)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.7300  0.8825  1.4150  1.3130  1.5950  1.9100 
R> summary(y)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.740   0.880   0.900   0.976   1.150   1.210 

so x seems to be larger on average. I think this corresponds to the
Behrens-Fisher problem in some way but I'm not aware of any literature...
The confidence set cannot be computed by inverting the test for your
(raw) data but currently `ansari.test' forces the computation be using 
the extreme values of the possible ratios (which should at least give a
warning).

Now (with ansari.test as attached)

R> print(ansari.test(x-mean(x),y-mean(y), conf.int=TRUE))

        Ansari-Bradley test

data:  x - mean(x) and y - mean(y) 
AB = 37, p-value = 0.2171
alternative hypothesis: true ratio of scales is not equal to 1 
95 percent confidence interval:
 0.6282051 6.0729167 
sample estimates:
ratio of scales 
       2.468075 

is consistent (using the median can cause problems with ties). Using the
mean seems to be ok (I just simulated some situations, I can give you
details if you like).

best,

Torsten

*************************************************************************

ansari.test <- function(x, ...) UseMethod("ansari.test")

# altered with respect to bug report 2252
# and some additional improvements

ansari.test.default <-
function(x, y, alternative = c("two.sided", "less", "greater"),
         exact = NULL, conf.int = FALSE, conf.level = 0.95, ...) 
{
    alternative <- match.arg(alternative)
    if(conf.int) {
        if(!((length(conf.level) == 1)
             && is.finite(conf.level)
             && (conf.level > 0)
             && (conf.level < 1)))
            stop("conf.level must be a single number between 0 and 1")
    }
    DNAME <- paste(deparse(substitute(x)), "and", deparse(substitute(y)))

    x <- x[complete.cases(x)]
    y <- y[complete.cases(y)]
    m <- length(x)
    if(m < 1)
        stop("not enough x observations")
    n <- length(y)
    if(n < 1)
        stop("not enough y observations")
    N <- m + n

    r <- rank(c(x, y))
    STATISTIC <- sum(pmin(r, N - r + 1)[seq(along = x)])
    TIES <- (length(r) != length(unique(r)))

    if(is.null(exact))
        exact <- ((m < 50) && (n < 50))

    if(exact && !TIES) {
        pansari <- function(q, m, n) {
            .C("pansari",
               as.integer(length(q)),
               p = as.double(q),
               as.integer(m),
               as.integer(n),
               PACKAGE = "ctest")$p
        }
        PVAL <-
            switch(alternative,
                   two.sided = {
                       if (STATISTIC > ((m + 1)^2 %/% 4
                                        + ((m * n) %/% 2) / 2))
                           p <- 1 - pansari(STATISTIC - 1, m, n)
                       else
                           p <- pansari(STATISTIC, m, n)
                       min(2 * p, 1)
                   },
                   less = 1 - pansari(STATISTIC - 1, m, n),
                   greater = pansari(STATISTIC, m, n))
        if (conf.int) {
            qansari <- function(p, m, n) {
                .C("qansari",
                   as.integer(length(p)),
                   q = as.double(p), 
                   as.integer(m),
                   as.integer(n),
                   PACKAGE = "ctest")$q
            }
            alpha <- 1 - conf.level
            x <- sort(x)
            y <- sort(y)
            ab <- function(sig) {
                rab <- rank(c(x/sig, y))
                sum(pmin(rab, N - rab + 1)[seq(along = x)])
            }
            ratio <- outer(x,y,"/")
            aratio <- ratio[ratio >= 0]
            sigma <- sort(aratio)

            cci <- function(alpha) {
              u <- absigma - qansari(alpha/2,  m, n) 
              l <- absigma - qansari(1 - alpha/2, m, n) 
              # check if the statistic exceeds both quantiles first
              uci <- NULL
              lci <- NULL                    
              if(length(u[u >= 0]) == 0) {
                  uci <- sigma[1]
                  warning("samples differ in location: confidence set maybe wrong")
              } 
              if (length(l[l > 0]) == 0) {
                  lci <- sigma[length(sigma)]
                  warning("samples differ in location: confidence set maybe wrong")
              }
              if (is.null(uci)) {
                  u[u < 0] <- NA
                  uci <- min(sigma[which(u == min(u, na.rm=TRUE))])
              }
              if (is.null(lci)) {
                  l[l <= 0] <- NA
                  lci <- max(sigma[which(l == min(l, na.rm=TRUE))])
              }
              # the process of the statistics does not need to be monotone
              # in sigma: check this and interchange quantiles.
              if (uci > lci) {
                  l <- absigma - qansari(alpha/2,  m, n)
                  u <- absigma - qansari(1 - alpha/2, m, n)
                  u[u < 0] <- NA
                  uci <- min(sigma[which(u == min(u, na.rm=TRUE))])
                  l[l <= 0] <- NA
                  lci <- max(sigma[which(l == min(l, na.rm=TRUE))])
               }
               c(uci, lci)
            }

            cint <- if(length(sigma) < 1) {
                warning("Cannot compute confidence interval")
                c(NA, NA)
            }
            else {
                # compute statistics directly: computing the steps is not
                # faster
                absigma <- sapply(sigma + c(diff(sigma)/2, sigma[length(sigma)]*1.01), ab)
                switch(alternative, two.sided = {
                    cci(alpha)
                }, greater= {
                    c(cci(alpha*2)[1], Inf)
                }, less= {
                    c(0, cci(alpha*2)[2])
                })
            }
            attr(cint, "conf.level") <- conf.level
            u <- absigma - qansari(0.5, m, n)
            sgr <- sigma[u <= 0]
            if (length(sgr) == 0) sgr <- NA
            else sgr <- max(sgr)
            sle <- sigma[u > 0]
            if (length(sle) == 0) sle <- NA
            else sle <- min(sle)
            ESTIMATE <- mean(c(sle, sgr))
        }
    }
    else {
        EVEN <- ((N %% 2) == 0)
        normalize <- function(s, r, TIES, m=length(x), n=length(y)) {
            z <- if(EVEN)
                s - m * (N + 2)/4
            else
                s - m * (N + 1)^2/(4 * N)
            if (!TIES) {
                SIGMA <- if(EVEN) 
                    sqrt((m * n * (N + 2) * (N - 2))/(48 * (N - 1)))
                else
                    sqrt((m * n * (N + 1) * (3 + N^2))/(48 * N^2))
            }
            else {
                r <- rle(sort(pmin(r, N - r + 1)))
                SIGMA <- if(EVEN) 
                    sqrt(m * n
                         * (16 * sum(r$l * r$v^2) - N * (N + 2)^2)
                         / (16 * N * (N - 1)))
                else
                    sqrt(m * n
                         * (16 * N * sum(r$l * r$v^2) - (N + 1)^4)
                         / (16 * N^2 * (N - 1)))
            }
            z / SIGMA
        }
        p <- pnorm(normalize(STATISTIC, r, TIES))
        PVAL <- switch(alternative,
                       two.sided = 2 * min(p, 1 - p),
                       less = 1 - p,
                       greater = p)

        if(conf.int && !exact) {
            alpha <- 1 - conf.level
            ab <- function(sig, zq) {
                r <- rank(c(x / sig, y))
                s <- sum(pmin(r, N -r + 1)[seq(along = x)])
                TIES <- (length(r) != length(unique(r)))
                normalize(s, r, TIES, length(x), length(y)) - zq
            }
            ## use uniroot here
            ## compute the range of sigma first 
            srangepos <- NULL
            srangeneg <- NULL
            if (any(x[x > 0]) && any(y[y > 0]))
                srangepos <- c(min(x[x>0], na.rm=TRUE)/max(y[y>0], na.rm=TRUE), 
                            max(x[x>0], na.rm=TRUE)/min(y[y>0], na.rm=TRUE))
            if (any(x[x <= 0]) && any(y[y < 0]))
                srangeneg <- c(min(x[x<=0], na.rm=TRUE)/max(y[y<0], na.rm=TRUE), 
                            max(x[x<=0], na.rm=TRUE)/min(y[y<0], na.rm=TRUE))
            if (any(is.infinite(c(srangepos, srangeneg)))) {
                warning("cannot compute asymptotic confidence intervals or estimator")
                conf.int <- FALSE
            } else {
                ccia <- function(alpha) {
                    # check if the statistic exceeds both quantiles first
                    statu <- ab(srange[1], zq=qnorm(alpha/2))
                    statl <- ab(srange[2], zq=qnorm(alpha/2, lower=FALSE))
                    if (statu > 0) {
                        u <- srange[1]
                        warning("samples differ in location: confidence set maybe wrong")
                    } else
                        u <- uniroot(ab, srange, tol=1e-4, zq=qnorm(alpha/2))$root
                    if (statl < 0) {
                        l <- srange[2]
                        warning("samples differ in location: confidence set maybe wrong")
                    } else 
                        l <- uniroot(ab, srange, tol=1e-4, zq=qnorm(alpha/2, lower=FALSE))$root
                    # the process of the statistics does not need to be
                    # monotone: sort is ok here
                    sort(c(u, l))
                }
                srange <- range(c(srangepos, srangeneg), na.rm=FALSE)
                cint <- switch(alternative, two.sided = {
                    ccia(alpha)
                }, greater= {
                    c(ccia(alpha*2)[1], Inf)
                }, less= {
                    c(0, ccia(alpha*2)[2])
                })
                attr(cint, "conf.level") <- conf.level
                # check if the statistic exceeds both quantiles first
                statu <- ab(srange[1], zq=0)
                statl <- ab(srange[2], zq=0)
                if (statu > 0 || statl < 0) {
                    ESTIMATE <- NA
                    warning("cannot compute estimate")
                } else
                    ESTIMATE <- uniroot(ab, srange, tol=1e-4, zq=0)$root
            }
        }
        if(exact && TIES) {
            warning("Cannot compute exact p-value with ties")
            if(conf.int)
                warning(paste("Cannot compute exact confidence",
                              "intervals with ties"))
        }
    }

    names(STATISTIC) <- "AB"
    RVAL <- list(statistic = STATISTIC,
                 p.value = PVAL,
                 null.value = c("ratio of scales" = 1),
                 alternative = alternative,
                 method = "Ansari-Bradley test",
                 data.name = DNAME)
    if(conf.int)
        RVAL <- c(RVAL,
                  list(conf.int = cint,
                       estimate = c("ratio of scales" = ESTIMATE)))
    class(RVAL) <- "htest"
    return(RVAL)
}

ansari.test.formula <-
function(formula, data, subset, na.action, ...)
{
    if(missing(formula)
       || (length(formula) != 3)
       || (length(attr(terms(formula[-2]), "term.labels")) != 1)
       || (length(attr(terms(formula[-3]), "term.labels")) != 1))
        stop("formula missing or incorrect")
    if(missing(na.action))
        na.action <- getOption("na.action")
    m <- match.call(expand.dots = FALSE)
    if(is.matrix(eval(m$data, parent.frame())))
        m$data <- as.data.frame(data)
    m[[1]] <- as.name("model.frame")
    m$... <- NULL
    mf <- eval(m, parent.frame())
    DNAME <- paste(names(mf), collapse = " by ")
    names(mf) <- NULL
    response <- attr(attr(mf, "terms"), "response")
    g <- factor(mf[[-response]])
    if(nlevels(g) != 2)
        stop("grouping factor must have exactly 2 levels")
    DATA <- split(mf[[response]], g)
    names(DATA) <- c("x", "y")
    y <- do.call("ansari.test", c(DATA, list(...)))
    y$data.name <- DNAME
    y
}

-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-
r-devel mailing list -- Read http://www.ci.tuwien.ac.at/~hornik/R/R-FAQ.html
Send "info", "help", or "[un]subscribe"
(in the "body", not the subject !)  To: r-devel-request@stat.math.ethz.ch
_._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._