[Rd] the computation of exact p-value for the nonparametric cor-test with ties
Antonietta di Salvatore
adisalvatore at hotmail.it
Wed May 24 10:26:01 CEST 2006
Hello,
I wuold like to propose my modifications of the original cor.test to you : I
tried to calcolate the correct p-value for Spearman and Kendall's test with
ties.
Let me know what you think.
Thanks you for your time.
Antonietta di Salvatore
test <- function(x, ...) UseMethod("test")
test.default <-
function(x, y, alternative = c("two.sided", "less", "greater"),
method = c("pearson", "newkendall", "newspearman"), exact = NULL,
conf.level = 0.95, ...)
{
alternative <- match.arg(alternative)
method <- match.arg(method)
DNAME <- paste(deparse(substitute(x)), "and", deparse(substitute(y)))
if(length(x) != length(y))
stop("x and y must have the same length")
OK <- complete.cases(x, y)
x <- x[OK]
y <- y[OK]
n <- length(x)
PVAL <- NULL
NVAL <- 0
conf.int <- FALSE
if(method == "pearson") {
if(n < 3)
stop("not enough finite observations")
method <- "Pearson's product-moment correlation"
names(NVAL) <- "correlation"
r <- cor(x, y)
df <- n - 2
ESTIMATE <- c(cor = r)
PARAMETER <- c(df = df)
STATISTIC <- c(t = sqrt(df) * r / sqrt(1 - r^2))
p <- pt(STATISTIC, df)
if(n > 3) { ## confidence int.
if(!missing(conf.level) &&
(length(conf.level) != 1 || !is.finite(conf.level) ||
conf.level < 0 || conf.level > 1))
stop(paste("conf.level must be a single number",
"between 0 and 1"))
conf.int <- TRUE
z <- atanh(r)
sigma <- 1 / sqrt(n - 3)
cint <-
switch(alternative,
less = c(-Inf, z + sigma * qnorm(conf.level)),
greater = c(z - sigma * qnorm(conf.level), Inf),
two.sided = z +
c(-1, 1) * sigma * qnorm((1 + conf.level) / 2))
cint <- tanh(cint)
attr(cint, "conf.level") <- conf.level
}
}
else {
if(n < 2)
stop("not enough finite observations")
PARAMETER <- NULL
if(method == "newkendall") {
method <- "Kendall's rank correlation tau"
names(NVAL) <- "tau"
r <- cor(x,y, method = "kendall")
ESTIMATE <- c(tau = r)
if(!is.finite(ESTIMATE)) { # all x or all y the same
ESTIMATE[] <- NA
STATISTIC <- c(T = NA)
PVAL <- NA
}
else {
if(is.null(exact))
exact <- (n < 50)
if(exact) {
xr<-rank(x)
yr<-rank(y)
Gx<-table(xr)
Gx<-Gx[Gx>1]
Gx<-sum(Gx^2-Gx)
#Gx is null when x variable is without ties
Gy<-table(yr)
Gy<-Gy[Gy>1]
Gy<-sum(Gy^2-Gy)
Gy is null when y variable is without ties
q <- round((r + 1)*sqrt(n^2-n-Gx)*sqrt(n^2-n-Gy)/4)
pkendall <- function(q, n) {
.C("pkendall",
length(q),
p = as.double(q),
as.integer(n),
PACKAGE = "stats")$p
}
PVAL <-
switch(alternative,
"two.sided" = {
if(q > sqrt(n^2-n-Gx)*sqrt(n^2-n-Gy)/4)
p <- 1 - pkendall(q - 1, n)
else
p <- pkendall(q, n)
min(2 * p, 1)
},
"greater" = 1 - pkendall(q - 1, n),
"less" = pkendall(q, n))
STATISTIC <- c(T = q)
} else {
STATISTIC <- c(z = r / sqrt((4 * n + 10) / (9 *
n*(n-1))))
p <- pnorm(STATISTIC)
}
}
} else {
method<- "Spearman's rank correlation rho"
names(NVAL) <- "rho"
r <- cor(x,y,method="spearman")
ESTIMATE <- c(rho = r)
if(!is.finite(ESTIMATE)) { # all x or all y the same
ESTIMATE[] <- NA
STATISTIC <- c(S = NA)
PVAL <- NA
}
else {
## Use the test statistic S = sum(rank(x) - rank(y))^2
## and AS 89 for obtaining better p-values than via the
## simple normal approximation.
## In the case of no ties, S = (1-rho) * (n^3-n)/6.
pspearman <- function(q, n, lower.tail = TRUE) {
if(n <= 1290) # n*(n^2 - 1) does not overflow
.C("prho",
as.integer(n),
as.double(q + 1),
p = double(1),
integer(1),
as.logical(lower.tail),
PACKAGE = "stats")$p
else { # for large n: aymptotic t_{n-2}
r <- 1 - 6 * q / (n*(n*n - 1))
pt(r / sqrt((1 - r^2)/(n-2)), df = n-2,
lower.tail= !lower.tail)
}
}
xr<-rank(x)
yr<-rank(y)
Gx<-table(xr)
Gx<-Gx[Gx>1]
Gx<-sum(Gx^3-Gx)
Gy<-table(yr)
Gy<-Gy[Gy>1]
Gy<-sum(Gy^3-Gy)
q <-round(1/6*((n^3-n)-(Gx+Gy)/2-r*sqrt((n^3-n)^2-(Gx+Gy)*(n^3-n)+Gx*Gy)))
STATISTIC <- c(S = q)
PVAL <-
switch(alternative,
"two.sided" = {
p <- if(q > (n^3 - n) / 6)
pspearman(q - 1, n, lower.tail = FALSE)
else
pspearman(q, n, lower.tail = TRUE)
min(2 * p, 1)
},
"greater" = pspearman(q, n, lower.tail = TRUE),
"less" = pspearman(q - 1, n, lower.tail = FALSE))
}
}
}
if(is.null(PVAL)) # for "pearson" only, currently
PVAL <- switch(alternative,
"less" = p,
"greater" = 1 - p,
"two.sided" = 2 * min(p, 1 - p))
RVAL <- list(statistic = STATISTIC,
parameter = PARAMETER,
p.value = as.numeric(PVAL),
estimate = ESTIMATE,
null.value = NVAL,
alternative = alternative,
method = method,
data.name = DNAME)
if(conf.int)
RVAL <- c(RVAL, list(conf.int = cint))
class(RVAL) <- "htest"
RVAL
}
test.formula <-
function(formula, data, subset, na.action, ...)
{
if(missing(formula)
|| !inherits(formula, "formula")
|| length(formula) != 2)
stop("formula missing or invalid")
m <- match.call(expand.dots = FALSE)
if(is.matrix(eval(m$data, parent.frame())))
m$data <- as.data.frame(data)
m[[1]] <- as.name("model.frame")
m$... <- NULL
mf <- eval(m, environment(formula))
if(length(mf) != 2)
stop("invalid formula")
DNAME <- paste(names(mf), collapse = " and ")
names(mf) <- c("x", "y")
y <- do.call("test", c(mf, list(...)))
y$data.name <- DNAME
y
}
More information about the R-devel
mailing list