[Rd] 2 bugs in max.col() (PR#9542)
oehl_list at gmx.de
oehl_list at gmx.de
Sat Mar 3 18:48:59 CET 2007
Dear R-Developers,
I think I found two bugs in max.col(). Ties between zeros are not broken, which might affect simulations. -Inf and Zero can be treated the same, which can give completely wrong results, e.g. when the second max is sought by replacing all maxs by -Inf.
To fix max.col I do offer the C-code behind my function rowMax(), which also handles NAs and seems to be faster. However, since max.col() is widely used and since I only occasionally program C, I'd appreciate if someone would do code-review and (again) thorough testing before publishing it. Please note that the code was developed under R 1.6.2 and might need porting to R 2.4.1 .
Best regards
Jens Oehlschlägel
# -- output of max.col under 2.4.1. ------------------------
> x <- rbind(c(1,1), c(0,0), c(-Inf, 0), c(0, Inf), c(0, NA), c(NA, 0))
> rownames(x) <- paste("c(", apply(x, 1, paste, collapse=","), ")")
> x
[,1] [,2]
c( 1,1 ) 1 1
c( 0,0 ) 0 0
c( -Inf,0 ) -Inf 0
c( 0,Inf ) 0 Inf
c( 0,NA ) 0 NA
c( NA,0 ) NA 0
>
> cat("ties not broken for c(0,0)\n")
ties not broken for c(0,0)
> cat("erroneously ties broken for c(-Inf, 0)\n")
erroneously ties broken for c(-Inf, 0)
> tmp <- sapply(1:10, function(i)max.col(x, ties.method="random"));rownames(tmp)<-rownames(x);tmp
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
c( 1,1 ) 2 1 2 2 1 1 1 2 1 1
c( 0,0 ) 2 2 2 2 2 2 2 2 2 2
c( -Inf,0 ) 2 1 1 2 2 1 2 1 1 1
c( 0,Inf ) 2 2 2 2 2 2 2 2 2 2
c( 0,NA ) NA NA NA NA NA NA NA NA NA NA
c( NA,0 ) NA NA NA NA NA NA NA NA NA NA
> cat("The following look good\n")
The following look good
> tmp <- sapply(1:10, function(i)max.col(x, ties.method="first"));rownames(tmp)<-rownames(x);tmp
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
c( 1,1 ) 1 1 1 1 1 1 1 1 1 1
c( 0,0 ) 1 1 1 1 1 1 1 1 1 1
c( -Inf,0 ) 2 2 2 2 2 2 2 2 2 2
c( 0,Inf ) 2 2 2 2 2 2 2 2 2 2
c( 0,NA ) NA NA NA NA NA NA NA NA NA NA
c( NA,0 ) NA NA NA NA NA NA NA NA NA NA
> tmp <- sapply(1:10, function(i)max.col(x, ties.method="last"));rownames(tmp)<-rownames(x);tmp
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
c( 1,1 ) 2 2 2 2 2 2 2 2 2 2
c( 0,0 ) 2 2 2 2 2 2 2 2 2 2
c( -Inf,0 ) 2 2 2 2 2 2 2 2 2 2
c( 0,Inf ) 2 2 2 2 2 2 2 2 2 2
c( 0,NA ) NA NA NA NA NA NA NA NA NA NA
c( NA,0 ) NA NA NA NA NA NA NA NA NA NA
> version
_
platform i386-pc-mingw32
arch i386
os mingw32
system i386, mingw32
status
major 2
minor 4.1
year 2006
month 12
day 18
svn rev 40228
language R
version.string R version 2.4.1 (2006-12-18)
# -- output of rowMax under 1.6.2 --------
> tmp <- sapply(1:10, function(i)rowMax(x, value=FALSE, na.rm=TRUE, ties.method="random"));rownames(tmp)<-rownames(x);tmp
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
c( 1,1 ) 2 2 2 2 1 2 2 1 1 1
c( 0,0 ) 1 1 2 2 2 2 1 1 1 2
c( -Inf,0 ) 2 2 2 2 2 2 2 2 2 2
c( 0,Inf ) 2 2 2 2 2 2 2 2 2 2
c( 0,NA ) 1 1 1 1 1 1 1 1 1 1
c( NA,0 ) 2 2 2 2 2 2 2 2 2 2
> tmp <- sapply(1:10, function(i)rowMax(x, value=FALSE, na.rm=TRUE, ties.method="first"));rownames(tmp)<-rownames(x);tmp
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
c( 1,1 ) 1 1 1 1 1 1 1 1 1 1
c( 0,0 ) 1 1 1 1 1 1 1 1 1 1
c( -Inf,0 ) 2 2 2 2 2 2 2 2 2 2
c( 0,Inf ) 2 2 2 2 2 2 2 2 2 2
c( 0,NA ) 1 1 1 1 1 1 1 1 1 1
c( NA,0 ) 2 2 2 2 2 2 2 2 2 2
> tmp <- sapply(1:10, function(i)rowMax(x, value=FALSE, na.rm=TRUE, ties.method="last"));rownames(tmp)<-rownames(x);tmp
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
c( 1,1 ) 2 2 2 2 2 2 2 2 2 2
c( 0,0 ) 2 2 2 2 2 2 2 2 2 2
c( -Inf,0 ) 2 2 2 2 2 2 2 2 2 2
c( 0,Inf ) 2 2 2 2 2 2 2 2 2 2
c( 0,NA ) 1 1 1 1 1 1 1 1 1 1
c( NA,0 ) 2 2 2 2 2 2 2 2 2 2
# -- replication ----------
x <- rbind(c(1,1), c(0,0), c(-Inf, 0), c(0, Inf), c(0, NA), c(NA, 0))
rownames(x) <- paste("c(", apply(x, 1, paste, collapse=","), ")")
x
cat("ties not broken for c(0,0)\n")
cat("erroneously ties broken for c(-Inf, 0)\n")
tmp <- sapply(1:10, function(i)max.col(x, ties.method="random"));rownames(tmp)<-rownames(x);tmp
cat("The following look good\n")
tmp <- sapply(1:10, function(i)max.col(x, ties.method="first"));rownames(tmp)<-rownames(x);tmp
tmp <- sapply(1:10, function(i)max.col(x, ties.method="last"));rownames(tmp)<-rownames(x);tmp
# rowMax under R 1.6.2
tmp <- sapply(1:10, function(i)rowMax(x, value=FALSE, na.rm=TRUE, ties.method="random"));rownames(tmp)<-rownames(x);tmp
tmp <- sapply(1:10, function(i)rowMax(x, value=FALSE, na.rm=TRUE, ties.method="first"));rownames(tmp)<-rownames(x);tmp
tmp <- sapply(1:10, function(i)rowMax(x, value=FALSE, na.rm=TRUE, ties.method="last"));rownames(tmp)<-rownames(x);tmp
# -- R 1.6.2 -----------
## Copyright 2003-2007 Jens Oehlschlägel
## May be used (possibly modified) with name "max.col" under GPL
## Provided "as is"
## Use at own risk
rowMax <- function(x, value=TRUE, index=NULL, na.rm=FALSE, ties.method = c("random", "first", "last"), use.names=FALSE){
if (!length(x))
stop("x has no length")
d <- dim(x)
if (!is.null(index) && !identical(d, dim(index)))
stop("index has wrong dimension")
if (length(d)!=2)
stop("x must be a matrix")
if (is.na(value) || !value){
ties.method <- pmatch(ties.method, c("random", "first", "last"))
if (is.na(ties.method))
stop("illegal ties method")
ret <- .C("R_row_maxwithindex_double"
, x=as.double(x)
, nr=d[1]
, nc=d[2]
, narm = as.integer(na.rm)
, index=integer(d[1])
, tiesmethod=as.integer(ties.method)
, NAOK = TRUE, DUP = FALSE, PACKAGE = "base")[c("index")]
i <- cbind(seq(length=d[1]), ret$index)
if (is.na(value))
ret$value <- x[i]
if (!is.null(index))
ret$index <- index[i]
if (is.na(value)){
if (use.names){
names(ret$index) <- dimnames(x)[1]
names(ret$values) <- dimnames(x)[1]
}
return(ret)
}else{
if (use.names){
names(ret$index) <- dimnames(x)[1]
}
return(ret$index)
}
}else{
ret <- .C("R_row_max_double"
, x=as.double(x)
, nr=d[1]
, nc=d[2]
, narm = as.integer(na.rm)
, values=double(d[1])
, NAOK = TRUE, DUP = FALSE, PACKAGE = "base")["values"]
if (use.names){
names(ret$values) <- dimnames(x)[1]
}
return(ret$values)
}
}
# -- C --------------
// Author: Jens Oehlschlägel
// Date: 3.3.2007
// Provided 'as is' under GPL
// Use at own risk
#include <R_ext/Arith.h> /* NA handling */
#include <Rmath.h> /* probably not needed */
#include <R_ext/Random.h> /* ..RNGstate */
#include <R_ext/Applic.h> /* NA handling */
#include <R_ext/Utils.h> /* probably not needed */
#define RELTOL 1e-5
void R_row_maxwithindex_double(
double *x
, int *nr
, int *nc
, int *narm
, int *index
, int *tiesmethod
)
{
int r, c, m, ntie, n_r = *nr, n_c = *nc, na_rm = *narm, ties_method = *tiesmethod;
double a = 0; // to avoid uninitialized compiler warning
double b, tol, small, absa;
Rboolean isna;
if (ties_method==1)
GetRNGstate();
for (r = 0; r < n_r; r++) {
/* first check row for any NAs and find the smallst entry */
small = R_PosInf;
if (na_rm){
isna = TRUE;
for (c = 0; c < n_c; c++){
a = x[r + c * n_r];
if (!ISNAN(a)) {
isna = FALSE;
absa = fabs(a);
if (absa<small && absa!=0)
small = absa;
}
}
}else{
isna = FALSE;
for (c = 0; c < n_c; c++) {
a = x[r + c * n_r];
if (ISNAN(a)) {
isna = TRUE;
break;
}else{
absa = fabs(a);
if (absa<small && absa!=0)
small = absa;
}
}
}
if (isna) {
index[r] = NA_INTEGER;
}else{
if (R_FINITE(small))
tol = RELTOL * small;
else
tol = 0;
//Rprintf("small=%g tol=%g\n", small, tol);
// the following loop has two parts
// 1) find non-NA
// 2) find extreme (and don't ask ISNAN anymore)
m = NA_INTEGER;
ntie = 1;
for (c = 0; c < n_c; c++) {
a = x[r + c * n_r];
if (!ISNAN(a)){
m = c;
break;
}
}
for (c++; c < n_c; c++) {
b = x[r + c * n_r];
//Rprintf("a-tol=%g a=%g a+tol=%g b=%g\n", a-tol, a, a+tol, b);
if (b > a + tol){ // MASS had >= here, which does not tie zeros and Inf
ntie = 1;
a = b;
m = c;
} else if (b >= a - tol) {
if (ties_method==1){
ntie++;
if (ntie * unif_rand() < 1.) m = c;
}else if (ties_method==3){
m=c;
}
}
}
index[r] = m + 1;
}
}
if (ties_method==1)
PutRNGstate();
}
void R_row_max_double(double *x, int *nr, int *nc, int *narm, double *values)
{
int r, c, n_r = *nr, n_c = *nc, na_rm = *narm;
double e, extreme;
Rboolean isna;
for (r = 0; r < n_r; r++) {
/* first check row for any NAs and find the largest entry */
extreme = R_NegInf;
isna = FALSE;
for (c = 0; c < n_c; c++) {
e = x[r + c * n_r];
if (ISNAN(e)){
if (!na_rm){
isna = TRUE;
break;
}
}else if (e>extreme){
extreme = e;
}
}
if (isna)
values[r] = NA_REAL;
else
values[r] = extreme;
}
}
--
"Feel free" - 10 GB Mailbox, 100 FreeSMS/Monat ...
Jetzt GMX TopMail testen: www.gmx.net/de/go/mailfooter/topmail-out
More information about the R-devel
mailing list