[Rd] overriding "summary.default" or "summary.data.frame". How?
Paul Johnson
pauljohn32 at gmail.com
Tue Mar 20 16:24:30 CET 2012
I suppose everybody who makes a package for the first time thinks "I
can change anything!" and then runs into this same question. Has
anybody written out information on how a package can override
functions in R base in the R 2.14 (mandatory NAMESPACE era)?
Suppose I want to alphabetize variables in a summary.data.frame, or
return the standard deviation with the mean in summary output. I'm
pasting in a working example below. It has new "summary.factor"
method. It also has a function summarize that I might like to use in
place of summary.data.frame.
How would my new methods "drop on top" of R base functions? It
appears my functions (summarizeFactors) can find my summary.factor,
but R's own summary uses its own summary.factor.
## summarizeNumerics takes a data frame or matrix, scans the columns
## to select only the numeric variables. By default it alphabetizes
## the columns (use alphaSort = FALSE to stop that). It then
## calculates the quantiles for each variable, as well as the mean,
## standard deviation, and variance, and then packs those results into
## a matrix. The main benefits from this compared to R's default
## summary are 1) more summary information is returned for each
## variable, and the results are returned in a matrix that is easy to
## use in further analysis.
summarizeNumerics <- function(dat, alphaSort = TRUE, digits = max(3,
getOption("digits") - 3)){
if (!is.data.frame(dat)) dat <- as.data.frame(dat)
nums <- sapply(dat, is.numeric)
datn <- dat[ , nums, drop = FALSE]
if (alphaSort) datn <- datn[ , sort(colnames(datn)), drop = FALSE]
sumdat <- apply(datn, 2, stats::quantile, na.rm=TRUE)
sumdat <- rbind(sumdat, mean= apply(datn, 2, mean, na.rm=TRUE))
sumdat <- rbind(sumdat, sd= apply(datn, 2, sd, na.rm=TRUE))
sumdat <- rbind(sumdat, var= apply(datn, 2, var, na.rm=TRUE))
sumdat <- rbind(sumdat, "NA's"=apply(datn, 2, function(x) sum(is.na(x))))
signif(sumdat, digits)
}
summary.factor <- function(y, numLevels) {
## 5 nested functions to be used later
divr <- function(p=0){
ifelse ( p>0 & p < 1, -p * log2(p), 0)
}
entropy <- function(p){
sum ( divr(p) )
}
maximumEntropy <- function(N) -log2(1/N)
normedEntropy <- function(x) entropy(x)/ maximumEntropy(length(x))
nas <- is.na(y)
y <- factor(y)
ll <- levels(y)
tbl <- table(y)
tt <- c(tbl)
names(tt) <- dimnames(tbl)[[1L]]
o <- sort.list(tt, decreasing = TRUE)
if (length(ll) > numLevels){
toExclude <- numLevels:length(ll)
tt <- c(tt[o[-toExclude]], `(All Others)` = sum(tt[o[toExclude]]),
`NA's`=sum(nas))
}else{
tt <- c(tt[o], `NA's`=sum(nas))
}
props <- prop.table(tbl);
tt <- c(tt, "Entropy"=entropy(props), "NormedEntropy"= normedEntropy(props))
}
## Takes a data frame or matrix, scans the columns to find the
## variables that are not numeric and keeps them. By default it
## alphabetizes them (alphaSort = FALSE to stop that). It then treats
## all non-numeric variables as if they were factors, and summarizes
## each in a say that I find useful. In particular, for each factor,
## it provides a table of the most frequently occurring values (the
## top "numLevels" values are represented). As a diversity indictor,
## it calculates the Entropy and NormedEntropy values for each
## variable. Note not all of this is original. It combines my code
## and R code from base/summary.R
summarizeFactors <- function(dat = NULL, numLevels = 10, alphaSort =
TRUE, digits = max(3, getOption("digits") - 3))
{
##copies from R base::summary.R summary.data.frame
ncw <- function(x) {
z <- nchar(x, type="w")
if (any(na <- is.na(z))) {
# FIXME: can we do better
z[na] <- nchar(encodeString(z[na]), "b")
}
z
}
if (!is.data.frame(dat)) dat <- as.data.frame(dat)
##treat any nonnumeric as a factor
factors <- sapply(dat, function(x) {!is.numeric(x) })
##If only one factor, need drop=FALSE.
datf <- dat[ , factors, drop = FALSE]
if (alphaSort) datf <- datf[ , sort(colnames(datf)), drop = FALSE]
z <- lapply(datf, summary.factor, numLevels=numLevels)
nv <- length(datf)
nm <- names(datf)
lw <- numeric(nv)
nr <- max(unlist(lapply(z, NROW)))
for(i in 1L:nv) {
sms <- z[[i]]
lbs <- format(names(sms))
sms <- paste(lbs, ":", format(sms, digits = digits), " ",
sep = "")
lw[i] <- ncw(lbs[1L])
length(sms) <- nr
z[[i]] <- sms
}
z <- unlist(z, use.names=TRUE)
dim(z) <- c(nr, nv)
if(any(is.na(lw)))
warning("probably wrong encoding in names(.) of column ",
paste(which(is.na(lw)), collapse = ", "))
blanks <- paste(character(max(lw, na.rm=TRUE) + 2L), collapse = " ")
pad <- floor(lw - ncw(nm)/2)
nm <- paste(substring(blanks, 1, pad), nm, sep = "")
dimnames(z) <- list(rep.int("", nr), nm)
attr(z, "class") <- c("table")
z
}
##
## want to override summary.data.frame, but confusing. When
## will R find my summary.data.frame, when will it find the one in base.
## use ... for numLevels, digits, alphaSort
summarize <- function(dat, ...)
{
dots <- list(...)
dotnames <- names(dots)
## next should give c("digits", "alphaSort")
nnames <- names(formals(summarizeNumerics))[-1L]
## names that need keeping if in dots:
keepnames <- dotnames %in% nnames
if( sum(keepnames) > 0 ) {
argList <- modifyList( list("dat"=quote(dat)), dots[keepnames] )
datn <- do.call("summarizeNumerics", argList)
} else {
datn <- do.call("summarizeNumerics", args=list("dat"= quote(dat)))
}
## all ... can go to summarizeFactors
datf <- summarizeFactors(dat, ...)
value <- list(numerics = datn, factors = datf)
value
}
set.seed(23452345)
x1 <- gl(12,2, labels=LETTERS[1:12])
x2 <- gl(8,3, labels=LETTERS[12:24])
z1 <- rnorm(24)
a1 <- rnorm(24, mean=1.2, sd = 1.7)
a2 <- rpois(24, lambda=10 + a1)
a3 <- rgamma(24, 0.5, 4)
b1 <- rnorm(24, mean=1.3, sd = 1.4)
dat <- data.frame(z1, a1, x2, a2, x1, a3, b1)
summary(dat)
summarize(dat)
summarizeNumerics(dat)
summarizeFactors(dat, numLevels=5)
summarize(dat, alphaSort=FALSE)
summarize(dat, digits=6, alphaSort=FALSE)
summarize(dat, digits=22, alphaSort=FALSE)
summarize(dat, numLevels= 2)
datsumm <- summarize(dat)
datsumm$numerics
datsumm[[1]] ## same: gets numerics
datsumm$factors
datsumm[[2]]
## Use numerics output to make plots. First,
## transpose gives varnames x summary stat matrix
datsummNT <- t(datsumm$numerics)
datsummNT <- as.data.frame(datsummNT)
plot(datsummNT$mean, datsummNT$var, xlab="The Means", ylab="The Variances")
plot(datsummNT$mean, datsummNT$var, xlab="The Means", ylab="The
Variances", type="n")
text(datsummNT$mean, datsummNT$var, labels=rownames(datsummNT))
## Here's a little plot wrinkle. Note variable names are "out to the
## edge" of the plot. If names are longer they don't stay inside
## figure. See?
## Make the variable names longer
rownames(datsummNT)
rownames(datsummNT) <- c("boring var","var with long name", "tedious
name var", "stupid varname", "buffoon not baboon")
plot(datsummNT$mean, datsummNT$var, xlab="The Means", ylab="The
Variances", type="n")
text(datsummNT$mean, datsummNT$var, labels=rownames(datsummNT), cex=0.8)
## That's no good. Names across the edges
## We could brute force the names outside the edges like this
par(xpd=T)
text(datsummNT$mean, datsummNT$var, labels=rownames(datsummNT), cex=0.8)
## but that is not much better
par(xpd=F)
## Here is one fix. Make the unused space inside the plot larger by
## making xlim and ylim bigger. I use the magRange function from
## rockchalk to easily expand range to 1.2 times its current size.
## otherwise, long variable names do not fit inside plot. magRange
## could be asymmetric if we want, but this use is symmetric.
library(rockchalk)
rownames(datsummNT)
rownames(datsummNT) <- c("boring var","var with long name", "tedious
name var", "stupid varname", "buffoon not baboon")
plot(datsummNT$mean, datsummNT$var, xlab="The Means", ylab="The
Variances", type="n", xlim=magRange(datsummNT$mean, 1.2),
ylim=magRange(datsummNT$var, 1.2))
text(datsummNT$mean, datsummNT$var, labels=rownames(datsummNT), cex=0.8)
## Here's another little plot wrinkle.
## If we don't do that to keep the names in bounds, we need some
## fancy footwork. Note when a point is near the edge, I make sure
## the text prints toward the center of the graph.
plot(datsummNT$mean, datsummNT$var, xlab="The Means", ylab="The Variances")
## calculate label positions. This is not as fancy as it could be.
## If there were lots of variables, we'd have to get smarter about
## positioning labels on above, below, left, or right.
labelPos <- ifelse(datsummNT$mean - mean(datsummNT$mean, na.rm=T) > 0, 2, 4)
text(datsummNT$mean, datsummNT$var, labels=rownames(datsummNT),
cex=0.8, pos=labelPos)
x <- data.frame(x=rnorm(100), y = gl(50,2), z = rep(1:4, 25), ab = gl(2,50))
summarize(x)
summarize(x, numLevels=15)
sumry <- summarize(x)
sumry[[1]] ##another way to get the numerics output
sumry[[2]] ##another way to get the factors output
dat <- data.frame(x=rnorm(100), y = gl(50,2), z = factor(rep(1:4, 25),
labels=c("A","B","C","D")), animal=factor(ifelse(runif(100)< 0.2,
"cow", ifelse(runif(100) < 0.5,"pig","duck"))))
summarize(dat)
dat <- read.table(url("http://pj.freefaculty.org/guides/stat/DataSets/USNewsCollege/USNewsCollege.csv"),
sep=",")
colnames(dat) <- c("fice", "name", "state", "private", "avemath", "aveverb",
"avecomb", "aveact", "fstmath", "trdmath", "fstverb", "trdverb",
"fstact", "trdact", "numapps", "numacc", "numenr", "pctten",
"pctquart", "numfull", "numpart", "instate", "outstate",
"rmbrdcst", "roomcst", "brdcst", "addfees", "bookcst", "prsnl",
"pctphd", "pctterm", "stdtofac", "pctdonat", "instcst", "gradrate")
dat$private <- factor(dat$private, labels=c("public","private"))
sumry <- summarize(dat, digits=2)
sumry
sumry[[1]]
sumry[[2]]
summarize(dat[ , c("fice","name","private","fstverb","avemath")], digits=4)
--
Paul E. Johnson
Professor, Political Science Assoc. Director
1541 Lilac Lane, Room 504 Center for Research Methods
University of Kansas University of Kansas
http://pj.freefaculty.org http://quant.ku.edu
More information about the R-devel
mailing list