# Variance of the trimmed mean (example of Hertzsprung) alpha <- seq(0.001,0.499,0.001) a <- qnorm(1-alpha) sigma2 <- 2*(0.5+alpha*(a^2-1) - a*dnorm(a))/(1-2*alpha)^2 #asymptotic variance plot(alpha,sigma2,type="l") # comparing with Hertzsprung's simulation results points((1:11)/24,c(1.013,1.037,1.069,1.095,1.139,1.184,1.232,1.283, 1.345,1.407,1.489),pch=2) # New simulations N <- 1000 x <- matrix(rnorm(24*N),nrow=N,ncol=24) erg <- matrix(0,N,12) for (k in (0:11)) erg[,k+1] <- apply(x,1,mean,trim=k/24) var.trim <- 24*apply(erg^2,2,mean) points((0:11)/24,var.trim,col=3) # standard estimator points((0:11)/24,var.trim/var.trim[1],col=4) # ratio estimator hist(erg[,3]) # composition of Hertzsprung's urn x <- seq(-4,4,by=0.01) n <- length(x)-1 freq <- 12534*c(pnorm(x[1]+0.005),pnorm(x[2:n]+0.005)-pnorm(x[2:n]-0.005), 1-pnorm(x[n]+0.005)) plot(x,round(freq),type="h") sum(round(freq)) urn <- rep(x,freq) # reproducing Hertzsprung's simulation x <- matrix(sample(urn,size=24*N,replace=TRUE),nrow=N,ncol=24) erg <- matrix(0,N,12) for (k in (0:11)) erg[,k+1] <- apply(x,1,mean,trim=k/24) var.trim <- 24*apply(erg^2,2,mean) points((0:11)/24,var.trim,col=3) # standard estimator points((0:11)/24,var.trim/var.trim[1],col=4) # ratio estimator #Bootstrap # Data set used: Sublimation heat of platinum, from Rice, Chap. 10.4 plat <- c(136.3,136.6,135.8,135.4,134.7,135.0,134.1,143.3,147.8,148.8,134.8, 135.2,134.9,146.5,141.2,135.4,134.8,135.8,135.0,133.7,134.4,134.9, 134.8,134.5,134.3,135.2) hist(plat) n <- length(plat) # direct implementation of the bootstrap m <- floor((n-1)/2) N <- 1000 x <- matrix(sample(plat,size=n*N,replace=TRUE),nrow=N,ncol=n) erg <- matrix(0,N,m+1) for (k in (0:m)) erg[,k+1] <- apply(x,1,mean,trim=k/n) var.trim <- apply(erg,2,var) plot((0:m)/n,var.trim/var.trim[1],ylab="variance ratio", xlab="trimming proportion") hist(erg[,13]) round(sqrt(var.trim),3) # Comparing the real data with bootstrap samples #Using histograms par(mfrow=c(2,2)) hist(plat,breaks <- seq(132,150,2)) plat.b <- plat[ceiling(n*runif(n))] # Bootstrap sample plat.b <- sample(plat,size=n,replace=T) # same thing hist(plat.b,breaks <- seq(132,150,2)) plat.f <- factor(plat) # values which occur, sorted, without ties table(plat.f) # frequencies in the original sample table(factor(plat.b,levels=levels(plat.f))) # frequencies in the bootstrap sample #Using the normal plot qqnorm(plat) plat.b <- plat[ceiling(n*runif(n))] qqnorm(plat.b) #Normalplot of a bootstrap sample #Using the distribution function par(mfrow=c(1,1)) plot.stepfun(plat,verticals=T,do.points=F) # empirical distribution function plat.b <- plat[ceiling(n*runif(n))] plot.stepfun(plat.b,verticals=T,do.points=F,add=T,col.hor=3,col.vert=3) # empirical distribution function of the bootstrap sample # Using functions of Davison and Hinkley library("boot") help(boot) t.mean <- function(d,ind) mean(d[ind],0.2) plat.boot <- boot(plat, t.mean, R=999, stype="i") hist(plat.boot$t) sqrt(var(plat.boot$t)) qqnorm(plat.boot$t) boot.ci(plat.boot, conf=c(0.80,0.95),type=c("norm","basic","perc","bca")) # confidence intervals using the bootstrap. All methods except norm # correct for skewness of the trimmed mean. data("bigcity") # population size of American cities 1920 and 1930 plot(bigcity$u,bigcity$x) ratio <- function(d, ind) sum(d$x[ind])/sum(d$u[ind]) # Estimate of increase city.boot <- boot(bigcity, ratio, R=999, stype="i") hist(city.boot$t) sqrt(var(city.boot$t)) qqnorm(city.boot$t) boot.ci(city.boot, conf=c(0.90,0.95),type=c("norm","basic","perc","bca"))