### Ch. 5 Bootstrap ### ================= ## Running example using correlation coefficient ## --------------------------------------------- set.seed(1) x <- rnorm(50) y <- 1+x + rnorm(50,0,1) plot(x,y, main = "n=50 -- \"The Data\" ") (th.n <- cor(x,y)) ## 0.6339331 ## In order to skip the "long" computations below, you can get the saved results: saveFile <- "~/Vorl/comput-statist/R/cor_boot.rda" saveURL <- "http://stat.ethz.ch/Teaching/maechler/CompStat/cor_boot.rda" try(print(load(url(saveURL)))) # "B" "res" "res.true" "r.bseqvar" ## if that failed: if(file.exists(saveFile)) load(saveFile) ## -------------- ## --> and you can skip to the " jump " (text) below B <- 100000 res <- numeric(B) set.seed(22) for(i in 1:B) { if(i %% 10 == 0) cat(".", if(i %% 200 == 0) sprintf(" %6d\n", i)) ind <- sample.int(length(x), replace = TRUE) xb <- x[ind] yb <- y[ind] res[i] <- cor(xb,yb) ## ---------- } ## The "true" distribution (also by simulation): use 'N ~= oo (Infinity)' ## (slightly slower than bootstrap above - can you see why?) ## 2015-lynne: needs 13 secs (>> 10 secs w/ R 3.3.0) system.time({ N <- 250000 res.true <- numeric(N) set.seed(22) for(i in 1:N) { if(i %% 25 == 0) cat(".", if(i %% 500 == 0) sprintf(" %6d\n", i)) x <- rnorm(50) y <- 1+x + rnorm(50) res.true[i] <- cor(x,y) } }) ## <--- jump to here th.lab <- function(th, label = expression(hat(theta)[n]), col = "skyblue3", lwd = 3, tck = 0.2, padj = if(grepl("\\",format(label))) -.1 else -.4) axis(1, at = th, labels = label, tck=tck, padj=padj, lwd=lwd, col=col, col.axis=col) fformat <- function(N, ...) format(N, scientific=FALSE, big.mark = "'", ...) require(sfsmisc)# mult.fig() etc ## Compare Bootstrap world with Real World: pdf.do(file = "hist-2.pdf", height = 7, width = 7) rx <- range(res, res.true) mult.fig(mfrow = c(2,1)) hist(res, breaks = "FD", xlim = rx, main = paste0("Bootstrap, B=", fformat(B))) th.lab(th.n) hist(res.true, breaks = "FD", xlim = rx, main = paste("true distribution, simulated", fformat(N), "times")) -> ht th.lab(sqrt(1/2), expression(theta == sqrt(2)/2)) ## density (blown up to match hist() counts:) dens.true <- density(res.true) f <- with(ht, (counts/density)[counts > 0])[1] lines(with(dens.true, list(x=x, y = y * f)), col = adjustcolor("seagreen", 1/2), lwd = 3) pdf.end() options(digits = 4) # show *less* digits than usual mean(res) ## 0.6241 var(res) ## 0.008216 mean(res.true)## 0.7033 [ N -> oo : Cov(X,Y) = 1/sqrt(2) = 0.7071 ] var(res.true) ## 0.005386 quantile(res,c(0.025,0.975)) #-- see confint etc, below ## 2.5% 97.5% ## 0.4171 0.7713 ## Compute Var(T*_1 .. T*_B) for increasing B (B = 10,20, .... 100'000) : B. <- B / 10 r.bseqvar <- numeric(B.) for(i in 1:B.) { if(i %% 10 == 0) cat(".", if(i %% 200 == 0) sprintf(" %6d\n", i)) r.bseqvar[i] <- var(res[1:(10*i)]) } if(FALSE) ## and if you are MM: ## Save the results for fast reloading save(B, res, res.true, r.bseqvar, file = saveFile) pdf.do(file = "var-boot-conv.pdf", width = 6, height = 6) require("sfsmisc") # for mult.fig(), pdf.do(), .. mult.fig(mfrow = c(1,2), main = expression("Bootstrap variance " * var(T^"*"[1],...,T^"*"[B]))) plot(10*(1:100),r.bseqvar[1:100], type = "l", xlab = "B", ylab = "bootstrap variance", main = "B = 10 .. 1000") abline(h = var(res), col = 2) abline(h = var(res.true), col = 4,lty = 2) ## show the "zoom region" of the next plot px <- par("usr")[1:2]; bgcol <- rgb(.7,.8,.9, alpha=.3) rect(px[1], 0.007, px[2], 0.01, col = bgcol, border = "gray70", lwd=2) plot(10*(1:10000),r.bseqvar,type = "l", ylim = c(0.007,0.01), xaxt="n",# <- better x-labels xlab = "B", ylab = "bootstrap variance", main = "B = 10 .. 100'000") axis(1, at = (x0 <- c(10000, 100000)), labels = sprintf("%d", x0)) abline(h = var(res), col = 2) if(FALSE)# show the same shaded area as on the left [FALSE: it's ugly] rect(px[1], 0.007, px[2], 0.01, col = bgcol, border = "gray70", lwd=.5) pdf.end() ## Quantiles: (q95 <- quantile(res, pr = c(0.025, 0.975))) ## 2.5% 97.5% ## 0.4171 0.7713 pdf.do(file = "hist-CI.pdf", width = 11.6, height = 8.2)# A4 quer par(mfrow = c(1,1)) hist(res, breaks = "FD", # <- better than default ("Sturges") algorithm main = "histogram of B=100'000 bootstrapped correlation estimators", prob = TRUE, ylim = c(0, max(dens.true$y)), col = "gray88", border = "gray60", xlab = "") th.lab(th.n, padj=0) axis(1, at = q95, tck = 0.3, col = 2, labels=FALSE) mtext("2.5% and 97.5% quantiles", 1, at = mean(q95), line = -1.1, col = 2) ### Bootstrap confidence intervals -- now do the confidence interval correctly (q95. <- quantile(res - th.n, pr = c(0.025, 0.975))) ## 2.5% 97.5% ## -0.2169 0.1374 (ci95 <- th.n - q95.[2:1]) ## 97.5% 2.5% ## 0.4965 0.8508 axis(1, at = ci95, tck = 0.1, col="seagreen", col.axis="seagreen", lwd = 2, labels=FALSE) mtext("correct 95% - C.I.", 1, at = mean(ci95), line = -2.2, col = "seagreen") lines(dens.true, col="seagreen") pdf.end()