#### 4. Crossvalidation #### ================== ## Small modifications (by student E.Oldenhof, 1.August 2017): ## sort() *before* feeding data into ksmooth(): see comments with '(EOL)' trySetwd <- function(dir) if(file.exists(dir)) setwd(dir) saveFolder <- "/u/maechler/Vorl/comput-statist/R" ## <<-- adapt for your Comp !! ## ff <- function(x) 3*x + sin(6*x) ff <- function(x) x-((x-1)/2)^2 + sin(6*x/(1+x^2/20)) ## -------------- ------------------- N <- 128 ## sample size set.seed(0) ## our 'real data': x <- sort(runif(N, -3,3)) ## --- X ~ U[-3,3] y <- ff(x) + rnorm(N,0,1/2) plot(x,y) ## the data -- we can not easily "see" the underlying model, right? curve(ff(x), add = TRUE, col=2, lty = 2, lwd = 2) lines(ksmooth(x,y, b = 0.1, kernel = "normal", n.p = 999)) lines(ksmooth(x,y, b = 0.3, kernel = "normal", n.p = 999), col=3) ## ^^^ certainly better than this one : lines(ksmooth(x,y, b = 1.0, kernel = "normal", n.p = 999), col=4) legend("bottomright", bty="n", ncol=2, c("true f", paste("bw =",format(c(0.1, 0.3, 1.0)))), col=c(2,1,3,4), lwd=c(2,1,1,1), lty=c(2,1,1,1)) ## Set of bandwidths from which to choose: ## Till 2014: b.widths <- seq(1/20, 1, by = 1/20)# bw_k = k / 20, k = 1,..,20 ## New: take smaller more interesting range: b.widths <- seq(2/64, 0.5, by = 1/64)# bw_k = k / 64, k = 2,3,..,32 (n.b <- length(b.widths)) ## 31 [was 20] simCV <- 200 # Number of complete simulations (incl. data !!) ## simGSE <- 500 dn <- list(NULL, bandwidths = paste("h=", b.widths, sep="")) res.gse <- matrix(ncol = n.b, nrow = simGSE, dimnames= dn) res.1cv <- matrix(ncol = n.b, nrow = simCV , dimnames= dn) res.10x.cv <- matrix(ncol = n.b, nrow = simCV , dimnames= dn) res.10cv <- matrix(ncol = n.b, nrow = simCV , dimnames= dn) res.2cv <- matrix(ncol = n.b, nrow = simCV , dimnames= dn) ### Simulation for True GSE -- can do this, since we know the true ff() ## ----------------------- ## Note: one trick (time-saver, mainly) is to use the *same* ## ---- samples for the different values of the smoothing parameter l.New <- 400 set.seed(22) oneNew <- FALSE # more 'variation' oneNew <- TRUE # one Y.new for all 'h' [==> less noise ==> smoother) cat("Gen.(Squared) error simulations --- B =", simGSE, "\n") system.time( # << measure time used -- ~ 2 min. on "nb-mm" (P III, 700 MHz) for(i in 1:simGSE) { if(i %% 10 == 0) { cat(i,""); if(i %% 100 == 0) cat("\n") } ## Generate data according to known Law (X_i, Y_i) ~ P(.) x <- sort(runif(N)) y <- ff(x) + rnorm(N,0,1) ## now generate "New" Data (X.new_i, Y.new_i), i=1,..,l.New xnew <- sort(runif(l.New)) f.new <- ff(xnew) if(oneNew) ynew <- f.new + rnorm(l.New, 0,1)# the same for each 'h' ## for each bandwidth: for(k in 1:n.b) { y.hat <- ksmooth(x,y, bandwidth = b.widths[k], kernel = "normal", x.points = xnew)$y if(!oneNew) ## ......... ==== m^(X.new) [vector] ynew <- f.new + rnorm(l.New, 0,1)# different for each 'h' res.gse[i,k] <- mean((ynew - y.hat)^2) } }); cat("\n")## elapsed | n.b | l.New ## nb-mm3 [~ 2011]: 50 sec | 20 | 200 ## lynne [2014] : 25 sec | 20 | 200 ## " " : 30 sec | 31 | 200 ## " " : 52 sec | 31 | 400 ## new N = 128 ## nb-mmm5 [2021] : 8.5 sec| 31 | 400 boxplot(res.gse, cex = .5, ## at = b.widths, xlim = 0:1, main = paste("Empirical Generalization Error,\n B = ",simGSE, "samples for each h; l.New = ", l.New)) ## zoom in (vertically) and add boxplot "notches": boxplot(res.gse, cex = .5, ylim = c(0.85, 1.45), notch=TRUE, main = paste("Empirical Generalization Error,\n B = ",simGSE, "samples for each h; l.New = ", l.New)) ## add all B=500 curves (w/ some transparency) matlines(t(res.gse), type="l", col=adjustcolor(1, 1/4), lty=3) ## The following takes time --- jump down if you want to just see the results ## ========== ========= set.seed(22) cat("CV simulations --- B =", simCV, "\n") system.time( # << measure time used -- for(i in 1:simCV) { cat("", i) ## Newly generate the data (!) x <- sort(runif(N)) y <- ff(x) + rnorm(N,0,1) ## ##------- leave-one-out CV ---------------- h <- numeric(N) for(k in 1:n.b) { for(j in 1:N) { ## leave out (x[j], y[j]) and predict at x[j]: y.hat <- ksmooth(x[-j], y[-j], bandwidth = b.widths[k], kernel = "normal", x.points = x[j])$y h[j] <- (y[j] - y.hat)^2 } res.1cv[i,k] <- mean(h) } cat(".") ## ##------- 10-fold CV ------------------- h <- numeric(10) ## N.B: 10*10 = 100 ## randomly permute data once, ## and choose 10th-sized 'regular' subsets subsequently ii <- sample(1:N, size = N, replace = FALSE) xx <- x[ii] yy <- y[ii] for(k in 1:n.b) { for(j in 1:10) { ind <- ((j-1)*10 + 1):(j*10) ## NB: ksmooth() "needs" x-ordering of indices (EOL) ind_order <- order(xx[ind]) ind <- ind[ind_order] y.h <- ksmooth(xx[-ind], yy[-ind], bandwidth = b.widths[k], kernel = "normal", x.points = xx[ind])$y h[j] <- mean((yy[ind] - y.h)^2) } res.10x.cv[i,k] <- mean(h) } cat(",") ## ##------- leave-2-out and leave-10-out CV -------------- B <- 500 h <- h2 <- matrix(NA, B, n.b) #in2 <- 1 for(j in 1:B) { ## Use *same* set subset (of size 'd') C*_j for all bandwidths ==> smooth in h ## NB: 'sort(*)' as ksmooth() "needs" x-ordering of indices (EOL) ind <- sort(sample(1:N, size = 10, replace = FALSE)) in2 <- sort(sample(1:N, size = 2, replace = FALSE)) for(k in 1:n.b) { y.h <- ksmooth(x[-ind], y[-ind], bandwidth = b.widths[k], kernel = "normal", x.points = x[ind])$y h[j,k] <- mean((y[ind] - y.h)^2) y.h <- ksmooth(x[-in2], y[-in2], bandwidth = b.widths[k], kernel = "normal", x.points = x[in2])$y h2[j,k] <- mean((y[in2] - y.h)^2) } } res.10cv[i,] <- colMeans(h) res.2cv [i,] <- colMeans(h2) ## cat(";"); if(i %% 10 == 0) cat("\n") ## }); cat("\n") ## user system elapsed ## 5127.83 0.37 7065.44 [lynne, 2005 Pentium III, 700 MHz] -- ~ 2 hours ! ## 833.480 0.160 834.164 [lynne 2010-04 [Sys.MIPS() = 5600]] -- 14 minutes ## 487.105 0.046 488.224 [lynne 2010-04 [Sys.MIPS() = 5600]] -- 8.1 min. ## 282.073 0.009 282.841 [lynne 2014-03 [Intel i7-4765T @ 2.0 GHz] ## n.b == 31 + leave-2-out ==> much more work ## 685.824 0.009 687.792 [lynne 2014-03 ...] ## 473.097 0.455 475.821 [lynne 2021-03 Intel i7-7700T @ 2.9 GHz ] -- 7 minutes ## new N = 128: ## 498.922 0.112 502.875 [lynne 2022 v-lynne (Intel Xeon..) -- 8.3 min ## colMeans(m) is faster than apply(m , 2, mean), but that is more general: mean.cv <- cbind("GSE" = apply(res.gse, 2, mean, na.rm=TRUE), "rss1cv"= apply(res.1cv, 2, mean, na.rm=TRUE), "rss10x"= apply(res.10x.cv, 2, mean, na.rm=TRUE), "rss10cv"=apply(res.10cv, 2, mean, na.rm=TRUE), "rss2cv"= apply(res.2cv, 2, mean, na.rm=TRUE)) Sfil <- { if(FALSE) "CV-sim_res4.rda" ## <<- do not accidentally overwrite "true file" else "CV-sim_res4_scr.rda" ## <<- use as "scratch" } ## save the simulation results trySetwd(saveFolder) save(b.widths,n.b, ff,N, mean.cv, res.gse, res.1cv, res.2cv, res.10cv, res.10x.cv, file = Sfil) ## ----------- ### Start here, if you do not want to redo the computations ------------------ trySetwd(saveFolder) if(!exists("Sfil")) Sfil <- "CV-sim_res4.rda" if(!file.exists(Sfil)) Sfil <- "CV-sim_res4_scr.rda" ## if *still* not: if(!file.exists(Sfil)) { ## try to get it from *old* web page webDir <- "http://stat.ethz.ch/Teaching/maechler/CompStat/" message("Trying to get simulation results from CompStat website @ stat.ethz.ch") (Url <- paste(webDir, Sfil, sep = "")) download.file(Url, Sfil) } (load(Sfil)) # (.): shows the object names ##------- ### This shows that leave-1-out is fine, but the others don't seem so ... require(sfsmisc)# for pdf.do() / pdf.end() etc pdf.do(file = "cv1.pdf", height = 7, width = 10) if(FALSE)## nicer colors: dput(RColorBrewer::brewer.pal(5,"Set1")) ## gives these : cols <- c("#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00") n.meth <- ncol(mean.cv) lwds <- c(2, rep(1,n.meth-1)) matplot(b.widths, mean.cv, type = "b", lwd=lwds, lty=1, col=cols, xlab = "bandwidth", main = paste0("Gen.Err. & cross-validation: mean(.) values (B = ", nrow(res.1cv),")"), ylab = "") lines(b.widths, mean.cv[,"GSE"], lwd=2, col=adjustcolor(cols[1], 0.6)) legend("topright", inset=.01, c("true Gen.Err.","leave-1-out CV", "10-fold CV", "leave-10-out CV", "leave-2-out CV"), lwd=lwds, pch=paste(1:n.meth), lty = 1, col = cols) if(FALSE) # not by default ## For leave-1-out, "all the values" (in addition to mean): boxplot(res.1cv, at = b.widths, add=TRUE, col="#7777771A", xaxt='n', boxwex = mean(diff(b.widths))/2) pdf.end() pdf.do(file = "cv2.pdf", height = 7, width = 10) if(FALSE) # not ideal for 31 op <- mult.fig(n.b, marP= -c(2.5, 2, 0, 0.7), mgp=c(1, 0.4, 0))$old.par op <- mult.fig(mfrow=c(4,8), marP= -c(2.5, 2, 0, 0.7), mgp=c(1, 0.4, 0))$old.par for(i in 1:n.b) { boxplot(res.1cv[,i],res.2cv[,i], res.10cv[,i], res.10x.cv[,i], main= sprintf("bw= %4g", b.widths[i]), names = c("-1", "-2","-10","10x"), ylim = c(0.7,2.2)) abline(mean.cv[i,"GSE"], 0, col = cols[1]) } par(op) pdf.end() ## res.1cv sometimes has NA (for very small 'h', m.hat(x) is not always defined!) sum (is.na(res.1cv)) # 8 out of 200 x 31 mean(is.na(res.1cv)) # 0.0013 which(is.na(res.1cv), arr.ind = TRUE) ## 7 for the first, and one for the 2nd bandwidth ## No: not ok, for other stuff: ## res.1cv[is.na(res.1cv)] <- Inf ## which are the optimal bandwidths 'h' : ## ===================================== table((opt.h <- b.widths[apply(res.1cv, 1, which.min)])) ## (for the above set.seed(), and the original n.b=20, ..) ## 0.05 0.1 0.15 0.2 0.25 0.3 0.35 0.4 1 ## 9 24 43 66 42 12 2 1 1 plot(table(opt.h), main="LOO-CV optimal bandwidths") mtext(paste("for ", length(opt.h), "different simulations")) ## Show l.o.o CV curves ... one after the other ---------------- plot(b.widths, res.1cv[1,], type='o', lty=1, main = "l.o.o. CV") plot(b.widths, res.1cv[2,], type='o', lty=1, main = "l.o.o. CV") if(interactive()) for(i in 1:simCV) { plot(b.widths, res.1cv[i,], type='o', lty=1, main = paste0(i,"-th l.o.o. CV")) Sys.sleep(0.25) } ## Range of all CV scores: (rngCV <- range(res.1cv, res.2cv, res.10cv, res.10x.cv, finite=TRUE)) # [0.69 2.10] -- or a bit smaller for plotting : rngCV <- c(1.0,1.5) mult.fig(4)## show all CV curves for all 4 CV kinds ## All the CV-1 curves: {a mess} matplot(b.widths, t(res.1cv), type='l', lty=1, col="gray", ylim=rngCV, main = "Leave 1 out ('l.o.o') CV") lines(b.widths, mean.cv[,"GSE"], col=cols[1], lwd=3) lines(b.widths, mean.cv[,"rss1cv"], col="gray10", lwd=2) ## the mean curve all.equal(colMeans(res.1cv, na.rm=TRUE), mean.cv[,"rss1cv"]) ## TRUE ## Leave 2 out: matplot(b.widths, t(res.2cv), type='l', lty=1, col="gray", ylim=rngCV, main = "Leave 2 out [B = 500] CV") lines(b.widths, mean.cv[,"GSE"], col=cols[1], lwd=3) lines(b.widths, mean.cv[,"rss2cv"], col="gray10", lwd=2) ## the mean curve all.equal(colMeans(res.2cv, na.rm=TRUE), mean.cv[,"rss2cv"]) #- TRUE ## For 10-fold: matplot(b.widths, t(res.10x.cv), type='l', lty=1, col="gray", ylim=rngCV, main = "10-fold CV") lines(b.widths, mean.cv[,"GSE"], col=cols[1], lwd=3) lines(b.widths, mean.cv[,"rss10x"], col="gray10", lwd=2) ## the mean curve ##-> huge bias all.equal(colMeans(res.10x.cv, na.rm=TRUE), mean.cv[,"rss10x"]) #- TRUE ## Leave 10 out: matplot(b.widths, t(res.10cv), type='l', lty=1, col="gray", ylim=rngCV, main = "Leave 10 out [B = 500] CV") lines(b.widths, mean.cv[,"GSE"], col=cols[1], lwd=3) lines(b.widths, mean.cv[,"rss10cv"], col="gray10", lwd=2) ## the mean curve ##-> "huge" bias ( the same as 10-fold: n-d = 100-10 = 90 = n - |B_j| ) all.equal(colMeans(res.10cv, na.rm=TRUE), mean.cv[,"rss10cv"]) #- TRUE Sys.info() sessionInfo()