#### 4. Crossvalidation
#### ==================

## Small modifications (by student E.Oldenhof, 1.August 2017):
## sort() *before* feeding data into ksmooth():  see comments with '(EOL)'

trySetwd <- function(dir) if(file.exists(dir)) setwd(dir)
saveFolder <- "/u/maechler/Vorl/comput-statist/R" ## <<-- adapt for your Comp !!

## ff <- function(x) 3*x + sin(6*x)
ff <- function(x) x-((x-1)/2)^2 + sin(6*x/(1+x^2/20))
##                --------------  -------------------
N <- 128 ## sample size

set.seed(0) ## our 'real data':
x <- sort(runif(N, -3,3)) ## --- X ~ U[-3,3]
y <- ff(x) + rnorm(N,0,1/2)
plot(x,y) ## the data -- we can not easily "see" the underlying model, right?
curve(ff(x), add = TRUE, col=2, lty = 2, lwd = 2)

lines(ksmooth(x,y, b = 0.1, kernel = "normal", n.p = 999))
lines(ksmooth(x,y, b = 0.3, kernel = "normal", n.p = 999), col=3)
##                     ^^^  certainly better than this one :
lines(ksmooth(x,y, b = 1.0, kernel = "normal", n.p = 999), col=4)
legend("bottomright", bty="n", ncol=2, c("true f", paste("bw =",format(c(0.1, 0.3, 1.0)))),
       col=c(2,1,3,4), lwd=c(2,1,1,1), lty=c(2,1,1,1))

## Set of bandwidths from which to choose:
## Till 2014:
b.widths <- seq(1/20, 1, by = 1/20)#  bw_k =  k / 20,  k = 1,..,20
## New: take smaller more interesting range:
b.widths <- seq(2/64, 0.5, by = 1/64)#  bw_k =  k / 64,  k = 2,3,..,32
(n.b <- length(b.widths)) ## 31  [was 20]

simCV  <- 200 # Number of complete simulations (incl. data !!)
##
simGSE <- 500

dn <- list(NULL, bandwidths = paste("h=", b.widths, sep=""))
res.gse <-    matrix(ncol = n.b, nrow = simGSE, dimnames= dn)
res.1cv <-    matrix(ncol = n.b, nrow = simCV , dimnames= dn)
res.10x.cv <- matrix(ncol = n.b, nrow = simCV , dimnames= dn)
res.10cv <-   matrix(ncol = n.b, nrow = simCV , dimnames= dn)
res.2cv  <-   matrix(ncol = n.b, nrow = simCV , dimnames= dn)


### Simulation for True GSE -- can do this, since we know the true ff()
##  -----------------------
## Note: one trick (time-saver, mainly) is to use the *same*
## ----  samples for the different values of the smoothing parameter

l.New <- 400

set.seed(22)
oneNew <- FALSE # more 'variation'
oneNew <- TRUE  # one Y.new for all 'h' [==> less noise ==> smoother)
cat("Gen.(Squared) error simulations --- B =", simGSE, "\n")
system.time( # << measure time used -- ~ 2 min. on "nb-mm" (P III, 700 MHz)
for(i in 1:simGSE) {
    if(i %% 10 == 0) { cat(i,""); if(i %% 100 == 0) cat("\n") }
    ## Generate data according to known Law   (X_i, Y_i)  ~  P(.)
    x <- sort(runif(N))
    y <- ff(x) + rnorm(N,0,1)
    ## now generate "New" Data    (X.new_i, Y.new_i), i=1,..,l.New
    xnew <- sort(runif(l.New))
    f.new <- ff(xnew)
    if(oneNew)
        ynew <- f.new + rnorm(l.New, 0,1)# the same for each 'h'
    ## for each bandwidth:
    for(k in 1:n.b) {
        y.hat <- ksmooth(x,y, bandwidth = b.widths[k],
                         kernel = "normal", x.points = xnew)$y
        if(!oneNew)  ##                     .........  ====   m^(X.new) [vector]
            ynew <- f.new + rnorm(l.New, 0,1)# different for each 'h'
        res.gse[i,k] <- mean((ynew - y.hat)^2)
    }
}); cat("\n")##     elapsed | n.b | l.New
## nb-mm3 [~ 2011]: 50 sec  |  20 |   200
## lynne  [2014]  : 25 sec  |  20 |   200
##   "       "    : 30 sec  |  31 |   200
##   "       "    : 52 sec  |  31 |   400
## new N = 128
## nb-mmm5 [2021] :  8.5 sec|  31 |   400

boxplot(res.gse, cex = .5, ## at = b.widths, xlim = 0:1,
        main = paste("Empirical Generalization Error,\n B = ",simGSE,
        "samples for each h;    l.New = ", l.New))

## zoom in (vertically) and add boxplot "notches":
boxplot(res.gse, cex = .5, ylim = c(0.85, 1.45), notch=TRUE,
        main = paste("Empirical Generalization Error,\n B = ",simGSE,
        "samples for each h;    l.New = ", l.New))
## add all B=500 curves (w/ some transparency)
matlines(t(res.gse), type="l", col=adjustcolor(1, 1/4), lty=3)

## The following takes time --- jump down if you want to just see the results
##               ==========     =========
set.seed(22)
cat("CV simulations --- B =", simCV, "\n")
system.time( # << measure time used --
for(i in 1:simCV) {
    cat("", i)
    ##  Newly generate the data  (!)
    x <- sort(runif(N))
    y <- ff(x) + rnorm(N,0,1)
    ##
    ##-------  leave-one-out CV  ----------------
    h <- numeric(N)
    for(k in 1:n.b) {
        for(j in 1:N) { ## leave out  (x[j], y[j]) and predict at x[j]:
            y.hat <- ksmooth(x[-j], y[-j],
                             bandwidth = b.widths[k], kernel = "normal",
                             x.points = x[j])$y
            h[j] <- (y[j] - y.hat)^2
        }
        res.1cv[i,k] <- mean(h)
    }
    cat(".")
    ##
    ##------- 10-fold CV -------------------
    h <- numeric(10) ## N.B: 10*10 = 100
    ## randomly permute data once,
    ## and choose 10th-sized 'regular' subsets subsequently
    ii <- sample(1:N, size = N, replace = FALSE)
    xx <- x[ii]
    yy <- y[ii]
    for(k in 1:n.b) {
        for(j in 1:10)
        {
            ind <- ((j-1)*10 + 1):(j*10)
            ## NB: ksmooth() "needs" x-ordering of indices (EOL)
            ind_order <- order(xx[ind])
            ind <- ind[ind_order]
            y.h <- ksmooth(xx[-ind], yy[-ind],
                           bandwidth = b.widths[k], kernel = "normal",
                           x.points = xx[ind])$y
            h[j] <- mean((yy[ind] - y.h)^2)
        }
        res.10x.cv[i,k] <- mean(h)
    }
    cat(",")
    ##
    ##------- leave-2-out and leave-10-out CV --------------
    B <- 500
    h <- h2 <- matrix(NA, B, n.b)
    #in2 <- 1
    for(j in 1:B) {
        ## Use *same* set subset (of size 'd') C*_j for all bandwidths ==> smooth in h
        ## NB: 'sort(*)' as ksmooth() "needs" x-ordering of indices (EOL)
        ind <- sort(sample(1:N, size = 10, replace = FALSE))
        in2 <- sort(sample(1:N, size =  2, replace = FALSE))
        for(k in 1:n.b) {
            y.h <- ksmooth(x[-ind], y[-ind],
                           bandwidth = b.widths[k], kernel = "normal",
                           x.points = x[ind])$y
            h[j,k] <- mean((y[ind] - y.h)^2)
            y.h <- ksmooth(x[-in2], y[-in2],
                           bandwidth = b.widths[k], kernel = "normal",
                           x.points = x[in2])$y
            h2[j,k] <- mean((y[in2] - y.h)^2)
        }
    }
    res.10cv[i,] <- colMeans(h)
    res.2cv [i,] <- colMeans(h2)
    ##
    cat(";"); if(i %% 10 == 0) cat("\n")
    ##
}); cat("\n")
##       user  system   elapsed
##    5127.83    0.37   7065.44  [lynne, 2005 Pentium III, 700 MHz] -- ~ 2 hours !
##    833.480   0.160   834.164  [lynne 2010-04 [Sys.MIPS() = 5600]] -- 14   minutes
##    487.105   0.046   488.224  [lynne 2010-04 [Sys.MIPS() = 5600]] --  8.1 min.
##    282.073   0.009   282.841  [lynne 2014-03 [Intel i7-4765T @ 2.0 GHz]
## n.b == 31 + leave-2-out ==> much more work
##    685.824   0.009   687.792  [lynne 2014-03 ...]
##    473.097   0.455   475.821  [lynne 2021-03 Intel i7-7700T @ 2.9 GHz ] -- 7 minutes 
## new N = 128:
##    498.922   0.112   502.875  [lynne 2022 v-lynne (Intel Xeon..)   -- 8.3 min

## colMeans(m) is faster than apply(m , 2, mean), but that is more general:
mean.cv <- cbind("GSE"   = apply(res.gse,    2, mean, na.rm=TRUE),
                 "rss1cv"= apply(res.1cv,    2, mean, na.rm=TRUE),
                 "rss10x"= apply(res.10x.cv, 2, mean, na.rm=TRUE),
                 "rss10cv"=apply(res.10cv,   2, mean, na.rm=TRUE),
                 "rss2cv"= apply(res.2cv,    2, mean, na.rm=TRUE))

Sfil <- {
    if(FALSE)
         "CV-sim_res4.rda"  ## <<- do not accidentally overwrite "true file"
    else "CV-sim_res4_scr.rda" ## <<- use as "scratch"
}

## save the simulation results
trySetwd(saveFolder)
save(b.widths,n.b, ff,N, mean.cv, res.gse, res.1cv, res.2cv, res.10cv, res.10x.cv,
     file = Sfil)
##   -----------


### Start here, if you do not want to redo the computations ------------------

trySetwd(saveFolder)
if(!exists("Sfil"))    Sfil <- "CV-sim_res4.rda"
if(!file.exists(Sfil)) Sfil <- "CV-sim_res4_scr.rda"
## if *still* not:
if(!file.exists(Sfil)) { ## try to get it from *old* web page
    webDir <- "http://stat.ethz.ch/Teaching/maechler/CompStat/"
    message("Trying to get simulation results from CompStat website @ stat.ethz.ch")
    (Url <- paste(webDir, Sfil, sep = ""))
    download.file(Url, Sfil)
}

(load(Sfil)) # (.): shows the object names
##-------

### This shows that  leave-1-out is fine, but the others don't seem so ...

require(sfsmisc)# for pdf.do() / pdf.end() etc

pdf.do(file = "cv1.pdf", height = 7, width = 10)

if(FALSE)## nicer colors:
    dput(RColorBrewer::brewer.pal(5,"Set1")) ## gives these :
cols <- c("#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00")
n.meth <- ncol(mean.cv)
lwds <- c(2, rep(1,n.meth-1))
matplot(b.widths, mean.cv, type = "b", lwd=lwds, lty=1, col=cols,
        xlab = "bandwidth",
        main = paste0("Gen.Err. & cross-validation: mean(.) values (B = ",
            nrow(res.1cv),")"), ylab = "")
lines(b.widths, mean.cv[,"GSE"], lwd=2, col=adjustcolor(cols[1], 0.6))
legend("topright", inset=.01,
       c("true Gen.Err.","leave-1-out CV", "10-fold CV",
         "leave-10-out CV", "leave-2-out CV"),
       lwd=lwds, pch=paste(1:n.meth), lty = 1, col = cols)

if(FALSE) # not by default
## For leave-1-out, "all the values" (in addition to mean):
boxplot(res.1cv, at = b.widths, add=TRUE, col="#7777771A", xaxt='n',
        boxwex = mean(diff(b.widths))/2)

pdf.end()


pdf.do(file = "cv2.pdf", height = 7, width = 10)

if(FALSE) # not ideal for 31
op <- mult.fig(n.b, marP= -c(2.5, 2, 0, 0.7), mgp=c(1, 0.4, 0))$old.par
op <- mult.fig(mfrow=c(4,8), marP= -c(2.5, 2, 0, 0.7), mgp=c(1, 0.4, 0))$old.par
for(i in 1:n.b) {
    boxplot(res.1cv[,i],res.2cv[,i], res.10cv[,i], res.10x.cv[,i],
            main= sprintf("bw= %4g", b.widths[i]),
            names = c("-1", "-2","-10","10x"), ylim = c(0.7,2.2))
    abline(mean.cv[i,"GSE"], 0, col = cols[1])
}
par(op)


pdf.end()

## res.1cv sometimes has NA (for very small 'h', m.hat(x) is not always defined!)
sum (is.na(res.1cv)) # 8 out of 200 x 31
mean(is.na(res.1cv)) # 0.0013
which(is.na(res.1cv), arr.ind = TRUE) ## 7 for the first, and one for the 2nd bandwidth

## No: not ok, for other stuff:
## res.1cv[is.na(res.1cv)] <- Inf


## which are the optimal bandwidths  'h' :
## =====================================
table((opt.h <- b.widths[apply(res.1cv, 1, which.min)]))
## (for the above set.seed(), and the original n.b=20, ..)
## 0.05  0.1 0.15  0.2 0.25  0.3 0.35  0.4    1
##    9   24   43   66   42   12    2    1    1
plot(table(opt.h), main="LOO-CV optimal bandwidths")
mtext(paste("for ", length(opt.h), "different simulations"))

## Show  l.o.o CV curves  ... one after the other ----------------
plot(b.widths, res.1cv[1,], type='o', lty=1, main = "l.o.o. CV")
plot(b.widths, res.1cv[2,], type='o', lty=1, main = "l.o.o. CV")
if(interactive()) for(i in 1:simCV) {
    plot(b.widths, res.1cv[i,], type='o', lty=1,
         main = paste0(i,"-th  l.o.o. CV"))
    Sys.sleep(0.25)
}


## Range of all CV scores:
(rngCV <- range(res.1cv, res.2cv, res.10cv, res.10x.cv, finite=TRUE))
# [0.69 2.10] -- or a bit smaller for plotting :
rngCV <- c(1.0,1.5)
mult.fig(4)## show all CV curves for all 4 CV kinds

## All the CV-1  curves: {a mess}
matplot(b.widths, t(res.1cv), type='l', lty=1, col="gray", ylim=rngCV,
        main = "Leave 1 out ('l.o.o')   CV")
lines(b.widths, mean.cv[,"GSE"], col=cols[1], lwd=3)
lines(b.widths, mean.cv[,"rss1cv"], col="gray10", lwd=2) ## the mean curve

all.equal(colMeans(res.1cv, na.rm=TRUE), mean.cv[,"rss1cv"])
## TRUE

## Leave 2 out:
matplot(b.widths, t(res.2cv), type='l', lty=1, col="gray", ylim=rngCV,
        main = "Leave 2 out  [B = 500] CV")
lines(b.widths, mean.cv[,"GSE"], col=cols[1], lwd=3)
lines(b.widths, mean.cv[,"rss2cv"], col="gray10", lwd=2) ## the mean curve
all.equal(colMeans(res.2cv, na.rm=TRUE), mean.cv[,"rss2cv"]) #- TRUE

## For  10-fold:
matplot(b.widths, t(res.10x.cv), type='l', lty=1, col="gray", ylim=rngCV,
        main = "10-fold   CV")
lines(b.widths, mean.cv[,"GSE"], col=cols[1], lwd=3)
lines(b.widths, mean.cv[,"rss10x"], col="gray10", lwd=2) ## the mean curve
##-> huge bias
all.equal(colMeans(res.10x.cv, na.rm=TRUE), mean.cv[,"rss10x"]) #- TRUE

## Leave 10 out:
matplot(b.widths, t(res.10cv), type='l', lty=1, col="gray", ylim=rngCV,
        main = "Leave 10 out  [B = 500] CV")
lines(b.widths, mean.cv[,"GSE"], col=cols[1], lwd=3)
lines(b.widths, mean.cv[,"rss10cv"], col="gray10", lwd=2) ## the mean curve
##-> "huge" bias ( the same as 10-fold:  n-d = 100-10 = 90 = n - |B_j| )
all.equal(colMeans(res.10cv, na.rm=TRUE), mean.cv[,"rss10cv"]) #- TRUE

Sys.info()
sessionInfo()