#### Funktionen geschrieben von Valentin Rousson
#### Minimal changes/adaptions, Martin Maechler, Sep.2012; Sep. 2020

clickText <- function(txt) title(sub = txt, col.sub = "firebrick", font.sub = 4)

f.compare.ise <- function(n = 50, sd = 0.1, nof = 1, b = 7, dpol = 3, N = 10,
                          rand.x = FALSE,
			  waitClick = (N <= 10), delay = .1)
{
    stopifnot( require("lokern") )
    ## Compare ISE (Integrated Squared Error) of linear, quadratic, polynomial and nonparametric fit

    ## n :    sample size aka "number of observations"
    ## sd:    standard deviation of noise
    ## nof=1: bimodal regression function on [0,b];
    ## nof=2: exponential regression function on [0,b];
    ## nof=3: logistic regression function on [-b,b]
    ## b:     right bound of the regression domain
    ## dpol:  degree of the polynomial model used
    ## N:     number of samples simulated
    ## rand.x if TRUE, use 'random' x in [a, b] otherwise (default) *equidistant* x
    stopifnot(nof %in% 1:3, dpol >= 1)
    switch(nof, { ## 1:
	f <- function(x) { dnorm(x,3*b/7,0.5)+dnorm(x,5*b/7,0.2) }
	a <- 0
	legx <- a
	labf <- "Bimodal function"
    }, { ## 2 :
	f <- function(x) { exp(-x) }
	a <- 0
	legx <- (2/3)*b
	labf <- "Exponential function"
    }, { ## 3 :
	f <- function(x) { exp(x)/(1+exp(x)) }
	a <- -b
	legx <- a
	labf <- "Logistic function"
    })

    xg <- seq(a,b, 0.01) # "g": values on regular [g]rid
    scale <- (b-a)/(length(xg)-1)
    ztrue <- f(xg)
    ylim  <- c(min(ztrue)- 4*sd,
	       max(ztrue)+ 4*sd)
    x <- if(rand.x) runif(n, a,b) else seq(a,b,len = n)
    X2 <- cbind(x,x^2)
    Xd <- matrix( , n, dpol)
    for(j in 1:dpol) Xd[,j] <- x^j

    ltyest <- c(1,1,1, 1, 2)
    colest <- c(1,2,3, 4, 1)
    labest <- c("linear","quadratic",paste("polynom",dpol),"nonpar","true")

    isels1 <- isels2 <- iselsd <- isenp <- numeric(N)

    op <- par(mfrow = c(1,1)); on.exit(par(op))

    for(I in 1:N) {
	y <- f(x) + rnorm(n,sd = sd)
        ##   ----------------------- the I-th data set
	ls1 <- lsfit( x,y)$coef
	ls2 <- lsfit(X2,y)$coef
	lsd <- lsfit(Xd,y)$coef
	np <- glkerns(x,y,x.out = xg)

        ## Compute fitted values z*(xg)  for the 4 models
	zls1 <- ls1[1]+xg*ls1[2]
	zls2 <- ls2[1]+xg*ls2[2] + xg^2*ls2[3]

	## zlsd <- lsd[1]
	## for(j in 1:dpol) zlsd <- zlsd + xg^j * lsd[j+1]
        ## Faster, closed form:
        zlsd <- outer(xg, 0:dpol, `^`) %*% lsd

	znp <- np$est

	isels1[I] <- sum((zls1-ztrue)^2)*scale
	isels2[I] <- sum((zls2-ztrue)^2)*scale
	iselsd[I] <- sum((zlsd-ztrue)^2)*scale
	isenp [I] <- sum((znp -ztrue)^2)*scale

	plot(x, y, ylim = ylim)
	lines(xg, zls1, lty = ltyest[1], col = colest[1])
	lines(xg, zls2, lty = ltyest[2], col = colest[2])
	lines(xg, zlsd, lty = ltyest[3], col = colest[3])
	lines(xg, znp,	lty = ltyest[4], col = colest[4])
	lines(xg, ztrue,lty = ltyest[5], col = colest[5])

	legtxt <- paste(labest,c(rep("	ISE=",4),""),
			c(round(c(isels1[I],isels2[I],iselsd[I],isenp[I]),4),""))
	legend(legx, ylim[2], legtxt, lty = ltyest, col = colest)
	title(paste("Sample",I,":",labf,"n=",n,"sd=",round(sd,2)))
	if(waitClick) {
	    clickText("Click on the plot to get the next sample!")
	    locator(1)
	} else Sys.sleep(delay)
    }

    boxplot(isels1,isels2,iselsd,isenp,
	    names = labest[1:4])
    title(paste("Summary of ISE of",N,"samples:",labf,"n=",n,"sd=",round(sd,2)))

    list(isels1=isels1, isels2=isels2, iselsd=iselsd, isenp=isenp,
	 N = N, labFunc = labf, labest=labest)
}


f.compare.bdw <- function(h = c(.2,.5,1), n = 50, sd = 0.2,
                          rand.x = FALSE,
			  nof = 1, b = 7, N = 10,
			  oneplot = (N > 10), click = !oneplot)
{
    ## compare ise of nonparametric fits with three different bandwidth given
    ## by the user, as well as with the "optimal" bandwidth calculated by glkerns

    ## h: vector of 3 different bandwidths to be compared (or also length 1 or 2)
    ## n: sample size
    ## sd: standard deviation of noise

    ## nof=1: bimodal regression function     on  [0,b];
    ## nof=2: exponential regression function on  [0,b];
    ## nof=3: logistic regression function    on [-b,b]

    ## b: right bound of the regression domain
    ## N: number of samples simulated
    ## oneplot: the estimates for different bandwidths are shown
    ##	  on the same plot (oneplot = TRUE) or different plots (oneplot=FALSE)

    stopifnot(require("lokern"),
	      nof %in% 1:3, length(h) >= 1, h > 0, length(h) >= 2)
    switch(nof, { ## 1:
	f <- function(x) { dnorm(x,3*b/7,0.5)+dnorm(x,5*b/7,0.2) }
	a <- 0
	legx <- a
	labf <- "Bimodal function"
    }, { ## 2 :
	f <- function(x) { exp(-x) }
	a <- 0
	legx <- (1/3)*b
	labf <- "Exponential function"
    }, { ## 3 :
	f <- function(x) { exp(x)/(1+exp(x)) }
	a <- -b
	legx <- a
	labf <- "Logistic function"
    })

    if(length(h) == 1) h <- c(.5*h,h,2*h)
    else if(length(h) == 2) h <- sort(h[1],h[2],max(h)*2)
    else if(length(h) >= 3) h <- sort(h[1:3])

    xg <- seq(a,b, 0.01) # "g": values on regular [g]rid
    scale <- (b-a)/(length(xg)-1)
    ztrue <- f(xg)
    ylim  <- c(min(ztrue)- 4*sd,
	       max(ztrue)+ 4*sd)
    if(!rand.x) # fixed equidistant x  (default):
        x <- seq(a,b,len = n)
    ltyest <- c(1,1,1,1, 2)
    colest <- c(1,2,3,4, 1)
    lab.h <- c(paste("h=", round(h,2)), "hopt")
    iseh <- matrix(0, nrow = N, ncol = 4, dimnames=list(NULL, lab.h))
    ## to contain the fitted values z^("zhat"): {N samples} x {4 models} x {at xgrid = {x_i}}
    zh <- array(dim = c(N,4,length(xg)))
    hOpt <- numeric(N)
    op <- par(mfrow = if(oneplot) c(1,1) else c(2,2))
    on.exit(par(op))

    for(I in 1:N) {
        if(rand.x) # random design
            x <- runif(n, a,b)
	y <- f(x) + rnorm(n,sd = sd)
        ##   ----------------------- the I-th data set  'sample'  (x, y)
	for(i in 1:4) {
	    zz <-
                if(i <= 3)
                    glkerns(x,y, x.out = xg, bandwidth = h[i])$est
	        else { #rgnp.R## last one: take automatic h = "hopt"
                    tmp <- glkerns(x,y, x.out = xg)
                    hOpt[I] <- hopt <- tmp$bandwidth
                    tmp$est
                }
	    iseh[I,i]  <- sum((zz - ztrue)^2)*scale
            zh  [I,i,] <- zz
	}
	legtxt <- c(paste(lab.h, " ISE=", signif(iseh[I,], digits=5)),
                    "true")
	if(oneplot) {
	    plot(x,y,ylim = ylim)
	    for(i in 1:4)
		lines(xg,zh[I,i,],lty = ltyest[i],col = colest[i])
	    lines(xg,ztrue,lty = ltyest[5],col = colest[5])
	    legend(legx,ylim[2],legtxt,lty = ltyest,col = colest)
	    title(paste("Sample",I,":",labf,"n=",n,
			"sd=",round(sd,2),"hopt=",round(hopt,2)))
	    if(click) {
		clickText("Click on the plot to get the next sample!")
		locator(1)
	    }
	} else { ## (!oneplot)
	    for(i in 1:4) {
		plot(x,y,ylim = ylim)
		lines(xg, zh[I,i,],lty = ltyest[i], col = colest[i])
		lines(xg, ztrue,   lty = ltyest[5], col = colest[5])
		legend(legx,ylim[2], legtxt[c(i,5)],
                       lty = ltyest[c(i,5)], col = colest[c(i,5)])
		title(paste("Sample",I,":",labf,"n=",n,"sd=",round(sd,2),
			    if(i < 4) paste("h=",round(h[i],2))
			    else paste("hopt=",round(hopt,2))))
		if(click)
		    clickText("Click on the plot to get the next sample!")
	    }
	    if(click) locator(1)
	}
    }## for( 1:N )

    meanzh <- apply(zh,2:3,"mean")
    biash <- varh <- mseh <- numeric(4)
    for(i in 1:4) {
	biash[i] <- sum((meanzh[i,]-ztrue)^2)*scale # Bias^2 (squared!)
	mseh [i] <- mean(iseh[,i])
	varh [i] <- mseh[i]-biash[i]  ## MSE = Var + Bias^2
    }
    par(mfrow = c(1,1))
    boxplot(iseh, ylim = c(0,max(iseh)))
    abline(h=0, col="gray", lty=3)
    text(1:4,biash,"Bias^2",col = 2)
    text(1:4,varh, "Var", col = 3)
    text(1:4,mseh, "MSE", col = 4)
    title(paste("Summary of ISE of",N,"samples:",labf,"n=",n,"sd=",round(sd,2),
                if(rand.x) "- random x"))

    list(hOpt=hOpt, iseh=iseh, biash=biash, varh=varh, mseh=mseh)
}


##' This function plots distance curves and velocity curves of nboys boys and
##' ngirls girls (randomly selected from a population of 120 boys and 112 girls)
##' It also allows to extract features characterizing the
##' puberty from the velocity curves
##' @param nboys
##' @param ngirls
##' @author Valentin Rousson & Martin Maechler
f.compare.veloc <- function(nboys = 12, ngirls = nboys,
                            maxvel = 18, age.grid = seq(0, 20, 1/64),
                            notch=TRUE, ...)
{
    stopifnot( require("lokern"), nboys >= 1, ngirls >= 1)
    growthLab <- "growth velocity [cm / yr]"
    draw.and.select <- function(dat, nSel, col.velo)
    {
	name <- deparse(substitute(dat))
	stopifnot(is.list(dat), is.list(dat[[1]]),
		  nSel >= 1, nSel == as.integer(nSel))
	if(grepl("s$", name)) ## drop plural "s":
	    name <- sub("s$", "", name)
	N <- length(dat)
	nSel <- min(N, nSel)
	i. <- sample(1:N, nSel) # randomly selected children
	tpub <- peak <- numeric(nSel)
	for(i in 1:nSel) { ## for each child [i]
	    x  <- dat[[i.[i]]]$age
	    y  <- dat[[i.[i]]]$height
	    no <- dat[[i.[i]]]$ID
	    dist <- glkerns(x,y,x.out = age.grid)$est
	    dx <- x[-1]-diff(x)/2
            rx <- range(x, dx, age.grid)
            ## (originally had  ylab = 'distance')
	    plot(x,y, xlim=rx, ylim= range(y,dist), xlab = "age", ylab = "height")
	    lines(age.grid, dist)
	    title(paste("Distance curve for",name,"no",no))
            mtext(sprintf("%3d of %d",i, nSel), adj=1)

            ## divided differences  dy/dx  as poor man's estimate -- need to be smoothed
	    dy <- diff(y)/diff(x)
	    veloc0 <- glkerns(dx,dy, x.out = age.grid)$est
	    hopt <- glkerns(x,y, x.out = age.grid, deriv = 1)$bandwidth
	    tmp <- interpol.xy(x,y)
	    newx <- tmp$x
	    newy <- tmp$y
	    veloc1 <- glkerns(newx,newy, x.out = age.grid, deriv = 1, bandwidth = hopt)$est
	    plot(dx, pmin(dy, maxvel),
                 xlim=rx, ylim = c(0,maxvel), xlab = "age", ylab = growthLab)
	    lines(age.grid, veloc1, col = col.velo, lwd=2)
	    lines(age.grid, veloc0, col = 1)
	    legend("top", c("Estimate of derivative", "Smoothing the differences"),
		   lwd = c(2,1), lty = c(1,1), col = c(col.velo, 1), bty="n")
	    title(paste("Velocity curve for",name,"no",no))
	    clickText("Click on the puberty peak")
	    tmp <- locator(1)
	    tpub[i] <- tmp$x
	    peak[i] <- tmp$y
	}
	list(tpub = tpub, peak = peak)
    } ## draw.and.select()

    r <- getGrData()
    boys  <- r$boys
    girls <- r$girls

    op <- par(mfrow = c(2,1), mgp = c(1.5, 0.6, 0), mar = .1+c(7,5,4,1)/2)
    on.exit(par(op))

    L.boys <- draw.and.select(boys , nboys,  col.velo = "skyblue3")
    L.girl <- draw.and.select(girls, ngirls, col.velo = "pink4")

    par(mfrow = c(1,2))
    boxplot(L.boys$tpub, L.girl$tpub, names = c("boys","girls"), notch=notch, ...)
    title(paste("\"High\" puberty age for", nboys,
		"boys and", ngirls,"girls"), cex.main = 0.8,
          ylab = "age [years]")
    boxplot(L.boys$peak, L.girl$peak, names = c("boys","girls"), notch=notch, ...)
    title(paste("\"High\" puberty peak for", nboys,
		"boys and", ngirls,"girls"), cex.main = 0.8, ylab = growthLab)
    invisible(list(boys = L.boys, girls = L.girl))
}
##----------------------------------------------------------------------------

##' Get Growth Data  [ called only from f.compare.veloc() ]
getGrData <- function(dir = "http://stat.ethz.ch/Teaching/Datasets/WBL")
{
    age	   <- matrix(scan(paste(dir,"zeiten.dat", sep="/")),
		     byrow = TRUE, ncol = 38)
    height <- matrix(scan(paste(dir,"laengen.dat", sep="/")),
		     byrow = TRUE, ncol = 38)
    stopifnot(dim(age) == dim(height))
    ID  <- age[,1]
    sex <- age[,2]
    stopifnot(height[,1] == ID, height[,2] == sex) # consistency check
    get.hgt.age <- function(sel) {
        stopifnot(is.logical(sel), length(sel) == length(ID))
        nn <- length(ind <- which(sel))
        r <- vector("list", nn)
        for(i in seq_len(nn)) {
            h.i <- abs(height[ind[i], -(1:2)])
            a.i <-        age[ind[i], -(1:2)]
            r[[i]] <-
                list(ID = ID[ind[i]],
                     height = h.i[a.i > 0],
                     age    = a.i[a.i > 0])
        }
        r
    }
    boys  <- get.hgt.age(sex == 9)
    girls <- get.hgt.age(sex == 8)

    list(boys=boys, girls=girls)
}

##' growthData() : returns one data frame {more useful/modern}
growthData <- function(dir = "http://stat.ethz.ch/Teaching/Datasets/WBL")
{
    age	   <- matrix(scan(paste(dir,"zeiten.dat", sep="/")),
		     byrow = TRUE, ncol = 38)
    height <- matrix(scan(paste(dir,"laengen.dat", sep="/")),
		     byrow = TRUE, ncol = 38)
    stopifnot(dim(age) == dim(height))
    ID  <- age[,1]
    sex <- age[,2]
    iNA <- age [,-(1:2)] == 0
    stopifnot(height[,1] == ID, height[,2] == sex, # consistency checks
              ## the '0's are non recorded data
              identical(iNA, height[,-(1:2)] == 0))
    nObs <- rowSums(!iNA) # number of observations per child
    ## in 29 .. 36, most in 32,33,34;  some negative 'heights' ==> take(abs)
    data.frame(ID = factor(rep(ID,                nObs)),
               sex= factor(rep(c("f","m")[sex-7], nObs)),
               age   = {   age <-     t(age   [,-(1:2)]) ; age   [   age != 0]},
               height= {height <- abs(t(height[,-(1:2)])); height[height != 0]})
}

if(FALSE) {
    dGrowth <- growthData()
    require(lattice)
    xyplot(height ~ age | sex, group=~ID, dGrowth, type ="l") # Girls vs Boys
    xyplot(height ~ age | sex:ID, dGrowth, type ="l") # individuals (all 'f' | all 'm')
    xyplot(height ~ age | ID,     dGrowth, type ="l") # individuals
}

## called from  draw.and.select() above
## MM: This almost surely just a not so efficient version of  approx()
interpol.xy <- function(x,y, f = 1.25) {
    n <- length(x)
    stopifnot(n >= 2, length(y) == n,
	      is.numeric(f), length(f)==1, f >= 1)
    m <- round(f*n)
    newx <- seq(min(x),max(x),len = m)
    newy <- c(y[1],rep(0,m-2),y[n])
    for(i in 2:(m-1)) {
	i0 <- max(which(x < newx[i]))
	i1 <- min(which(x > newx[i]))
	b <- (y[i1]-y[i0])/(x[i1]-x[i0])
	a <- y[i1] - b*x[i1]
	newy[i] <- a + b*newx[i]
    }
    list(x=newx, y=newy)
}