#### Funktionen geschrieben von Valentin Rousson #### Minimal changes/adaptions, Martin Maechler, Sep.2012; Sep. 2020 clickText <- function(txt) title(sub = txt, col.sub = "firebrick", font.sub = 4) f.compare.ise <- function(n = 50, sd = 0.1, nof = 1, b = 7, dpol = 3, N = 10, rand.x = FALSE, waitClick = (N <= 10), delay = .1) { stopifnot( require("lokern") ) ## Compare ISE (Integrated Squared Error) of linear, quadratic, polynomial and nonparametric fit ## n : sample size aka "number of observations" ## sd: standard deviation of noise ## nof=1: bimodal regression function on [0,b]; ## nof=2: exponential regression function on [0,b]; ## nof=3: logistic regression function on [-b,b] ## b: right bound of the regression domain ## dpol: degree of the polynomial model used ## N: number of samples simulated ## rand.x if TRUE, use 'random' x in [a, b] otherwise (default) *equidistant* x stopifnot(nof %in% 1:3, dpol >= 1) switch(nof, { ## 1: f <- function(x) { dnorm(x,3*b/7,0.5)+dnorm(x,5*b/7,0.2) } a <- 0 legx <- a labf <- "Bimodal function" }, { ## 2 : f <- function(x) { exp(-x) } a <- 0 legx <- (2/3)*b labf <- "Exponential function" }, { ## 3 : f <- function(x) { exp(x)/(1+exp(x)) } a <- -b legx <- a labf <- "Logistic function" }) xg <- seq(a,b, 0.01) # "g": values on regular [g]rid scale <- (b-a)/(length(xg)-1) ztrue <- f(xg) ylim <- c(min(ztrue)- 4*sd, max(ztrue)+ 4*sd) x <- if(rand.x) runif(n, a,b) else seq(a,b,len = n) X2 <- cbind(x,x^2) Xd <- matrix( , n, dpol) for(j in 1:dpol) Xd[,j] <- x^j ltyest <- c(1,1,1, 1, 2) colest <- c(1,2,3, 4, 1) labest <- c("linear","quadratic",paste("polynom",dpol),"nonpar","true") isels1 <- isels2 <- iselsd <- isenp <- numeric(N) op <- par(mfrow = c(1,1)); on.exit(par(op)) for(I in 1:N) { y <- f(x) + rnorm(n,sd = sd) ## ----------------------- the I-th data set ls1 <- lsfit( x,y)$coef ls2 <- lsfit(X2,y)$coef lsd <- lsfit(Xd,y)$coef np <- glkerns(x,y,x.out = xg) ## Compute fitted values z*(xg) for the 4 models zls1 <- ls1[1]+xg*ls1[2] zls2 <- ls2[1]+xg*ls2[2] + xg^2*ls2[3] ## zlsd <- lsd[1] ## for(j in 1:dpol) zlsd <- zlsd + xg^j * lsd[j+1] ## Faster, closed form: zlsd <- outer(xg, 0:dpol, `^`) %*% lsd znp <- np$est isels1[I] <- sum((zls1-ztrue)^2)*scale isels2[I] <- sum((zls2-ztrue)^2)*scale iselsd[I] <- sum((zlsd-ztrue)^2)*scale isenp [I] <- sum((znp -ztrue)^2)*scale plot(x, y, ylim = ylim) lines(xg, zls1, lty = ltyest[1], col = colest[1]) lines(xg, zls2, lty = ltyest[2], col = colest[2]) lines(xg, zlsd, lty = ltyest[3], col = colest[3]) lines(xg, znp, lty = ltyest[4], col = colest[4]) lines(xg, ztrue,lty = ltyest[5], col = colest[5]) legtxt <- paste(labest,c(rep(" ISE=",4),""), c(round(c(isels1[I],isels2[I],iselsd[I],isenp[I]),4),"")) legend(legx, ylim[2], legtxt, lty = ltyest, col = colest) title(paste("Sample",I,":",labf,"n=",n,"sd=",round(sd,2))) if(waitClick) { clickText("Click on the plot to get the next sample!") locator(1) } else Sys.sleep(delay) } boxplot(isels1,isels2,iselsd,isenp, names = labest[1:4]) title(paste("Summary of ISE of",N,"samples:",labf,"n=",n,"sd=",round(sd,2))) list(isels1=isels1, isels2=isels2, iselsd=iselsd, isenp=isenp, N = N, labFunc = labf, labest=labest) } f.compare.bdw <- function(h = c(.2,.5,1), n = 50, sd = 0.2, rand.x = FALSE, nof = 1, b = 7, N = 10, oneplot = (N > 10), click = !oneplot) { ## compare ise of nonparametric fits with three different bandwidth given ## by the user, as well as with the "optimal" bandwidth calculated by glkerns ## h: vector of 3 different bandwidths to be compared (or also length 1 or 2) ## n: sample size ## sd: standard deviation of noise ## nof=1: bimodal regression function on [0,b]; ## nof=2: exponential regression function on [0,b]; ## nof=3: logistic regression function on [-b,b] ## b: right bound of the regression domain ## N: number of samples simulated ## oneplot: the estimates for different bandwidths are shown ## on the same plot (oneplot = TRUE) or different plots (oneplot=FALSE) stopifnot(require("lokern"), nof %in% 1:3, length(h) >= 1, h > 0, length(h) >= 2) switch(nof, { ## 1: f <- function(x) { dnorm(x,3*b/7,0.5)+dnorm(x,5*b/7,0.2) } a <- 0 legx <- a labf <- "Bimodal function" }, { ## 2 : f <- function(x) { exp(-x) } a <- 0 legx <- (1/3)*b labf <- "Exponential function" }, { ## 3 : f <- function(x) { exp(x)/(1+exp(x)) } a <- -b legx <- a labf <- "Logistic function" }) if(length(h) == 1) h <- c(.5*h,h,2*h) else if(length(h) == 2) h <- sort(h[1],h[2],max(h)*2) else if(length(h) >= 3) h <- sort(h[1:3]) xg <- seq(a,b, 0.01) # "g": values on regular [g]rid scale <- (b-a)/(length(xg)-1) ztrue <- f(xg) ylim <- c(min(ztrue)- 4*sd, max(ztrue)+ 4*sd) if(!rand.x) # fixed equidistant x (default): x <- seq(a,b,len = n) ltyest <- c(1,1,1,1, 2) colest <- c(1,2,3,4, 1) lab.h <- c(paste("h=", round(h,2)), "hopt") iseh <- matrix(0, nrow = N, ncol = 4, dimnames=list(NULL, lab.h)) ## to contain the fitted values z^("zhat"): {N samples} x {4 models} x {at xgrid = {x_i}} zh <- array(dim = c(N,4,length(xg))) hOpt <- numeric(N) op <- par(mfrow = if(oneplot) c(1,1) else c(2,2)) on.exit(par(op)) for(I in 1:N) { if(rand.x) # random design x <- runif(n, a,b) y <- f(x) + rnorm(n,sd = sd) ## ----------------------- the I-th data set 'sample' (x, y) for(i in 1:4) { zz <- if(i <= 3) glkerns(x,y, x.out = xg, bandwidth = h[i])$est else { #rgnp.R## last one: take automatic h = "hopt" tmp <- glkerns(x,y, x.out = xg) hOpt[I] <- hopt <- tmp$bandwidth tmp$est } iseh[I,i] <- sum((zz - ztrue)^2)*scale zh [I,i,] <- zz } legtxt <- c(paste(lab.h, " ISE=", signif(iseh[I,], digits=5)), "true") if(oneplot) { plot(x,y,ylim = ylim) for(i in 1:4) lines(xg,zh[I,i,],lty = ltyest[i],col = colest[i]) lines(xg,ztrue,lty = ltyest[5],col = colest[5]) legend(legx,ylim[2],legtxt,lty = ltyest,col = colest) title(paste("Sample",I,":",labf,"n=",n, "sd=",round(sd,2),"hopt=",round(hopt,2))) if(click) { clickText("Click on the plot to get the next sample!") locator(1) } } else { ## (!oneplot) for(i in 1:4) { plot(x,y,ylim = ylim) lines(xg, zh[I,i,],lty = ltyest[i], col = colest[i]) lines(xg, ztrue, lty = ltyest[5], col = colest[5]) legend(legx,ylim[2], legtxt[c(i,5)], lty = ltyest[c(i,5)], col = colest[c(i,5)]) title(paste("Sample",I,":",labf,"n=",n,"sd=",round(sd,2), if(i < 4) paste("h=",round(h[i],2)) else paste("hopt=",round(hopt,2)))) if(click) clickText("Click on the plot to get the next sample!") } if(click) locator(1) } }## for( 1:N ) meanzh <- apply(zh,2:3,"mean") biash <- varh <- mseh <- numeric(4) for(i in 1:4) { biash[i] <- sum((meanzh[i,]-ztrue)^2)*scale # Bias^2 (squared!) mseh [i] <- mean(iseh[,i]) varh [i] <- mseh[i]-biash[i] ## MSE = Var + Bias^2 } par(mfrow = c(1,1)) boxplot(iseh, ylim = c(0,max(iseh))) abline(h=0, col="gray", lty=3) text(1:4,biash,"Bias^2",col = 2) text(1:4,varh, "Var", col = 3) text(1:4,mseh, "MSE", col = 4) title(paste("Summary of ISE of",N,"samples:",labf,"n=",n,"sd=",round(sd,2), if(rand.x) "- random x")) list(hOpt=hOpt, iseh=iseh, biash=biash, varh=varh, mseh=mseh) } ##' This function plots distance curves and velocity curves of nboys boys and ##' ngirls girls (randomly selected from a population of 120 boys and 112 girls) ##' It also allows to extract features characterizing the ##' puberty from the velocity curves ##' @param nboys ##' @param ngirls ##' @author Valentin Rousson & Martin Maechler f.compare.veloc <- function(nboys = 12, ngirls = nboys, maxvel = 18, age.grid = seq(0, 20, 1/64), notch=TRUE, ...) { stopifnot( require("lokern"), nboys >= 1, ngirls >= 1) growthLab <- "growth velocity [cm / yr]" draw.and.select <- function(dat, nSel, col.velo) { name <- deparse(substitute(dat)) stopifnot(is.list(dat), is.list(dat[[1]]), nSel >= 1, nSel == as.integer(nSel)) if(grepl("s$", name)) ## drop plural "s": name <- sub("s$", "", name) N <- length(dat) nSel <- min(N, nSel) i. <- sample(1:N, nSel) # randomly selected children tpub <- peak <- numeric(nSel) for(i in 1:nSel) { ## for each child [i] x <- dat[[i.[i]]]$age y <- dat[[i.[i]]]$height no <- dat[[i.[i]]]$ID dist <- glkerns(x,y,x.out = age.grid)$est dx <- x[-1]-diff(x)/2 rx <- range(x, dx, age.grid) ## (originally had ylab = 'distance') plot(x,y, xlim=rx, ylim= range(y,dist), xlab = "age", ylab = "height") lines(age.grid, dist) title(paste("Distance curve for",name,"no",no)) mtext(sprintf("%3d of %d",i, nSel), adj=1) ## divided differences dy/dx as poor man's estimate -- need to be smoothed dy <- diff(y)/diff(x) veloc0 <- glkerns(dx,dy, x.out = age.grid)$est hopt <- glkerns(x,y, x.out = age.grid, deriv = 1)$bandwidth tmp <- interpol.xy(x,y) newx <- tmp$x newy <- tmp$y veloc1 <- glkerns(newx,newy, x.out = age.grid, deriv = 1, bandwidth = hopt)$est plot(dx, pmin(dy, maxvel), xlim=rx, ylim = c(0,maxvel), xlab = "age", ylab = growthLab) lines(age.grid, veloc1, col = col.velo, lwd=2) lines(age.grid, veloc0, col = 1) legend("top", c("Estimate of derivative", "Smoothing the differences"), lwd = c(2,1), lty = c(1,1), col = c(col.velo, 1), bty="n") title(paste("Velocity curve for",name,"no",no)) clickText("Click on the puberty peak") tmp <- locator(1) tpub[i] <- tmp$x peak[i] <- tmp$y } list(tpub = tpub, peak = peak) } ## draw.and.select() r <- getGrData() boys <- r$boys girls <- r$girls op <- par(mfrow = c(2,1), mgp = c(1.5, 0.6, 0), mar = .1+c(7,5,4,1)/2) on.exit(par(op)) L.boys <- draw.and.select(boys , nboys, col.velo = "skyblue3") L.girl <- draw.and.select(girls, ngirls, col.velo = "pink4") par(mfrow = c(1,2)) boxplot(L.boys$tpub, L.girl$tpub, names = c("boys","girls"), notch=notch, ...) title(paste("\"High\" puberty age for", nboys, "boys and", ngirls,"girls"), cex.main = 0.8, ylab = "age [years]") boxplot(L.boys$peak, L.girl$peak, names = c("boys","girls"), notch=notch, ...) title(paste("\"High\" puberty peak for", nboys, "boys and", ngirls,"girls"), cex.main = 0.8, ylab = growthLab) invisible(list(boys = L.boys, girls = L.girl)) } ##---------------------------------------------------------------------------- ##' Get Growth Data [ called only from f.compare.veloc() ] getGrData <- function(dir = "http://stat.ethz.ch/Teaching/Datasets/WBL") { age <- matrix(scan(paste(dir,"zeiten.dat", sep="/")), byrow = TRUE, ncol = 38) height <- matrix(scan(paste(dir,"laengen.dat", sep="/")), byrow = TRUE, ncol = 38) stopifnot(dim(age) == dim(height)) ID <- age[,1] sex <- age[,2] stopifnot(height[,1] == ID, height[,2] == sex) # consistency check get.hgt.age <- function(sel) { stopifnot(is.logical(sel), length(sel) == length(ID)) nn <- length(ind <- which(sel)) r <- vector("list", nn) for(i in seq_len(nn)) { h.i <- abs(height[ind[i], -(1:2)]) a.i <- age[ind[i], -(1:2)] r[[i]] <- list(ID = ID[ind[i]], height = h.i[a.i > 0], age = a.i[a.i > 0]) } r } boys <- get.hgt.age(sex == 9) girls <- get.hgt.age(sex == 8) list(boys=boys, girls=girls) } ##' growthData() : returns one data frame {more useful/modern} growthData <- function(dir = "http://stat.ethz.ch/Teaching/Datasets/WBL") { age <- matrix(scan(paste(dir,"zeiten.dat", sep="/")), byrow = TRUE, ncol = 38) height <- matrix(scan(paste(dir,"laengen.dat", sep="/")), byrow = TRUE, ncol = 38) stopifnot(dim(age) == dim(height)) ID <- age[,1] sex <- age[,2] iNA <- age [,-(1:2)] == 0 stopifnot(height[,1] == ID, height[,2] == sex, # consistency checks ## the '0's are non recorded data identical(iNA, height[,-(1:2)] == 0)) nObs <- rowSums(!iNA) # number of observations per child ## in 29 .. 36, most in 32,33,34; some negative 'heights' ==> take(abs) data.frame(ID = factor(rep(ID, nObs)), sex= factor(rep(c("f","m")[sex-7], nObs)), age = { age <- t(age [,-(1:2)]) ; age [ age != 0]}, height= {height <- abs(t(height[,-(1:2)])); height[height != 0]}) } if(FALSE) { dGrowth <- growthData() require(lattice) xyplot(height ~ age | sex, group=~ID, dGrowth, type ="l") # Girls vs Boys xyplot(height ~ age | sex:ID, dGrowth, type ="l") # individuals (all 'f' | all 'm') xyplot(height ~ age | ID, dGrowth, type ="l") # individuals } ## called from draw.and.select() above ## MM: This almost surely just a not so efficient version of approx() interpol.xy <- function(x,y, f = 1.25) { n <- length(x) stopifnot(n >= 2, length(y) == n, is.numeric(f), length(f)==1, f >= 1) m <- round(f*n) newx <- seq(min(x),max(x),len = m) newy <- c(y[1],rep(0,m-2),y[n]) for(i in 2:(m-1)) { i0 <- max(which(x < newx[i])) i1 <- min(which(x > newx[i])) b <- (y[i1]-y[i0])/(x[i1]-x[i0]) a <- y[i1] - b*x[i1] newy[i] <- a + b*newx[i] } list(x=newx, y=newy) }