## typing in the data, the target y is the fraction of UK pupils admitted into Oxf/Cam ## the explanatory variable x is school_month birth_month <- c(9:12,1:8) school_month <- 0:11 number_admissions <- c(470,515,470,457,473,381,466,457,437,396,384,394) number_pupils <- c(66776,64167,59603,62877,62543,57597,65319,62651,65454,64429,64612,62738) y <- 100* number_admissions / number_pupils ## fit a linear model with y as response variable and school_month as explanatory variable linmod <- lm( y ~ school_month ) ## estimate the noise level (is also stored in linmod object, see for example summary(linmod) n <- length(y) p <- 2 sigmahat <- sqrt( sum(residuals(linmod)^2) / (n-p)) ########################################################################################################## ##################################################### generate (parametric) bootstrap versions of the linear model fit ## generate a plot with regression line in red and 2000 parametric bootstrap simulations as black lines pdf(file="admission3_parametric_bootstrap2.pdf",width=6,height=6) plot(school_month, fitted(linmod),xlab="MONTHS AFTER SCHOOL YEAR STARTS",ylab="ADMISSION RATE INTO OXFORD/CAMBRIDGE (%)",type="l",col=2,lwd=4,ylim=c(0.58,0.83)) ## number of bootstrap simulations nsim <- 2000 ## record the coefficients of each linear model in this matrix coefmatrix <- matrix(nrow=nsim,ncol=p) for (i in 1:nsim){ ## create new observations (fitted + plus Gaussian noise) newy <- fitted(linmod) + sigmahat*rnorm(n) ## fit the linear model with the new observations ll <- lm(newy ~ school_month) ## store intercept and slope in the matrix coefmatrix[i,] <- coef(ll) ## plot the lines on top of original fit lines( school_month, fitted(ll), col=rgb(0.2,0.2,0.2,0.1)) } dev.off() ########################################################################################################## #################################################### show scatterplot of coefficients ## scatterplot of intercept versus slope pdf(file="admission3_coefficients.pdf",width=6.5,height=6) par(mar=c(4,5,2,2)) plot(coefmatrix,xlab=expression(hat(alpha)),ylab=expression(hat(beta)),cex.lab=1.3) dev.off() ########################################################################################################## ########################################################## compute simultaneous confidence bands ### construct confidence band at the locations in the vector x ### (note we extend into a non-sensical range) x <- seq(-2,12,length=200) ## fitted values at these locations fittedval <- as.numeric(coef(linmod)[1] + coef(linmod)[2]*x) ## 95% quantile of F-distribution (FQ = 4.102) FQ <- qf(0.95, p, n-p) ## initialise lower- and upper- bound of simultaneous confidence interval upperbound <- lowerbound <- numeric(length(x)) ## get (X^t X)^{-1} X <- cbind(rep(1,n),school_month) colnames(X) <- c("intercept", "school_month") SI <- solve( t(X) %*% X) ## loop over all points of prediction for (i in 1:length(x)){ ## get difference between fitted values and upper- and lower-bound difference <- (FQ * sigmahat^2 * p) * ( t(c(1,x[i])) %*% SI %*% (c(1,x[i]))) ## get lower- and upper-bound upperbound[i] <- fittedval[i] + sqrt(difference) lowerbound[i] <- fittedval[i] - sqrt(difference) } ## combine lower-, upper-bounds and fitted values into a matrix allbounds <- cbind(lowerbound,fittedval,upperbound) ## plot the 95% confidence bands pdf(file="admission3_confband.pdf",width=6,height=6) matplot(x,allbounds,type="l", col=c("blue","red","blue"),lwd=3,lty=c(3,1,3),ylab="CONFIDENCE BANDS",xlab="MONTHS AFTER SCHOOL YEAR STARTS") dev.off()