# Confidence intervals and bands: library(ISwR) # "Corresponding to Introductory Statistics with R" # by Peter Dalgaard # Load data data(thuesen) ?thuesen summary(thuesen) # Remove observation with missing y-value: thuesen <- thuesen[!is.na(thuesen[,2]),] # Check result summary(thuesen) attach(thuesen) # Create first plot: plot(blood.glucose, short.velocity) lines(loess.smooth(blood.glucose, short.velocity), col="red") # Fit simple linear regression fit <- lm(short.velocity~blood.glucose) summary(fit) # Recall how to get confidence intervals for the # parameters alpha and beta: confint(fit) # In this script we focus on confidence intervals # for E(y_0) and y_0 instead. # Get fitted values by hand: fit$coef fit$coef[1] + fit$coef[2]*blood.glucose # Or automatically: predict(fit) # Get confidence interval for E(y_1) by hand: # Sample size: (n <- length(blood.glucose)) # x-value at which we want to create confidence interval blood.glucose[1] # Predicted value (midpoint of confidence interval): (fitted <- fit$coef[1] + fit$coef[2]*15.3) # Quantile of t-distribution that we need: (quant <- qt(.975,n-2)) # Sigma.hat value: (sigma.hat <- sqrt((n-1)/(n-2)*var(fit$resid))) # Some explanation: # Note that sigma.hat almost equals sd(fit$resid) # However, sd uses n-1 in the denominator, while # \hat sigma^2 uses n-2 in the denominator. # Therefore we correct for this. # Note that the result equals the residual standard error # in summary(fit), as it should. # Now compute estimate for sd(\hat y_1): (sd.haty1 <- sigma.hat * sqrt(1/n + (15.3-mean(blood.glucose))^2 / sum( (blood.glucose-mean(blood.glucose))^2) )) # Now we can construct confidence interval: (lower <- fitted - quantile*sd.haty1) (upper <- fitted + quantile*sd.haty1) # Compare to automatic result (much shorter!): predict(fit, int="c") # Yes, our values correspond to the first line. # Get confidence interval for y_1 # (assuming a *new* observations with x-value x_1) (sd.haty1.minus.y1 <- sigma.hat * sqrt(1 + 1/n + (15.3-mean(blood.glucose))^2 / sum( (blood.glucose-mean(blood.glucose))^2))) # Note this is almost the same formula as before, # but we added a 1 within the square root # Construct confidence interval: (lower <- fitted - quant*sd.haty1.minus.y1) (upper <- fitted + quant*sd.haty1.minus.y1) # Compare to automatic result: predict(fit, int="p") # Yes, our results match to the first line again! # Note we use int="p" because a CI for y_0 is # also called a *p*rediction interval. # Finally, let's consider the confidence *band* # for the regression line # Use x-values 4,5,...,20 x <- c(4:20) (pred.frame <- data.frame(blood.glucose=x)) # Compute predicted values: (fitted <- predict(fit,newdata=pred.frame)) (temp <- sqrt(1/n + (x-mean(blood.glucose))^2 / sum( (blood.glucose-mean(blood.glucose))^2) )) lower <- fitted - sqrt(2*qf(.95,2,n-2)*sigma.hat*temp) upper <- fitted + sqrt(2*qf(.95,2,n-2)*sigma.hat*temp) # Now create a plot of everything # We use a special command for ylim to make sure that # the confidence intervals fit in the plot pp <- predict(fit, int="p", newdata=pred.frame) pc <- predict(fit, int="c", newdata=pred.frame) plot(blood.glucose, short.velocity, ylim=range(short.velocity,pp,lower,upper), main="black: fitted values, red: CI for E(y_0), blue: CI for y_0, green: confidence band") pred.gluc <- pred.frame$blood.glucose matlines(x, pc, lty=c(1,2,2), col="red") matlines(x, pp, lty=c(1,3,3), col="blue") lines(pred.gluc,pc[,1], col="black") matlines(x, cbind(lower,upper), lty=c(4,4), col="green")