# Confidence intervals and bands:

library(ISwR) 
# "Corresponding to Introductory Statistics with R" 
# by Peter Dalgaard

# Load data
data(thuesen)
?thuesen
summary(thuesen)
# Remove observation with missing y-value:
thuesen <- thuesen[!is.na(thuesen[,2]),]
# Check result
summary(thuesen)
attach(thuesen)

# Create first plot:
plot(blood.glucose, short.velocity)
lines(loess.smooth(blood.glucose, short.velocity), col="red")

# Fit simple linear regression
fit <- lm(short.velocity~blood.glucose)
summary(fit)
# Recall how to get confidence intervals for the 
# parameters alpha and beta:
confint(fit)
# In this script we focus on confidence intervals 
# for E(y_0) and y_0 instead.

# Get fitted values by hand:
fit$coef
fit$coef[1] + fit$coef[2]*blood.glucose
# Or automatically:
predict(fit)
 
# Get confidence interval for E(y_1) by hand:
# Sample size:
(n <- length(blood.glucose))
# x-value at which we want to create confidence interval
blood.glucose[1]
# Predicted value (midpoint of confidence interval):
(fitted <- fit$coef[1] + fit$coef[2]*15.3)
# Quantile of t-distribution that we need:
(quant <- qt(.975,n-2))
# Sigma.hat value:
(sigma.hat <- sqrt((n-1)/(n-2)*var(fit$resid)))
# Some explanation: 
# Note that sigma.hat almost equals sd(fit$resid)
# However, sd uses n-1 in the denominator, while
# \hat sigma^2 uses n-2 in the denominator. 
# Therefore we correct for this. 
# Note that the result equals the residual standard error 
# in summary(fit), as it should.
# Now compute estimate for sd(\hat y_1):
(sd.haty1 <- sigma.hat * sqrt(1/n + 
        (15.3-mean(blood.glucose))^2 / 
         sum( (blood.glucose-mean(blood.glucose))^2) ))
# Now we can construct confidence interval:
(lower <- fitted - quantile*sd.haty1)
(upper <- fitted + quantile*sd.haty1)
# Compare to automatic result (much shorter!):
predict(fit, int="c")
# Yes, our values correspond to the first line.

# Get confidence interval for y_1 
# (assuming a *new* observations with x-value x_1)
(sd.haty1.minus.y1 <- sigma.hat * sqrt(1 + 1/n + 
                 (15.3-mean(blood.glucose))^2 / 
                 sum( (blood.glucose-mean(blood.glucose))^2)))
# Note this is almost the same formula as before, 
# but we added a 1 within the square root
# Construct confidence interval:
(lower <- fitted - quant*sd.haty1.minus.y1)
(upper <- fitted + quant*sd.haty1.minus.y1)
# Compare to automatic result:
predict(fit, int="p")
# Yes, our results match to the first line again!
# Note we use int="p" because a CI for y_0 is 
# also called a *p*rediction interval.

# Finally, let's consider the confidence *band* 
# for the regression line
# Use x-values 4,5,...,20
x <- c(4:20)
(pred.frame <- data.frame(blood.glucose=x))
# Compute predicted values:
(fitted <- predict(fit,newdata=pred.frame))
(temp <- sqrt(1/n + (x-mean(blood.glucose))^2 / 
         sum( (blood.glucose-mean(blood.glucose))^2) ))
lower <- fitted - sqrt(2*qf(.95,2,n-2)*sigma.hat*temp)
upper <- fitted + sqrt(2*qf(.95,2,n-2)*sigma.hat*temp)

# Now create a plot of everything
# We use a special command for ylim to make sure that 
# the confidence intervals fit in the plot
pp <- predict(fit, int="p", newdata=pred.frame)
pc <- predict(fit, int="c", newdata=pred.frame)
plot(blood.glucose, short.velocity, 
    ylim=range(short.velocity,pp,lower,upper),
    main="black: fitted values, red: CI for E(y_0), blue: CI for y_0, green: confidence band") 
pred.gluc <- pred.frame$blood.glucose
matlines(x, pc, lty=c(1,2,2), col="red")
matlines(x, pp, lty=c(1,3,3), col="blue")
lines(pred.gluc,pc[,1], col="black")
matlines(x, cbind(lower,upper), lty=c(4,4), col="green")