## Rscript for Chapter 5 library(car) ############################# # Eyeball a regression line # ############################# postscript("EyeballRegression.ps",horizontal=FALSE) # Generate random data x <- rnorm(250,0,1) y <- 2*x + rnorm(250,0,2) plot(x,y,main="Which line is the regression line?") # Fit linear regression fit <- lm(y~x) abline(fit,col="blue",lwd=2) # Generate alternative line (sometimes called SD line), that also # goes through (mean(x),mean(y)) b <- sd(y)/sd(x) a <- mean(y)-b*mean(x) abline(a,b,col="black",lwd=2) # Check if both lines go through (mean(x),mean(y)) points(mean(x),mean(y),pch=3) # It may look as if the black line fits the data better. # However, its sum of squared errors is larger: sum((y-fit$fitted)^2) # blue line, fit$fitted contains the fitted values sum((y - (a+b*x))^2) # black line dev.off() #################################### # Example simple linear regression # #################################### library(car) data(Davis) women <- Davis[Davis$sex=="F",] plot(women$repwt, women$weight) identify(women$repwt, women$weight) # to stop: click Esc ## correct outlier: women[4,] women[4,c(2,3)]<-c(57,166) #length and weight were probably interchanged attach(women) plot(repwt, weight) # better: plot(jitter(repwt),jitter(weight)) ## remove women with missing values in weight or repwt summary(women) women <- women[!is.na(repwt),] attach(women) ## check if we now have 101 women left: (n <- length(weight)) # fit linear regression fit <- lm(weight ~ repwt) summary(fit) abline(fit) plot(fit$resid) abline(h=0) abline(h=c(-4,-2,2,4),lty=2) ################# # Ozone example # ################# ozone <- read.table("ozone.txt",header=TRUE) attach(ozone) # make scatterplot matrix pairs(ozone[1:3]) # fit multiple linear regression fit <- lm(sf ~ year + rain) summary(fit) ######################### # Added variable plots: # ######################### par(mfrow=c(2,2)) # regression of SF on YEAR: plot(year,sf,main="sf~year") fit1 <- lm(sf~year) abline(fit1) summary(fit1) # Is it worth adding RAIN as an extra explanatory variable? # We must model the variation in SF that is not explained by YEAR (fit1$resid) # with the variation in RAIN that is not explained by YEAR (fit2$resid) # regression of RAIN on YEAR: plot(year,rain,main="rain~year") fit2 <- lm(rain~year) abline(fit2) summary(fit2) # added variable plot for RAIN: regression of fit1$resid on fit2$resid: plot(fit2$resid,fit1$resid,main="added variable plot") fit3 <- lm(fit1$resid~fit2$resid) abline(fit3) summary(fit3) # note that the coefficient of RAIN in fit3 is exactly the same as the # coefficient of RAIN in the full model. # Now we fit RAIN first, and wonder if we should add YEAR as an extra # explantory variable. We want to model that part of SF that is not # explained by RAIN (fit5$resid) with the part of YEAR that is not # explained by RAIN (fit6$resid) par(mfrow=c(2,2)) # regression of SF on RAIN: fit5 <- lm(sf ~ rain) plot(rain,sf,main="sf~rain") abline(fit5) summary(fit5) # regression of YEAR on RAIN: fit6 <- lm(year ~ rain) plot(rain,year,main="year~rain") abline(fit6) summary(fit6) # added variable plot for YEAR: regression of fit5$resid on fit6$resid: fit7 <- lm(fit5$resid ~ fit6$resid) plot(fit6$resid,fit5$resid,main="added variable plot") abline(fit7) summary(fit7) # Note that the coefficient for YEAR in fit7 is exactly the same as the # coefficient of YEAR in the full model.