## Rscript for Chapter 5 library(car) ############################# # Eyeball a regression line # ############################# postscript("EyeballRegression.ps",horizontal=FALSE) # Generate random data x <- rnorm(250,0,1) y <- 2*x + rnorm(250,0,2) plot(x,y,main="Which line is the regression line?") # Fit linear regression fit <- lm(y~x) abline(fit,col="blue",lwd=2) # Generate alternative line (sometimes called SD line), that also # goes through (mean(x),mean(y)) b <- sd(y)/sd(x) a <- mean(y)-b*mean(x) abline(a,b,col="black",lwd=2) # Check if both lines go through (mean(x),mean(y)) points(mean(x),mean(y),pch=3) # It may look as if the black line fits the data better. # However, its sum of squared errors is larger: sum((y-fit$fitted)^2) # blue line, fit$fitted contains the fitted values sum((y - (a+b*x))^2) # black line dev.off() #################################### # Example simple linear regression # #################################### library(car) data(Davis) women <- Davis[Davis$sex=="F",] plot(women$repwt, women$weight) ## correct outlier: length and weight are interchanged women[4,c(2,3)]<-c(57,166) attach(women) plot(repwt, weight) ## remove women with missing values in weight or repwt women <- women[!is.na(repwt) & !is.na(weight),] attach(women) ## check if we now have 101 women left: n <- length(weight) n # fit linear regression fit <- lm(weight ~ repwt) summary(fit) plot(repwt,weight) abline(fit) # better: plot(jitter(repwt),jitter(weight)) abline(fit) ################# # Ozone example # ################# ozone <- read.table("ozone.txt",header=TRUE) attach(ozone) # make scatterplot matrix pairs(ozone[1:3]) # fit multiple linear regression fit <- lm(sf ~ year + rain) summary(fit) ######################### # Added variable plots: # ######################### par(mfrow=c(2,2)) fit1 <- lm(sf~year) plot(year,sf,main="sf~year") abline(fit1) summary(fit1) # Is it worth adding 'rain' as an extra explanatory variable? # We want to model that part of sf that is not explained by year (fit1$resid) # with the part of rain that is not explained by year (fit2$resid) fit2 <- lm(rain~year) plot(year,rain,main="rain~year") abline(fit2) summary(fit2) fit3 <- lm(fit1$resid~fit2$resid) plot(fit2$resid,fit1$resid,main="added variable plot") abline(fit3) summary(fit3) # Now we fit rain first, and wonder if we should add 'year' as an extra # explantory variable. We want to model that part of 'sf' that is not # explained by 'rain' (fit5$resid) with the part of 'year' that is not # explained by 'rain' (fit6$resid) par(mfrow=c(2,2)) fit5 <- lm(sf ~ rain) plot(rain,sf,main="sf~rain") abline(fit5) summary(fit5) fit6 <- lm(year ~ rain) plot(rain,year,main="year~rain") abline(fit6) summary(fit6) fit7 <- lm(fit5$resid ~ fit6$resid) plot(fit6$resid,fit5$resid,main="added variable plot") abline(fit7) summary(fit7)