## Rscript for Chapter 5

library(car)

#############################
# Eyeball a regression line #
#############################

postscript("EyeballRegression.ps",horizontal=FALSE)

# Generate random data
x <- rnorm(250,0,1)
y <- 2*x + rnorm(250,0,2)
plot(x,y,main="Which line is the regression line?")

# Fit linear regression
fit <- lm(y~x)
abline(fit,col="blue",lwd=2)

# Generate alternative line (sometimes called SD line), that also
# goes through (mean(x),mean(y))
b <- sd(y)/sd(x)
a <- mean(y)-b*mean(x)
abline(a,b,col="black",lwd=2)

# Check if both lines go through (mean(x),mean(y))
points(mean(x),mean(y),pch=3)

# It may look as if the black line fits the data better.
# However, its sum of squared errors is larger:
sum((y-fit$fitted)^2)  	# blue line, fit$fitted contains the fitted values
sum((y - (a+b*x))^2)    # black line

dev.off()


####################################
# Example simple linear regression #
####################################

library(car)
data(Davis)
women <- Davis[Davis$sex=="F",]
plot(women$repwt, women$weight)

## correct outlier: length and weight are interchanged
women[4,c(2,3)]<-c(57,166)
attach(women)
plot(repwt, weight)

## remove women with missing values in weight or repwt
women <- women[!is.na(repwt) & !is.na(weight),]
attach(women)

## check if we now have 101 women left:
n <- length(weight)
n

# fit linear regression
fit <- lm(weight ~ repwt)
summary(fit)

plot(repwt,weight)
abline(fit)

# better:
plot(jitter(repwt),jitter(weight))
abline(fit)


#################
# Ozone example #
#################

ozone <- read.table("ozone.txt",header=TRUE)
attach(ozone)

# make scatterplot matrix
pairs(ozone[1:3])

# fit multiple linear regression
fit <- lm(sf ~ year + rain)
summary(fit)


#########################
# Added variable plots: #
#########################

par(mfrow=c(2,2))

fit1 <- lm(sf~year)
plot(year,sf,main="sf~year")
abline(fit1)
summary(fit1)

# Is it worth adding 'rain' as an extra explanatory variable?
# We want to model that part of sf that is not explained by year (fit1$resid)
# with the part of rain that is not explained by year (fit2$resid)

fit2 <- lm(rain~year)
plot(year,rain,main="rain~year")
abline(fit2)
summary(fit2)

fit3 <- lm(fit1$resid~fit2$resid)
plot(fit2$resid,fit1$resid,main="added variable plot")
abline(fit3)
summary(fit3)


# Now we fit rain first, and wonder if we should add 'year' as an extra
# explantory variable. We want to model that part of 'sf' that is not 
# explained by 'rain' (fit5$resid) with the part of 'year' that is not 
# explained by 'rain' (fit6$resid)

par(mfrow=c(2,2))

fit5 <- lm(sf ~ rain)
plot(rain,sf,main="sf~rain")
abline(fit5)
summary(fit5)

fit6 <- lm(year ~ rain)
plot(rain,year,main="year~rain")
abline(fit6)
summary(fit6)

fit7 <- lm(fit5$resid ~ fit6$resid)
plot(fit6$resid,fit5$resid,main="added variable plot")
abline(fit7)
summary(fit7)