################
# MISSING VALUES
################

# r-script to reproduce results in Faraway Ch 14

#############
# read data #
#############

library(faraway)
data(chmiss)
summary(chmiss)
nrow(chmiss)
names(chmiss)

pairs(chmiss)

#########################################################
# by default, R removes all cases that are not complete #
# this can lead to a huge loss of data 	              #
#########################################################

# note that we can write a regression like this:
# the 'dot' means that we regress on all variables
# by using 'data=chmiss', we do not first need to attach the data 

m1 <- lm(involact ~ ., data=chmiss)
summary(m1)


#########################################################
# remove independent variables with many missing values #
#########################################################

# do not regress on age, because it has 5 missing observations:

m2 <- lm(involact ~ race+fire+theft+income, data=chmiss)
summary(m2)


#########################
# impute missing values #
#########################

# compute the means of the independent variables, 
# using the new command apply

# this does not work:
cmeans <- apply(chmiss,2,mean)
cmeans
# but this does work:
cmeans <- apply(chmiss,2,mean,na.rm=T)
cmeans

# and then impute these means for the missing values:

imputed.chmiss <- chmiss
for(i in c(1,2,3,4,6)){
  imputed.chmiss[is.na(chmiss[,i]),i] <- cmeans[i]
}
# note that we do not impute missing values in the 5th column, 
# because the 5th column is the dependent variable

# check the result:
summary(imputed.chmiss)

m2 <- lm(involact ~ ., data=imputed.chmiss)
summary(m2)

##############
# COLLINEARITY
##############

# we can compute the correlation between all variables at
# once, so we don't have to do this pairwise!

# the following does not work, because of the missing values:
cor(chmiss)

# this version only uses cases with complete data:
# (note we use the command round, to limit the number 
# of digits in the output)
c <- round(cor(chmiss, use="complete.obs"),digits=2)
c

# this version computes correlation between each pair of variables
# using all complete pairs of observations on those variables. This 
# can result in covariance or correlation matrices which are not 
# positive semidefinite.
c <- round(cor(chmiss, use="pairwise.complete.obs"),digits=2)
c

# if there are many variables, it is easier to make a plot of the 
# correlation matrix, where values that are high in absolute value 
# are dark, and values that are low in absolute value are light grey
n <- nrow(c)
image(c(1:n),c(1:n),abs(c),col=gray(seq(1,0,by=-.05)),xlab="Predictors",ylab="Predictors")

#################
# MODEL SELECTION
#################

# R-code to reproduce results in Faraway, Ch 10

###########################
# read libraries and data #
###########################

library(faraway)
library(leaps)
data(state)
statedata <- data.frame(state.x77,row.names=state.abb,check.names=T)


######################
# backward selection #
######################

m1 <- lm(Life.Exp ~ ., data=statedata)
summary(m1)

m2 <- update(m1, . ~ . - Area)
summary(m2)

m3 <- update(m2, . ~ . - Illiteracy)
summary(m3)

m4 <- update(m3, . ~ . - Income)
summary(m4)

m5 <- update(m4, . ~ . - Population)
summary(m5)


#######
# AIC #
#######
 
m6 <- lm(Life.Exp ~ ., data=statedata)
step(m6)


################
# adjusted R^2 #
################

x <- statedata[,-4]
y <- statedata[,4]
m7 <- leaps(x,y,method="adjr2")
maxadjr(m7,10)
ind <- order(m7$adjr2, decreasing=T)
which <- m7$which[ind,]
nr <- nrow(which)
nc <- ncol(which)

par(mfrow=c(2,1))
image(c(1:nr),c(1:nc),which,col=c("white","black"),xlab="Models, ordered by adjusted R^2",ylab="Predictors")
image(c(1:10),c(1:nc),which[c(1:10),],col=c("white","black"),xlab="Best 10 models, ordered by adjusted R^2",ylab="Predictors")

####################################
# add transformations of variables #
####################################

# this is one easy way to make boxplots, with automatic titles:

dimnames(state.x77)

par(mfrow=c(3,3))
for (i in 1:8){
   boxplot(state.x77[,i], main=dimnames(state.x77)[[2]][i])
}

xtransf <- cbind(log(x[,1]),log(x[,3]),log(x[,7]))

# this is another easy way to make boxplots, but without titles:
par(mfrow=c(2,2))
apply(xtransf,2,boxplot)

# we can just add the transformed variables to the orginal
# variables, and let R decide which ones are best
x <- cbind(x,xtransf)
m8 <- leaps(x,y,method="adjr2")
maxadjr(m8,10)
ind <- order(m8$adjr2, decreasing=T)
which <- m8$which[ind,]
nr <- nrow(which)
nc <- ncol(which)

par(mfrow=c(2,1))
image(c(1:nr),c(1:nc),which,col=c("white","black"),xlab="Models, ordered by adjusted R^2",ylab="Predictors")
image(c(1:10),c(1:nc),which[c(1:10),],col=c("white","black"),xlab="Best 10 models, ordered by adjusted R^2",ylab="Predictors")

# note that collinearity is not a big problem.
# for example, Population and log(Population) are highly correlated
# but just one of these two ends up in the model. 
round(cor(x),2)
image(c(1:nc),c(1:nc),abs(cor(x)),col=gray(seq(1,0,by=-.05)),xlab="Predictors",ylab="Predictors")

########################
# some quick diagnostics
########################

m.final <- lm(Life.Exp ~ log(Population) + Murder + HS.Grad + Frost, data=statedata)
plot(m.final, which=c(1:5))

# removing Hawaii leads to the same best models with leaps() function:
subset <- c(1:50)[-11]
x <- cbind(x,xtransf)
m8 <- leaps(x[subset,],y[subset],method="adjr2")
maxadjr(m8,10)