################
# MISSING VALUES
################

# r-script to reproduce results in Faraway Ch 14

#############
# read data #
#############

library(faraway)
data(chmiss)
?chmiss
summary(chmiss)
nrow(chmiss)
names(chmiss)

pairs(chmiss)

###########################################
# By default, R removes all cases that 
# are not complete. This can lead to a huge 
# loss of data 	              
###########################################

# Note that we can write a regression like 
# below. The 'dot' means that we regress on 
# all variables. By using 'data=chmiss', 
# we do not first need to attach the data 

m1 <- lm(involact ~ ., data=chmiss)
summary(m1)


###########################################
# remove independent variables with many 
# missing values 
###########################################

# do not regress on age, because it has 
# 5 missing observations:

m2 <- lm(involact ~ race+fire+theft+income, 
         data=chmiss)
summary(m2)


#########################
# impute missing values #
#########################

# compute the means of the independent 
# variables, using the command apply

# this does not work:
cmeans <- apply(chmiss,2,mean)
cmeans
# but this does work:
cmeans <- apply(chmiss,2,mean,na.rm=T)
cmeans

# and then impute these means for the 
# missing values:

imputed.chmiss <- chmiss
for(i in c(1,2,3,4,6)){
  imputed.chmiss[is.na(chmiss[,i]),i] <- cmeans[i]
}
# note that we do not impute missing values 
# in the 5th column, because the 5th column 
# is the dependent variable

# check the result:
summary(imputed.chmiss)

m2 <- lm(involact ~ ., data=imputed.chmiss)
summary(m2)

# of course we have now changed the data. 
# a better way to deal with missing values
# is to use multiple imputation, for 
# example using the R-package mice. 

##############
# COLLINEARITY
##############

# we can compute the correlation between 
# all variables at once, so we don't have 
# to do this pairwise!

# the following does not work, because of 
# the missing values:
cor(chmiss)

# this version only uses cases with complete 
# data. (note we use the command round, to 
# limit the number of digits in the output)
c <- round(cor(chmiss, use="complete.obs"),
           digits=2)
c

# this version computes correlation between 
# each pair of variables, using all complete 
# pairs of observations on those variables. 
# This can result in covariance or 
# correlation matrices which are not 
# positive semidefinite.
c <- round(cor(chmiss, 
               use="pairwise.complete.obs"),
           digits=2)
c

# if there are many variables, it is easier 
# to make a plot of the correlation matrix, 
# where values that are high in absolute value 
# are dark, and values that are low in 
# absolute value are light grey
n <- nrow(c)
par(mfrow=c(1,1))
image(c(1:n),c(1:n),abs(c),col=gray(seq(1,0,by=-.05)),
      xlab="Predictors",ylab="Predictors")

#################
# MODEL SELECTION
#################

# R-code to reproduce results in 
# Faraway, Ch 10

###########################
# read libraries and data #
###########################

library(faraway)
library(leaps)
data(state)
statedata <- data.frame(state.x77,
                        row.names=state.abb,
                        check.names=T)
?state.x77

######################
# backward selection #
######################

m1 <- lm(Life.Exp ~ ., data=statedata)
par(mfrow=c(3,2))
plot(m1, which=c(1:5))
# we may want to rerun the analysis at the 
# end without Hawaii and Arkansas

summary(m1)

# backward regression, alpha_drop=0.05:
m2 <- update(m1, . ~ . - Area)
summary(m2)

m3 <- update(m2, . ~ . - Illiteracy)
summary(m3)

m4 <- update(m3, . ~ . - Income)
summary(m4)

m5 <- update(m4, . ~ . - Population)
summary(m5)

#######
# AIC #
#######
 
m6 <- lm(Life.Exp ~ ., data=statedata)
step(m6)


################
# adjusted R^2 #
################

x <- statedata[,-4]
y <- statedata[,4]
m7 <- leaps(x,y,method="adjr2")
m7
maxadjr(m7,10)

# sort the models by adjusted R-squared
ind <- order(m7$adjr2, decreasing=T)
which <- m7$which[ind,] 
nr <- nrow(which) # number of rows
nc <- ncol(which) # number of columns

par(mfrow=c(2,1))
image(c(1:nr),c(1:nc),which,col=c("white","black"),
      xlab="Models, ordered by adjusted R^2",
      ylab="Predictors")
image(c(1:10),c(1:nc),which[c(1:10),],
      col=c("white","black"),
      xlab="Best 10 models, ordered by adjusted R^2",
      ylab="Predictors")
colnames(x)

####################################
# add transformations of variables #
####################################

# this is one easy way to make boxplots, with automatic 
# titles:

dimnames(state.x77)

par(mfrow=c(3,3))
for (i in 1:8){
   boxplot(state.x77[,i], main=dimnames(state.x77)[[2]][i])
}

xtransf <- cbind(log(x[,1]),log(x[,3]),log(x[,7]))

# this is another easy way to make boxplots, but 
# without titles:
par(mfrow=c(2,2))
apply(xtransf,2,boxplot)

# we can just add the transformed variables to the orginal
# variables, and let R decide which ones are best
x <- cbind(x,xtransf)
m8 <- leaps(x,y,method="adjr2")
maxadjr(m8,10)
ind <- order(m8$adjr2, decreasing=T)
which <- m8$which[ind,]
nr <- nrow(which)
nc <- ncol(which)

par(mfrow=c(2,1))
image(c(1:nr),c(1:nc),which,col=c("white","black"),
      xlab="Models, ordered by adjusted R^2",
      ylab="Predictors")
image(c(1:10),c(1:nc),which[c(1:10),],
      col=c("white","black"),
      xlab="Best 10 models, ordered by adjusted R^2",
      ylab="Predictors")

# note that collinearity is not a big problem.
# for example, Population and log(Population) are 
# highly correlated but just one of these two 
# ends up in the model. 
round(cor(x),2)
image(c(1:nc),c(1:nc),abs(cor(x)),
      col=gray(seq(1,0,by=-.05)),xlab="Predictors",
      ylab="Predictors")