################ # MISSING VALUES ################ # r-script to reproduce results in Faraway Ch 14 ############# # read data # ############# library(faraway) data(chmiss) summary(chmiss) nrow(chmiss) names(chmiss) pairs(chmiss) ######################################################### # by default, R removes all cases that are not complete # # this can lead to a huge loss of data # ######################################################### # note that we can write a regression like this: # the 'dot' means that we regress on all variables # by using 'data=chmiss', we do not first need to attach the data m1 <- lm(involact ~ ., data=chmiss) summary(m1) ######################################################### # remove independent variables with many missing values # ######################################################### # do not regress on age, because it has 5 missing observations: m2 <- lm(involact ~ race+fire+theft+income, data=chmiss) summary(m2) ######################### # impute missing values # ######################### # compute the means of the independent variables, # using the new command apply # this does not work: cmeans <- apply(chmiss,2,mean) cmeans # but this does work: cmeans <- apply(chmiss,2,mean,na.rm=T) cmeans # and then impute these means for the missing values: imputed.chmiss <- chmiss for(i in c(1,2,3,4,6)){ imputed.chmiss[is.na(chmiss[,i]),i] <- cmeans[i] } # note that we do not impute missing values in the 5th column, # because the 5th column is the dependent variable # check the result: summary(imputed.chmiss) m2 <- lm(involact ~ ., data=imputed.chmiss) summary(m2) ############## # COLLINEARITY ############## # we can compute the correlation between all variables at # once, so we don't have to do this pairwise! # the following does not work, because of the missing values: cor(chmiss) # this version only uses cases with complete data: # (note we use the command round, to limit the number # of digits in the output) c <- round(cor(chmiss, use="complete.obs"),digits=2) c # this version computes correlation between each pair of variables # using all complete pairs of observations on those variables. This # can result in covariance or correlation matrices which are not # positive semidefinite. c <- round(cor(chmiss, use="pairwise.complete.obs"),digits=2) c # if there are many variables, it is easier to make a plot of the # correlation matrix, where values that are high in absolute value # are dark, and values that are low in absolute value are light grey n <- nrow(c) image(c(1:n),c(1:n),abs(c),col=gray(seq(1,0,by=-.05)),xlab="Predictors",ylab="Predictors") ################# # MODEL SELECTION ################# # R-code to reproduce results in Faraway, Ch 10 ########################### # read libraries and data # ########################### library(faraway) library(leaps) data(state) statedata <- data.frame(state.x77,row.names=state.abb,check.names=T) ###################### # backward selection # ###################### m1 <- lm(Life.Exp ~ ., data=statedata) summary(m1) m2 <- update(m1, . ~ . - Area) summary(m2) m3 <- update(m2, . ~ . - Illiteracy) summary(m3) m4 <- update(m3, . ~ . - Income) summary(m4) m5 <- update(m4, . ~ . - Population) summary(m5) ####### # AIC # ####### m6 <- lm(Life.Exp ~ ., data=statedata) step(m6) ################ # adjusted R^2 # ################ x <- statedata[,-4] y <- statedata[,4] m7 <- leaps(x,y,method="adjr2") maxadjr(m7,10) ind <- order(m7$adjr2, decreasing=T) which <- m7$which[ind,] nr <- nrow(which) nc <- ncol(which) par(mfrow=c(2,1)) image(c(1:nr),c(1:nc),which,col=c("white","black"),xlab="Models, ordered by adjusted R^2",ylab="Predictors") image(c(1:10),c(1:nc),which[c(1:10),],col=c("white","black"),xlab="Best 10 models, ordered by adjusted R^2",ylab="Predictors") #################################### # add transformations of variables # #################################### # this is one easy way to make boxplots, with automatic titles: dimnames(state.x77) par(mfrow=c(3,3)) for (i in 1:8){ boxplot(state.x77[,i], main=dimnames(state.x77)[[2]][i]) } xtransf <- cbind(log(x[,1]),log(x[,3]),log(x[,7])) # this is another easy way to make boxplots, but without titles: par(mfrow=c(2,2)) apply(xtransf,2,boxplot) # we can just add the transformed variables to the orginal # variables, and let R decide which ones are best x <- cbind(x,xtransf) m8 <- leaps(x,y,method="adjr2") maxadjr(m8,10) ind <- order(m8$adjr2, decreasing=T) which <- m8$which[ind,] nr <- nrow(which) nc <- ncol(which) par(mfrow=c(2,1)) image(c(1:nr),c(1:nc),which,col=c("white","black"),xlab="Models, ordered by adjusted R^2",ylab="Predictors") image(c(1:10),c(1:nc),which[c(1:10),],col=c("white","black"),xlab="Best 10 models, ordered by adjusted R^2",ylab="Predictors") # note that collinearity is not a big problem. # for example, Population and log(Population) are highly correlated # but just one of these two ends up in the model. round(cor(x),2) image(c(1:nc),c(1:nc),abs(cor(x)),col=gray(seq(1,0,by=-.05)),xlab="Predictors",ylab="Predictors") ######################## # some quick diagnostics ######################## m.final <- lm(Life.Exp ~ log(Population) + Murder + HS.Grad + Frost, data=statedata) plot(m.final, which=c(1:5)) # removing Hawaii leads to the same best models with leaps() function: subset <- c(1:50)[-11] x <- cbind(x,xtransf) m8 <- leaps(x[subset,],y[subset],method="adjr2") maxadjr(m8,10)