################ # MISSING VALUES ################ # r-script to reproduce results in Faraway Ch 14 ############# # read data # ############# library(faraway) data(chmiss) ?chmiss summary(chmiss) nrow(chmiss) names(chmiss) pairs(chmiss) ########################################### # By default, R removes all cases that # are not complete. This can lead to a huge # loss of data ########################################### # Note that we can write a regression like # below. The 'dot' means that we regress on # all variables. By using 'data=chmiss', # we do not first need to attach the data m1 <- lm(involact ~ ., data=chmiss) summary(m1) ########################################### # remove independent variables with many # missing values ########################################### # do not regress on age, because it has # 5 missing observations: m2 <- lm(involact ~ race+fire+theft+income, data=chmiss) summary(m2) ######################### # impute missing values # ######################### # compute the means of the independent # variables, using the command apply # this does not work: cmeans <- apply(chmiss,2,mean) cmeans # but this does work: cmeans <- apply(chmiss,2,mean,na.rm=T) cmeans # and then impute these means for the # missing values: imputed.chmiss <- chmiss for(i in c(1,2,3,4,6)){ imputed.chmiss[is.na(chmiss[,i]),i] <- cmeans[i] } # note that we do not impute missing values # in the 5th column, because the 5th column # is the dependent variable # check the result: summary(imputed.chmiss) m2 <- lm(involact ~ ., data=imputed.chmiss) summary(m2) # of course we have now changed the data. # a better way to deal with missing values # is to use multiple imputation, for # example using the R-package mice. ############## # COLLINEARITY ############## # we can compute the correlation between # all variables at once, so we don't have # to do this pairwise! # the following does not work, because of # the missing values: cor(chmiss) # this version only uses cases with complete # data. (note we use the command round, to # limit the number of digits in the output) c <- round(cor(chmiss, use="complete.obs"), digits=2) c # this version computes correlation between # each pair of variables, using all complete # pairs of observations on those variables. # This can result in covariance or # correlation matrices which are not # positive semidefinite. c <- round(cor(chmiss, use="pairwise.complete.obs"), digits=2) c # if there are many variables, it is easier # to make a plot of the correlation matrix, # where values that are high in absolute value # are dark, and values that are low in # absolute value are light grey n <- nrow(c) par(mfrow=c(1,1)) image(c(1:n),c(1:n),abs(c),col=gray(seq(1,0,by=-.05)), xlab="Predictors",ylab="Predictors") ################# # MODEL SELECTION ################# # R-code to reproduce results in # Faraway, Ch 10 ########################### # read libraries and data # ########################### library(faraway) library(leaps) data(state) statedata <- data.frame(state.x77, row.names=state.abb, check.names=T) ?state.x77 ###################### # backward selection # ###################### m1 <- lm(Life.Exp ~ ., data=statedata) par(mfrow=c(3,2)) plot(m1, which=c(1:5)) # we may want to rerun the analysis at the # end without Hawaii and Arkansas summary(m1) # backward regression, alpha_drop=0.05: m2 <- update(m1, . ~ . - Area) summary(m2) m3 <- update(m2, . ~ . - Illiteracy) summary(m3) m4 <- update(m3, . ~ . - Income) summary(m4) m5 <- update(m4, . ~ . - Population) summary(m5) ####### # AIC # ####### m6 <- lm(Life.Exp ~ ., data=statedata) step(m6) ################ # adjusted R^2 # ################ x <- statedata[,-4] y <- statedata[,4] m7 <- leaps(x,y,method="adjr2") m7 maxadjr(m7,10) # sort the models by adjusted R-squared ind <- order(m7$adjr2, decreasing=T) which <- m7$which[ind,] nr <- nrow(which) # number of rows nc <- ncol(which) # number of columns par(mfrow=c(2,1)) image(c(1:nr),c(1:nc),which,col=c("white","black"), xlab="Models, ordered by adjusted R^2", ylab="Predictors") image(c(1:10),c(1:nc),which[c(1:10),], col=c("white","black"), xlab="Best 10 models, ordered by adjusted R^2", ylab="Predictors") colnames(x) #################################### # add transformations of variables # #################################### # this is one easy way to make boxplots, with automatic # titles: dimnames(state.x77) par(mfrow=c(3,3)) for (i in 1:8){ boxplot(state.x77[,i], main=dimnames(state.x77)[[2]][i]) } xtransf <- cbind(log(x[,1]),log(x[,3]),log(x[,7])) # this is another easy way to make boxplots, but # without titles: par(mfrow=c(2,2)) apply(xtransf,2,boxplot) # we can just add the transformed variables to the orginal # variables, and let R decide which ones are best x <- cbind(x,xtransf) m8 <- leaps(x,y,method="adjr2") maxadjr(m8,10) ind <- order(m8$adjr2, decreasing=T) which <- m8$which[ind,] nr <- nrow(which) nc <- ncol(which) par(mfrow=c(2,1)) image(c(1:nr),c(1:nc),which,col=c("white","black"), xlab="Models, ordered by adjusted R^2", ylab="Predictors") image(c(1:10),c(1:nc),which[c(1:10),], col=c("white","black"), xlab="Best 10 models, ordered by adjusted R^2", ylab="Predictors") # note that collinearity is not a big problem. # for example, Population and log(Population) are # highly correlated but just one of these two # ends up in the model. round(cor(x),2) image(c(1:nc),c(1:nc),abs(cor(x)), col=gray(seq(1,0,by=-.05)),xlab="Predictors", ylab="Predictors")