# Rscript to reproduce results in Faraway Ch 12 # Chicago insurance redlining # Brief explanation of the data (source: Andrews and Herzberg (1985)) # Several communities in Chicago claimed that insurance companies # were redlining their neighborhoods. # Redlining = canceling insurance policies or refusing to renew. # Data is given by zipcode: # race: racial decomposition in percent minority # fire: fires per 100?? housing units in 1975 # theft: theft per 1000 population in 1975 # age: percent of housing units built before 1939 # involact: new FAIR plan policies and renewals per 100 housing units # in the first half of 1978 (FAIR plan policies are mostly # given after normal insurance is denied). # income: median family income # Dependent variable = involact # We will focus on relationship between race and involact # remove all objects rm(list=ls(all=TRUE)) ############# # read data # ############# library(faraway) data(chicago) # remove variable 'volact' and rescale 'income' ch <- data.frame(chicago[,1:4],involact=chicago[,6], income=chicago[,7]/1000) summary(ch) # note high spread in 'race' -> good for our analysis ############### # exploration # ############### # histograms par(mfrow=c(2,3)) for (i in 1:6){ hist(ch[,i], main=names(ch)[i],xlab=names(ch)[i],prob=T) lines(density(ch[,i])) } # boxplots par(mfrow=c(2,3)) for (i in 1:6){ boxplot(ch[,i],main=names(ch)[i]) } # dependent variable versus each independent variable par(mfrow=c(2,3)) for (i in c(seq(1,4),6)){ plot(ch[,i],ch[,5],xlab=names(ch)[i],ylab=names(ch)[5]) lines(loess.smooth(ch[,i],ch[,5])) } plot(log(ch[,6]),ch[,5],xlab="log(income)",ylab=names(ch)[5]) lines(loess.smooth(log(ch[,6]),ch[,5])) # compute correlations round(cor(ch),2) ########################## # full model diagnostics # ########################## m <- lm(involact ~ race + fire + theft + age + income, data=ch) summary(m) par(mfrow=c(2,3)) plot(m, which=c(1:5)) # ZIP codes 60610, 60607, 60621 stick out. # these are cases 6, 24, and 35 # what is special about these observations? # histograms, with value of ZIP code 60610 indicated: par(mfrow=c(2,3)) for (i in 1:6){ hist(ch[,i], main=c(names(ch)[i],"zip code 60610"), xlab=names(ch)[i], prob=T) lines(density(ch[,i])) abline(v=ch[6,i],col="blue",lwd=2) } # histograms, with value of ZIP code 60607 indicated: par(mfrow=c(2,3)) for (i in 1:6){ hist(ch[,i], main=c(names(ch)[i],"zip code 60607"), xlab=names(ch)[i], prob=T) lines(density(ch[,i])) abline(v=ch[24,i],col="blue",lwd=2) } # histograms, with value of ZIP code 60621 indicated: par(mfrow=c(2,3)) for (i in 1:6){ hist(ch[,i], main=c(names(ch)[i],"zip code 60621"), xlab=names(ch)[i], prob=T) lines(density(ch[,i])) abline(v=ch[35,i],col="blue",lwd=2) } ################################ # remove these three zip codes # ################################ m <- lm(involact ~ race + fire + theft + age + income, data=ch, subset=c(1:47)[-c(6,24,35)]) par(mfrow=c(3,2)) plot(m, which=c(1:5)) # what is a reasonable cut-off for Cook's distance? # 4/(n-k-1): 4/(45-5-1) # there are still influential cases, but we'll keep the rest of the data # note only fire is highly significant now, # all other variables are insignificant summary(m) # find the model with the best AIC score: step(m) # the model includes race, fire, theft and age # find the model with the best BIC score: step(m, k=log(45)) # the model just includes race and fire # find the model with the best adjusted R^2 score: y <- ch$involact[-c(6,24,35)] x <- cbind(ch[,1:4],income=ch[,6]) x <- x[-c(6,24,35),] library(leaps) m <- leaps(x,y,method="adjr2") ind <- order(m$adjr2, decreasing=T) which <- m$which[ind,] nr <- nrow(which) nc <- ncol(which) par(mfrow=c(1,1)) image(c(1:nr),c(1:nc),which,col=c("white","black"), xlab="Models, ordered by adjusted R^2",ylab="Predictors") # the best model includes race, fire, theft and age m <- leaps(x,y,method="Cp") Cpplot(m) # so we have several competing models. Two of them are: # m0: involact ~ race + fire (BIC) # m1: involact ~ race + fire + theft + age (AIC, R^2, Cp) # let's take a look at them: m0 <- lm(involact ~ race + fire, data=ch, subset=c(1:47)[-c(6,24,35)]) summary(m0) m1 <- lm(involact ~ race + fire + theft + age, data=ch, subset=c(1:47)[-c(6,24,35)]) summary(m1) anova(m0,m1) par(mfrow=c(3,2)) plot(m0, which=c(1:5)) # there are some highly influential observations par(mfrow=c(3,2)) plot(m1, which=c(1:5)) # what happens if we use all obsevations? m0.f <- lm(involact ~ race + fire, data=ch) summary(m0.f) par(mfrow=c(3,2)) plot(m0.f, which=c(1:5)) m1.f <- lm(involact ~ race + fire + theft + age, data=ch) summary(m1.f) par(mfrow=c(3,2)) plot(m1.f, which=c(1:5)) # what can we conclude? # there does seem to be a relationship between race and involact, while # controlling for a subset of the other variables. # possible critique on this analysis: # - involact is not a perfect measure of insurance redlining # - we have aggregate data (zip-codes). hence we cannot draw conclusions # about individual houses/families in the zip-codes (ecological fallacy) # - the size of the effect is not very large. the largest value of the # response is only 2.2 per 100 # - there may be a hidden variable that causes the correlation between # race and involact