# Rscript to reproduce results in Faraway Ch 12
# Chicago insurance redlining 

# Brief explanation of the data (source: Andrews and Herzberg (1985))
# Several communities in Chicago claimed that insurance companies 
#    were redlining their neighborhoods.
# Redlining = canceling insurance policies or refusing to renew.

# Data is given by zipcode:
#    race: racial decomposition in percent minority
#    fire: fires per 100?? housing units in 1975
#    theft: thefts per 1000 people in 1975
#    age: percent of housing units built before 1939
#    involact: new FAIR plan policies and renewals per 100 housing units
#              in the first half of 1978 (FAIR plan policies are mostly 
#              given after normal insurance is denied).
#    income: median family income

# Dependent variable = involact
# We will focus on the relationship between race and involact
# In particular: is there discrimination?

# remove all objects
rm(list=ls(all=TRUE))

#############
# read data #
#############

library(faraway)
data(chicago)

# remove variable 'volact' and rescale 'income'
ch <- data.frame(chicago[,1:4],involact=chicago[,6],
                 income=chicago[,7]/1000)
summary(ch)

# note high spread in 'race' -> good for our analysis


###############
# exploration #
###############

# histograms
par(mfrow=c(2,3))
for (i in 1:6){
  hist(ch[,i], main=names(ch)[i],xlab=names(ch)[i],prob=T)
  lines(density(ch[,i]))
}

# boxplots
par(mfrow=c(2,3))
for (i in 1:6){
  boxplot(ch[,i],main=names(ch)[i])
}

# dependent variable versus each independent variable
par(mfrow=c(2,3))
for (i in c(seq(1,4),6)){
  plot(ch[,i],ch[,5],xlab=names(ch)[i],ylab=names(ch)[5])
  lines(loess.smooth(ch[,i],ch[,5]))
}
# we'll consider log2(income) because of better interpretation:
plot(log2(ch[,6]),ch[,5],xlab="log2(income)",ylab=names(ch)[5])
lines(loess.smooth(log2(ch[,6]),ch[,5]))

# compute correlations
ch.new <- ch
ch.new[,6] <- log2(ch[,6])
names(ch.new)[6] <- "log2(income)"
round(cor(ch.new),2)


##########################
# full model diagnostics #
##########################

m <- lm(involact ~ race + fire + theft + age + log2(income), data=ch)
summary(m)
# race is very highly significant (p<0.001)

par(mfrow=c(2,3))
plot(m, which=c(1:5))

# ZIP codes 60610, 60607 (and perhaps 60621) stick out.
# these are cases 6, 24 (and 35)
# what is special about these observations?

# histograms, with value of ZIP code 60610 indicated:
par(mfrow=c(2,3))
for (i in 1:6){
  hist(ch[,i], main=c(names(ch)[i],"zip code 60610"), 
       xlab=names(ch)[i], prob=T)
  lines(density(ch[,i]))
  abline(v=ch[6,i],col="blue",lwd=2)
}

# histograms, with value of ZIP code 60607 indicated:
par(mfrow=c(2,3))
for (i in 1:6){
  hist(ch[,i], main=c(names(ch)[i],"zip code 60607"), 
       xlab=names(ch)[i], prob=T)
  lines(density(ch[,i]))
  abline(v=ch[24,i],col="blue",lwd=2)
}

# histograms, with value of ZIP code 60621 indicated:
par(mfrow=c(2,3))
for (i in 1:6){
  hist(ch[,i], main=c(names(ch)[i],"zip code 60621"), 
       xlab=names(ch)[i], prob=T)
  lines(density(ch[,i]))
  abline(v=ch[35,i],col="blue",lwd=2)
}


########################################
# remove the 60610 and 60607 zip codes #
########################################

m <- lm(involact ~ race + fire + theft + age + log2(income), 
        data=ch, subset=c(1:47)[-c(6,24)])
par(mfrow=c(3,2))
plot(m, which=c(1:5))

# what is a reasonable cut-off for Cook's distance?
# 4/(n-k-1):
4/(45-5-1)
# there are still influential cases, but we'll keep the rest of the data

# note only fire is very highly significant now, 
# race is less stong than before, but still significant (p<0.05)
summary(m)


### quadratic terms? ###

# should we include quadratic terms?
# we don't transform involact and race to keep clear interpretation
m.quad <- lm(involact ~ race + poly(fire,2) + poly(theft,2) + poly(age,2) 
                        + poly(log2(income),2), data=ch, 
                        subset=c(1:47)[-c(6,24)])
anova(m, m.quad)
# p-value insignficant -> we do not reject the null hypothesis that model
# without quadratic terms is OK. 


### variable selection ###

# find the model with the "best" AIC score:
step(m)
# the model includes race, fire, theft and age

# find the model with the "best" BIC score:
step(m, k=log(45))
# the model just includes race and fire

# find the model with the best adjusted R^2 score:
y <- ch$involact[-c(6,24)]
x <- cbind(ch[,1:4],"log2(income)"=log2(ch[,6]))
x <- x[-c(6,24),]

library(leaps)
library(faraway)

m <- leaps(x,y,method="adjr2")
ind <- order(m$adjr2, decreasing=T)
which <- m$which[ind,]
nr <- nrow(which)
nc <- ncol(which)

par(mfrow=c(1,1))
image(c(1:nr),c(1:nc),which,col=c("white","black"),
	xlab="Models, ordered by adjusted R^2",ylab="Predictors")
# the best model includes race, fire, theft and age

maxadjr(m,20)

# find best model according to Mallows Cp criterion:
m <- leaps(x,y,method="Cp")
par(mfrow=c(1,1))
Cpplot(m)
# 1,2,3,4 (race, fire, theft, age) seems best
# 1,2,4 (race, fire, age) is also pretty good

# so we have several competing models. Three of them are:
# m0: involact ~ race + fire  (BIC)
# m1: involact ~ race + fire + age (Runner up in Cp)
# m2: involact ~ race + fire + theft + age  (AIC, R^2, Cp)

# let's take a look at them:
m0 <- lm(involact ~ race + fire, data=ch, subset=c(1:47)[-c(6,24)])
summary(m0)
m1 <- lm(involact ~ race + fire + age, data=ch, subset=c(1:47)[-c(6,24)])
summary(m1)
m2 <- lm(involact ~ race + fire + theft + age, 
         data=ch, subset=c(1:47)[-c(6,24)])
summary(m2)

# race is significant in all of them

# there are some highly influential observations
par(mfrow=c(3,2))
plot(m0, which=c(1:5))
par(mfrow=c(3,2))
plot(m1, which=c(1:5))
par(mfrow=c(3,2))
plot(m2, which=c(1:5))

# take out zip codes 60612, 60621, 60653 (cases 23,30,35):
m0 <- lm(involact ~ race + fire, data=ch, 
         subset=c(1:47)[-c(6,23,24,30,35)])
summary(m0)
m1 <- lm(involact ~ race + fire + age, data=ch, 
         subset=c(1:47)[-c(6,23,24,30,35)])
summary(m1)
m2 <- lm(involact ~ race + fire + theft + age, 
         data=ch, subset=c(1:47)[-c(6,23,24,30,35)])
summary(m2)

# what happens if we use all observations?
m0 <- lm(involact ~ race + fire, data=ch)
summary(m0)
m1 <- lm(involact ~ race + fire + age, data=ch)
summary(m1)
m2 <- lm(involact ~ race + fire + theft + age, data=ch)
summary(m2)

# what can we conclude?
# there does seem to be a relationship between race and involact, while 
# controlling for a subset of the other variables. 

# possible critique on this analysis:
# - involact is not a perfect measure of insurance redlining
# - the size of the effect is not very large. the largest value of the 
#   response is only 2.2 per 100. if there is discrimination, relatively
#   few people are affected.
# - there may be a hidden variable that causes the correlation between 
#   race and involact. i can't think of a very likely candidate though.
# - we have aggregate data (zip-codes). hence we cannot draw conclusions
#   about individual houses/families in the zip-codes (ecological fallacy)