# R-code logistic regression 

# remove all objects
rm(list=ls(all=TRUE))

library(car)
data(Chile)
attach(Chile)
names(Chile)
?Chile


####################
# look at the data #
####################

# look at first 3 lines of data set:
Chile[1:3,]
# how big is the data set?
nrow(Chile)
ncol(Chile)
# take a look at a summary of the data:
summary(Chile)


####################
# prepare the data #
####################

# remove people who had missing values for 'vote' or 'statusquo'
data <- Chile[(!is.na(vote)) & (!is.na(statusquo)),]

# only keep the people who were planning to vote yes or no
data <- data[data$vote=="Y" | data$vote=="N",]
attach(data)
nr <- nrow(data)
nr
# check if everything is ok now
summary(data)

# turn 'vote' into a numeric vector, where 0="N" and 1="Y"
vote.num <- 1*(vote=="Y")

# check if it worked
summary(vote.num)

# plot data
plot(statusquo,vote.num)
abline(h=0)
abline(h=1)

# we cannot see the points very well
# jitter values of vote for better visibility
# we just use jittering for plotting purposes
# we always compute with the real data!
plot(statusquo,jitter(vote.num,amount=.1))
abline(h=0)
abline(h=1)

############################
# nonparametric regression #
############################

# order the data
ind <- order(statusquo)
statusquo <- statusquo[ind]
vote.num <- vote.num[ind]

# check if it worked
plot(statusquo)

# compute local averages
y.np <- NULL
for (i in 100:(nr-100)){
  average <- mean(vote.num[(i-100):(i+100)])
  y.np <- c(y.np, average)
}

# plot the result
plot(statusquo,jitter(vote.num,amount=.1))
lines(statusquo[100:(nr-100)],y.np,col="red",lwd=2)
abline(h=0)
abline(h=1)

lines(loess.smooth(statusquo, vote.num), lwd=2)
# loess curve seems to fit poorly.


#####################
# linear regression #
#####################

# fit model
m1 <- lm(vote.num ~ statusquo)

# plot result
plot(statusquo,jitter(vote.num,amount=.1))
lines(statusquo[100:(nr-100)],y.np,col="red",lwd=2)
abline(m1,col="blue",lwd=2,lty=2)
abline(h=0)
abline(h=1)

# this didn't work very well


#######################
# logistic regression #
#######################

# fit logistic model
m3 <- glm(vote.num~statusquo,family=binomial(logit))
summary(m3)
(a3 <- m3$coef[1])
(b3 <- m3$coef[2])
plot(statusquo,jitter(vote.num,amount=.1))
lines(statusquo[100:(nr-100)],y.np,col="red",lwd=2)
abline(m1,col="blue",lwd=2,lty=2)
lines(statusquo,1/(1+exp(-(a3+b3*statusquo))),col="darkgreen",lwd=2)
abline(h=0)
abline(h=1)


#########################
# testing nested models #
#########################

# deviance = -2*loglikelihood
# consider two models m0 (with q parameters) and m1 (with p parameters),
#   so that m0 is a submodel of m1
# under m0, the difference in deviance between the two models has a 
#   chisquared distribution with p-q degrees of freedom

# small illustration:
m4 <- glm(vote.num~1, family=binomial(logit))
summary(m4)
# difference in deviance between m4 and m3:
m4$deviance - m3$deviance
# difference in degrees of freedom:
2-1
# p-value:
pchisq(1678.697, df=1, lower.tail=FALSE)