# R-code logistic regression # remove all objects rm(list=ls(all=TRUE)) library(car) data(Chile) attach(Chile) names(Chile) ?Chile #################### # look at the data # #################### # look at first 3 lines of data set: Chile[1:3,] # how big is the data set? nrow(Chile) ncol(Chile) # take a look at a summary of the data: summary(Chile) #################### # prepare the data # #################### # remove people who had missing values for 'vote' or 'statusquo' data <- Chile[(!is.na(vote)) & (!is.na(statusquo)),] # only keep the people who were planning to vote yes or no data <- data[data$vote=="Y" | data$vote=="N",] attach(data) nr <- nrow(data) nr # check if everything is ok now summary(data) # turn 'vote' into a numeric vector, where 0="N" and 1="Y" vote.num <- 1*(vote=="Y") # check if it worked summary(vote.num) # plot data plot(statusquo,vote.num) abline(h=0) abline(h=1) # we cannot see the points very well # jitter values of vote for better visibility # we just use jittering for plotting purposes # we always compute with the real data! plot(statusquo,jitter(vote.num,amount=.1)) abline(h=0) abline(h=1) ############################ # nonparametric regression # ############################ # order the data ind <- order(statusquo) statusquo <- statusquo[ind] vote.num <- vote.num[ind] # check if it worked plot(statusquo) # compute local averages y.np <- NULL for (i in 100:(nr-100)){ average <- mean(vote.num[(i-100):(i+100)]) y.np <- c(y.np, average) } # plot the result plot(statusquo,jitter(vote.num,amount=.1)) lines(statusquo[100:(nr-100)],y.np,col="red",lwd=2) abline(h=0) abline(h=1) lines(loess.smooth(statusquo, vote.num), lwd=2) # loess curve seems to fit poorly. ##################### # linear regression # ##################### # fit model m1 <- lm(vote.num ~ statusquo) # plot result plot(statusquo,jitter(vote.num,amount=.1)) lines(statusquo[100:(nr-100)],y.np,col="red",lwd=2) abline(m1,col="blue",lwd=2,lty=2) abline(h=0) abline(h=1) # this didn't work very well ####################### # logistic regression # ####################### # fit logistic model m3 <- glm(vote.num~statusquo,family=binomial(logit)) summary(m3) (a3 <- m3$coef[1]) (b3 <- m3$coef[2]) plot(statusquo,jitter(vote.num,amount=.1)) lines(statusquo[100:(nr-100)],y.np,col="red",lwd=2) abline(m1,col="blue",lwd=2,lty=2) lines(statusquo,1/(1+exp(-(a3+b3*statusquo))),col="darkgreen",lwd=2) abline(h=0) abline(h=1) ######################### # testing nested models # ######################### # deviance = -2*loglikelihood # consider two models m0 (with q parameters) and m1 (with p parameters), # so that m0 is a submodel of m1 # under m0, the difference in deviance between the two models has a # chisquared distribution with p-q degrees of freedom # small illustration: m4 <- glm(vote.num~1, family=binomial(logit)) summary(m4) # difference in deviance between m4 and m3: m4$deviance - m3$deviance # difference in degrees of freedom: 2-1 # p-value: pchisq(1678.697, df=1, lower.tail=FALSE)