## Series 4 (Applied Multivariate Statistics)
#####################################################################

# Exercise 5
############

## Load the data
load(file="~/dataU6.rda")

# a) Make a plot of X1 vs. X2. 
plot(dat, col=cl,pch=cl, xlab="X1",ylab="X2")
##In which direction will the first PC and the first LD be?

# b) Project the points onto the first PC and investigate how well
#    the groups are separated.
colMeans(dat) # confirm centering
?princomp # let's look at the helpfile of princomp first
pc <- princomp(dat) 
summary(pc, loadings=TRUE)
pc$scores # what are these scores? 
plot(pc$scores[,1],rep(0,200),col=cl,pch=cl) # projection on first PC


# c) Project the points onto the first LD and investigate how well
#    the groups are separated.
library(MASS)
?lda
ld <- lda(dat,cl)
ld 
?predict.lda # look at the helpfile of predict.lda 
ld.pred <- predict(ld)$x
plot(ld.pred,rep(0,200),col=cl,pch=cl)

# d) What is the (leave-one-out) cross validation error for LDA on this data set?
ld.cv <- lda(dat,cl,CV=TRUE)
str(ld.cv)
res <- data.frame(est=ld.cv$class,true=cl) # data.frame with estimated and true class labels
table(res) # make a table of estimated and true class labels


# Exercise 6
############

load(file="~/dataU6.rda")

# The data consists of
str(datSA)
str(clSA)
str(datSATest)
str(clSATest)

# a) Have a look at the meaning of the variables involved. 
library(MASS)
library(ElemStatLearn)
?SAheart # get some infromation on the data
str(SAheart)

# b) Look at a scatterplot matrix and the summary of datSA
pairs(datSA) 

# c) Compute LDA and project datSA onto the first LD. Visualize the result.
#    Can you detect any separation of the two groups?
ld <- lda(datSA,clSA)
ld.pred <- predict(ld)$x
plot(ld.pred,rep(0,length(ld.pred)),col=clSA,pch=clSA)


# d) Now we want to assess how confident we can be in predictions of
#    LDA on new patients. For this,we use (leave-one-out) CV.
#    What error rate would you expect for new samples from the same population?
lda.cv <- lda(datSA,clSA,CV=TRUE)
dfSA.cv <- data.frame(est=lda.cv$class,truth=clSA)
tab.cv <- table(dfSA.cv) # makes a 2x2 table
tab.cv
sum(diag(tab.cv))/nrow(dfSA.cv) # percentage of correctly classified patients


# e) Make a prediction for the 50 new patients (datSATest).
#    You can check the predictions using the true labels (clSATest).
#    What error rate do you observe? Does it agree with the CV estimate?
lda.fit <- lda(datSA,clSA)
lda.pred <- predict(lda.fit,newdata=datSATest) # predict the class labels for the test data set
dfSA <- data.frame(est=lda.pred$class, truth=clSATest)
tab <- table(dfSA)
tab
sum(diag(tab))/nrow(dfSA)