## Series 4 (Applied Multivariate Statistics) ##################################################################### # Exercise 5 ############ ## Load the data load(file="~/dataU6.rda") # a) Make a plot of X1 vs. X2. plot(dat, col=cl,pch=cl, xlab="X1",ylab="X2") ##In which direction will the first PC and the first LD be? # b) Project the points onto the first PC and investigate how well # the groups are separated. colMeans(dat) # confirm centering ?princomp # let's look at the helpfile of princomp first pc <- princomp(dat) summary(pc, loadings=TRUE) pc$scores # what are these scores? plot(pc$scores[,1],rep(0,200),col=cl,pch=cl) # projection on first PC # c) Project the points onto the first LD and investigate how well # the groups are separated. library(MASS) ?lda ld <- lda(dat,cl) ld ?predict.lda # look at the helpfile of predict.lda ld.pred <- predict(ld)$x plot(ld.pred,rep(0,200),col=cl,pch=cl) # d) What is the (leave-one-out) cross validation error for LDA on this data set? ld.cv <- lda(dat,cl,CV=TRUE) str(ld.cv) res <- data.frame(est=ld.cv$class,true=cl) # data.frame with estimated and true class labels table(res) # make a table of estimated and true class labels # Exercise 6 ############ load(file="~/dataU6.rda") # The data consists of str(datSA) str(clSA) str(datSATest) str(clSATest) # a) Have a look at the meaning of the variables involved. library(MASS) library(ElemStatLearn) ?SAheart # get some infromation on the data str(SAheart) # b) Look at a scatterplot matrix and the summary of datSA pairs(datSA) # c) Compute LDA and project datSA onto the first LD. Visualize the result. # Can you detect any separation of the two groups? ld <- lda(datSA,clSA) ld.pred <- predict(ld)$x plot(ld.pred,rep(0,length(ld.pred)),col=clSA,pch=clSA) # d) Now we want to assess how confident we can be in predictions of # LDA on new patients. For this,we use (leave-one-out) CV. # What error rate would you expect for new samples from the same population? lda.cv <- lda(datSA,clSA,CV=TRUE) dfSA.cv <- data.frame(est=lda.cv$class,truth=clSA) tab.cv <- table(dfSA.cv) # makes a 2x2 table tab.cv sum(diag(tab.cv))/nrow(dfSA.cv) # percentage of correctly classified patients # e) Make a prediction for the 50 new patients (datSATest). # You can check the predictions using the true labels (clSATest). # What error rate do you observe? Does it agree with the CV estimate? lda.fit <- lda(datSA,clSA) lda.pred <- predict(lda.fit,newdata=datSATest) # predict the class labels for the test data set dfSA <- data.frame(est=lda.pred$class, truth=clSATest) tab <- table(dfSA) tab sum(diag(tab))/nrow(dfSA)