## Series 5 (Applied Multivariate Statistics) #################################################################### # Exercise 1 ############ # a) Load the data and have a first look at it. fos <- read.table("http://stat.ethz.ch/Teaching/Datasets/WBL/fossilien.dat") dim(fos) head(fos) #Gives the first 6 rows of the dataset # b) We would like to test, whether the body size is associated # with the environmental conditions during that period. Make # a multivariate regression. Target variables are sAngle, # lLength and rWidth; predictors are SST.Mean, Salinity and # lChlorophy lm.fit <- lm(cbind(sAngle, lLength, rWidth) ~ SST.Mean + Salinity + lChlorophyll, data = fos) #c) Make a Wilks Test to check if any predictor has an # influence on any target variable anova(lm.fit, test = "Wilks") #d) Does lChlorophyll have an equally significant effect on # all three responses? summary(lm.fit) # Exercise 2 ############ # a) Load the data and have a first look at it. bn <- read.table("http://stat.ethz.ch/Teaching/Datasets/WBL/banknot.dat") dim(bn) head(bn) str(bn) # b) We would like to know whether the other variables can seperate # the unforged from the forged banknotes significantly. # Write a small program that returns the p-value of the test. t.test(LENGTH ~ CODE, data = bn) # The p-value is 0.0058. The unforged and the forged banknotes differ in their length. # In order to use apply we define a new function that applies the t-test # on a vector and returns the p-value. The group-variable is CODE. f.ttest <- function(y) { r.ttest <- t.test(y ~ bn[,"CODE"]) r.ttest$p.value } apply(bn[,-1], 2, f.ttest) # c) Install the package ICSNP. Look at the help file of # the function HotellingsT2. library(ICSNP) ?HotellingsT2 # d) Use Hotellings’s T-test for unpaired groups in order to decide, # whether the unforged banknotes differ from the forged ones. x <- bn[bn$CODE == 0, 2:7] y <- bn[bn$CODE == 1, 2:7] HotellingsT2(x,y) # Exercise 3 ############ library(MASS) library(cluster) library(mclust) d.banknot <- read.table("http://stat.ethz.ch/Teaching/Datasets/WBL/banknot.dat") #a) Make a clustering with Mclust() from the package mclust using # the maximum likelihood method. ml.banknot<-Mclust(d.banknot[,-1]) #What number of clusters and what model do you propose? plot(ml.banknot,d.banknot[,-1],what="BIC") #b) Make a table with the misclassification of the model based method with respect to CODE. Keep in mind: CODE=0 are the genuine banknotes an CODE=1 the forged ones. ml.cluster<-Mclust(d.banknot[,-1],modelNames="EEE",G=4)$classification table(ml.cluster,d.banknot[,1]) #Make a pairs plot of the variables. pairs(d.banknot[,-1],col=d.banknot[,1]+1,pch=ml.cluster) # c) Carry out the PAM-algorithm for the same number of clusters as # above and the euclidean metric. pam.cluster <- pam(d.banknot[,-1],k=4,metric="euclidean")$clustering #Make a table with the "misclassification" of the model based #method compared to the PAM algorithm. table(pam.cluster,ml.cluster) #Make a pairs plot of the variables. pairs(d.banknot[,-1],col=pam.cluster,pch=ml.cluster) # Exercise 4 ############ # Reading in the data: library(cluster) library(MASS) empl <- read.table("http://stat.ethz.ch/Teaching/Datasets/WBL/empl2.dat",header=T) labempl <- rownames(empl) # a) First look at the data using the scatterplot. Can you find clusters by eye? plot(empl,panel=function(x,y) text(x,y,labels=labempl,xpd=T)) # Oftentimes Turkey and Yugoslavia (TR,YU) und sometimes also Greece (GR) # are a bit far off. # b) Calcutlate the euclidean distances between the states. Which two states are first combined into a cluster? dis.empl <- daisy(empl) round(dis.empl, 2) min(dis.empl) # The first cluster is {GB, B}. # c) Carry out a hierarchical cluster analysis by hand using the “Single Linkage”-method. # d) Carry out the previous cluster analysis using the function agnes(). Verify your result of c) by comparing the first five steps. sing.empl <- agnes(empl, method="single") sing.empl plot(sing.empl, which=2) # As expected we get the same clusterin as in c). # e) Carry out the cluster analysis with the same distances but with the methods average and complete. Compare the dendrograms of all three methods (including the Single Linkage Method). comp.empl <- agnes(empl, method="complete") aver.empl <- agnes(empl, method="average") par(mfrow=c(1,3)) plot(sing.empl, which=2) plot(comp.empl, which.plot=2) plot(aver.empl, which.plot=2) # f) Group the states into k clusters. Choose for instance k = 3 and k = 4. Compare the different methods. Also plot an MDS-plot and mark the observed groups of states with colors (for one k and one method). # Single r.4scl <- cutree(sing.empl, k=4) split(labempl, r.4scl) # Average r.4acl <- cutree(aver.empl, k=4) split(labempl, r.4acl) # Complete r.4ccl <- cutree(comp.empl, k=4) split(labempl, r.4ccl) r.mds <- cmdscale(daisy(empl)) par(mfrow=c(2,2)) plot(r.mds, type="n", asp=1, main="'Single' Clusterung, MDS Koordinaten") text(r.mds, labempl, col=1 + r.4scl) plot(r.mds, type="n", asp=1, main="'Complete' Clusterung, MDS Koordinaten") text(r.mds, labempl, col=1 + r.4ccl) plot(r.mds, type="n", asp=1, main="'Average' Clusterung, MDS Koordinaten") text(r.mds, labempl, col=1 + r.4acl) # Exercise 5 ############ # Reading in the data: d.bank.org <- read.table("http://stat.ethz.ch/Teaching/Datasets/WBL/banknot.dat") d.bank <- d.bank.org[,c("CODE","BOTTOM","DIAGONAL")] # a) K-means-algorithm: # Computing the 2 clusters with the K-means method. set.seed(10) t.bank<-dist(d.bank[,-1],method="euclidean") kmean.bank <- kmeans(d.bank[,-1],centers=2) kmean.sl <- silhouette(kmean.bank$cluster,t.bank) # Misclassification: table(kmean.bank$cluster,d.bank[,1]) # Silhouette-plot: plot(kmean.sl) # b) PAM-algorithm: # Computing the 2 clusters with PAM. pam.bank <- pam(d.bank[,-1],k=2,metric="euclidean") # Misclassification: table(pam.bank$clustering,d.bank[,1]) # Silhouette-plot: plot(pam.bank,which=2) # If you use which=1, the clusters are displayed as ellipses. plot(pam.bank,which=1) # c) Compare the two results. Make a table with the differences with respect to the two clusterings. table(pam.bank$clustering,kmean.bank$cluster) # The classifications are the same. # d) We would like to show by means of simple simulation, that the K-means algorithm finds local minima. # K-means-algorithm: set.seed(10) v.einer1 <- rep(1,dim(d.bank)[1]) v.einer2 <- rep(1,100) t.kmeans <- NULL for (i in 1:100){ kmean.bank.cluster <- kmeans(d.bank[,-1],centers=3)$cluster t.kmeans <- cbind(t.kmeans,sort(aggregate(v.einer1,by=list(kmean.bank.cluster), FUN=sum)[,2])) } trans.kmeans <- t(t.kmeans) m.kmeans <- aggregate(v.einer2,by=list(trans.kmeans[,1],trans.kmeans[,2], trans.kmeans[,3]),FUN=sum) m.kmeans # We find 5 different cluster triples. # Note: The size of the triples can change, however it is not true, that we find the same clusters in one group. Two clusters with the same size can still be different from each other. #PAM-algorithm: anz.sim <- 100 t.centers <- 3 v.einer1 <- rep(1,dim(d.bank)[1]) v.einer2 <- rep(1,anz.sim) t.pam <- NULL for (i in 1:anz.sim){ pam.bank.cluster <- pam(d.bank[,-1],k=t.centers)$clustering t.pam <- cbind(t.pam,sort(aggregate(v.einer1,by=list(pam.bank.cluster),FUN=sum)[,2])) } trans.pam <- t(t.pam) m.pam <- aggregate(v.einer2,by=list(trans.pam[,1],trans.pam[,2],trans.pam[,3]),FUN=sum) m.pam # We only get one triple. It seems that the PAM-algorithm is more stable.