# Rscript on clustering ############# # iris data # ############# data(iris) # 2-dimensional array ?iris iris[,5] table(iris[,5]) # setosa: 1-50, versicolor: 51-100, virginica: 101-150 pairs(iris[,1:4], pch=c(rep("S",50),rep("C",50),rep("V",50)), col=c(rep("black",50),rep("red",50),rep("blue",50))) ########################### # hierarchical clustering # ########################### dat.easy.2gr <- iris[c(1:10,51:60), 1:4] # 10xS, 10xC dat.diff.2gr <- iris[c(51:60,101:110), 1:4] # 10xC, 10xV dat.3gr <- iris[c(1:10,51:60,101:110), 1:4] # 10xS, 10xC, 10xV # we now pretend we don't know the species anymore, # and we will see how well the clustering methods recover the species. # compute distance measure # possible methods: euclidean, maximum, manhattan, canberra, # binary, or minkowski ?dist # illustrate some of the different distances using a small example in R^2: a <- rbind(c(1,2), # first point c(4,6), # second point c(0,0), # third point c(0,8)) # fourth point dist(a, method="euclidean") dist(a, method="maximum") dist(a, method="manhattan") dist(a, method="canberra") dist(a, method="binary") dist(a, method="minkowski", p=1) dist(a, method="minkowski", p=2) # note: some of the latter distance are rarely used # compute the distances for the iris data dist.eu <- dist(dat.easy.2gr, method="euclidean") # default method round(dist.eu, 3) dist.ma <- dist(dat.easy.2gr, method="maximum") dist.ma # single linkage clustering for dat.easy.2gr, euclidean distance: res <- hclust(dist(dat.easy.2gr, method="euclidean"), method="single") plot(res, labels=c(1:10, 51:60)) # note: # "method" inside command dist() specifies how to compute distance between points # "method" inside command hclust() specifies how to compute distance between clusters # single linkage clustering for dat.easy.2gr, maximum distance: res <- hclust(dist(dat.easy.2gr, method="maximum"), method="single") plot(res, labels=c(1:10, 51:60)) # single linkage clustering for dat.diff.2gr, euclidean distance: res <- hclust(dist(dat.diff.2gr, method="euclidean"), method="single") plot(res, labels=c(51:60, 101:110)) # single linkage clustering for dat.3gr, euclidean distance: res <- hclust(dist(dat.3gr, method="euclidean"), method="single") plot(res, labels=c(1:10, 51:60, 101:110)) # complete linkage clustering for dat.3gr, euclidean distance: res <- hclust(dist(dat.3gr, method="euclidean"), method="complete") plot(res, labels=c(1:10,51:60,101:110)) # what does the height of the dendogram mean? # the height tells you at which distance clusters were joined together summary(dist(dat.3gr, method="euclidean")) # average linkage clustering for dat.3gr, euclidean distance: res <- hclust(dist(dat.3gr, method="euclidean"), method="average") plot(res, labels=c(1:10,51:60,101:110)) cutree(res, h=3) abline(h=3, lty=2, col="red") cutree(res, h=1.4) abline(h=1.4, lty=2, col="blue") # note that it doesn't work so well for the tibetan skull data: source("chap7tibetskull.dat") summary(Tibet) res <- hclust(dist(Tibet[,1:5]), method="average") plot(res, labels=Tibet$Type) # but this is not surprising if we look at a pairs plot of these data: pairs(Tibet[,1:5], pch=c(rep("1", 17),rep("2",15)), col=c(rep("red",17), rep("blue",15))) ###################### # k-means clustering # ###################### dat <- iris[,1:4] res <- kmeans(dat,3) res # discuss output # compute within cluster sum of squares by hand: # all observations in cluster 1 minus the center of cluster 1: dat[res$clust==1,1:4] - rep(res$center[1,], each=res$size[1]) # euclidean distance for the points in cluster 1 to the center of cluster 1: apply( (dat[res$clust==1,]-rep(res$center[1,], each=res$size[1]))^2, 1, sum) # within cluster sum of squares for cluster 1: sum( apply( (dat[res$clust==1,]-rep(res$center[1,], each=res$size[1]))^2, 1, sum) ) # note output of kmeans is random, due to random starting values of clusters: kmeans(dat.3gr,3)$cluster kmeans(dat.3gr,3)$cluster kmeans(dat.3gr,3)$cluster kmeans(dat.3gr,3)$cluster kmeans(dat.3gr,3)$cluster kmeans(dat.3gr,3)$cluster kmeans(dat.3gr,3)$cluster kmeans(dat.3gr,3)$cluster kmeans(dat.3gr,3)$cluster # we can construct a scree plot in order to choose the right value of k: res <- rep(NA, 10) # res[k] contains the sum of the k within cluster sum of squares, # when k-means algorithm is used with k clusters # compute res[1]: res[1] <- sum( apply( scale(dat, center=TRUE, scale=FALSE)^2, 1, sum) ) # compute res[2]-res[10]: for (k in 2:10){ res[k] <- sum(kmeans(dat, k)$withinss) } # construct scree plot: plot(c(1:10),res) abline(h=0) ########################## # model based clustering # ########################## # in order to run this code, you first need to install the package 'mclust' library(mclust) res <- Mclust(iris[,1:4]) plot(res, data=iris) res # understand the output a little better: ?Mclust ?mclustModelNames round(res$z, 3) res$classification round(res$uncertainty, 3) res <- Mclust(Tibet[,1:5], G=c(1:15)) res res$classification round(res$uncertainty, 3) # compare to hierachical clustering plot(hclust(dist(Tibet[,1:5]), method="average"), labels=res$classification)