# Rscript for linear discriminant analysis # Download zip-file "chap7tibetskull.dat" # from http://biostatistics.iop.kcl.ac.uk/publications/everitt/ # Unzip the file into your working directory source("chap7tibetskull.dat") # the data are now available as dataframe 'Tibet' ########################## # Info about these data: # ########################## # The data were collected in southeastern and eastern Tibet # Skulls 1-17 were found in one place (Type 1), # and skulls 18-32 were found in another place (Type 2). # On each skull, we have 5 measurements, all in millimeter: # Length: greatest length of skull # Breadth: greatest horizontal breadth of skull # Height: height of skull # Fheight: upper face height # Fbreadth: face breadth, between outermost points of cheek bones ####################### # Compute lda by hand # ####################### attach(Tibet) Tibet1 <- Tibet[Type==1,c(1:5)] Tibet2 <- Tibet[Type==2,c(1:5)] n1 <- nrow(Tibet1) n2 <- nrow(Tibet2) n <- n1+n2 p <- ncol(Tibet1) # compute sample mean vectors: sample.mean1 <- apply(Tibet1, 2, mean) sample.mean2 <- apply(Tibet2, 2, mean) # compute pooled estimate for the covariance matrix: S.u <- ((n1-1)*var(Tibet1) + (n2-1)*var(Tibet2))/(n-2) S.u.inverse <- solve(S.u) # compute lda: a <- S.u.inverse %*% (sample.mean1-sample.mean2) ############################################### # Compare to 'lda' function from MASS library # ############################################### # Use lda function from the MASS library: library(MASS) dis <- lda(Type ~ Length + Breadth + Height + Fheight + Fbreadth, data=Tibet, prior=c(0.5,0.5)) # Let's compare the two approaches. # Vector of discriminant function coefficients from MASS: dis$scaling # Vector of discriminant function coefficients computed by hand: a # The output seems different, but this is just a scaling factor: dis$scaling / a ######################################## # Classifying new observations by hand # ######################################## newdata <- rbind(c(171, 140.5, 127, 69.5, 137), c(179, 132, 140, 72, 138.5)) # compute average lda score for both groups: lda.1 <- sample.mean1 %*% a lda.1 lda.2 <- sample.mean2 %*% a lda.2 cutoff <- (lda.1 + lda.2)/2 cutoff # rule: classify observation to Type 1 if lda score is larger than # -30.46349, and classify it to Type 2 otherwise # compute scores for newdata: newdata %*% a # conclusion: # classify the first observation to Type 1 # classify the second observation to Type 2 ####################################################### # Classifying new observations using the MASS library # ####################################################### # put 'newdata' in a data frame: dimnames(newdata) <- list(NULL, c("Length","Breadth","Height", "Fheight","Fbreadth")) newdata.frame <- data.frame(newdata) # use function 'predict': pred <- predict(dis, newdata=newdata.frame) pred # predicted class memberships: pred$class ############################################### # Assess performance of the prediction method # ############################################### # predict group membership of each observation using the discriminant # function that we found: group <- predict(dis, method="plug-in")$class group # compare to true group membership: Type # tabulate the results: table(group, Type) # misclassification rate: (3+3)/n # this method is too optimistic. why?? # better method: leave-one-out predictions <- array(NA, n) for (i in 1:n){ dat <- Tibet[-i,] dis <- lda(Type ~ Length+Breadth+Height+Fheight+Fbreadth, data=dat, prior=c(0.5,0.5)) predictions[i] <- predict(dis, newdata=Tibet[i,c(1:5)])$class } # results: predictions # compare to true group memberships: Type # tabulate results: table(predictions, Type) # misclassification rate: (6+5)/n