############ ## Case Study: Hepthatlon ############ library(MVA) ?heptathlon dim(heptathlon) head(heptathlon) ## Explore... stars(heptathlon[,-8], draw.segments = TRUE, key.loc = c(-0.7,5)) ## Rescale, so that "large value = good performance" heptathlon$hurdles <- with(heptathlon, max(hurdles)-hurdles) heptathlon$run200m <- with(heptathlon, max(run200m)-run200m) heptathlon$run800m <- with(heptathlon, max(run800m)-run800m) dat <- heptathlon[,-8] stars(dat, draw.segments = TRUE, key.loc = c(-0.7,5)) library(mvoutlier) chisq.plot(dat) dat[25,] ## outlier: I decide to leave her out, but need to report this and the reasons I had dat2 <- dat[-25,] heptathlon2 <- heptathlon[-25,] cor(dat2) pairs(dat2) ## long jump - hurdles / javelin - run800m heptathlon_pca <- princomp(dat2, cor = TRUE) summary(heptathlon_pca, loadings = TRUE) heptathlon_pca$score[,1] plot(heptathlon_pca) ## Goal 1: Dimension reduction - Use up to first e.g. 3 PC's hep_reduced <- heptathlon_pca$scores[,1:3] hep_reduced ## transform new data points into PC coordinates dat2 nd <- dat2[1:2,] ## keep first observation for sanity check, replace second observation (or more) ## with new observations nd[2,] <- c(3.5, 1.7, 15, 4, 7, 43, 33) rownames(nd)[2] <- "Mrs. New (XYZ)" nd predict(heptathlon_pca, newdata = nd) heptathlon_pca$score[1,] ## old observation was predicted OK ## Goal 2: Define a score for clear ranking - Use first PC cor(heptathlon2$score, heptathlon_pca$score[,1]) plot(heptathlon2$score, heptathlon_pca$score[,1]) identify(heptathlon2$score, heptathlon_pca$score[,1], labels = rownames(heptathlon2)) ## Good agreement btw. Olympic scoring rule and first PC; ## only Mrs. Dimitrova (BUL) should be annoyed ... ## Note: The sign of the PC coordinates is arbitrary ## Biplot ?biplot biplot(heptathlon_pca)