## Series 2 (Applied Multivariate Statistics) ##################################################################### # Exercise 1 ############ library(MASS) library(mice) library(missForest) # load data url <- "http://stat.ethz.ch/Teaching/Datasets/WBL/" dat <- read.csv(paste(url, "USairpollutionNA.csv", sep=""), row.names = 1) md.pattern(dat) # impute the dataset using missForest set.seed(123) imp <- missForest(dat) head(imp$ximp) # get the imputation error imp$OOBerror # draw stars plot stars(imp$ximp, draw.segments = TRUE, key.loc = c(20,1)) datOrig <- read.csv(paste(url, "USairpollution.csv", sep=""), row.names = 1, header = TRUE) stars(datOrig, draw.segments = TRUE, key.loc = c(20,1)) # Exercise 2 ############ library(mice) library(missForest) # declare settings n <- 50 nreps <- 100 mu <- c(125, 125) sigma <- matrix( c(625, 375, 375, 625), 2,2) # initialize matrices resMean <- matrix(NA, nreps, 3) colnames(resMean) <- c("MCAR", "MAR", "MNAR") resSd <- matrix(NA, nreps, 3) colnames(resSd) <- c("MCAR", "MAR", "MNAR") # compute results for (i in 1:nreps) { ## Simulate the data: dat <- mvrnorm(n, mu, sigma) ## MCAR mis <- which(runif(n) < 0.73) datMCAR <- dat datMCAR[mis,2] <- NA resMean[i, 1] <- mean(datMCAR[,2], na.rm = TRUE) resSd[i, 1] <- sd(datMCAR[,2], na.rm = TRUE) ## MAR mis <- which(dat[,1] <= 140) datMAR <- dat datMAR[mis,2] <- NA resMean[i, 2] <- mean(datMAR[,2], na.rm = TRUE) resSd[i, 2] <- sd(datMAR[,2], na.rm = TRUE) ## MNAR ## mis <- which(dat[,2] <= 140) datMNAR <- dat datMNAR[mis,2] <- NA resMean[i, 3] <- mean(datMNAR[,2], na.rm = TRUE) resSd[i, 3] <- sd(datMNAR[,2], na.rm = TRUE) } colMeans(resMean) colMeans(resSd) # Exercise 3 ############ # load data library(MVA) # you might have a different path for the data !!! load("/u/sfs/datasets/WBL/WWIIleaders.rda") WWIIleaders # visualize similarities using non-metrical MDS set.seed(123) WWIImds <- isoMDS(WWIIleaders) plot(WWIImds$points, type = "n") text(WWIImds$points, labels = attr(WWIIleaders, "Labels")) WWIImds$stress # draw Shepard plot sh <- Shepard(WWIIleaders, WWIImds$points) plot(sh$x, sh$y) abline(0,1,lty='dashed') # Exercise 4 ############ # load data library(cluster) library(MASS) t.url <- "http://stat.ethz.ch/Teaching/Datasets/WBL/ch-dist.dat" chdist <- read.table(t.url, header=T) m.chdist <- as.matrix(chdist) t.dist0 <- c(m.chdist) # use metric MDS koord <- cmdscale(chdist) t.dist1 <- c(as.matrix(dist(koord))) # plot results plot(-koord, col="red", asp=1) text(-koord, labels=rownames(koord), pos=1) p.p1 <- plot(t.dist0~t.dist1) abline(0,1,lty=3) # use non-metric MDS koord2 <- isoMDS(m.chdist) t.dist2 <- c(as.matrix(dist(koord2$points))) # plot results plot(-koord2$points) text(-koord2$points, labels=rownames(koord2$points), pos=1) p.p1 <- plot(t.dist0~t.dist2) abline(0,1,lty=3)