setwd("C:/Users/kalischm/ETH/teaching/12/ams/vorlesung/v7.2/") ## Data input/output dat1 <- matrix(rnorm(6), 3, 2) dat2 <- matrix(rnorm(4), 2, 2) ## create .csv file write.csv(dat1, file = "data1.csv") write.csv(dat2, file = "data2.csv") ## create .rda file save(dat1, dat2, file = "data.rda") ## convenient alternative if you have many variables: ## save.image(file = "data.rda") saves all variables ls() rm(list = ls()) ls() load("data.rda") ls() rm(list = ls()) ls() datNew1 <- read.csv("data1.csv") datNew2 <- read.csv("data2.csv") ls() ############ ## Revision ############ ## Cov, Cor, Mahalanobis setwd("C:/Users/kalischm/ETH/teaching/12/ams/vorlesung/v1.1/") x <- read.csv(file = "USairpollution.csv", header = TRUE, row.names = 1) mu <- colMeans(x) cm <- cov(x) cor(x) ?mahalanobis mahalanobis(x[1,], center = mu, cov = cm) mahalanobis(x, center = mu, cov = cm) ## Stars plot stars(x, key.loc = c(15,1.5), flip.labels = FALSE, draw.segment = TRUE) ## Mosaic plot with shading library(vcd) df <- read.csv2("C:/Users/kalischm/ETH/teaching/12/ams/vorlesung/v2.1/marriage.csv", header = TRUE) df tab <- xtabs(Freq ~ ., data = df) tab structable(~ Edu + Mar, data = df) mosaic(tab, shade = TRUE) ## Finding outliers setwd("C:/Users/kalischm/ETH/teaching/12/ams/vorlesung/v2.2/") load(file = "simpleExample.rda") library(mvoutlier) par(mfrow = c(1,2)) plot(dat, xlim = c(-5,5), ylim = c(-5,5)) chisq.plot(dat) ## Multiple Imputation library(mice) ## get overview over pattern md.pattern(boys) ## impute missing values, compute lm and aggregate imp <- mice(boys, m = 5, seed = 43) fmMI <- with(imp, lm(hc ~ age + hgt + wgt)) poolRes <- pool(fmMI) round(summary(poolRes), 2) ## Non-metric MDS on Gower dissimilarity library(cluster) ?flower flower d2 <- daisy(flower, metric = "gower", type = list(asymm = c(1,3), symm = 2)) d2 ## Types = A, S, A, N, O, O, I, I cmd2 <- isoMDS(d2) plot(cmd2$points[,1], -cmd2$points[,2], type = "n") text(cmd2$points[,1], -cmd2$points[,2], labels = rownames(flower)) ## PCA load("C:/Users/kalischm/ETH/teaching/12/ams/vorlesung/v7.2/pca.rda") ## transformed Heptathlon data heptathlon_pca <- princomp(dat2, cor = TRUE) summary(heptathlon_pca, loadings = TRUE) plot(heptathlon_pca) ## "Scree-plot" heptathlon_pca$scores[,1] ## make a one-dimensional "index" heptathlon_pca$scores[,1:3] ## reduce data to only 3 dimensions which explaining most of the variation biplot(heptathlon_pca) ## LDA ?iris lda.fit <- lda(Species ~ ., data = iris) lda.fit ld.pred <- predict(lda.fit, dimen = 2) plot(ld.pred$x, asp = 1, col = as.numeric(iris$Species), xlab = "LD 1", ylab = "LD 2") ## use predict to predict class of new observations lda.cv <- predict(lda.fit, dimen = 2, CV = TRUE) df <- data.frame(est = lda.cv$class, truth = iris$Species) tab <- table(df) ## confusion matrix tab 1 - sum(diag(tab)) / nrow(iris) ## error rate only 0% -> expect very good predicions