setwd("C:/Users/kalischm/ETH/teaching/12/ams/vorlesung/v7.2/")
## Data input/output
dat1 <- matrix(rnorm(6), 3, 2)
dat2 <- matrix(rnorm(4), 2, 2)
## create .csv file
write.csv(dat1, file = "data1.csv")
write.csv(dat2, file = "data2.csv")
## create .rda file
save(dat1, dat2, file = "data.rda")
## convenient alternative if you have many variables: 
## save.image(file = "data.rda") saves all variables
ls()

rm(list = ls())
ls()

load("data.rda")
ls()

rm(list = ls())
ls()
datNew1 <- read.csv("data1.csv")
datNew2 <- read.csv("data2.csv")
ls()

############
## Revision
############
## Cov, Cor, Mahalanobis
setwd("C:/Users/kalischm/ETH/teaching/12/ams/vorlesung/v1.1/")
x <- read.csv(file = "USairpollution.csv", header = TRUE, row.names = 1)
mu <- colMeans(x)
cm <- cov(x)
cor(x)
?mahalanobis
mahalanobis(x[1,], center = mu, cov = cm)
mahalanobis(x, center = mu, cov = cm)

## Stars plot
stars(x, key.loc = c(15,1.5), flip.labels = FALSE, draw.segment = TRUE)

## Mosaic plot with shading
library(vcd)
df <- read.csv2("C:/Users/kalischm/ETH/teaching/12/ams/vorlesung/v2.1/marriage.csv", header = TRUE)
df
tab <- xtabs(Freq ~ ., data = df)
tab
structable(~ Edu + Mar, data = df)
mosaic(tab, shade = TRUE)

## Finding outliers
setwd("C:/Users/kalischm/ETH/teaching/12/ams/vorlesung/v2.2/")
load(file = "simpleExample.rda")
library(mvoutlier)
par(mfrow = c(1,2))
plot(dat, xlim = c(-5,5), ylim = c(-5,5))
chisq.plot(dat)

## Multiple Imputation
library(mice)
## get overview over pattern
md.pattern(boys)
## impute missing values, compute lm and aggregate
imp <- mice(boys, m = 5, seed = 43)
fmMI <- with(imp, lm(hc ~ age + hgt + wgt))
poolRes <- pool(fmMI)
round(summary(poolRes), 2)

## Non-metric MDS on Gower dissimilarity
library(cluster)
?flower
flower
d2 <- daisy(flower, metric = "gower", type = list(asymm = c(1,3), symm = 2))
d2 ## Types = A, S, A, N, O, O, I, I 
cmd2 <- isoMDS(d2)
plot(cmd2$points[,1], -cmd2$points[,2], type = "n")
text(cmd2$points[,1], -cmd2$points[,2], labels = rownames(flower))

## PCA
load("C:/Users/kalischm/ETH/teaching/12/ams/vorlesung/v7.2/pca.rda") ## transformed Heptathlon data
heptathlon_pca <- princomp(dat2, cor = TRUE)
summary(heptathlon_pca, loadings = TRUE)
plot(heptathlon_pca) ## "Scree-plot"
heptathlon_pca$scores[,1] ## make a one-dimensional "index"
heptathlon_pca$scores[,1:3] ## reduce data to only 3 dimensions which explaining most of the variation
biplot(heptathlon_pca)

## LDA
?iris
lda.fit <- lda(Species ~ ., data = iris)
lda.fit
ld.pred <- predict(lda.fit, dimen = 2)
plot(ld.pred$x, asp = 1, col = as.numeric(iris$Species), xlab = "LD 1", ylab = "LD 2")

## use predict to predict class of new observations
lda.cv <- predict(lda.fit, dimen = 2, CV = TRUE)
df <- data.frame(est = lda.cv$class, truth = iris$Species)
tab <- table(df) ## confusion matrix
tab
1 - sum(diag(tab)) / nrow(iris) ## error rate only 0% -> expect very good predicions