## Series 6 (Applied Multivariate Statistics)
#####################################################################

# Exercise 1
############


# a) Load the package MASS. Have a look at the data set shuttle.
library(MASS)
?shuttle


# b) For fitting and visualizing a tree we will need the functions rpart, plot, text and print in the package rpart. Have a look at the corresponding help files (plot.rpart, ?text.rpart, etc.)
?rpart
?plot.rpart
?text.rpart
?print.rpart

# c) Train a tree using rpart. Plot the result. Also look at a text representation of the tree using the print function and compare it with the plot.

library(rpart)
sh.fit <- rpart(use ~ ., data = shuttle, method = "class")
plot(sh.fit, margin = 0.2, uniform = TRUE)
text(sh.fit, use.n = TRUE)
print(sh.fit) ## text representation

# Yes, I can confirm, we get exactly the same results in the tree as in the textual representation.


# d) How many cases are misclassified?

table(predict(sh.fit, type="class"), shuttle$use)

# e) Should the autolander be used in a situation with vis=yes and error=MM? Solve by looking at the plot and at the text representation.

# No, the autolander should not be used.


# f) Thus, create a tree that perfectly describes the opinion of the experts (i.e., don’t use pruning). Create a plot and a text representation of the resulting tree.

sh.max <- rpart(use ~ ., data = shuttle, method = "class", minsplit = 1, cp = 0)
plot(sh.max, margin = 0.2, uniform = TRUE)
text(sh.max, use.n = TRUE)
print(sh.max) # get the text representation

# g) Create a postscript file of the tree you found.
post(sh.max, file = "shuttleTree.ps", title = "Guideline for Space Shuttle Autolander")


# Exercise 2
############

#a)Load package MASS and read the help file of the data set.

library(MASS)
?fgl

#b) Fit a tree with the default settings. Plot it and look at
#   the text representation. Load the package rpart first.

library(rpart)
fgl.fit <- rpart(type ~ ., data = fgl)
plot(fgl.fit, margin = 0.2, uniform = TRUE)
text(fgl.fit, use.n = TRUE, cex = 0.7)
print(fgl.fit)

#c) Fit a tree that fits the 214 analyses perfectly and plot it.

fgl.fit <- rpart(type ~ ., data = fgl, minsplit = 1, cp = 0)
plot(fgl.fit, margin = 0.2, uniform = TRUE)
text(fgl.fit, use.n = TRUE, cex = 0.5)

#d) Investigate the crossvalidation error in relation to the
#   complexity parameter.

fgl.fit2 <- prune(fgl.fit, cp = 0.027)
plot(fgl.fit2, margin = 0.2, uniform = TRUE)
text(fgl.fit2, use.n = TRUE, cex = 0.7)

#e) Choose a complexity parameter that seems reasonable to you
#   and prune the model accordingly.

fgl.fit2 <- prune(fgl.fit, cp = 0.027)
plot(fgl.fit2, margin = 0.2, uniform = TRUE)
text(fgl.fit2, use.n = TRUE, cex = 0.7)

# Exercise 3
############


# a) Have a look at the data set ”spam” in the package ”ElemStatLearn”

library(ElemStatLearn)
?spam
head(spam)


# b) Fit a random Forest (library(randomForest)) with the default settings. 

library(randomForest)
set.seed(123)
system.time(rf.spam <- randomForest(spam ~ ., data = spam)) # about 17 sec
rf.spam


# c) Plot the error rate vs. the number of fitted trees. How many trees are necessary? Refit the model with the chosen number of trees. How long does it take now?

plot(rf.spam)# Choose no more than 100 trees.
set.seed(123)
system.time(rf.spam <- randomForest(spam ~ ., data = spam, ntree = 100)) # about 3 sec
rf.spam


# d) Have a look at the output. What error rate do you expect for new predictions (OOB error rate)? What is the error rate in the ’spam’-class?

#OOB err.rate = 4.6%, err.rate in spam class: 0.07


# e) Suppose, we get a new email and want to predict the spam label. For simplicity, we refit the Random Forest on 2601 randomly chosen emails and save the remaining 2000 emails as test set. How does the OOB error compare with the error on the test set? (use ntree = 100, and set.seed = 123)

set.seed(123)
idx <- sample(1:nrow(spam), 2601)
dTrain <- spam[idx,]
dTest <- spam[-idx,]
rf.train <- randomForest(spam ~ ., data = dTrain, ntree = 100)
rf.train ## OOB error: 5.04%
pred.rf <- predict(rf.train, newdata = dTest)
mean(pred.rf != dTest$spam) ## Test error: 5.4%

# I.e., OOB error is a very good approximation


# f) Suppose we don’t want to compute all variables for each new incoming mail, but only use the best 5. Which 5 variables should we choose? Compare the OOB error using all variables, the best 5 and the worst 5 (according to decrease in accuracy; use ntree = 100 and seed = 123).

set.seed(123)
rf.spam <- randomForest(spam ~ ., data = spam, importance = TRUE, ntree = 100)
tmp <- varImpPlot(rf.spam, n.var = ncol(spam)-1)
rf.spam ## OOB error: 4.5%

# We can find the 5 most and least important variables by looking at tmp-plot. The top variables have more influence than the bottom variables. Note that depending on your operating system you can get slightly different variables.

rf.best <- randomForest(spam ~ A.52 + A.55 + A.7 + A.56 + A.53, data = spam, ntree = 100)
rf.best

rf.worst <- randomForest(spam ~ A.38 + A.47 + A.54 + A.48 + A.34, data = spam, ntree = 100)
rf.worst

# I.e., variable importance indeed selects the variables that are important for prediction.


# Exercise 4
############

#a) Load the data on forensic glass again.

library(MASS)
?fgl

#b)Fit a tree using the default settings of rpart.

library(rpart)
fgl.tree <- rpart(type ~ ., data = fgl)

#c) Fit a Random Forest using the default settings of
#   the function 'randomForest' in package 'random-Forest'.

library(randomForest)
set.seed(99)
fgl.rf <- randomForest(type ~ ., data = fgl)

#e) Perform a leave - one - out cross-validation.

set.seed(83)
nreps <- nrow(fgl)
resRF <- resT <- rep(NA, nreps)
for (i in 1:nreps) {
  cat("i=",i,"\n")
  dTrain <- fgl[-i,]
  dTest <- fgl[i,]
  rf.fit <- randomForest(type ~ ., data = dTrain)
  t.fit <- rpart(type ~ ., data = dTrain)
  resRF[i] <- predict(rf.fit, newdata = dTest) != dTest$type
  resT[i] <- predict(t.fit, newdata = dTest, type = "class") != dTest$type
}

#  Calculate the missclassification rate for both methods.

mean(resRF) ## Missclass.rate RF: 19%
mean(resT) ## Missclass.rate Tree: 28%

#g) Perform a LDA.

lda.fit <- lda(type ~ ., data = fgl, CV = TRUE)
summary(lda.fit)
mean(lda.fit$class != fgl$type) ## LDA: 35%