## Regression with Random Forest
library(randomForest)
library(MVA)
?USairpollution
fit.rf <- randomForest(SO2 ~ ., data = USairpollution)
fit.rf

##############
## Classification with Random Forest
##############
library(MASS)
?fgl

## Fit RF with default settings
fit.rf <- randomForest(type ~ ., data = fgl)
fit.rf

## Plot Error vs. nmb. of trees
plot(fit.rf)
## around 100 trees is surely enough

## refit with 100 trees
system.time(fit.rf <- randomForest(type ~ ., data = fgl, ntree = 100))

## predict class labels on new observations
## (for simplicity, I take some old observations)
idx <- c(1,73,150,169,181,205)
datNew <- fgl[idx,]
predict(fit.rf, newdata = datNew) ## predicted labels
fgl$type[idx] ## true labels for comparision

## compare with tree
set.seed(123)
nreps <- nrow(fgl)
resRF <- resT <- rep(NA, nreps)
for (i in 1:nreps) {
  cat("i=",i,"\n")
  dTrain <- fgl[-i,]
  dTest <- fgl[i,]
  rf.fit <- randomForest(type ~ ., data = dTrain, ntree = 100)
  t.fit <- rpart(type ~ ., data = dTrain)
  resRF[i] <- predict(rf.fit, newdata = dTest) != dTest$type
  resT[i] <- predict(t.fit, newdata = dTest, type = "class") != dTest$type
}
mean(resRF) ## Missclass.rate RF: 20%
mean(resT) ## Missclass.rate Tree: 28%

## Compare with LDA (also leave-one-out CV)
lda.fit <- lda(type ~ ., data = fgl, CV = TRUE)
mean(lda.fit$class != fgl$type) ## 35%

## Variable Importance
rf.fgl <- randomForest(type ~ ., data = fgl, importance = TRUE)
varImpPlot(rf.fgl, n.var = ncol(fgl)-1)
rf.fgl

rf.good <- randomForest(type ~ Mg + RI + Al, data = fgl)
rf.good

rf.bad <- randomForest(type ~ Fe + Si + K, data = fgl)
rf.bad