## Regression with Random Forest library(randomForest) library(MVA) ?USairpollution fit.rf <- randomForest(SO2 ~ ., data = USairpollution) fit.rf ############## ## Classification with Random Forest ############## library(MASS) ?fgl ## Fit RF with default settings fit.rf <- randomForest(type ~ ., data = fgl) fit.rf ## Plot Error vs. nmb. of trees plot(fit.rf) ## around 100 trees is surely enough ## refit with 100 trees system.time(fit.rf <- randomForest(type ~ ., data = fgl, ntree = 100)) ## predict class labels on new observations ## (for simplicity, I take some old observations) idx <- c(1,73,150,169,181,205) datNew <- fgl[idx,] predict(fit.rf, newdata = datNew) ## predicted labels fgl$type[idx] ## true labels for comparision ## compare with tree set.seed(123) nreps <- nrow(fgl) resRF <- resT <- rep(NA, nreps) for (i in 1:nreps) { cat("i=",i,"\n") dTrain <- fgl[-i,] dTest <- fgl[i,] rf.fit <- randomForest(type ~ ., data = dTrain, ntree = 100) t.fit <- rpart(type ~ ., data = dTrain) resRF[i] <- predict(rf.fit, newdata = dTest) != dTest$type resT[i] <- predict(t.fit, newdata = dTest, type = "class") != dTest$type } mean(resRF) ## Missclass.rate RF: 20% mean(resT) ## Missclass.rate Tree: 28% ## Compare with LDA (also leave-one-out CV) lda.fit <- lda(type ~ ., data = fgl, CV = TRUE) mean(lda.fit$class != fgl$type) ## 35% ## Variable Importance rf.fgl <- randomForest(type ~ ., data = fgl, importance = TRUE) varImpPlot(rf.fgl, n.var = ncol(fgl)-1) rf.fgl rf.good <- randomForest(type ~ Mg + RI + Al, data = fgl) rf.good rf.bad <- randomForest(type ~ Fe + Si + K, data = fgl) rf.bad