## Script to explore properties of decision trees in 2D ## Author : Sylvain Robert ## FS 2014, 16.05.2014 ## FS 2016, 20.05.2016 updated by Sylvain ## Content: ## 0. load the data ## 1. training ## 2. pruning ## 3. variance of trees ## 4. bagging ################################################################### ## 0. load the data ############################################### .pardefault <- par(no.readonly = T) ## to reset to default later on ## load the data: banana <- read.table(file='banana.dat', sep=',') ## sample a training set set.seed(11) banana <- banana[sample.int(nrow(banana), 200),] ## format: colnames(banana) <- c('X1', 'X2', 'Y') banana[,'Y'] <- as.factor(banana[,'Y']) y.colors <- c('pink', 'palegreen3')[banana$Y] y.symbol <- c(2,3)[banana$Y] ## plot: xlims <- c(-3,3); ylims <- c(-2.5,2.5) plot(X2~X1, col=y.colors, data=banana, pch=y.symbol, xlim=xlims, ylim=ylims) ################################################################### ## 1. training #################################################### ## overfit a tree: ## parameter cost-complexity=cp = 0.0 means that we do not penalize ## the complexity of the tree at all. ## minsplit=10 means that we don't split leaves with less than 10 ## points (try to set minsplit=1 and see how big the tree get). require(rpart) tree <- rpart(Y ~ ., data = banana, control = rpart.control(cp = 0.0, minsplit = 10)) ## nice plot: require(rpart.plot) prp(tree, extra=1, type=1, box.col=c('pink', 'palegreen3')[tree$frame$yval]) ## Interpretation: ## go down the tree ## X2 >= 0.59: left yes, right no ## node: -1 = majority class of the node ## 112 = number of points from class -1 ## 88 = number of points from class +1 ## function to plot 2D partioning: ## hackish solution similar to the use of contours for LDA/QDA: ## predict on a grid, plot with image() tree.2d <- function(tree, data=banana, ...){ x1 <- seq(xlims[1], xlims[2], length.out=200) x2 <- seq(ylims[1],ylims[2], length.out=200) newdat <- expand.grid(X1=x1, X2=x2) y.pred <- predict(tree, newdata=newdat, type='class') z <- matrix(as.numeric(y.pred), nrow=length(x1), ncol=length(x2)) image(x1, x2, z, col=c('pink', 'palegreen3'),xlab=NA, ylab=NA,...) ## plot the data overlayed: y.colors <- c('pink', 'palegreen3')[banana$Y] y.symbol <- c(2,3)[banana$Y] y.symbol.full <- c(16,17)[banana$Y] points(X2~X1, data=data, col=y.colors, pch=y.symbol.full) points(X2~X1, data=data, pch=y.symbol) } ## partioning for the overfitted tree: tree.2d(tree,axes=F, main='non-pruned tree') ################################################################### ## 2. PRUNING: ################################################### ## explore trees of decreasing complexity (==increasing cp) complexities <- sort(unique(tree$frame$complexity)) for (i in seq_along(complexities)){ complexity <- complexities[i] ## plot tree with branches that are pruned shaded cols <- ifelse(tree$frame$complexity > complexity, 1, "darkgray") par(mfrow=c(2,1), mar=c(.5,0,1,0)) prp(tree, col=cols, branch.col=cols, split.col=cols, main=paste('CP =', round(complexity,3)), box.col=c('pink', 'palegreen3')[tree$frame$yval]) p.tree <- prune.rpart(tree, cp=complexity) tree.2d(p.tree, axes=F) devAskNewPage(TRUE) } devAskNewPage(FALSE) ## Select optimal tree: par(.pardefault) ### 1 std-error rule: plotcp(tree) abline(v=7, col='red') ## choose tree size = 8 ==> 7 splits ## if not clear enough check more precisely with the table printcp(tree) ## optimal Cp: opt.cp <- tree$cptable[7, 'CP'] ## check: 1. find min xerror ## 2. add xstd to min xerror => xlim ## 3. find smallest CP where xerror < xlim ## Prune the tree: opt.tree <- prune.rpart(tree, cp=opt.cp) ## plot it: prp(opt.tree, extra=1, type=1, box.col=c('pink', 'palegreen3')[tree$frame$yval]) tree.2d(opt.tree, axes=F) ################################################################### ## 3. Large variance of trees ##################################### ## load the full dataset banana.full <- read.table(file='banana.dat', sep=',') colnames(banana.full) <- c('X1', 'X2', 'Y') banana.full[,'Y'] <- as.factor(banana.full[,'Y']) y.full.colors <- c('pink', 'palegreen3')[banana.full$Y] y.full.symbol <- c(2,3)[banana.full$Y] ## Repeat this as many times as you want: ## Observe how the trees are really different: High variance! ## if you increase n, it changes less (of course) n <- 100 #800 resample <- 'yes' par(.pardefault) while(! resample %in% c('n','no')){ ## sample the data: banana <- banana.full[sample.int(nrow(banana.full), n),] ## fit a tree: tree <- rpart(Y ~ ., data = banana, control = rpart.control(cp = 0.0, minsplit = 10)) ## automatic 1 std-error rule: cptable <- tree$cptable min.ind <- which.min(cptable[,"xerror"]) min.lim <- cptable[min.ind, "xerror"] + cptable[min.ind, "xstd"] cp.opt <- cptable[(cptable[,"xerror"] < min.lim),"CP"][1] ## pruning opt.tree <- prune.rpart(tree, cp=cp.opt) ## plotting par(mfrow=c(2,1), mar=c(0.5,0,1,0)) prp(opt.tree, extra=1, type=1, main=paste('CP =', round(cp.opt,3)), box.col=c('pink', 'palegreen3')[tree$frame$yval]) tree.2d(opt.tree, axes=F, xlim=c(-3,3), ylim=c(-3,3)) print('Do you want to continue: Y/n') resample <- readline() } ################################################################### ## 4. BAGGING of trees: ########################################### ## Trees have a lot of variance, as we just saw. ## A natural idea would be to take the average decision ## over many trees that we train on different bootstrapped sample. ## Pushing this idea a tiny bit further would lead to random Forest, but ## this will be for another time! ## function to train one tree on a subsample of data ## determined by ind.training ## return predicted probabilities for each classes on a test set: one.tree <- function(data, ind.training, x.test=newdat){ tree <- rpart(Y ~ ., data = data[ind.training, ], control = rpart.control(cp=0,minsplit = 1)) ## choose optimal cp according to 1-std-error rule: min.ind <- which.min(tree$cptable[,"xerror"]) min.lim <- tree$cptable[min.ind, "xerror"] + tree$cptable[min.ind, "xstd"] cp.opt <- tree$cptable[(tree$cptable[,"xerror"] < min.lim),"CP"][1] tree.sample <- prune.rpart(tree, cp=cp.opt) y <- predict(tree.sample, newdata=x.test, type='prob') } ## grid: x1 <- seq(-2.5,2.5, length.out=100) x2 <- seq(-2.5,2.5, length.out=100) newdat <- expand.grid(X1=x1, X2=x2) ## chose a particular dataset: banana <- banana.full[sample.int(nrow(banana.full), 200),] ## format: colnames(banana) <- c('X1', 'X2', 'Y') banana[,'Y'] <- as.factor(banana[,'Y']) ## Train B trees on B different bootstrap samples: ## try different B B <- 100 # 10, 100, 500, 1000 n <- 200 many.tree <- replicate(B, one.tree(banana, sample.int(nrow(banana),n, replace = TRUE))) ## average over the B repeatition: mean.tree <- apply(many.tree, c(1,2), mean) ## plot majority class: ## Observe how the partioning becomes more and more 'curvy' and ## can follow better the 'banana' shape of the data! par(.pardefault) y.pred <- ifelse(mean.tree[,1] > 0.5, 1, 2) z <- matrix(as.numeric(y.pred), nrow=length(x1), ncol=length(x2)) image(x1, x2, z, col=c('pink', 'palegreen3'), main=paste('Bag of trees of size = ', B)) ## plot probability (intensity of color = probability of this class) rbPal <- colorRampPalette(c('palegreen3','pink')) y.prob <- rbPal(10)[as.numeric(cut(mean.tree[,1],breaks = 10))] z.prob <- matrix(as.numeric(mean.tree[,1]), nrow=length(x1), ncol=length(x2)) image(x1, x2, z.prob, col=rbPal(20), main=paste('Bag of trees of size = ', B)) # overlay with plot of the original data: y.colors <- c('pink', 'palegreen3')[banana$Y] y.symbol <- c(2,3)[banana$Y] y.symbol.full <- c(16,17)[banana$Y] points(X2~X1, data=banana, col=y.colors, pch=y.symbol.full) points(X2~X1, data=banana, pch=y.symbol) ## more errors in zone of lower probability... ## low probability == more uncertainty