[R] Recursive-SVM (R-SVM)

Mon Jan 22 21:50:20 CET 2007

I am trying to implement a simple r-svm example using the iris data (only two of the classes are taken and data is within the code). I am running into some errors. I am not an expert on svm's. If any one has used it, I would appreciate their help. I am appending the code below. 
Thanks../Murli

#######################################################
### R-code for R-SVM

### use leave-one-out / Nfold or bootstrape to permute data for external CV

### build SVM model and use mean-balanced weight to sort genes on training set

### and recursive elimination of least important genes

### author: Dr. Xin Lu, Research Scientist

### Biostatistics Department, Harvard School of Public Health

library(e1071)

## read in SVM formated data in filename

## the format following the defination of SVMTorch

## the first line contains 2 integer: nSample nFeature+1 

## followed by a matrix, each row for one sample, with the last column being +/1 1 for class label

ReadSVMdata <- function(filename)

{

dd <- read.table( filename, header=F, skip=1)

x <- as.matrix( dd[, 1:(ncol(dd)-1)] ) 

y <- factor( dd[, ncol(dd)] )

ret <- list(x=x, y=y)

}

## create a decreasing ladder for recursive feature elimination

CreatLadder <- function( Ntotal, pRatio=0.75, Nmin=5 )

{

x <- vector()

x[1] <- Ntotal

for( i in 1:100 )

{

pp <- round(x[i] * pRatio)

if( pp == x[i] )

{

pp <- pp-1

} 

if( pp >= Nmin )

{

x[i+1] <- pp

} else

{

break

}

}

x

}

## R-SVM core code

## input:

## x: row matrix of data

## y: class label: 1 / -1 for 2 classes

## CVtype: 

## integer: N fold CV

## "LOO": leave-one-out CV

## "bootstrape": bootstrape CV

## CVnum: number of CVs

## LOO: defined as sample size

## Nfold and bootstrape: user defined, default as sample size

## output: a named list

## Error: a vector of CV error on each level

## SelFreq: a matrix for the frequency of each gene being selected in each level

## with each column corresponds to a level of selection

## and each row for a gene

## The top important gene in each level are those high-freqent ones

RSVM <- function(x, y, ladder, CVtype, CVnum=0 )

{

## check if y is binary response

Ytype <- names(table(y))

if( length(Ytype) != 2) 

{

print("ERROR!! RSVM can only deal with 2-class problem")

return(0)

}

## class mean

m1 <- apply(x[ which(y==Ytype[1]), ], 2, mean)

m2 <- apply(x[ which(y==Ytype[2]), ], 2, mean)

md <- m1-m2

yy <- vector( length=length(y))

yy[which(y==Ytype[1])] <- 1

yy[which(y==Ytype[2])] <- -1 

y <- yy

## check ladder

if( min(diff(ladder)) >= 0 )

{

print("ERROR!! ladder must be monotonously decreasing")

return(0);

}

if( ladder[1] != ncol(x) )

{

ladder <- c(ncol(x), ladder)

}

nSample <- nrow(x)

nGene <- ncol(x)

SampInd <- seq(1, nSample)

if( CVtype == "LOO" )

{

CVnum <- nSample

} else

{

if( CVnum == 0 )

{

CVnum <- nSample

}

}

## vector for test error and number of tests

ErrVec <- vector( length=length(ladder))

names(ErrVec) <- paste("Lev_", ladder, sep="")

nTests <- 0

SelFreq <- matrix( 0, nrow=nGene, ncol=length(ladder))

colnames(SelFreq) <- paste("Lev_", ladder, sep="")

## for each CV 

for( i in 1:CVnum )

{

## split data

if( CVtype == "LOO" )

{

TestInd <- i

TrainInd <- SampInd[ -TestInd]

} else

{

if( CVtype == "bootstrape" )

{

TrainInd <- sample(SampInd, nSample, replace=T )

TestInd <- SampInd[ which(!(SampInd %in% TrainInd ))]

} else

{

## Nfold

TrainInd <- sample(SampInd, nSample*(CVtype-1)/CVtype )

TestInd <- SampInd[ which(!(SampInd %in% TrainInd ))]

}

}

nTests <- nTests + length(TestInd)

## in each level, train a SVM model and record test error

xTrain <- x[TrainInd, ]

yTrain <- y[TrainInd]

xTest <- x[TestInd,]

yTest <- y[TestInd]

## index of the genes used in the 

SelInd <- seq(1, nGene)

for( gLevel in 1:length(ladder) )

{

## record the genes selected in this ladder

SelFreq[SelInd, gLevel] <- SelFreq[SelInd, gLevel] +1

## train SVM model and test error

svmres <- svm(xTrain[, SelInd], yTrain, scale=F, type="C-classification", kernel="linear" )

if( CVtype == "LOO" )

{

svmpred <- predict(svmres, matrix(xTest[SelInd], nrow=1) )

} else

{

svmpred <- predict(svmres, xTest[, SelInd] )

}

ErrVec[gLevel] <- ErrVec[gLevel] + sum(svmpred != yTest )

## weight vector

W <- t(svmres$coefs*yTrain[svmres$index]) %*% svmres$SV * md[SelInd]

rkW <- rank(W)

if( gLevel < length(ladder) )

{

SelInd <- SelInd[which(rkW > (ladder[gLevel] - ladder[gLevel+1]))]

}

}

}

ret <- list(ladder=ladder, Error=ErrVec/nTests, SelFreq=SelFreq)

}

SummaryRSVM <- function( RSVMres )

{

ERInd <- max( which(RSVMres$Error == min(RSVMres$Error)) )

MinLevel <- RSVMres$ladder[ERInd]

FreqVec <- RSVMres$SelFreq[, ERInd]

SelInd <- which( rank(FreqVec) >= (ladder[1]-MinLevel) )

# print("MinCV error of", min(RSVMres$Error), "at", MinLevel, "genes" )

ret <- list( MinER=min(RSVMres$Error), MinLevel=MinLevel, SelInd=SelInd)

}

###########################################

#my code starts below

#data<-ReadSVMdata("iris_r-svm.txt")

#The data read from the file is given below. 

data<-structure(list(x = structure(c(5.1, 4.9, 4.7, 4.6, 5, 5.4, 4.6, 

5, 4.4, 4.9, 5.4, 4.8, 4.8, 4.3, 5.8, 5.7, 5.4, 5.1, 5.7, 5.1, 

5.4, 5.1, 4.6, 5.1, 4.8, 5, 5, 5.2, 5.2, 4.7, 4.8, 5.4, 5.2, 

5.5, 4.9, 5, 5.5, 4.9, 4.4, 5.1, 5, 4.5, 4.4, 5, 5.1, 4.8, 5.1, 

4.6, 5.3, 5, 7, 6.4, 6.9, 5.5, 6.5, 5.7, 6.3, 4.9, 6.6, 5.2, 

5, 5.9, 6, 6.1, 5.6, 6.7, 5.6, 5.8, 6.2, 5.6, 5.9, 6.1, 6.3, 

6.1, 6.4, 6.6, 6.8, 6.7, 6, 5.7, 5.5, 5.5, 5.8, 6, 5.4, 6, 6.7, 

6.3, 5.6, 5.5, 5.5, 6.1, 5.8, 5, 5.6, 5.7, 5.7, 6.2, 5.1, 5.7, 

3.5, 3, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7, 3.4, 3, 

3, 4, 4.4, 3.9, 3.5, 3.8, 3.8, 3.4, 3.7, 3.6, 3.3, 3.4, 3, 3.4, 

3.5, 3.4, 3.2, 3.1, 3.4, 4.1, 4.2, 3.1, 3.2, 3.5, 3.6, 3, 3.4, 

3.5, 2.3, 3.2, 3.5, 3.8, 3, 3.8, 3.2, 3.7, 3.3, 3.2, 3.2, 3.1, 

2.3, 2.8, 2.8, 3.3, 2.4, 2.9, 2.7, 2, 3, 2.2, 2.9, 2.9, 3.1, 

3, 2.7, 2.2, 2.5, 3.2, 2.8, 2.5, 2.8, 2.9, 3, 2.8, 3, 2.9, 2.6, 

2.4, 2.4, 2.7, 2.7, 3, 3.4, 3.1, 2.3, 3, 2.5, 2.6, 3, 2.6, 2.3, 

2.7, 3, 2.9, 2.9, 2.5, 2.8, 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 

1.5, 1.4, 1.5, 1.5, 1.6, 1.4, 1.1, 1.2, 1.5, 1.3, 1.4, 1.7, 1.5, 

1.7, 1.5, 1, 1.7, 1.9, 1.6, 1.6, 1.5, 1.4, 1.6, 1.6, 1.5, 1.5, 

1.4, 1.5, 1.2, 1.3, 1.4, 1.3, 1.5, 1.3, 1.3, 1.3, 1.6, 1.9, 1.4, 

1.6, 1.4, 1.5, 1.4, 4.7, 4.5, 4.9, 4, 4.6, 4.5, 4.7, 3.3, 4.6, 

3.9, 3.5, 4.2, 4, 4.7, 3.6, 4.4, 4.5, 4.1, 4.5, 3.9, 4.8, 4, 

4.9, 4.7, 4.3, 4.4, 4.8, 5, 4.5, 3.5, 3.8, 3.7, 3.9, 5.1, 4.5, 

4.5, 4.7, 4.4, 4.1, 4, 4.4, 4.6, 4, 3.3, 4.2, 4.2, 4.2, 4.3, 

3, 4.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1, 0.2, 

0.2, 0.1, 0.1, 0.2, 0.4, 0.4, 0.3, 0.3, 0.3, 0.2, 0.4, 0.2, 0.5, 

0.2, 0.2, 0.4, 0.2, 0.2, 0.2, 0.2, 0.4, 0.1, 0.2, 0.2, 0.2, 0.2, 

0.1, 0.2, 0.2, 0.3, 0.3, 0.2, 0.6, 0.4, 0.3, 0.2, 0.2, 0.2, 0.2, 

1.4, 1.5, 1.5, 1.3, 1.5, 1.3, 1.6, 1, 1.3, 1.4, 1, 1.5, 1, 1.4, 

1.3, 1.4, 1.5, 1, 1.5, 1.1, 1.8, 1.3, 1.5, 1.2, 1.3, 1.4, 1.4, 

1.7, 1.5, 1, 1.1, 1, 1.2, 1.6, 1.5, 1.6, 1.5, 1.3, 1.3, 1.3, 

1.2, 1.4, 1.2, 1, 1.3, 1.2, 1.3, 1.3, 1.1, 1.3), .Dim = c(100, 

4), .Dimnames = list(c("1", "2", "3", "4", "5", "6", "7", "8", 

"9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", 

"20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", 

"31", "32", "33", "34", "35", "36", "37", "38", "39", "40", "41", 

"42", "43", "44", "45", "46", "47", "48", "49", "50", "51", "52", 

"53", "54", "55", "56", "57", "58", "59", "60", "61", "62", "63", 

"64", "65", "66", "67", "68", "69", "70", "71", "72", "73", "74", 

"75", "76", "77", "78", "79", "80", "81", "82", "83", "84", "85", 

"86", "87", "88", "89", "90", "91", "92", "93", "94", "95", "96", 

"97", "98", "99", "100"), c("V1", "V2", "V3", "V4"))), y = structure(c(2, 

2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 

2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 

2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), .Label = c("-1", 

"1"), class = "factor")), .Names = c("x", "y"))

len<-length(data$y)

x<-data$x

y<-data$y

ladder<-CreatLadder(len)

RSVM(x,y,ladder,"LOO")