[R] Recursive-SVM (R-SVM)
Nair, Murlidharan T
mnair at iusb.edu
Mon Jan 22 21:50:20 CET 2007
I am trying to implement a simple r-svm example using the iris data (only two of the classes are taken and data is within the code). I am running into some errors. I am not an expert on svm's. If any one has used it, I would appreciate their help. I am appending the code below.
Thanks../Murli
#######################################################
### R-code for R-SVM
### use leave-one-out / Nfold or bootstrape to permute data for external CV
### build SVM model and use mean-balanced weight to sort genes on training set
### and recursive elimination of least important genes
### author: Dr. Xin Lu, Research Scientist
### Biostatistics Department, Harvard School of Public Health
library(e1071)
## read in SVM formated data in filename
## the format following the defination of SVMTorch
## the first line contains 2 integer: nSample nFeature+1
## followed by a matrix, each row for one sample, with the last column being +/1 1 for class label
ReadSVMdata <- function(filename)
{
dd <- read.table( filename, header=F, skip=1)
x <- as.matrix( dd[, 1:(ncol(dd)-1)] )
y <- factor( dd[, ncol(dd)] )
ret <- list(x=x, y=y)
}
## create a decreasing ladder for recursive feature elimination
CreatLadder <- function( Ntotal, pRatio=0.75, Nmin=5 )
{
x <- vector()
x[1] <- Ntotal
for( i in 1:100 )
{
pp <- round(x[i] * pRatio)
if( pp == x[i] )
{
pp <- pp-1
}
if( pp >= Nmin )
{
x[i+1] <- pp
} else
{
break
}
}
x
}
## R-SVM core code
## input:
## x: row matrix of data
## y: class label: 1 / -1 for 2 classes
## CVtype:
## integer: N fold CV
## "LOO": leave-one-out CV
## "bootstrape": bootstrape CV
## CVnum: number of CVs
## LOO: defined as sample size
## Nfold and bootstrape: user defined, default as sample size
## output: a named list
## Error: a vector of CV error on each level
## SelFreq: a matrix for the frequency of each gene being selected in each level
## with each column corresponds to a level of selection
## and each row for a gene
## The top important gene in each level are those high-freqent ones
RSVM <- function(x, y, ladder, CVtype, CVnum=0 )
{
## check if y is binary response
Ytype <- names(table(y))
if( length(Ytype) != 2)
{
print("ERROR!! RSVM can only deal with 2-class problem")
return(0)
}
## class mean
m1 <- apply(x[ which(y==Ytype[1]), ], 2, mean)
m2 <- apply(x[ which(y==Ytype[2]), ], 2, mean)
md <- m1-m2
yy <- vector( length=length(y))
yy[which(y==Ytype[1])] <- 1
yy[which(y==Ytype[2])] <- -1
y <- yy
## check ladder
if( min(diff(ladder)) >= 0 )
{
print("ERROR!! ladder must be monotonously decreasing")
return(0);
}
if( ladder[1] != ncol(x) )
{
ladder <- c(ncol(x), ladder)
}
nSample <- nrow(x)
nGene <- ncol(x)
SampInd <- seq(1, nSample)
if( CVtype == "LOO" )
{
CVnum <- nSample
} else
{
if( CVnum == 0 )
{
CVnum <- nSample
}
}
## vector for test error and number of tests
ErrVec <- vector( length=length(ladder))
names(ErrVec) <- paste("Lev_", ladder, sep="")
nTests <- 0
SelFreq <- matrix( 0, nrow=nGene, ncol=length(ladder))
colnames(SelFreq) <- paste("Lev_", ladder, sep="")
## for each CV
for( i in 1:CVnum )
{
## split data
if( CVtype == "LOO" )
{
TestInd <- i
TrainInd <- SampInd[ -TestInd]
} else
{
if( CVtype == "bootstrape" )
{
TrainInd <- sample(SampInd, nSample, replace=T )
TestInd <- SampInd[ which(!(SampInd %in% TrainInd ))]
} else
{
## Nfold
TrainInd <- sample(SampInd, nSample*(CVtype-1)/CVtype )
TestInd <- SampInd[ which(!(SampInd %in% TrainInd ))]
}
}
nTests <- nTests + length(TestInd)
## in each level, train a SVM model and record test error
xTrain <- x[TrainInd, ]
yTrain <- y[TrainInd]
xTest <- x[TestInd,]
yTest <- y[TestInd]
## index of the genes used in the
SelInd <- seq(1, nGene)
for( gLevel in 1:length(ladder) )
{
## record the genes selected in this ladder
SelFreq[SelInd, gLevel] <- SelFreq[SelInd, gLevel] +1
## train SVM model and test error
svmres <- svm(xTrain[, SelInd], yTrain, scale=F, type="C-classification", kernel="linear" )
if( CVtype == "LOO" )
{
svmpred <- predict(svmres, matrix(xTest[SelInd], nrow=1) )
} else
{
svmpred <- predict(svmres, xTest[, SelInd] )
}
ErrVec[gLevel] <- ErrVec[gLevel] + sum(svmpred != yTest )
## weight vector
W <- t(svmres$coefs*yTrain[svmres$index]) %*% svmres$SV * md[SelInd]
rkW <- rank(W)
if( gLevel < length(ladder) )
{
SelInd <- SelInd[which(rkW > (ladder[gLevel] - ladder[gLevel+1]))]
}
}
}
ret <- list(ladder=ladder, Error=ErrVec/nTests, SelFreq=SelFreq)
}
SummaryRSVM <- function( RSVMres )
{
ERInd <- max( which(RSVMres$Error == min(RSVMres$Error)) )
MinLevel <- RSVMres$ladder[ERInd]
FreqVec <- RSVMres$SelFreq[, ERInd]
SelInd <- which( rank(FreqVec) >= (ladder[1]-MinLevel) )
# print("MinCV error of", min(RSVMres$Error), "at", MinLevel, "genes" )
ret <- list( MinER=min(RSVMres$Error), MinLevel=MinLevel, SelInd=SelInd)
}
###########################################
#my code starts below
#data<-ReadSVMdata("iris_r-svm.txt")
#The data read from the file is given below.
data<-structure(list(x = structure(c(5.1, 4.9, 4.7, 4.6, 5, 5.4, 4.6,
5, 4.4, 4.9, 5.4, 4.8, 4.8, 4.3, 5.8, 5.7, 5.4, 5.1, 5.7, 5.1,
5.4, 5.1, 4.6, 5.1, 4.8, 5, 5, 5.2, 5.2, 4.7, 4.8, 5.4, 5.2,
5.5, 4.9, 5, 5.5, 4.9, 4.4, 5.1, 5, 4.5, 4.4, 5, 5.1, 4.8, 5.1,
4.6, 5.3, 5, 7, 6.4, 6.9, 5.5, 6.5, 5.7, 6.3, 4.9, 6.6, 5.2,
5, 5.9, 6, 6.1, 5.6, 6.7, 5.6, 5.8, 6.2, 5.6, 5.9, 6.1, 6.3,
6.1, 6.4, 6.6, 6.8, 6.7, 6, 5.7, 5.5, 5.5, 5.8, 6, 5.4, 6, 6.7,
6.3, 5.6, 5.5, 5.5, 6.1, 5.8, 5, 5.6, 5.7, 5.7, 6.2, 5.1, 5.7,
3.5, 3, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7, 3.4, 3,
3, 4, 4.4, 3.9, 3.5, 3.8, 3.8, 3.4, 3.7, 3.6, 3.3, 3.4, 3, 3.4,
3.5, 3.4, 3.2, 3.1, 3.4, 4.1, 4.2, 3.1, 3.2, 3.5, 3.6, 3, 3.4,
3.5, 2.3, 3.2, 3.5, 3.8, 3, 3.8, 3.2, 3.7, 3.3, 3.2, 3.2, 3.1,
2.3, 2.8, 2.8, 3.3, 2.4, 2.9, 2.7, 2, 3, 2.2, 2.9, 2.9, 3.1,
3, 2.7, 2.2, 2.5, 3.2, 2.8, 2.5, 2.8, 2.9, 3, 2.8, 3, 2.9, 2.6,
2.4, 2.4, 2.7, 2.7, 3, 3.4, 3.1, 2.3, 3, 2.5, 2.6, 3, 2.6, 2.3,
2.7, 3, 2.9, 2.9, 2.5, 2.8, 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4,
1.5, 1.4, 1.5, 1.5, 1.6, 1.4, 1.1, 1.2, 1.5, 1.3, 1.4, 1.7, 1.5,
1.7, 1.5, 1, 1.7, 1.9, 1.6, 1.6, 1.5, 1.4, 1.6, 1.6, 1.5, 1.5,
1.4, 1.5, 1.2, 1.3, 1.4, 1.3, 1.5, 1.3, 1.3, 1.3, 1.6, 1.9, 1.4,
1.6, 1.4, 1.5, 1.4, 4.7, 4.5, 4.9, 4, 4.6, 4.5, 4.7, 3.3, 4.6,
3.9, 3.5, 4.2, 4, 4.7, 3.6, 4.4, 4.5, 4.1, 4.5, 3.9, 4.8, 4,
4.9, 4.7, 4.3, 4.4, 4.8, 5, 4.5, 3.5, 3.8, 3.7, 3.9, 5.1, 4.5,
4.5, 4.7, 4.4, 4.1, 4, 4.4, 4.6, 4, 3.3, 4.2, 4.2, 4.2, 4.3,
3, 4.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1, 0.2,
0.2, 0.1, 0.1, 0.2, 0.4, 0.4, 0.3, 0.3, 0.3, 0.2, 0.4, 0.2, 0.5,
0.2, 0.2, 0.4, 0.2, 0.2, 0.2, 0.2, 0.4, 0.1, 0.2, 0.2, 0.2, 0.2,
0.1, 0.2, 0.2, 0.3, 0.3, 0.2, 0.6, 0.4, 0.3, 0.2, 0.2, 0.2, 0.2,
1.4, 1.5, 1.5, 1.3, 1.5, 1.3, 1.6, 1, 1.3, 1.4, 1, 1.5, 1, 1.4,
1.3, 1.4, 1.5, 1, 1.5, 1.1, 1.8, 1.3, 1.5, 1.2, 1.3, 1.4, 1.4,
1.7, 1.5, 1, 1.1, 1, 1.2, 1.6, 1.5, 1.6, 1.5, 1.3, 1.3, 1.3,
1.2, 1.4, 1.2, 1, 1.3, 1.2, 1.3, 1.3, 1.1, 1.3), .Dim = c(100,
4), .Dimnames = list(c("1", "2", "3", "4", "5", "6", "7", "8",
"9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19",
"20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30",
"31", "32", "33", "34", "35", "36", "37", "38", "39", "40", "41",
"42", "43", "44", "45", "46", "47", "48", "49", "50", "51", "52",
"53", "54", "55", "56", "57", "58", "59", "60", "61", "62", "63",
"64", "65", "66", "67", "68", "69", "70", "71", "72", "73", "74",
"75", "76", "77", "78", "79", "80", "81", "82", "83", "84", "85",
"86", "87", "88", "89", "90", "91", "92", "93", "94", "95", "96",
"97", "98", "99", "100"), c("V1", "V2", "V3", "V4"))), y = structure(c(2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), .Label = c("-1",
"1"), class = "factor")), .Names = c("x", "y"))
len<-length(data$y)
x<-data$x
y<-data$y
ladder<-CreatLadder(len)
RSVM(x,y,ladder,"LOO")
More information about the R-help
mailing list