[BioC] How to do k-fold validation using SVM
Ramon Diaz
rdiaz@cnio.es
Mon, 27 Jan 2003 14:37:39 +0100
--------------Boundary-00=_RIKDOLK72K5YKH5275KI
Content-Type: text/plain;
charset="iso-8859-1"
Content-Transfer-Encoding: quoted-printable
Sorry for jumping into the thread so late: I just read the posting today.
Anyway, I have used the following code a couple of times, and maybe it is=
of=20
some help to you. In each "round" (for each set of training data) I selec=
t=20
the best K genes (where best means with largest abs. value of t statistic=
)=20
and then fit the svm using those K genes.
For a couple of reasons, I use mt.maxT (from the multtest library) to get=
the =20
t statistic, but you can modify the function at your convenience (like an=
=20
ANOVA for > 3 or whatever you want). Note also that I use a linear kernel=
=2E=20
Hope it helps,
Ram=F3n
gene.select <- function(data, class, size =3D NULL, threshold =3D NULL) {
# t.stat <- apply(data, 2, function(x) {abs(t.test(x ~ class)$statisti=
c)})=20
this is slower than
tmp <- mt.maxT(t(data), class, B=3D 1)[, c(1, 2)]
selected <- tmp[seq(1:size), 1]
return(selected)
}
cross.valid.svm <- function(data, y, knumber =3D 10, size =3D 200) {
## data is structured as subjects in rows, genes in columns
## (and thus is transposed inside gene.select to be fed to mt.maxT).
## If you want leave-one-out, set knumber =3D NULL
## size is the number of genes used when building the classifier.
## those are the "best" genes, based on a t-statistic.
=20
## First, selecting the data subsets for cross-validation
if(is.null(knumber)) {
knumber <- length(y)
leave.one.out <- TRUE
}
else leave.one.out <- FALSE
N <- length(y)
if(knumber > N) stop(message =3D "knumber has to be <=3D number of su=
bjects")
reps <- floor(N/knumber)
reps.last <- N - (knumber-1)*reps
index.select <- c( rep(seq(from =3D 1, to =3D (knumber - 1)), reps),
rep(knumber, reps.last))
index.select <- sample(index.select)
=20
cv.errors <- matrix(-99, nrow =3D knumber, ncol =3D 4)
## Fit model for each data set. =20
for(sample.number in 1:knumber) {
## gene selection
gene.subset <- gene.select(data[index.select !=3D sample.number, =
],
y[index.select !=3D sample.number],
size =3D size)
=20
=20
## predict from svm on that subset
y.f <- factor(y)
test.set <- data[index.select =3D=3D sample.number, gene.subset]
if(is.null(dim(test.set))) test.set <- matrix(test.set, nrow =3D =
1) ##=20
for leave-one-out
predicted <- predict(svm(data[index.select !=3D sample.number,=20
gene.subset],
y.f[index.select !=3D sample.number],
kernel =3D "linear"), test.set)
=20
cv.errors[sample.number, 1] <- length(which((y.f[index.select =3D=
=3D=20
sample.number] =3D=3D 1)
& predicted =3D=3D 0)=
)
cv.errors[sample.number, 2] <- length(which((y.f[index.select =3D=
=3D=20
sample.number] =3D=3D 0)
& predicted =3D=3D 1)=
)
cv.errors[sample.number, 3] <- length(which(y.f[index.select =3D=3D=
=20
sample.number] !=3D predicted))
cv.errors[sample.number, 4] <- length(predicted)
=20
}
cv.errors <- data.frame(cv.errors)
names(cv.errors) <- c("true.1.pred.0", "true.0.pred.1", "total.error"=
,=20
"number.predicted")
average.error.rate <- sum(cv.errors[, 3])/sum(cv.errors[, 4])
return(list(cv.errors =3D cv.errors, average.error.rate =3D=20
average.error.rate))
=20
}
## An example code:
cross.valid.svm(matrix.covar, class.vector, k =3D 10, size =3D 10)
--=20
Ram=F3n D=EDaz-Uriarte
Bioinformatics Unit
Centro Nacional de Investigaciones Oncol=F3gicas (CNIO)
(Spanish National Cancer Center)
Melchor Fern=E1ndez Almagro, 3
28029 Madrid (Spain)
http://bioinfo.cnio.es/~rdiaz
--------------Boundary-00=_RIKDOLK72K5YKH5275KI
Content-Type: text/plain;
charset="iso-8859-1";
name="cv-svm.R"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment; filename="cv-svm.R"
cross.valid.optim.svm <- function(data, y, knumber = 10, size = 200) {
## First, selecting the data subsets for cross-validation
## data is structured as subjects in rows, genes in columns
## (and thus is transposed inside gene.select to be fed to mt.maxT).
## If you want leave-one-out, set knumber = NULL
## size is the number of genes used when building the classifier.
## those are the "best" genes, based on a t-statistic.
## gene selection
gene.subset <- gene.select(data, y, size = size)
if(is.null(knumber)) {
knumber <- length(y)
leave.one.out <- TRUE
}
else leave.one.out <- FALSE
N <- length(y)
if(knumber > N) stop(message = "knumber has to be <= number of subjects")
reps <- floor(N/knumber)
reps.last <- N - (knumber-1)*reps
index.select <- c( rep(seq(from = 1, to = (knumber - 1)), reps),
rep(knumber, reps.last))
index.select <- sample(index.select)
cv.errors <- matrix(-99, nrow = knumber, ncol = 4)
## Fit model for each data set.
for(sample.number in 1:knumber) {
## predict from svm on that subset
y.f <- factor(y)
test.set <- data[index.select == sample.number, gene.subset]
if(is.null(dim(test.set))) test.set <- matrix(test.set, nrow = 1) ## for leave-one-out
predicted <- predict(svm(data[index.select != sample.number, gene.subset],
y.f[index.select != sample.number],
kernel = "linear"), test.set)
cv.errors[sample.number, 1] <- length(which((y.f[index.select == sample.number] == 1)
& predicted == 0))
cv.errors[sample.number, 2] <- length(which((y.f[index.select == sample.number] == 0)
& predicted == 1))
cv.errors[sample.number, 3] <- length(which(y.f[index.select == sample.number] != predicted))
cv.errors[sample.number, 4] <- length(predicted)
}
cv.errors <- data.frame(cv.errors)
names(cv.errors) <- c("true.1.pred.0", "true.0.pred.1", "total.error", "number.predicted")
average.error.rate <- sum(cv.errors[, 3])/sum(cv.errors[, 4])
return(list(cv.errors = cv.errors, average.error.rate = average.error.rate))
}
gene.select <- function(data, class, size = NULL, threshold = NULL) {
# t.stat <- apply(data, 2, function(x) {abs(t.test(x ~ class)$statistic)}) this is slower than
tmp <- mt.maxT(t(data), class, B= 1)[, c(1, 2)]
selected <- tmp[seq(1:size), 1]
return(selected)
}
cross.valid.optim.svm(matrix.data, censored, k = 10, size = 10)
--------------Boundary-00=_RIKDOLK72K5YKH5275KI--