[R] predict.fda difficulties

Mon Oct 15 14:33:01 CEST 2001

Dear Dr Hornik / R help list,

I am using the mda package and in particular the fda routine to classify a
set of 162 neurons falling in to 11 neuronal cell types according to 17
morphological variables.  I would like to use a cross-validation approach in
which I split the data, train with one part amd then test the predictive
accuracy of the discriminant functions with the remaining part.  However I
have come across a problem doing this with predict.fda.  If I try using the
trained FDA object to predict the test data, I frequently get an error
message like:

Error in factor(pclass, labels = dimnames(means)[[1]]) :
    invalid labels; length 11 should be 1 or 10

As far as I can tell this is because the test data set is small and one of
the classes has no predicted members.  I have attached some sample code
which produces this error.  I have also attached the traceback().  I am
using R 1.3.1 on MacOS 9.1 and the mda package is the version that Stefano
Iacus made for this release of R.

Am I trying to do something stupid?  Is there a step I have not taken?  Or
is this a bug in the predict.fda routine?  I would be very grateful for any
guidance you could offer.  Many thanks,

Greg Jefferis.
__________________________________________________________________________
Greg Jefferis,                          Lab Address: Liqun Luo, Herrin 144
Neurosciences PhD Programme &                e-mail: jefferis at stanford.edu
Dept Biological Sciences,                       Lab: (650) 725 5809
Gilbert Biology Building,                       Fax: (650) 723 0589
371 Serra Mall,
Stanford, CA 94305-5020.                       Home: (650) 497 1135

__________________________________________________________________________
Toy Example
____________

# Dummy cross-validation routine

set.seed(7771)

TotalN<-162
TestN<-80
TrainN<-TotalN-TestN
# Make a dummy frame with 11 kinds of "CellType" labels and 17 normal
variables
DummyCellTypes<-floor(runif(TotalN,0,11)+1)
DummyParams<-data.frame(CellType=DummyCellTypes,
    matrix(rnorm(TotalN*17),TotalN,17))
DummyParams$CellType<-factor(DummyParams$CellType)
table(DummyParams$CellType)

TrainFDAs<-list(100)
TestConfusions<-list(100)
TestErrs<-rep(-1,100)
i<-1
z<-0
runsum<-0

while(i<=100 & z<1000){
    z<-z+1
    TrainSample<-sample(1:TotalN,TrainN)

    # Check to see if any CellTypes are missing from the training
    # or test set as this will certainly cause an error.
    TrainMissing<-any(table(DummyParams$CellType[TrainSample])==0)
    TestMissing<-any(table(DummyParams$CellType[-TrainSample])==0)

    if(!TrainMissing & !TestMissing){

       TrainFDAs[[i]]<-fda(CellType~.,data=DummyParams[TrainSample,])

       #Predict CellTypes of the test data set with the training set FDA
       TP<-predict(TrainFDAs[[i]], DummyParams[-TrainSample,])
       TestConfusions[[i]]<-confusion(TP,DummyParams$CellType[-TrainSample])

       #Keep track of the error rate
       runsum<-runsum+attr(TestConfusions[[i]],"error")
       i<-i+1      
    }
}
cat("Mean prediction error : ",runsum/i,"for ", i," cross-validation
runs\n")

__________________________________________________________________________
traceback()
_______________
> traceback()
5: stop(paste("invalid labels; length", nl, "should be 1 or",
length(levels)))
4: factor(pclass, labels = dimnames(means)[[1]])
3: switch(type, variates = return(x), class = {
       n <- nrow(x)
       prior <- 2 * log(prior)
       mindist <- dist(x, means[1, ], dimension) - prior[1]
       pclass <- rep(1, n)
       for (i in seq(2, J)) {
           ndist <- dist(x, means[i, ], dimension) - prior[i]
           l <- ndist < mindist
           pclass[l] <- i
           mindist[l] <- ndist[l]
       }
       return(factor(pclass, labels = dimnames(means)[[1]]))
   }, posterior = {
       pclass <- matrix(0, nrow(x), J)
       for (i in seq(J)) pclass[, i] <- exp(-0.5 * dist(x, means[i,
           ], dimension)) * prior[i]
       dimnames(pclass) <- list(dimnames(x)[[1]], dimnames(means)[[1]])
       return(pclass/drop(pclass %*% rep(1, J)))
   }, hierarchical = {
       prior <- 2 * log(prior)
       Pclass <- vector("list", length(dimension.set))
       names(Pclass) <- paste("D", dimension.set, sep = "")
       for (ad in seq(along = dimension.set)) {
           d <- dimension.set[ad]
           dd <- seq(d)
           mindist <- dist(x[, dd, drop = F], means[1, dd, drop = F],
               d) - prior[1]
           pclass <- rep(1, nrow(x))
           for (i in seq(2, J)) {
               ndist <- dist(x[, dd, drop = F], means[i, dd, drop = F],
                   d) - prior[i]
               l <- ndist < mindist
               pclass[l] <- i
               mindist[l] <- ndist[l]
           }
           levels(pclass) <- dimnames(means)[[1]]
           Pclass[[ad]] <- pclass
       }
       rownames <- dimnames(x)[[1]]
       if (is.null(rownames))
           rownames <- paste(seq(nrow(x)))
       return(structure(Pclass, class = "data.frame", row.names = rownames,
           dimensions = dimension.set))
   }, distances = {
       dclass <- matrix(0, nrow(x), J)
       for (i in seq(J)) dclass[, i] <- dist(x, means[i, ], dimension)
       dimnames(dclass) <- list(dimnames(x)[[1]], dimnames(means)[[1]])
       return(dclass)
   })
2: predict.fda(TrainFDAs[[i]], DummyParams[-TrainSample, ])
1: predict(TrainFDAs[[i]], DummyParams[-TrainSample, ])

-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-
r-help mailing list -- Read http://www.ci.tuwien.ac.at/~hornik/R/R-FAQ.html
Send "info", "help", or "[un]subscribe"
(in the "body", not the subject !)  To: r-help-request at stat.math.ethz.ch
_._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._