[R] Nnet and AIC: selection of a parsimonious parameterisation

Wed Jan 5 20:58:54 CET 2011

Hi All,

I am trying to use a neural network for my work, but I am not sure about my 
approach to select a parsimonious model. In R with nnet, the IAC has
not been defined for a feed-forward neural network with a single hidden layer. 
Is this because it does not make sens mathematically in this case?
For example, is this pseudo code sensible?

Thanks in advance for your help. I am sorry if this has been answered before, 
but I haven't found an answer for this in the archive.

Below, I have added an implementation of this idea based on (Modern Applied 
Statistic with S) MASS code of chapter 8.

Cheers,

Ben

--------------------------------------------------------------------------------
Pseudo code
--------------------------------------------------------------------------------

Define RSS as:
RSS = (1-alpha)*RSS(identification set) +  alpha* RSS(validation set)
and AIC as:
AIC = 2*np + N*log(RSS)
where np corresponds to the non-null parameters of the neural network
and N is the sample size (based on 
http://en.wikipedia.org/wiki/Akaike_information_criterion).

Assuming a feed-forward neural network with a single hidden layer and
a maximum number of neurons (maxSize),

For size = 1 to maxSize
Optimise the decay
Select the neural network with the smallest AIC for a given size and decay
  using random starting parameterisation and random identification set
For the lowest to the largest diagonal element of the Hessian,
       Equate the corresponding parameter to 0
       If AIC(i)>AIC(i-1), break;

The neural network selected is the one with the smallest AIC.

--------------------------------------------------------------------------------
an example based on cpus data in Chapter 8 of MASS
--------------------------------------------------------------------------------

library(nnet)
library(MASS)

# From Chapter 6, for comparisons
set.seed(123)
cpus.samp <-
c(3, 5, 6, 7, 8, 10, 11, 16, 20, 21, 22, 23, 24, 25, 29, 33, 39, 41, 44, 45,
46, 49, 57, 58, 62, 63, 65, 66, 68, 69, 73, 74, 75, 76, 78, 83, 86,
88, 98, 99, 100, 103, 107, 110, 112, 113, 115, 118, 119, 120, 122,
124, 125, 126, 127, 132, 136, 141, 144, 146, 147, 148, 149, 150, 151,
152, 154, 156, 157, 158, 159, 160, 161, 163, 166, 167, 169, 170, 173,
174, 175, 176, 177, 183, 184, 187, 188, 189, 194, 195, 196, 197, 198,
199, 202, 204, 205, 206, 208, 209)

cpus2 <- cpus[, 2:8] # excludes names, authors’ predictions
attach(cpus2)
cpus3 <- data.frame(syct = syct-2, mmin = mmin-3, mmax = mmax-4, 
cach=cach/256,chmin=chmin/100, chmax=chmax/100, perf)
detach()

CVnn.cpus <- function(formula, data = cpus3[cpus.samp, ], maxSize = 10,
    decayRange = c(0,0.2), nreps = 5, nifold = 10, alpha= 9/10,
    linout = TRUE, skip = TRUE, maxit = 1000,...){
#nreps=number of attempts to fit a nnet model with randomly chosen initial 
parameters
#      The one with the smallest RSS on the training data is then chosen
  nnWtsPrunning <-function(nn,data,alpha,i){
    truth <- log10(data$perf)
    RSS=(1-alpha)*sum((truth[ri != i] - predict(nn, data[ri != i,]))^2) + 
        alpha* sum((truth[ri == i] - predict(nn, data[ri == i,]))^2)
    AIC=2*sum(nn$wts!=0) + length(data$perf)*log(RSS)
    nn.tmp=nn
    for (j in (1:length(nn$wts))) {
      nn.tmp$wts[order(diag(nn.tmp$Hessian))[j]]=0
      RSS.tmp=(1-alpha)*sum((truth[ri != i] - predict(nn.tmp, data[ri != 
i,]))^2) + 

                 alpha* sum((truth[ri == i] - predict(nn.tmp, data[ri == 
i,]))^2)
      AIC.tmp=2*sum(nn.tmp$wts!=0) + length(data$perf)*log(RSS.tmp)  
      if (is.nan(AIC.tmp) || AIC.tmp>AIC ) {
          cat('\n  j',j,'AIC'=AIC.tmp,'AIC_1',AIC,'\n')
          break
      } else {
          nn=nn.tmp; AIC=AIC.tmp; RSS=RSS.tmp
      }
    }
    list(choice=sqrt(RSS/100),nparam=sum(nn$wts!=0),AIC=AIC,nn=nn)
  }

  #Modified function for optimisation
  CVnn1 <- function(decay, formula, data, nreps=1, ri, size, linout, skip, 
maxit, optimFlag=FALSE, alpha) {
    truth <- log10(data$perf)
    nn <- nnet(formula, data[ri !=1,], trace=FALSE, size=size, linout=linout, 
skip=skip, maxit=maxit, Hess = TRUE)
    RSS=(alpha-1)*sum((truth[ri != 1] - predict(nn, data[ri != 1,]))^2) + 
        alpha* sum((truth[ri == 1] - predict(nn, data[ri == 1,]))^2)
    ii=1
    for (i in sort(unique(ri))) {
      for(rep in 1:nreps) {
        nn.tmp <- nnet(formula, data[ri !=i,], trace=FALSE, size=size, 
linout=linout, skip=skip, maxit=maxit, Hess = TRUE)
        RSS.tmp=(alpha-1)*sum((truth[ri != i] - predict(nn.tmp, data[ri != 
i,]))^2) + 

                 alpha* sum((truth[ri == i] - predict(nn.tmp, data[ri == 
i,]))^2)
    if (RSS.tmp<RSS){ RSS=RSS.tmp; nn=nn.tmp; ii=i}
      }
    }
    if (optimFlag) {
         return(RSS)
    }else{
     prn=nnWtsPrunning(nn,data,alpha,ii)

 list(choice=prn$choice,nparam=prn$nparam,nparaminit=length(nn$wts),AIC=prn$AIC,nn1=prn$nn)

    }
  }

  maxSize=maxSize+1; j=1;   
  choice <- numeric(maxSize);  nparam <- numeric(maxSize);  lambdaj <- 
numeric(maxSize)
  AIC <- numeric(maxSize);  nparamInit <- numeric(maxSize)
  ri <- sample(nifold, nrow(data), replace = TRUE)
  size=seq(maxSize-1,0); minAIC=1000000
  for(j in 1:maxSize) {
    tlambda=optimize(CVnn1,decayRange,tol = 0.0001,formula, data, nreps=nreps,
                 ri=ri, size=size[j], linout=linout, skip=skip, maxit=maxit, 
optimFlag=TRUE, alpha=.5)
    lambdaj[j]=tlambda[[1]]
    cat("  size =", size[j], "decay =", round(tlambda[[1]],3), "\n")

    nntmp <- CVnn1(decay=lambdaj[j], formula, data, nreps=nreps, ri=ri, 
size=size[j], alpha=alpha,
                   linout=linout, skip=skip, maxit=maxit,...)
    choice[j]=nntmp$choice; nparam[j]=nntmp$nparam; 
nparamInit[j]=nntmp$nparaminit; AIC[j]=nntmp$AIC
    if (nntmp$AIC<=minAIC) nn=nntmp$nn1
    }
  #Display the result
  tr = cbind(size=size, decay=lambdaj, fit=sqrt(choice/100), nparam=nparam, 
nparamInit=nparamInit, AIC=AIC)
  modelOptimAIC=order(AIC)[1]
  return(list(tr=tr,modelOptimAIC=modelOptimAIC,nn=nn))
}

(nn.res = CVnn.cpus(log10(perf) ~ ., data = cpus3[cpus.samp,]))