[R] Nnet and AIC: selection of a parsimonious parameterisation
Ben Rhelp
benrhelp at yahoo.co.uk
Wed Jan 5 20:58:54 CET 2011
Hi All,
I am trying to use a neural network for my work, but I am not sure about my
approach to select a parsimonious model. In R with nnet, the IAC has
not been defined for a feed-forward neural network with a single hidden layer.
Is this because it does not make sens mathematically in this case?
For example, is this pseudo code sensible?
Thanks in advance for your help. I am sorry if this has been answered before,
but I haven't found an answer for this in the archive.
Below, I have added an implementation of this idea based on (Modern Applied
Statistic with S) MASS code of chapter 8.
Cheers,
Ben
--------------------------------------------------------------------------------
Pseudo code
--------------------------------------------------------------------------------
Define RSS as:
RSS = (1-alpha)*RSS(identification set) + alpha* RSS(validation set)
and AIC as:
AIC = 2*np + N*log(RSS)
where np corresponds to the non-null parameters of the neural network
and N is the sample size (based on
http://en.wikipedia.org/wiki/Akaike_information_criterion).
Assuming a feed-forward neural network with a single hidden layer and
a maximum number of neurons (maxSize),
For size = 1 to maxSize
Optimise the decay
Select the neural network with the smallest AIC for a given size and decay
using random starting parameterisation and random identification set
For the lowest to the largest diagonal element of the Hessian,
Equate the corresponding parameter to 0
If AIC(i)>AIC(i-1), break;
The neural network selected is the one with the smallest AIC.
--------------------------------------------------------------------------------
an example based on cpus data in Chapter 8 of MASS
--------------------------------------------------------------------------------
library(nnet)
library(MASS)
# From Chapter 6, for comparisons
set.seed(123)
cpus.samp <-
c(3, 5, 6, 7, 8, 10, 11, 16, 20, 21, 22, 23, 24, 25, 29, 33, 39, 41, 44, 45,
46, 49, 57, 58, 62, 63, 65, 66, 68, 69, 73, 74, 75, 76, 78, 83, 86,
88, 98, 99, 100, 103, 107, 110, 112, 113, 115, 118, 119, 120, 122,
124, 125, 126, 127, 132, 136, 141, 144, 146, 147, 148, 149, 150, 151,
152, 154, 156, 157, 158, 159, 160, 161, 163, 166, 167, 169, 170, 173,
174, 175, 176, 177, 183, 184, 187, 188, 189, 194, 195, 196, 197, 198,
199, 202, 204, 205, 206, 208, 209)
cpus2 <- cpus[, 2:8] # excludes names, authors’ predictions
attach(cpus2)
cpus3 <- data.frame(syct = syct-2, mmin = mmin-3, mmax = mmax-4,
cach=cach/256,chmin=chmin/100, chmax=chmax/100, perf)
detach()
CVnn.cpus <- function(formula, data = cpus3[cpus.samp, ], maxSize = 10,
decayRange = c(0,0.2), nreps = 5, nifold = 10, alpha= 9/10,
linout = TRUE, skip = TRUE, maxit = 1000,...){
#nreps=number of attempts to fit a nnet model with randomly chosen initial
parameters
# The one with the smallest RSS on the training data is then chosen
nnWtsPrunning <-function(nn,data,alpha,i){
truth <- log10(data$perf)
RSS=(1-alpha)*sum((truth[ri != i] - predict(nn, data[ri != i,]))^2) +
alpha* sum((truth[ri == i] - predict(nn, data[ri == i,]))^2)
AIC=2*sum(nn$wts!=0) + length(data$perf)*log(RSS)
nn.tmp=nn
for (j in (1:length(nn$wts))) {
nn.tmp$wts[order(diag(nn.tmp$Hessian))[j]]=0
RSS.tmp=(1-alpha)*sum((truth[ri != i] - predict(nn.tmp, data[ri !=
i,]))^2) +
alpha* sum((truth[ri == i] - predict(nn.tmp, data[ri ==
i,]))^2)
AIC.tmp=2*sum(nn.tmp$wts!=0) + length(data$perf)*log(RSS.tmp)
if (is.nan(AIC.tmp) || AIC.tmp>AIC ) {
cat('\n j',j,'AIC'=AIC.tmp,'AIC_1',AIC,'\n')
break
} else {
nn=nn.tmp; AIC=AIC.tmp; RSS=RSS.tmp
}
}
list(choice=sqrt(RSS/100),nparam=sum(nn$wts!=0),AIC=AIC,nn=nn)
}
#Modified function for optimisation
CVnn1 <- function(decay, formula, data, nreps=1, ri, size, linout, skip,
maxit, optimFlag=FALSE, alpha) {
truth <- log10(data$perf)
nn <- nnet(formula, data[ri !=1,], trace=FALSE, size=size, linout=linout,
skip=skip, maxit=maxit, Hess = TRUE)
RSS=(alpha-1)*sum((truth[ri != 1] - predict(nn, data[ri != 1,]))^2) +
alpha* sum((truth[ri == 1] - predict(nn, data[ri == 1,]))^2)
ii=1
for (i in sort(unique(ri))) {
for(rep in 1:nreps) {
nn.tmp <- nnet(formula, data[ri !=i,], trace=FALSE, size=size,
linout=linout, skip=skip, maxit=maxit, Hess = TRUE)
RSS.tmp=(alpha-1)*sum((truth[ri != i] - predict(nn.tmp, data[ri !=
i,]))^2) +
alpha* sum((truth[ri == i] - predict(nn.tmp, data[ri ==
i,]))^2)
if (RSS.tmp<RSS){ RSS=RSS.tmp; nn=nn.tmp; ii=i}
}
}
if (optimFlag) {
return(RSS)
}else{
prn=nnWtsPrunning(nn,data,alpha,ii)
list(choice=prn$choice,nparam=prn$nparam,nparaminit=length(nn$wts),AIC=prn$AIC,nn1=prn$nn)
}
}
maxSize=maxSize+1; j=1;
choice <- numeric(maxSize); nparam <- numeric(maxSize); lambdaj <-
numeric(maxSize)
AIC <- numeric(maxSize); nparamInit <- numeric(maxSize)
ri <- sample(nifold, nrow(data), replace = TRUE)
size=seq(maxSize-1,0); minAIC=1000000
for(j in 1:maxSize) {
tlambda=optimize(CVnn1,decayRange,tol = 0.0001,formula, data, nreps=nreps,
ri=ri, size=size[j], linout=linout, skip=skip, maxit=maxit,
optimFlag=TRUE, alpha=.5)
lambdaj[j]=tlambda[[1]]
cat(" size =", size[j], "decay =", round(tlambda[[1]],3), "\n")
nntmp <- CVnn1(decay=lambdaj[j], formula, data, nreps=nreps, ri=ri,
size=size[j], alpha=alpha,
linout=linout, skip=skip, maxit=maxit,...)
choice[j]=nntmp$choice; nparam[j]=nntmp$nparam;
nparamInit[j]=nntmp$nparaminit; AIC[j]=nntmp$AIC
if (nntmp$AIC<=minAIC) nn=nntmp$nn1
}
#Display the result
tr = cbind(size=size, decay=lambdaj, fit=sqrt(choice/100), nparam=nparam,
nparamInit=nparamInit, AIC=AIC)
modelOptimAIC=order(AIC)[1]
return(list(tr=tr,modelOptimAIC=modelOptimAIC,nn=nn))
}
(nn.res = CVnn.cpus(log10(perf) ~ ., data = cpus3[cpus.samp,]))
More information about the R-help
mailing list