[R] Prediction intervals for zero inflated Poisson regression

Tue Dec 16 16:44:37 CET 2008

Thierry,

Simon had written some code for this but we never got round to fully 
integrate it into the "pscl" package. A file pb.R is attached, but as a 
disclaimer: I haven't looked at this code for a while. It still seems to 
work (an example is included at the end) but please check.

hth,
Z

On Tue, 16 Dec 2008, ONKELINX, Thierry wrote:

> Dear all,
>
> I'm using zeroinfl() from the pscl-package for zero inflated Poisson
> regression. I would like to calculate (aproximate) prediction intervals
> for the fitted values. The package itself does not provide them. Can
> this be calculated analyticaly? Or do I have to use bootstrap?
>
> What I tried until now is to use bootstrap to estimate these intervals.
> Any comments on the code are welcome. The data and the model are based
> on the examples in zeroinfl().
>
> #aproximate prediction intervals with Poisson regression
> fm_pois <- glm(art ~ fem, data = bioChemists, family = poisson)
> newdata <- na.omit(unique(bioChemists[, "fem", drop = FALSE]))
> prediction <- predict(fm_pois, newdata = newdata, se.fit = TRUE)
> ci <- data.frame(exp(prediction$fit + matrix(prediction$se.fit, ncol =
> 1) %*% c(-1.96, 1.96)))
> newdata$fit <- exp(prediction$fit)
> newdata <- cbind(newdata, ci)
> newdata$model <- "Poisson"
>
> library(pscl)
> #aproximate prediction intervals with zero inflated poisson regression
> fm_zip <- zeroinfl(art ~ fem | 1, data = bioChemists)
> fit <- predict(fm_zip)
> Pearson <- resid(fm_zip, type = "pearson")
> VarComp <- resid(fm_zip, type = "response") / Pearson
> fem <- bioChemists$fem
> bootstrap <- replicate(999, {
>    yStar <- pmax(round(fit + sample(Pearson) * VarComp, 0), 0)
>    predict(zeroinfl(yStar ~ fem | 1), newdata = newdata)
> })
> newdata0 <- newdata
> newdata0$fit <- predict(fm_zip, newdata = newdata, type = "response")
> newdata0[, 3:4] <- t(apply(bootstrap, 1, quantile, c(0.025, 0.975)))
> newdata0$model <- "Zero inflated"
>
> #compare the intervals in a nice plot.
> newdata <- rbind(newdata, newdata0)
> library(ggplot2)
> ggplot(newdata, aes(x = fem, y = fit, min = X1, max = X2, colour =
> model)) + geom_point(position = position_dodge(width = 0.4)) +
> geom_errorbar(position = position_dodge(width = 0.4))
>
>
> Best regards,
>
> Thierry
>
> ------------------------------------------------------------------------
> ----
> ir. Thierry Onkelinx
> Instituut voor natuur- en bosonderzoek / Research Institute for Nature
> and Forest
> Cel biometrie, methodologie en kwaliteitszorg / Section biometrics,
> methodology and quality assurance
> Gaverstraat 4
> 9500 Geraardsbergen
> Belgium
> tel. + 32 54/436 185
> Thierry.Onkelinx at inbo.be
> www.inbo.be
>
> To call in the statistician after the experiment is done may be no more
> than asking him to perform a post-mortem examination: he may be able to
> say what the experiment died of.
> ~ Sir Ronald Aylmer Fisher
>
> The plural of anecdote is not data.
> ~ Roger Brinner
>
> The combination of some data and an aching desire for an answer does not
> ensure that a reasonable answer can be extracted from a given body of
> data.
> ~ John Tukey
>
> Dit bericht en eventuele bijlagen geven enkel de visie van de schrijver weer
> en binden het INBO onder geen enkel beding, zolang dit bericht niet bevestigd is
> door een geldig ondertekend document. The views expressed in  this message
> and any annex are purely those of the writer and may not be regarded as stating
> an official position of INBO, as long as the message is not confirmed by a duly
> signed document.
>
> ______________________________________________
> R-help at r-project.org mailing list
> https://stat.ethz.ch/mailman/listinfo/r-help
> PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
> and provide commented, minimal, self-contained, reproducible code.
>
>
-------------- next part --------------
## parametric bootstrap

predict.zeroinfl <- function(object, newdata, type = c("response", "prob"),

                             se=FALSE,MC=1000,level=.95,

                             na.action = na.pass, ...)

{

    type <- match.arg(type)

    ## if no new data supplied

    if(missing(newdata)){

      rval <- object$fitted.values

        if(!is.null(object$x)) {

	  X <- object$x$count

	  Z <- object$x$zero

	}

        else if(!is.null(object$model)) {

          X <- model.matrix(object$terms$count, object$model, contrasts = object$contrasts$count)

          Z <- model.matrix(object$terms$zero,  object$model, contrasts = object$contrasts$zero)	

	}

        else {

	  stop("no X and/or Z matrices can be extracted from fitted model")

	}

      if(type == "prob") {

        mu <- exp(X %*% object$coefficients$count)[,1]

        phi <- object$linkinv(Z %*% object$coefficients$zero)[,1]

      }

    }

    else {

      mf <- model.frame(delete.response(object$terms$full), newdata, na.action = na.action, xlev = object$levels)

      X <- model.matrix(delete.response(object$terms$count), mf, contrasts = object$contrasts$count)

      Z <- model.matrix(delete.response(object$terms$zero),  mf, contrasts = object$contrasts$zero)

      mu <- exp(X %*% object$coefficients$count)[,1]

      phi <- object$linkinv(Z %*% object$coefficients$zero)[,1]

      rval <- (1-phi) * mu

    }   

    if(se & !is.null(X) & !is.null(Z)){

      require(mvtnorm)

      vc <- -solve(object$optim$hessian)

      kx <- length(object$coefficients$count)

      kz <- length(object$coefficients$zero)

      parms <- object$optim$par

      if(type!="prob"){

        yhat.sim <- matrix(NA,MC,dim(X)[1])

        for(i in 1:MC){

          cat(paste("MC iterate",i,"of",MC,"\n"))

          parms.sim <- rmvnorm(n=1,mean=parms,sigma=vc)

          beta <- parms.sim[1:kx]

          gamma <- parms.sim[(kx+1):(kx+kz)]

          mu.sim <- exp(X%*%beta)[,1]

          phi.sim <- object$linkinv(Z%*%gamma)[,1]

          yhat.sim[i,] <- (1-phi.sim)*mu.sim

        }

      }

      out <- list()

      out$lower <- apply(yhat.sim,2,quantile,(1-level)/2)

      out$upper <- apply(yhat.sim,2,quantile,1-((1-level)/2))

      out$se <- apply(yhat.sim,2,sd)

    }

    ## predicted probabilities

    if(type == "prob") {

      if(!is.null(object$y)) y <- object$y

        else if(!is.null(object$model)) y <- model.response(object$model)

	else stop("predicted probabilities cannot be computed for fits with y = FALSE and model = FALSE")

      yUnique <- min(y):max(y)

      nUnique <- length(yUnique)

      rval <- matrix(NA, nrow = length(rval), ncol = nUnique)

      dimnames(rval) <- list(rownames(X), yUnique)

      switch(object$dist,

             "poisson" = {

    	       rval[, 1] <- phi + (1-phi) * exp(-mu)

               for(i in 2:nUnique) rval[,i] <- (1-phi) * dpois(yUnique[i], lambda = mu)

             },

	     "negbin" = {

               theta <- object$theta

               rval[, 1] <- phi + (1-phi) * dnbinom(0, mu = mu, size = theta)

               for(i in 2:nUnique) rval[,i] <- (1-phi) * dnbinom(yUnique[i], mu = mu, size = theta)

             },

	     "geometric" = {

               rval[, 1] <- phi + (1-phi) * dnbinom(0, mu = mu, size = 1)

               for(i in 2:nUnique) rval[,i] <- (1-phi) * dnbinom(yUnique[i], mu = mu, size = 1)

	     })

    }

    if(se)

      rval <- list(rval,out)

    rval

}

################################################################

## test this code

require(pscl)

data(bioChemists)

obj <- zeroinfl(art ~ . | .,

                data=bioChemists,

                dist="negbin",

                EM=TRUE)

foo <- predict(obj,se=TRUE,type="response")

indx <- order(foo[[1]])

n <- length(indx)

plot(1:n,

     foo[[1]][indx],

     type="n",

     ylim=range(cbind(foo[[2]]$lower,foo[[2]]$upper)),

     xlab="Order Statistic",

     ylab="Predicted Value")

segments(x0=1:n,x1=1:n,

         y0=foo[[2]]$lower[indx],

         y1=foo[[2]]$upper[indx],

         col=gray(.45),

         lwd=.1)

points(1:n,foo[[1]][indx],cex=.5)