[R] Prediction intervals for zero inflated Poisson regression
Achim Zeileis
Achim.Zeileis at wu-wien.ac.at
Tue Dec 16 16:44:37 CET 2008
Thierry,
Simon had written some code for this but we never got round to fully
integrate it into the "pscl" package. A file pb.R is attached, but as a
disclaimer: I haven't looked at this code for a while. It still seems to
work (an example is included at the end) but please check.
hth,
Z
On Tue, 16 Dec 2008, ONKELINX, Thierry wrote:
> Dear all,
>
> I'm using zeroinfl() from the pscl-package for zero inflated Poisson
> regression. I would like to calculate (aproximate) prediction intervals
> for the fitted values. The package itself does not provide them. Can
> this be calculated analyticaly? Or do I have to use bootstrap?
>
> What I tried until now is to use bootstrap to estimate these intervals.
> Any comments on the code are welcome. The data and the model are based
> on the examples in zeroinfl().
>
> #aproximate prediction intervals with Poisson regression
> fm_pois <- glm(art ~ fem, data = bioChemists, family = poisson)
> newdata <- na.omit(unique(bioChemists[, "fem", drop = FALSE]))
> prediction <- predict(fm_pois, newdata = newdata, se.fit = TRUE)
> ci <- data.frame(exp(prediction$fit + matrix(prediction$se.fit, ncol =
> 1) %*% c(-1.96, 1.96)))
> newdata$fit <- exp(prediction$fit)
> newdata <- cbind(newdata, ci)
> newdata$model <- "Poisson"
>
> library(pscl)
> #aproximate prediction intervals with zero inflated poisson regression
> fm_zip <- zeroinfl(art ~ fem | 1, data = bioChemists)
> fit <- predict(fm_zip)
> Pearson <- resid(fm_zip, type = "pearson")
> VarComp <- resid(fm_zip, type = "response") / Pearson
> fem <- bioChemists$fem
> bootstrap <- replicate(999, {
> yStar <- pmax(round(fit + sample(Pearson) * VarComp, 0), 0)
> predict(zeroinfl(yStar ~ fem | 1), newdata = newdata)
> })
> newdata0 <- newdata
> newdata0$fit <- predict(fm_zip, newdata = newdata, type = "response")
> newdata0[, 3:4] <- t(apply(bootstrap, 1, quantile, c(0.025, 0.975)))
> newdata0$model <- "Zero inflated"
>
> #compare the intervals in a nice plot.
> newdata <- rbind(newdata, newdata0)
> library(ggplot2)
> ggplot(newdata, aes(x = fem, y = fit, min = X1, max = X2, colour =
> model)) + geom_point(position = position_dodge(width = 0.4)) +
> geom_errorbar(position = position_dodge(width = 0.4))
>
>
> Best regards,
>
> Thierry
>
> ------------------------------------------------------------------------
> ----
> ir. Thierry Onkelinx
> Instituut voor natuur- en bosonderzoek / Research Institute for Nature
> and Forest
> Cel biometrie, methodologie en kwaliteitszorg / Section biometrics,
> methodology and quality assurance
> Gaverstraat 4
> 9500 Geraardsbergen
> Belgium
> tel. + 32 54/436 185
> Thierry.Onkelinx at inbo.be
> www.inbo.be
>
> To call in the statistician after the experiment is done may be no more
> than asking him to perform a post-mortem examination: he may be able to
> say what the experiment died of.
> ~ Sir Ronald Aylmer Fisher
>
> The plural of anecdote is not data.
> ~ Roger Brinner
>
> The combination of some data and an aching desire for an answer does not
> ensure that a reasonable answer can be extracted from a given body of
> data.
> ~ John Tukey
>
> Dit bericht en eventuele bijlagen geven enkel de visie van de schrijver weer
> en binden het INBO onder geen enkel beding, zolang dit bericht niet bevestigd is
> door een geldig ondertekend document. The views expressed in this message
> and any annex are purely those of the writer and may not be regarded as stating
> an official position of INBO, as long as the message is not confirmed by a duly
> signed document.
>
> ______________________________________________
> R-help at r-project.org mailing list
> https://stat.ethz.ch/mailman/listinfo/r-help
> PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
> and provide commented, minimal, self-contained, reproducible code.
>
>
-------------- next part --------------
## parametric bootstrap
predict.zeroinfl <- function(object, newdata, type = c("response", "prob"),
se=FALSE,MC=1000,level=.95,
na.action = na.pass, ...)
{
type <- match.arg(type)
## if no new data supplied
if(missing(newdata)){
rval <- object$fitted.values
if(!is.null(object$x)) {
X <- object$x$count
Z <- object$x$zero
}
else if(!is.null(object$model)) {
X <- model.matrix(object$terms$count, object$model, contrasts = object$contrasts$count)
Z <- model.matrix(object$terms$zero, object$model, contrasts = object$contrasts$zero)
}
else {
stop("no X and/or Z matrices can be extracted from fitted model")
}
if(type == "prob") {
mu <- exp(X %*% object$coefficients$count)[,1]
phi <- object$linkinv(Z %*% object$coefficients$zero)[,1]
}
}
else {
mf <- model.frame(delete.response(object$terms$full), newdata, na.action = na.action, xlev = object$levels)
X <- model.matrix(delete.response(object$terms$count), mf, contrasts = object$contrasts$count)
Z <- model.matrix(delete.response(object$terms$zero), mf, contrasts = object$contrasts$zero)
mu <- exp(X %*% object$coefficients$count)[,1]
phi <- object$linkinv(Z %*% object$coefficients$zero)[,1]
rval <- (1-phi) * mu
}
if(se & !is.null(X) & !is.null(Z)){
require(mvtnorm)
vc <- -solve(object$optim$hessian)
kx <- length(object$coefficients$count)
kz <- length(object$coefficients$zero)
parms <- object$optim$par
if(type!="prob"){
yhat.sim <- matrix(NA,MC,dim(X)[1])
for(i in 1:MC){
cat(paste("MC iterate",i,"of",MC,"\n"))
parms.sim <- rmvnorm(n=1,mean=parms,sigma=vc)
beta <- parms.sim[1:kx]
gamma <- parms.sim[(kx+1):(kx+kz)]
mu.sim <- exp(X%*%beta)[,1]
phi.sim <- object$linkinv(Z%*%gamma)[,1]
yhat.sim[i,] <- (1-phi.sim)*mu.sim
}
}
out <- list()
out$lower <- apply(yhat.sim,2,quantile,(1-level)/2)
out$upper <- apply(yhat.sim,2,quantile,1-((1-level)/2))
out$se <- apply(yhat.sim,2,sd)
}
## predicted probabilities
if(type == "prob") {
if(!is.null(object$y)) y <- object$y
else if(!is.null(object$model)) y <- model.response(object$model)
else stop("predicted probabilities cannot be computed for fits with y = FALSE and model = FALSE")
yUnique <- min(y):max(y)
nUnique <- length(yUnique)
rval <- matrix(NA, nrow = length(rval), ncol = nUnique)
dimnames(rval) <- list(rownames(X), yUnique)
switch(object$dist,
"poisson" = {
rval[, 1] <- phi + (1-phi) * exp(-mu)
for(i in 2:nUnique) rval[,i] <- (1-phi) * dpois(yUnique[i], lambda = mu)
},
"negbin" = {
theta <- object$theta
rval[, 1] <- phi + (1-phi) * dnbinom(0, mu = mu, size = theta)
for(i in 2:nUnique) rval[,i] <- (1-phi) * dnbinom(yUnique[i], mu = mu, size = theta)
},
"geometric" = {
rval[, 1] <- phi + (1-phi) * dnbinom(0, mu = mu, size = 1)
for(i in 2:nUnique) rval[,i] <- (1-phi) * dnbinom(yUnique[i], mu = mu, size = 1)
})
}
if(se)
rval <- list(rval,out)
rval
}
################################################################
## test this code
require(pscl)
data(bioChemists)
obj <- zeroinfl(art ~ . | .,
data=bioChemists,
dist="negbin",
EM=TRUE)
foo <- predict(obj,se=TRUE,type="response")
indx <- order(foo[[1]])
n <- length(indx)
plot(1:n,
foo[[1]][indx],
type="n",
ylim=range(cbind(foo[[2]]$lower,foo[[2]]$upper)),
xlab="Order Statistic",
ylab="Predicted Value")
segments(x0=1:n,x1=1:n,
y0=foo[[2]]$lower[indx],
y1=foo[[2]]$upper[indx],
col=gray(.45),
lwd=.1)
points(1:n,foo[[1]][indx],cex=.5)
More information about the R-help
mailing list