[R] mgcv: relative risk from GAM with distributed lag

Fri Jul 22 14:29:36 CEST 2022

I made a small error in the code below (not checking for NAs which are
introduced by the lag function). However, this doesn't solve the issue
I raised).

So here's the problem (with corrected code) again:

I'm not sure what to do with the 'time' variable. (I don't want to
make predictions for specific points in time). I coded as follows
(full reproducible example at bottom of email), but get a warning and
error:

N <- 1000     # number of points for smooth to be predicted
# new temperatures and lags for prediction
pred_temp <- seq(min(dat$temp, na.rm = T), max(dat$temp, na.rm = T), length = N)
pred_lag <- rep(c(0, 1,2,3,4,5,6),each=N)      ## IS IT CORRECT TO SET
UP LAG LIKE THIS?

# not sure if these covariates are required with type = "terms"
pred_humidity <- rep(median(dat$humidity, na.rm = T), N)
pred_rain <- rep(median(dat$rain, na.rm = T), N)
pd <- data.frame(temp = pred_temp, lag = pred_lag, humidity =
pred_humidity, rain = pred_rain)

predictions <- predict(mod, pd, type = "terms")

The predict line creates the following warning and error:

Warning in predict.gam(mod, pd, type = "terms") :
  not all required variables have been supplied in  newdata!

Error in model.frame.default(ff, data = newdata, na.action = na.act) :
  object is not a matrix

For ease of reference, I've (re)included the full reproducible example:

library(mgcv)
set.seed(3) # make reproducible example
simdat <- gamSim(1,400)
g <- exp(simdat$f/5)

simdat$y <- rnbinom(g,size=3,mu=g)  # negative binomial response var
simdat$time <- 1:400  # create time series

names(simdat) <- c("deaths", "temp", "humidity", "rain", "x3", "f",
"f0", "f1", "f2", "f3", "time")

# lag function based on Wood (book 2017, p.349 and gamair package
documentation p.54
# https://cran.rstudio.com/web/packages/gamair/gamair.pdf)
lagard <- function(x,n.lag=7) {
n <- length(x); X <- matrix(NA,n,n.lag)
for (i in 1:n.lag) X[i:n,i] <- x[i:n-i+1]
X
}

# set up lag, temp, rain and humidity as 7-column matrices
# to create lagged variables
dat <- list(lag=matrix(0:6,nrow(simda
t),7,byrow=TRUE),
deaths=simdat$deaths, time = simdat$time)
dat$temp <- lagard(simdat$temp)
dat$rain <- lagard(simdat$rain)
dat$humidity <- lagard(simdat$humidity)

mod <- gam(deaths~s(time, k=70) + te(temp, lag, k=c(12, 4)) +
te(humidity, lag, k=c(12, 4)) + te(rain, lag, k=c(12, 4)), data = dat,
family = nb, method = 'REML', select = TRUE)

# create prediction data
N <- 1000 # number of points for which to predict the smooths
pred_temp <- seq(min(dat$temp, na.rm = T), max(dat$temp, na.rm = T), length = N)
pred_lag <- rep(c(0, 1,2,3,4,5,6),each=N)
pred_humidity <- rep(median(dat$humidity), N)
pred_rain <- rep(median(dat$rain), N)
pd <- data.frame(temp = pred_temp, lag = pred_lag, humidity =
pred_humidity, rain = pred_rain)

# make predictions
predictions <- predict(mod, pd, type = "terms")

On Fri, 22 Jul 2022 at 12:47, jade.shodan using googlemail.com
<jade.shodan using googlemail.com> wrote:
>
> Hi Simon,
>
> Thanks for the pointers! But I'm not sure what to do with the 'time'
> variable. (I don't want to make predictions for specific points in
> time). I coded as follows (full reproducible example at bottom of
> email), but get a warning and error:
>
>
> N <- 1000     # number of points for smooth to be predicted
> # new temperatures and lags for prediction
> pred_temp <- seq(min(dat$temp, na.rm = T), max(dat$temp, na.rm = T), length = N)
> pred_lag <- rep(c(0, 1,2,3,4,5,6),each=N)      ## IS IT CORRECT TO SET
> UP LAG LIKE THIS?
>
> # not sure if these covariates are required with type = "terms"
> pred_humidity <- rep(median(dat$humidity), N)
> pred_rain <- rep(median(dat$rain), N)
> pd <- data.frame(temp = pred_temp, lag = pred_lag, humidity =
> pred_humidity, rain = pred_rain)
>
> predictions <- predict(mod, pd, type = "terms")
>
>
> The predict line creates the following warning and error:
>
> Warning in predict.gam(mod, pd, type = "terms") :
>   not all required variables have been supplied in  newdata!
>
> Error in model.frame.default(ff, data = newdata, na.action = na.act) :
>   object is not a matrix
>
>
> For ease of reference, I've (re)included the full reproducible example:
>
> library(mgcv)
> set.seed(3) # make reproducible example
> simdat <- gamSim(1,400)
> g <- exp(simdat$f/5)
>
> simdat$y <- rnbinom(g,size=3,mu=g)  # negative binomial response var
> simdat$time <- 1:400  # create time series
>
> names(simdat) <- c("deaths", "temp", "humidity", "rain", "x3", "f",
> "f0", "f1", "f2", "f3", "time")
>
> # lag function based on Wood (book 2017, p.349 and gamair package
> documentation p.54
> # https://cran.rstudio.com/web/packages/gamair/gamair.pdf)
> lagard <- function(x,n.lag=7) {
> n <- length(x); X <- matrix(NA,n,n.lag)
> for (i in 1:n.lag) X[i:n,i] <- x[i:n-i+1]
> X
> }
>
> # set up lag, temp, rain and humidity as 7-column matrices
> # to create lagged variables
> dat <- list(lag=matrix(0:6,nrow(simdat),7,byrow=TRUE),
> deaths=simdat$deaths, time = simdat$time)
> dat$temp <- lagard(simdat$temp)
> dat$rain <- lagard(simdat$rain)
> dat$humidity <- lagard(simdat$humidity)
>
> mod <- gam(deaths~s(time, k=70) + te(temp, lag, k=c(12, 4)) +
> te(humidity, lag, k=c(12, 4)) + te(rain, lag, k=c(12, 4)), data = dat,
> family = nb, method = 'REML', select = TRUE)
>
> # create prediction data
> N <- 1000 # number of points for which to predict the smooths
> pred_temp <- seq(min(dat$temp, na.rm = T), max(dat$temp, na.rm = T), length = N)
> pred_lag <- rep(c(0, 1,2,3,4,5,6),each=N)
> pred_humidity <- rep(median(dat$humidity), N)
> pred_rain <- rep(median(dat$rain), N)
> pd <- data.frame(temp = pred_temp, lag = pred_lag, humidity =
> pred_humidity, rain = pred_rain)
>
> # make predictions
> predictions <- predict(mod, pd, type = "terms")
>
>
> On Fri, 22 Jul 2022 at 09:54, Simon Wood <simon.wood using bath.edu> wrote:
> >
> >
> > On 21/07/2022 15:19, jade.shodan--- via R-help wrote:
> > > Hello everyone (incl. Simon Wood?),
> > >
> > > I'm not sure that my original question (see below, including
> > > reproducible example) was as clear as it could have been. To clarify,
> > > what I would to like to get is:
> > >
> > > 1) a perspective plot of temperature x lag x relative risk.  I know
> > > how to use plot.gam and vis.gam but don't know how to get plots on the
> > > relative risk scale as opposed to  "response" or "link".
> >
> > - You are on the log scale so I think that all you need to do is to use
> > 'predict.gam', with 'type = "terms"' to  get the predictions for the
> > te(temp, lag) term over the required grid of lags and temperatures.
> > Suppose the dataframe of prediction data is 'pd'. Now produce pd0, which
> > is identical to pd, except that the temperatures are all set to the
> > reference temperature. Use predict.gam to predict te(temp,lag) from pd0.
> > Now the exponential of the difference between the first and second
> > predictions is the required RR, which you can plot using 'persp',
> > 'contour', 'image' or whatever. If you need credible intervals see pages
> > 341-343 of my 'GAMs: An intro with R' book (2nd ed).
> >
> > > 2) a plot of relative risk (accumulated across all lags) vs
> > > temperature, given a reference temperature. An example of such a plot
> > > can be found in figure 2 (bottom) of this paper by Gasparrini et al:
> > > https://onlinelibrary.wiley.com/doi/abs/10.1002/sim.3940
> >
> > - I guess this only makes sense if you have the same temperature at all
> > lags. So this time produce a data.frame with each desired temperature
> > repeated for each lag: 'pd1'. Again use predict.gam(...,type="terms").
> > Then sum the predictions over lags for each temperature, subtract the
> > minimum, and take the exponential. Same as above for CIs.
> >
> > best,
> >
> > Simon
> >
> > > I've seen Simon Wood's response to a related issue here:
> > > https://stat.ethz.ch/pipermail/r-help/2012-May/314387.html
> > > However, I'm not sure how to apply this to time series data with
> > > distributed lag, to get the above mentioned figures.
> > >
> > > Would be really grateful for suggestions!
> > >
> > > Jade
> > >
> > > On Tue, 19 Jul 2022 at 16:07, jade.shodan using googlemail.com
> > > <jade.shodan using googlemail.com> wrote:
> > >> Dear list members,
> > >>
> > >> Does anyone know how to obtain a relative risk/ risk ratio from a GAM
> > >> with a distributed lag model implemented in mgcv? I have a GAM
> > >> predicting daily deaths from time series data consisting of daily
> > >> temperature, humidity and rainfall. The GAM includes a distributed lag
> > >> model because deaths may occur over several days following a high heat
> > >> day.
> > >>
> > >> What I'd like to do is compute (and plot) the relative risk
> > >> (accumulated across all lags) for a given temperature vs the
> > >> temperature at which the risk is lowest, with corresponding confidence
> > >> intervals. I am aware of the predict.gam function but am not sure if
> > >> and how it should be used in this case. (Additionally, I'd also like
> > >> to plot the relative risk for different lags separately).
> > >>
> > >> I apologise if this seems trivial to some. (Actually, I hope it is,
> > >> because that might mean I get a solution!) I've been looking for
> > >> examples on how to do this, but found nothing so far. Suggestions
> > >> would be very much appreciated!
> > >>
> > >> Below is a reproducible example with the GAM:
> > >>
> > >> library(mgcv)
> > >> set.seed(3) # make reproducible example
> > >> simdat <- gamSim(1,400) # simulate data
> > >> g <- exp(simdat$f/5)
> > >> simdat$y <- rnbinom(g,size=3,mu=g)  # negative binomial response var
> > >> simdat$time <- 1:400  # create time series
> > >> names(simdat) <- c("deaths", "temp", "humidity", "rain", "x3", "f",
> > >> "f0", "f1", "f2", "f3", "time")
> > >>
> > >> # lag function based on Simon Wood (book 2017, p.349 and gamair
> > >> package documentation p.54
> > >> # https://cran.rstudio.com/web/packages/gamair/gamair.pdf)
> > >> lagard <- function(x,n.lag=7) {
> > >> n <- length(x); X <- matrix(NA,n,n.lag)
> > >> for (i in 1:n.lag) X[i:n,i] <- x[i:n-i+1]
> > >> X
> > >> }
> > >>
> > >> # set up lag, temp, rain and humidity as 7-column matrices
> > >> # to create lagged variables - based on Simon Wood's example
> > >> dat <- list(lag=matrix(0:6,nrow(simdat),7,byrow=TRUE),
> > >> deaths=simdat$deaths, time = simdat$time)
> > >> dat$temp <- lagard(simdat$temp)
> > >> dat$rain <- lagard(simdat$rain)
> > >> dat$humidity <- lagard(simdat$humidity)
> > >>
> > >> mod <- gam(deaths~s(time, k=70) + te(temp, lag, k=c(12, 4)) +
> > >> te(humidity, lag, k=c(12, 4)) + te(rain, lag, k=c(12, 4)), data = dat,
> > >> family = nb, method = 'REML', select = TRUE)
> > >>
> > >> summary(mod)
> > >> plot(mod, scheme = 1)
> > >> plot(mod, scheme = 2)
> > >>
> > >> Thanks for any suggestions you may have,
> > >>
> > >> Jade
> > > ______________________________________________
> > > R-help using r-project.org mailing list -- To UNSUBSCRIBE and more, see
> > > https://stat.ethz.ch/mailman/listinfo/r-help
> > > PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
> > > and provide commented, minimal, self-contained, reproducible code.
> >
> > --
> > Simon Wood, School of Mathematics, University of Edinburgh,
> > https://www.maths.ed.ac.uk/~swood34/
> >