[R] predictOMatic for regression. Please try and advise me

Fri Apr 20 18:37:56 CEST 2012

I'm pasting below a working R file featuring a function I'd like to polish up.

I'm teaching regression this semester and every time I come to
something that is very difficult to explain in class, I try to
simplify it by writing an R function (eventually into my package
"rockchalk").  Students have a difficult time with predict and newdata
objects, so right now I'm concentrated on that problem.

Fitting regressions is pretty easy, but getting predicted values for
particular combinations of input values can be difficult when the
model is complicated. If you agree, please try out the following and
let me know how its use could be enhanced, or what other features you
might want.

I know folks are busy, so to save you the trouble of actually running
the code, I also paste in a session demonstrating one run through.

Here's the code:

##Paul Johnson
## 2012-04-20
## Facilitate creation of "newdata" objects for use in predict
## statements (for interpretation of regression models)

## newdataMaker creates the newdata frame required in predict.
## It supplies a data frame with all input variables at a
## central value (mean, mode) except for some variables that
## the user wants to focus on. User should
## supply a fitted model "model" and a focus list "fl" of variable
## values. See examples below.  The list must be a named list,
## using names of variables from the regression formula.
## It is not needed to call this
## directly if one is satisfied with the results from
## predictOMatic

newdataMaker <- function (model = NULL, fl = NULL){
    mf <- model.frame(model)
    mterms <- terms(model)
    mfnames <- colnames(mf)
    modelcv <- centralValues(mf)
    if (sum(!names(fl) %in% mfnames) > 0) stop(cat(c("Error. The focus
list (fl) requests variables that are not included in the original
model. The names of the variables in the focus list be drawn from this
list: ",  mfnames)))
    ## Consider "padding" range of fl for numeric variables so that we
    ## get newdata objects including the min and max values.
    mixAndMatch <- expand.grid(fl)
    mixAndMatch$combo <- 1:nrow(mixAndMatch)
    newdf <- cbind(mixAndMatch, modelcv[  ,!colnames(modelcv) %in%
colnames(mixAndMatch)])
    newdf
}

predictOMatic <- function(model = NULL, fl = NULL, ...){
    nd <- newdataMaker(model, fl)
    fit <- predict(model, newdata=nd, ...)
    cbind(fit, nd)
}

set.seed(12345)
x1 <- rnorm(100)
x2 <- rnorm(100)
x3 <- rnorm(100)
x4 <- rnorm(100)
x5 <- rpois(100, 5)
x6 <- rgamma(100, 2,1)
xcat1 <- gl(2,50, labels=c("M","F"))
xcat2 <- cut(rnorm(100), breaks=c(-Inf, 0, 0.4, 0.9, 1, Inf),
labels=c("R", "M", "D", "P", "G"))
dat <- data.frame(x1, x2, x3, x4, x5, xcat1, xcat2)
rm(x1, x2, x3, x4, xcat1, xcat2)
dat$y <- with(dat, 0.03 + 0.1*x1 + 0.1*x2 + 0.4*x3 -0.1*x4 + 0.01*x5 +
0.1*x6 + 2*rnorm(100))

##ordinary regression.
m1 <- lm(y ~ x1 + x2 + x3 + x4 + x5 + x6 + xcat1 + xcat2, data=dat)
summary(m1)
## Use the fitted model's data frame for analysis
m1mf <- model.frame(m1)

## If you have rockchalk 1.5.4, run these:
library(rockchalk)
summarize(m1mf)
## otherwise, run
summary(m1mf)

## First, overview for values of xcat1
newdataMaker(m1, fl = list(xcat1 = levels(m1mf$xcat1)))

## mix and match all combinations of xcat1 and xcat2
newdataMaker(m1, fl = list(xcat1 = levels(m1mf$xcat2), xcat2 =
levels(m1mf$xcat2)))

## Pick some particular values for focus
newdataMaker(m1, fl = list(x2 = c(0.25, 1.0), xcat2 = c("M","D")))

## Generate a newdata frame and predictions in one step
predictOMatic(m1, fl = list(x2 = c(0.25, 1.0), xcat2 = c("M","D")))

predictOMatic(m1, fl = list(x2 = c(0.25, 1.0), xcat2 = c("M","D")),
interval="conf")

newdf <- predictOMatic(m1, fl = list(x2 = c(0.25, 1.0), xcat2 =
c("M","D"), x1=plotSeq(dat$x1)))

plot(y ~ x1, data= model.frame(m1))
by(newdf, list(newdf$x2, newdf$xcat2), function(x) {lines(x$x1, x$fit)})

newdataMaker(m1, fl = list(x2 = c(-1,0, 1), xcat2 = c("M","D")))

predictOMatic(m1, fl = list(x2 = range(dat$x2), xcat2 = c("M","D")))

newdf <- predictOMatic(m1, fl = list(x2 = quantile(dat$x2), xcat2 = c("M","D")))
plot(y ~ x2 , data=model.frame(m1))

lines(y ~ x2,  newdf)

predictOMatic(m1, fl = list(x2 = c(50, 60), xcat2 = c("M","D")),
interval="conf")

## just gets the new data
nd <- newdataMaker(m1, fl = list(x2 = c(50, 60), xcat2 = c("M","D")))

pr <- predictOMatic(m1, fl = list(x2 = c(50, 60), xcat2 = c("M","D")),
interval="conf")

##
m2 <- glm(xcat2 ~ x1 + x2 + x3 + xcat1, data=dat, family=binomial(logit))
summary(m2)
m2mf <- model.frame(m2)
summarize(m2mf)

predict(m2, newdata=data.frame(xcat1=c("M","F"), x1=c(1),
x2=mean(m2mf$x2), x3=mean(m2mf$x3)), se.fit=TRUE)

predictOMatic(m2, fl = list(x2 = c(-1, 1), xcat1 = c("M","F")),
interval="conf", type="response")

############################################

################Now, the example session ###################

> newdataMaker <- function (model = NULL, fl = NULL){
+     mf <- model.frame(model)
+     mterms <- terms(model)
+     mfnames <- colnames(mf)
+     modelcv <- centralValues(mf)
+     if (sum(!names(fl) %in% mfnames) > 0) stop(cat(c("Error. The
focus list (fl) requests variables that are not included in the
original model. The names of the variables in the focus list be drawn
from this list: ",  mfnames)))
+     ## Consider "padding" range of fl for numeric variables so that we
+     ## get newdata objects including the min and max values.
+     mixAndMatch <- expand.grid(fl)
+     mixAndMatch$combo <- 1:nrow(mixAndMatch)
+     newdf <- cbind(mixAndMatch, modelcv[  ,!colnames(modelcv) %in%
colnames(mixAndMatch)])
+     newdf
+ }
> predictOMatic <- function(model = NULL, fl = NULL, ...){
+     nd <- newdataMaker(model, fl)
+     fit <- predict(model, newdata=nd, ...)
+     cbind(fit, nd)
+ }
> set.seed(12345)
> x1 <- rnorm(100)
> x2 <- rnorm(100)
> x3 <- rnorm(100)
> x4 <- rnorm(100)
> x5 <- rpois(100, 5)
> x6 <- rgamma(100, 2,1)
> xcat1 <- gl(2,50, labels=c("M","F"))
> xcat2 <- cut(rnorm(100), breaks=c(-Inf, 0, 0.4, 0.9, 1, Inf), labels=c("R", "M", "D", "P", "G"))
> dat <- data.frame(x1, x2, x3, x4, x5, xcat1, xcat2)
> rm(x1, x2, x3, x4, xcat1, xcat2)
> dat$y <- with(dat, 0.03 + 0.1*x1 + 0.1*x2 + 0.4*x3 -0.1*x4 + 0.01*x5 + 0.1*x6 + 2*rnorm(100))
> m1 <- lm(y ~ x1 + x2 + x3 + x4 + x5 + x6 + xcat1 + xcat2, data=dat)
> summary(m1)

Call:
lm(formula = y ~ x1 + x2 + x3 + x4 + x5 + x6 + xcat1 + xcat2,
    data = dat)

Residuals:
    Min      1Q  Median      3Q     Max
-3.7624 -1.1740 -0.0753  0.9174  4.9910

Coefficients:
            Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.24744    0.64684  -0.383 0.702989
x1          -0.44055    0.16902  -2.607 0.010741 *
x2          -0.21342    0.18811  -1.135 0.259644
x3           0.74456    0.21434   3.474 0.000798 ***
x4           0.20394    0.20454   0.997 0.321453
x5           0.01176    0.09374   0.125 0.900426
x6           0.16072    0.14412   1.115 0.267809
xcat1F       0.81233    0.38271   2.123 0.036599 *
xcat2M       1.03244    0.51701   1.997 0.048922 *
xcat2D       0.13699    0.64662   0.212 0.832712
xcat2P      -0.43391    0.83018  -0.523 0.602517
xcat2G       0.16521    0.49741   0.332 0.740567
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.805 on 88 degrees of freedom
Multiple R-squared:  0.23,	Adjusted R-squared: 0.1338
F-statistic:  2.39 on 11 and 88 DF,  p-value: 0.01216

> m1mf <- model.frame(m1)
> library(rockchalk)
Loading required package: MASS
Loading required package: car
Loading required package: nnet
> summarize(m1mf)
$numerics
          x1       x2       x3      x4     x5     x6       y
0%   -2.3800 -2.12400 -2.29000 -2.5820  1.000 0.1373 -4.8140
25%  -0.5901 -0.51290 -0.64720 -0.3191  3.000 0.8342 -0.7684
50%   0.4837  0.02596  0.01349  0.3466  5.000 1.5240  0.6293
75%   0.9004  0.69840  0.63120  0.6968  6.000 2.5230  1.7550
100%  2.4770  2.65600  2.74700  2.2680 11.000 5.7050  5.7990
mean  0.2452  0.04523 -0.04621  0.2153  4.830 1.8200  0.6151
sd    1.1150  1.01100  0.93230  0.9707  2.050 1.3130  1.9390
var   1.2430  1.02300  0.86930  0.9422  4.203 1.7230  3.7620
NA's  0.0000  0.00000  0.00000  0.0000  0.000 0.0000  0.0000

$factors
           xcat1              xcat2
 M            :50   R            :46.000
 F            :50   M            :19.000
 NA's         : 0   G            :19.000
 entropy      : 1   D            :10.000
 normedEntropy: 1   P            : 6.000
                    NA's         : 0.000
                    entropy      : 2.002
                    normedEntropy: 0.862

> ## First, overview for values of xcat1
> newdataMaker(m1, fl = list(xcat1 = levels(m1mf$xcat1)))
  xcat1 combo         y        x1         x2          x3        x4   x5
1     M     1 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
2     F     2 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
        x6 xcat2
1 1.819744     R
2 1.819744     R
> newdataMaker(m1, fl = list(xcat1 = levels(m1mf$xcat2), xcat2 = levels(m1mf$xcat2)))
   xcat1 xcat2 combo         y        x1         x2          x3        x4   x5
1      R     R     1 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
2      M     R     2 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
3      D     R     3 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
4      P     R     4 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
5      G     R     5 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
6      R     M     6 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
7      M     M     7 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
8      D     M     8 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
9      P     M     9 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
10     G     M    10 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
11     R     D    11 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
12     M     D    12 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
13     D     D    13 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
14     P     D    14 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
15     G     D    15 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
16     R     P    16 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
17     M     P    17 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
18     D     P    18 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
19     P     P    19 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
20     G     P    20 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
21     R     G    21 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
22     M     G    22 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
23     D     G    23 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
24     P     G    24 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
25     G     G    25 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
         x6
1  1.819744
2  1.819744
3  1.819744
4  1.819744
5  1.819744
6  1.819744
7  1.819744
8  1.819744
9  1.819744
10 1.819744
11 1.819744
12 1.819744
13 1.819744
14 1.819744
15 1.819744
16 1.819744
17 1.819744
18 1.819744
19 1.819744
20 1.819744
21 1.819744
22 1.819744
23 1.819744
24 1.819744
25 1.819744
> newdataMaker(m1, fl = list(x2 = c(0.25, 1.0), xcat2 = c("M","D")))
    x2 xcat2 combo         y        x1          x3        x4   x5       x6
1 0.25     M     1 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744
2 1.00     M     2 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744
3 0.25     D     3 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744
4 1.00     D     4 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744
  xcat1
1     M
2     M
3     M
4     M
> predictOMatic(m1, fl = list(x2 = c(0.25, 1.0), xcat2 = c("M","D")))
          fit   x2 xcat2 combo         y        x1          x3        x4   x5
1  0.98241259 0.25     M     1 0.6150553 0.2451972 -0.04621158 0.2152759 4.83
2  0.82235069 1.00     M     2 0.6150553 0.2451972 -0.04621158 0.2152759 4.83
3  0.08695955 0.25     D     3 0.6150553 0.2451972 -0.04621158 0.2152759 4.83
4 -0.07310236 1.00     D     4 0.6150553 0.2451972 -0.04621158 0.2152759 4.83
        x6 xcat1
1 1.819744     M
2 1.819744     M
3 1.819744     M
4 1.819744     M
> predictOMatic(m1, fl = list(x2 = c(0.25, 1.0), xcat2 = c("M","D")), interval="conf")
          fit        lwr      upr   x2 xcat2 combo         y        x1
1  0.98241259  0.1140244 1.850801 0.25     M     1 0.6150553 0.2451972
2  0.82235069 -0.1039242 1.748626 1.00     M     2 0.6150553 0.2451972
3  0.08695955 -1.1263000 1.300219 0.25     D     3 0.6150553 0.2451972
4 -0.07310236 -1.3139912 1.167787 1.00     D     4 0.6150553 0.2451972
           x3        x4   x5       x6 xcat1
1 -0.04621158 0.2152759 4.83 1.819744     M
2 -0.04621158 0.2152759 4.83 1.819744     M
3 -0.04621158 0.2152759 4.83 1.819744     M
4 -0.04621158 0.2152759 4.83 1.819744     M
> newdf <- predictOMatic(m1, fl = list(x2 = c(0.25, 1.0), xcat2 = c("M","D"), x1=plotSeq(dat$x1)))
plot(y ~ x1, data= model.frame(m1))
> by(newdf, list(newdf$x2, newdf$xcat2), function(x) {lines(x$x1, x$fit)})
> : 0.25
: M
NULL
------------------------------------------------------------
: 1
: M
NULL
------------------------------------------------------------
: 0.25
: D
NULL
------------------------------------------------------------
: 1
: D
NULL
> newdataMaker(m1, fl = list(x2 = c(-1,0, 1), xcat2 = c("M","D")))
  x2 xcat2 combo         y        x1          x3        x4   x5       x6 xcat1
1 -1     M     1 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744     M
2  0     M     2 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744     M
3  1     M     3 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744     M
4 -1     D     4 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744     M
5  0     D     5 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744     M
6  1     D     6 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744     M
> predictOMatic(m1, fl = list(x2 = range(dat$x2), xcat2 = c("M","D")))
         fit        x2 xcat2 combo         y        x1          x3        x4
1  1.4889658 -2.123550     M     1 0.6150553 0.2451972 -0.04621158 0.2152759
2  0.4689792  2.655788     M     2 0.6150553 0.2451972 -0.04621158 0.2152759
3  0.5935128 -2.123550     D     3 0.6150553 0.2451972 -0.04621158 0.2152759
4 -0.4264739  2.655788     D     4 0.6150553 0.2451972 -0.04621158 0.2152759
    x5       x6 xcat1
1 4.83 1.819744     M
2 4.83 1.819744     M
3 4.83 1.819744     M
4 4.83 1.819744     M
> newdf <- predictOMatic(m1, fl = list(x2 = quantile(dat$x2), xcat2 = c("M","D")))
> plot(y ~ x2 , data=model.frame(m1))
> lines(y ~ x2,  newdf)
> predictOMatic(m1, fl = list(x2 = c(50, 60), xcat2 = c("M","D")), interval="conf")
         fit       lwr       upr x2 xcat2 combo         y        x1          x3
1  -9.635027 -28.29784  9.027789 50     M     1 0.6150553 0.2451972 -0.04621158
2 -11.769186 -34.16684 10.628471 60     M     2 0.6150553 0.2451972 -0.04621158
3 -10.530480 -29.14836  8.087395 50     D     3 0.6150553 0.2451972 -0.04621158
4 -12.664639 -35.01410  9.684824 60     D     4 0.6150553 0.2451972 -0.04621158
         x4   x5       x6 xcat1
1 0.2152759 4.83 1.819744     M
2 0.2152759 4.83 1.819744     M
3 0.2152759 4.83 1.819744     M
4 0.2152759 4.83 1.819744     M
> nd <- newdataMaker(m1, fl = list(x2 = c(50, 60), xcat2 = c("M","D")))
> pr <- predictOMatic(m1, fl = list(x2 = c(50, 60), xcat2 = c("M","D")), interval="conf")
> m2 <- glm(xcat2 ~ x1 + x2 + x3 + xcat1, data=dat, family=binomial(logit))
> summary(m2)

Call:
glm(formula = xcat2 ~ x1 + x2 + x3 + xcat1, family = binomial(logit),
    data = dat)

Deviance Residuals:
   Min      1Q  Median      3Q     Max
-1.743  -1.181   0.912   1.112   1.353

Coefficients:
            Estimate Std. Error z value Pr(>|z|)
(Intercept)  0.41371    0.29465   1.404    0.160
x1           0.09728    0.18532   0.525    0.600
x2           0.09622    0.20650   0.466    0.641
x3          -0.25314    0.22816  -1.109    0.267
xcat1F      -0.57235    0.41433  -1.381    0.167

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 137.99  on 99  degrees of freedom
Residual deviance: 134.56  on 95  degrees of freedom
AIC: 144.56

Number of Fisher Scoring iterations: 4

> m2mf <- model.frame(m2)
> summarize(m2mf)
$numerics
          x1       x2       x3
0%   -2.3800 -2.12400 -2.29000
25%  -0.5901 -0.51290 -0.64720
50%   0.4837  0.02596  0.01349
75%   0.9004  0.69840  0.63120
100%  2.4770  2.65600  2.74700
mean  0.2452  0.04523 -0.04621
sd    1.1150  1.01100  0.93230
var   1.2430  1.02300  0.86930
NA's  0.0000  0.00000  0.00000

$factors
           xcat1              xcat2
 M            :50   R            :46.000
 F            :50   M            :19.000
 NA's         : 0   G            :19.000
 entropy      : 1   D            :10.000
 normedEntropy: 1   P            : 6.000
                    NA's         : 0.000
                    entropy      : 2.002
                    normedEntropy: 0.862

> predict(m2, newdata=data.frame(xcat1=c("M","F"), x1=c(1), x2=mean(m2mf$x2), x3=mean(m2mf$x3)), se.fit=TRUE)
$fit
          1           2
 0.52704389 -0.04530271

$se.fit
        1         2
0.3336836 0.3139795

$residual.scale
[1] 1

> predictOMatic(m2, fl = list(x2 = c(-1, 1), xcat1 = c("M","F")), interval="conf", type="response")
        fit x2 xcat1 combo xcat2        x1          x3
1 0.5873563 -1     M     1     R 0.2451972 -0.04621158
2 0.6330860  1     M     2     R 0.2451972 -0.04621158
3 0.4453938 -1     F     3     R 0.2451972 -0.04621158
4 0.4932835  1     F     4     R 0.2451972 -0.04621158

-- 
Paul E. Johnson
Professor, Political Science    Assoc. Director
1541 Lilac Lane, Room 504     Center for Research Methods
University of Kansas               University of Kansas
http://pj.freefaculty.org            http://quant.ku.edu