[R] predictOMatic for regression. Please try and advise me
Paul Johnson
pauljohn32 at gmail.com
Fri Apr 20 18:37:56 CEST 2012
I'm pasting below a working R file featuring a function I'd like to polish up.
I'm teaching regression this semester and every time I come to
something that is very difficult to explain in class, I try to
simplify it by writing an R function (eventually into my package
"rockchalk"). Students have a difficult time with predict and newdata
objects, so right now I'm concentrated on that problem.
Fitting regressions is pretty easy, but getting predicted values for
particular combinations of input values can be difficult when the
model is complicated. If you agree, please try out the following and
let me know how its use could be enhanced, or what other features you
might want.
I know folks are busy, so to save you the trouble of actually running
the code, I also paste in a session demonstrating one run through.
Here's the code:
##Paul Johnson
## 2012-04-20
## Facilitate creation of "newdata" objects for use in predict
## statements (for interpretation of regression models)
## newdataMaker creates the newdata frame required in predict.
## It supplies a data frame with all input variables at a
## central value (mean, mode) except for some variables that
## the user wants to focus on. User should
## supply a fitted model "model" and a focus list "fl" of variable
## values. See examples below. The list must be a named list,
## using names of variables from the regression formula.
## It is not needed to call this
## directly if one is satisfied with the results from
## predictOMatic
newdataMaker <- function (model = NULL, fl = NULL){
mf <- model.frame(model)
mterms <- terms(model)
mfnames <- colnames(mf)
modelcv <- centralValues(mf)
if (sum(!names(fl) %in% mfnames) > 0) stop(cat(c("Error. The focus
list (fl) requests variables that are not included in the original
model. The names of the variables in the focus list be drawn from this
list: ", mfnames)))
## Consider "padding" range of fl for numeric variables so that we
## get newdata objects including the min and max values.
mixAndMatch <- expand.grid(fl)
mixAndMatch$combo <- 1:nrow(mixAndMatch)
newdf <- cbind(mixAndMatch, modelcv[ ,!colnames(modelcv) %in%
colnames(mixAndMatch)])
newdf
}
predictOMatic <- function(model = NULL, fl = NULL, ...){
nd <- newdataMaker(model, fl)
fit <- predict(model, newdata=nd, ...)
cbind(fit, nd)
}
set.seed(12345)
x1 <- rnorm(100)
x2 <- rnorm(100)
x3 <- rnorm(100)
x4 <- rnorm(100)
x5 <- rpois(100, 5)
x6 <- rgamma(100, 2,1)
xcat1 <- gl(2,50, labels=c("M","F"))
xcat2 <- cut(rnorm(100), breaks=c(-Inf, 0, 0.4, 0.9, 1, Inf),
labels=c("R", "M", "D", "P", "G"))
dat <- data.frame(x1, x2, x3, x4, x5, xcat1, xcat2)
rm(x1, x2, x3, x4, xcat1, xcat2)
dat$y <- with(dat, 0.03 + 0.1*x1 + 0.1*x2 + 0.4*x3 -0.1*x4 + 0.01*x5 +
0.1*x6 + 2*rnorm(100))
##ordinary regression.
m1 <- lm(y ~ x1 + x2 + x3 + x4 + x5 + x6 + xcat1 + xcat2, data=dat)
summary(m1)
## Use the fitted model's data frame for analysis
m1mf <- model.frame(m1)
## If you have rockchalk 1.5.4, run these:
library(rockchalk)
summarize(m1mf)
## otherwise, run
summary(m1mf)
## First, overview for values of xcat1
newdataMaker(m1, fl = list(xcat1 = levels(m1mf$xcat1)))
## mix and match all combinations of xcat1 and xcat2
newdataMaker(m1, fl = list(xcat1 = levels(m1mf$xcat2), xcat2 =
levels(m1mf$xcat2)))
## Pick some particular values for focus
newdataMaker(m1, fl = list(x2 = c(0.25, 1.0), xcat2 = c("M","D")))
## Generate a newdata frame and predictions in one step
predictOMatic(m1, fl = list(x2 = c(0.25, 1.0), xcat2 = c("M","D")))
predictOMatic(m1, fl = list(x2 = c(0.25, 1.0), xcat2 = c("M","D")),
interval="conf")
newdf <- predictOMatic(m1, fl = list(x2 = c(0.25, 1.0), xcat2 =
c("M","D"), x1=plotSeq(dat$x1)))
plot(y ~ x1, data= model.frame(m1))
by(newdf, list(newdf$x2, newdf$xcat2), function(x) {lines(x$x1, x$fit)})
newdataMaker(m1, fl = list(x2 = c(-1,0, 1), xcat2 = c("M","D")))
predictOMatic(m1, fl = list(x2 = range(dat$x2), xcat2 = c("M","D")))
newdf <- predictOMatic(m1, fl = list(x2 = quantile(dat$x2), xcat2 = c("M","D")))
plot(y ~ x2 , data=model.frame(m1))
lines(y ~ x2, newdf)
predictOMatic(m1, fl = list(x2 = c(50, 60), xcat2 = c("M","D")),
interval="conf")
## just gets the new data
nd <- newdataMaker(m1, fl = list(x2 = c(50, 60), xcat2 = c("M","D")))
pr <- predictOMatic(m1, fl = list(x2 = c(50, 60), xcat2 = c("M","D")),
interval="conf")
##
m2 <- glm(xcat2 ~ x1 + x2 + x3 + xcat1, data=dat, family=binomial(logit))
summary(m2)
m2mf <- model.frame(m2)
summarize(m2mf)
predict(m2, newdata=data.frame(xcat1=c("M","F"), x1=c(1),
x2=mean(m2mf$x2), x3=mean(m2mf$x3)), se.fit=TRUE)
predictOMatic(m2, fl = list(x2 = c(-1, 1), xcat1 = c("M","F")),
interval="conf", type="response")
############################################
################Now, the example session ###################
> newdataMaker <- function (model = NULL, fl = NULL){
+ mf <- model.frame(model)
+ mterms <- terms(model)
+ mfnames <- colnames(mf)
+ modelcv <- centralValues(mf)
+ if (sum(!names(fl) %in% mfnames) > 0) stop(cat(c("Error. The
focus list (fl) requests variables that are not included in the
original model. The names of the variables in the focus list be drawn
from this list: ", mfnames)))
+ ## Consider "padding" range of fl for numeric variables so that we
+ ## get newdata objects including the min and max values.
+ mixAndMatch <- expand.grid(fl)
+ mixAndMatch$combo <- 1:nrow(mixAndMatch)
+ newdf <- cbind(mixAndMatch, modelcv[ ,!colnames(modelcv) %in%
colnames(mixAndMatch)])
+ newdf
+ }
> predictOMatic <- function(model = NULL, fl = NULL, ...){
+ nd <- newdataMaker(model, fl)
+ fit <- predict(model, newdata=nd, ...)
+ cbind(fit, nd)
+ }
> set.seed(12345)
> x1 <- rnorm(100)
> x2 <- rnorm(100)
> x3 <- rnorm(100)
> x4 <- rnorm(100)
> x5 <- rpois(100, 5)
> x6 <- rgamma(100, 2,1)
> xcat1 <- gl(2,50, labels=c("M","F"))
> xcat2 <- cut(rnorm(100), breaks=c(-Inf, 0, 0.4, 0.9, 1, Inf), labels=c("R", "M", "D", "P", "G"))
> dat <- data.frame(x1, x2, x3, x4, x5, xcat1, xcat2)
> rm(x1, x2, x3, x4, xcat1, xcat2)
> dat$y <- with(dat, 0.03 + 0.1*x1 + 0.1*x2 + 0.4*x3 -0.1*x4 + 0.01*x5 + 0.1*x6 + 2*rnorm(100))
> m1 <- lm(y ~ x1 + x2 + x3 + x4 + x5 + x6 + xcat1 + xcat2, data=dat)
> summary(m1)
Call:
lm(formula = y ~ x1 + x2 + x3 + x4 + x5 + x6 + xcat1 + xcat2,
data = dat)
Residuals:
Min 1Q Median 3Q Max
-3.7624 -1.1740 -0.0753 0.9174 4.9910
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.24744 0.64684 -0.383 0.702989
x1 -0.44055 0.16902 -2.607 0.010741 *
x2 -0.21342 0.18811 -1.135 0.259644
x3 0.74456 0.21434 3.474 0.000798 ***
x4 0.20394 0.20454 0.997 0.321453
x5 0.01176 0.09374 0.125 0.900426
x6 0.16072 0.14412 1.115 0.267809
xcat1F 0.81233 0.38271 2.123 0.036599 *
xcat2M 1.03244 0.51701 1.997 0.048922 *
xcat2D 0.13699 0.64662 0.212 0.832712
xcat2P -0.43391 0.83018 -0.523 0.602517
xcat2G 0.16521 0.49741 0.332 0.740567
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.805 on 88 degrees of freedom
Multiple R-squared: 0.23, Adjusted R-squared: 0.1338
F-statistic: 2.39 on 11 and 88 DF, p-value: 0.01216
> m1mf <- model.frame(m1)
> library(rockchalk)
Loading required package: MASS
Loading required package: car
Loading required package: nnet
> summarize(m1mf)
$numerics
x1 x2 x3 x4 x5 x6 y
0% -2.3800 -2.12400 -2.29000 -2.5820 1.000 0.1373 -4.8140
25% -0.5901 -0.51290 -0.64720 -0.3191 3.000 0.8342 -0.7684
50% 0.4837 0.02596 0.01349 0.3466 5.000 1.5240 0.6293
75% 0.9004 0.69840 0.63120 0.6968 6.000 2.5230 1.7550
100% 2.4770 2.65600 2.74700 2.2680 11.000 5.7050 5.7990
mean 0.2452 0.04523 -0.04621 0.2153 4.830 1.8200 0.6151
sd 1.1150 1.01100 0.93230 0.9707 2.050 1.3130 1.9390
var 1.2430 1.02300 0.86930 0.9422 4.203 1.7230 3.7620
NA's 0.0000 0.00000 0.00000 0.0000 0.000 0.0000 0.0000
$factors
xcat1 xcat2
M :50 R :46.000
F :50 M :19.000
NA's : 0 G :19.000
entropy : 1 D :10.000
normedEntropy: 1 P : 6.000
NA's : 0.000
entropy : 2.002
normedEntropy: 0.862
> ## First, overview for values of xcat1
> newdataMaker(m1, fl = list(xcat1 = levels(m1mf$xcat1)))
xcat1 combo y x1 x2 x3 x4 x5
1 M 1 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
2 F 2 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
x6 xcat2
1 1.819744 R
2 1.819744 R
> newdataMaker(m1, fl = list(xcat1 = levels(m1mf$xcat2), xcat2 = levels(m1mf$xcat2)))
xcat1 xcat2 combo y x1 x2 x3 x4 x5
1 R R 1 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
2 M R 2 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
3 D R 3 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
4 P R 4 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
5 G R 5 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
6 R M 6 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
7 M M 7 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
8 D M 8 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
9 P M 9 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
10 G M 10 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
11 R D 11 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
12 M D 12 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
13 D D 13 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
14 P D 14 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
15 G D 15 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
16 R P 16 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
17 M P 17 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
18 D P 18 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
19 P P 19 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
20 G P 20 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
21 R G 21 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
22 M G 22 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
23 D G 23 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
24 P G 24 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
25 G G 25 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83
x6
1 1.819744
2 1.819744
3 1.819744
4 1.819744
5 1.819744
6 1.819744
7 1.819744
8 1.819744
9 1.819744
10 1.819744
11 1.819744
12 1.819744
13 1.819744
14 1.819744
15 1.819744
16 1.819744
17 1.819744
18 1.819744
19 1.819744
20 1.819744
21 1.819744
22 1.819744
23 1.819744
24 1.819744
25 1.819744
> newdataMaker(m1, fl = list(x2 = c(0.25, 1.0), xcat2 = c("M","D")))
x2 xcat2 combo y x1 x3 x4 x5 x6
1 0.25 M 1 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744
2 1.00 M 2 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744
3 0.25 D 3 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744
4 1.00 D 4 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744
xcat1
1 M
2 M
3 M
4 M
> predictOMatic(m1, fl = list(x2 = c(0.25, 1.0), xcat2 = c("M","D")))
fit x2 xcat2 combo y x1 x3 x4 x5
1 0.98241259 0.25 M 1 0.6150553 0.2451972 -0.04621158 0.2152759 4.83
2 0.82235069 1.00 M 2 0.6150553 0.2451972 -0.04621158 0.2152759 4.83
3 0.08695955 0.25 D 3 0.6150553 0.2451972 -0.04621158 0.2152759 4.83
4 -0.07310236 1.00 D 4 0.6150553 0.2451972 -0.04621158 0.2152759 4.83
x6 xcat1
1 1.819744 M
2 1.819744 M
3 1.819744 M
4 1.819744 M
> predictOMatic(m1, fl = list(x2 = c(0.25, 1.0), xcat2 = c("M","D")), interval="conf")
fit lwr upr x2 xcat2 combo y x1
1 0.98241259 0.1140244 1.850801 0.25 M 1 0.6150553 0.2451972
2 0.82235069 -0.1039242 1.748626 1.00 M 2 0.6150553 0.2451972
3 0.08695955 -1.1263000 1.300219 0.25 D 3 0.6150553 0.2451972
4 -0.07310236 -1.3139912 1.167787 1.00 D 4 0.6150553 0.2451972
x3 x4 x5 x6 xcat1
1 -0.04621158 0.2152759 4.83 1.819744 M
2 -0.04621158 0.2152759 4.83 1.819744 M
3 -0.04621158 0.2152759 4.83 1.819744 M
4 -0.04621158 0.2152759 4.83 1.819744 M
> newdf <- predictOMatic(m1, fl = list(x2 = c(0.25, 1.0), xcat2 = c("M","D"), x1=plotSeq(dat$x1)))
plot(y ~ x1, data= model.frame(m1))
> by(newdf, list(newdf$x2, newdf$xcat2), function(x) {lines(x$x1, x$fit)})
> : 0.25
: M
NULL
------------------------------------------------------------
: 1
: M
NULL
------------------------------------------------------------
: 0.25
: D
NULL
------------------------------------------------------------
: 1
: D
NULL
> newdataMaker(m1, fl = list(x2 = c(-1,0, 1), xcat2 = c("M","D")))
x2 xcat2 combo y x1 x3 x4 x5 x6 xcat1
1 -1 M 1 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744 M
2 0 M 2 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744 M
3 1 M 3 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744 M
4 -1 D 4 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744 M
5 0 D 5 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744 M
6 1 D 6 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744 M
> predictOMatic(m1, fl = list(x2 = range(dat$x2), xcat2 = c("M","D")))
fit x2 xcat2 combo y x1 x3 x4
1 1.4889658 -2.123550 M 1 0.6150553 0.2451972 -0.04621158 0.2152759
2 0.4689792 2.655788 M 2 0.6150553 0.2451972 -0.04621158 0.2152759
3 0.5935128 -2.123550 D 3 0.6150553 0.2451972 -0.04621158 0.2152759
4 -0.4264739 2.655788 D 4 0.6150553 0.2451972 -0.04621158 0.2152759
x5 x6 xcat1
1 4.83 1.819744 M
2 4.83 1.819744 M
3 4.83 1.819744 M
4 4.83 1.819744 M
> newdf <- predictOMatic(m1, fl = list(x2 = quantile(dat$x2), xcat2 = c("M","D")))
> plot(y ~ x2 , data=model.frame(m1))
> lines(y ~ x2, newdf)
> predictOMatic(m1, fl = list(x2 = c(50, 60), xcat2 = c("M","D")), interval="conf")
fit lwr upr x2 xcat2 combo y x1 x3
1 -9.635027 -28.29784 9.027789 50 M 1 0.6150553 0.2451972 -0.04621158
2 -11.769186 -34.16684 10.628471 60 M 2 0.6150553 0.2451972 -0.04621158
3 -10.530480 -29.14836 8.087395 50 D 3 0.6150553 0.2451972 -0.04621158
4 -12.664639 -35.01410 9.684824 60 D 4 0.6150553 0.2451972 -0.04621158
x4 x5 x6 xcat1
1 0.2152759 4.83 1.819744 M
2 0.2152759 4.83 1.819744 M
3 0.2152759 4.83 1.819744 M
4 0.2152759 4.83 1.819744 M
> nd <- newdataMaker(m1, fl = list(x2 = c(50, 60), xcat2 = c("M","D")))
> pr <- predictOMatic(m1, fl = list(x2 = c(50, 60), xcat2 = c("M","D")), interval="conf")
> m2 <- glm(xcat2 ~ x1 + x2 + x3 + xcat1, data=dat, family=binomial(logit))
> summary(m2)
Call:
glm(formula = xcat2 ~ x1 + x2 + x3 + xcat1, family = binomial(logit),
data = dat)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.743 -1.181 0.912 1.112 1.353
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.41371 0.29465 1.404 0.160
x1 0.09728 0.18532 0.525 0.600
x2 0.09622 0.20650 0.466 0.641
x3 -0.25314 0.22816 -1.109 0.267
xcat1F -0.57235 0.41433 -1.381 0.167
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 137.99 on 99 degrees of freedom
Residual deviance: 134.56 on 95 degrees of freedom
AIC: 144.56
Number of Fisher Scoring iterations: 4
> m2mf <- model.frame(m2)
> summarize(m2mf)
$numerics
x1 x2 x3
0% -2.3800 -2.12400 -2.29000
25% -0.5901 -0.51290 -0.64720
50% 0.4837 0.02596 0.01349
75% 0.9004 0.69840 0.63120
100% 2.4770 2.65600 2.74700
mean 0.2452 0.04523 -0.04621
sd 1.1150 1.01100 0.93230
var 1.2430 1.02300 0.86930
NA's 0.0000 0.00000 0.00000
$factors
xcat1 xcat2
M :50 R :46.000
F :50 M :19.000
NA's : 0 G :19.000
entropy : 1 D :10.000
normedEntropy: 1 P : 6.000
NA's : 0.000
entropy : 2.002
normedEntropy: 0.862
> predict(m2, newdata=data.frame(xcat1=c("M","F"), x1=c(1), x2=mean(m2mf$x2), x3=mean(m2mf$x3)), se.fit=TRUE)
$fit
1 2
0.52704389 -0.04530271
$se.fit
1 2
0.3336836 0.3139795
$residual.scale
[1] 1
> predictOMatic(m2, fl = list(x2 = c(-1, 1), xcat1 = c("M","F")), interval="conf", type="response")
fit x2 xcat1 combo xcat2 x1 x3
1 0.5873563 -1 M 1 R 0.2451972 -0.04621158
2 0.6330860 1 M 2 R 0.2451972 -0.04621158
3 0.4453938 -1 F 3 R 0.2451972 -0.04621158
4 0.4932835 1 F 4 R 0.2451972 -0.04621158
--
Paul E. Johnson
Professor, Political Science Assoc. Director
1541 Lilac Lane, Room 504 Center for Research Methods
University of Kansas University of Kansas
http://pj.freefaculty.org http://quant.ku.edu
More information about the R-help
mailing list