---
title: "More examples"
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{More examples}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
---
```{r, include = FALSE}
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
```
`vignette("pminternal")` gives an introduction to the package and writing user-defined model and score functions. This vignette provides more examples of user-defined model functions for what I expect are the most commonly used modeling approaches that would not be supported by the `fit` argument.
```{r setup}
library(pminternal)
library(Hmisc)
getHdata("gusto")
gusto <- gusto[, c("sex", "age", "hyp", "htn", "hrt", "pmi", "ste", "day30")]
gusto$y <- gusto$day30; gusto$day30 <- NULL
set.seed(234)
gusto <- gusto[sample(1:nrow(gusto), size = 4000),]
```
## Backward Selection
The function below implements glm variable selection via backward elimination using AIC.
```{r}
stepglm <- function(data, ...){
m <- glm(y~., data=data, family="binomial")
step(m, trace = 0)
}
steppred <- function(model, data, ...){
predict(model, newdata = data, type = "response")
}
validate(data = gusto, outcome = "y", model_fun = stepglm,
pred_fun = steppred, method = "cv_opt", B = 10)
```
In this situation it is probably best to stick with `lrm`, `fastbw`, and `validate` from the `rms` package (though note differences with default `step` behavior) unless you want the additional calibration metrics offered by `pminternal` or want to specify your own score function (see `vignette("pminternal")`).
## Ridge
`vignette("pminternal")` gives an example of a glm with lasso (L1) penalization. It is simple to modify this to implement ridge (L2) penalization by setting `alpha = 0`.
```{r}
#library(glmnet)
ridge_fun <- function(data, ...){
y <- data$y
x <- data[, c('sex', 'age', 'hyp', 'htn', 'hrt', 'pmi', 'ste')]
x$sex <- as.numeric(x$sex == "male")
x$pmi <- as.numeric(x$pmi == "yes")
x <- as.matrix(x)
cv <- glmnet::cv.glmnet(x=x, y=y, alpha=0, nfolds = 10, family="binomial")
lambda <- cv$lambda.min
glmnet::glmnet(x=x, y=y, alpha = 0, lambda = lambda, family="binomial")
}
ridge_predict <- function(model, data, ...){
# note this is identical to lasso_predict from "pminternal" vignette
x <- data[, c('sex', 'age', 'hyp', 'htn', 'hrt', 'pmi', 'ste')]
x$sex <- as.numeric(x$sex == "male")
x$pmi <- as.numeric(x$pmi == "yes")
x <- as.matrix(x)
plogis(glmnet::predict.glmnet(model, newx = x)[,1])
}
validate(method = "cv_optimism", data = gusto,
outcome = "y", model_fun = ridge_fun,
pred_fun = ridge_predict, B = 10)
# the use of package::function in user defined functions
# is especially important if you want to run
# boot_* or .632 in parallel via cores argument
# e.g.
# validate(method = ".632", data = gusto,
# outcome = "y", model_fun = ridge_fun,
# pred_fun = ridge_predict, B = 100, cores = 4)
```
Rather than have two separate functions we could specify an optional argument, `alpha`, that could be supplied to `validate`. If this argument isn't supplied the function below defaults to `alpha = 0`. The chunk below is not evaluated so no output is printed.
```{r, eval = F}
lognet_fun <- function(data, ...){
dots <- list(...)
if ("alpha" %in% names(dots)){
alpha <- dots[["alpha"]]
} else{
alpha <- 0
}
y <- data$y
x <- data[, c('sex', 'age', 'hyp', 'htn', 'hrt', 'pmi', 'ste')]
x$sex <- as.numeric(x$sex == "male")
x$pmi <- as.numeric(x$pmi == "yes")
x <- as.matrix(x)
cv <- glmnet::cv.glmnet(x=x, y=y, alpha = alpha, nfolds = 10, family="binomial")
lambda <- cv$lambda.min
glmnet::glmnet(x=x, y=y, alpha = alpha, lambda = lambda, family="binomial")
}
validate(method = "cv_optimism", data = gusto,
outcome = "y", model_fun = lognet_fun,
pred_fun = ridge_predict, B = 10, alpha = 0.5)
```
## Elastic Net
To implement a model with an elastic net penalty we need to add the steps to select `alpha`. The function below evaluates `nalpha` equally spaced values of `alpha` between 0 and 1 (inclusive) and selects the values `lambda` and `alpha` that result in the minimum CV binomial deviance (this could be changed via `type.measure`). `nalpha` is an optional argument. Note we don't need a new predict function here so `ridge_predict` is used. To save build time the chunk below is not evaluated.
```{r, eval = F}
enet_fun <- function(data, ...){
dots <- list(...)
if ("nalpha" %in% names(dots)){
nalpha <- dots[["nalpha"]]
} else{
nalpha <- 21 # 0 to 1 in steps of 0.05
}
y <- data$y
x <- data[, c('sex', 'age', 'hyp', 'htn', 'hrt', 'pmi', 'ste')]
x$sex <- as.numeric(x$sex == "male")
x$pmi <- as.numeric(x$pmi == "yes")
x <- as.matrix(x)
# run 10 fold CV for each alpha
alphas <- seq(0, 1, length.out = nalpha)
res <- lapply(alphas, function(a){
cv <- glmnet::cv.glmnet(x=x, y=y, alpha = a, nfolds = 10, family="binomial")
list(lambda = cv$lambda.min, bin.dev = min(cv$cvm))
})
# select result with min binomial deviance
j <- which.min(sapply(res, function(x) x$bin.dev))
# produce 'final' model with alpha and lambda
glmnet::glmnet(x=x, y=y, alpha = alphas[j], lambda = res[[j]][["lambda"]], family="binomial")
}
validate(method = "cv_optimism", data = gusto,
outcome = "y", model_fun = enet_fun,
pred_fun = ridge_predict, B = 10)
```
## Random Forest
In the example below we use the `ranger` package to create our `model_fun` and allow for optional arguments of `num.trees`, `max.depth`, and `min.node.size`; others could be added (see `?ranger`).
```{r}
rf_fun <- function(data, ...){
dots <- list(...)
num.trees <- if ("num.trees" %in% names(dots)) dots[["num.trees"]] else 500
max.depth <- if ("max.depth" %in% names(dots)) dots[["max.depth"]] else NULL
min.node.size <- if ("min.node.size" %in% names(dots)) dots[["min.node.size"]] else 1
# best to make sure y is a factor where '1' is level 2
data$y <- factor(data$y, levels = 0:1)
ranger::ranger(y~., data = data, probability = TRUE,
num.trees = num.trees,
max.depth = max.depth,
min.node.size = min.node.size)
}
rf_predict <- function(model, data, ...){
predict(model, data = data)$predictions[, 2]
}
validate(method = "cv_optimism", data = gusto,
outcome = "y", model_fun = rf_fun,
pred_fun = rf_predict, B = 10)
# instead of unlimited tree depth...
validate(method = "cv_optimism", data = gusto,
outcome = "y", model_fun = rf_fun,
pred_fun = rf_predict, B = 10, max.depth = 3)
```
\
\
\