--- title: "Comparing SelectBoost-GAMLSS Variants" shorttitle: "Comparing SelectBoost-GAMLSS Variants" author: - name: "Frédéric Bertrand" affiliation: - Cedric, Cnam, Paris email: frederic.bertrand@lecnam.net date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Comparing SelectBoost-GAMLSS Variants} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include = FALSE} #file.edit(normalizePath("~/.Renviron")) LOCAL <- identical(Sys.getenv("LOCAL"), "TRUE") #LOCAL=TRUE knitr::opts_chunk$set(purl = LOCAL) knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` This vignette compares: plain `sb_gamlss`, the SelectBoost-integrated `SelectBoost_gamlss`, grid-based `sb_gamlss_c0_grid` + `autoboost_gamlss`, and the lightweight `fastboost_gamlss`. ## What you'll learn * How the base `sb_gamlss()` call differs from `SelectBoost_gamlss()` (automatic grouping). * How to sweep c0 values with `sb_gamlss_c0_grid()` and let `autoboost_gamlss()` pick a favourite. * How to generate rapid previews with `fastboost_gamlss()` before launching a large bootstrap campaign. ```{r, cache=TRUE, eval=LOCAL} library(gamlss) library(SelectBoost.gamlss) set.seed(1) n <- 500 x1 <- rnorm(n); x2 <- rnorm(n); x3 <- rnorm(n); x4 <- rnorm(n) mu <- 0.5 + 1.2*x1 - 0.8*x3 y <- gamlss.dist::rNO(n, mu = mu, sigma = 1) dat <- data.frame(y, x1, x2, x3, x4) # Baseline b0 <- sb_gamlss( y ~ 1, data = dat, family = gamlss.dist::NO(), mu_scope = ~ x1 + x2 + x3 + x4, sigma_scope = ~ x1 + x2, B = 50, pi_thr = 0.6, pre_standardize = TRUE, trace = FALSE ) plot_sb_gamlss(b0) # SelectBoost integration (single c0) sb <- SelectBoost_gamlss( y ~ 1, data = dat, family = gamlss.dist::NO(), mu_scope = ~ x1 + x2 + x3 + x4, sigma_scope = ~ x1 + x2, B = 50, pi_thr = 0.6, c0 = 0.5, pre_standardize = TRUE, trace = FALSE ) summary(sb) |> plot() # c0 grid + autoboost g <- sb_gamlss_c0_grid( y ~ 1, data = dat, family = gamlss.dist::NO(), mu_scope = ~ x1 + x2 + x3 + x4, sigma_scope = ~ x1 + x2, c0_grid = seq(0.2, 0.8, by = 0.2), B = 40, pi_thr = 0.6, pre_standardize = TRUE, trace = FALSE ) plot(g) confidence_table(g) |> head() ab <- autoboost_gamlss( y ~ 1, data = dat, family = gamlss.dist::NO(), mu_scope = ~ x1 + x2 + x3 + x4, sigma_scope = ~ x1 + x2, c0_grid = seq(0.2, 0.8, by = 0.2), B = 40, pi_thr = 0.6, pre_standardize = TRUE, trace = FALSE ) attr(ab, "chosen_c0") plot_sb_gamlss(ab) # fastboost (lightweight) fb <- fastboost_gamlss( y ~ 1, data = dat, family = gamlss.dist::NO(), mu_scope = ~ x1 + x2 + x3 + x4, sigma_scope = ~ x1 + x2, B = 30, sample_fraction = 0.6, pi_thr = 0.6, pre_standardize = TRUE, trace = FALSE ) plot_sb_gamlss(fb) ``` ## Confidence Functionals (AUSC, Coverage, Quantiles, Weighted, Conservative) We summarize the stability curves \(p_j(c0)\) into single-number scores: - **AUSC**: normalized area under \(p_j(c0)\) across the `c0_grid`. - **Thresholded AUSC** (\( (p - \pi^\star)_+ \) area): only counts confidence above the stability threshold. - **Coverage**: fraction of `c0` where \(p \ge \pi^\star\). - **Quantiles** (median/80th/90th): robust distributional summaries of stability. - **Weighted AUSC**: apply a weight function \(w(c0)\) to emphasize parts of the grid. - **Conservative AUSC**: use Wilson lower bounds for \(p\) before computing AUSC. ```{r, cache=TRUE, eval=LOCAL} # Using the grid g created above cf <- confidence_functionals( g, pi_thr = 0.6, q = c(0.5, 0.8, 0.9), weight_fun = function(c0) (1 - c0)^2, # emphasize smaller c0 conservative = TRUE, B = 40, # use lower bounds method = "trapezoid" ) head(cf) plot(cf, top = 12, label_top = 8) # Inspect stability curves of top terms top_terms <- head(cf[order(cf$rank_score, decreasing = TRUE), c("term","parameter")], 4) plot_stability_curves(g, terms = unique(top_terms$term), parameter = unique(top_terms$parameter)[1]) ``` ## Grouped penalties for factors & splines ```{r, cache=TRUE, eval=LOCAL} library(gamlss) library(SelectBoost.gamlss) set.seed(42) n <- 500 f <- factor(sample(letters[1:3], n, TRUE)) x1 <- rnorm(n); x2 <- rnorm(n) y <- gamlss.dist::rNO(n, mu = 0.3 + 1.2*x1 - 0.7*x2 + 0.5*(f=="c"), sigma = exp(0.1 + 0.2*(f=="b"))) dat <- data.frame(y, f, x1, x2) # Stepwise baseline fit_step <- sb_gamlss( y ~ 1, data = dat, family = gamlss.dist::NO(), mu_scope = ~ f + x1 + pb(x2) + x1:f, sigma_scope = ~ f + x1 + pb(x2), B = 40, pi_thr = 0.6, pre_standardize = TRUE, trace = FALSE ) # Group lasso for μ and sparse group lasso for σ fit_grp <- sb_gamlss( y ~ 1, data = dat, family = gamlss.dist::NO(), mu_scope = ~ f + x1 + pb(x2) + x1:f, sigma_scope = ~ f + x1 + pb(x2), engine = "grpreg", engine_sigma = "sgl", B = 40, pi_thr = 0.6, pre_standardize = TRUE, trace = FALSE ) plot_sb_gamlss(fit_step) plot_sb_gamlss(fit_grp) selection_table(fit_step) |> head() selection_table(fit_grp) |> head() ``` ## Tuning & Group Knockoffs ```{r, cache=TRUE, eval=LOCAL} # Tuning engines / penalties cfgs <- list( list(engine="stepGAIC"), list(engine="glmnet", glmnet_alpha=1), list(engine="grpreg", grpreg_penalty="grLasso", engine_sigma="sgl", sgl_alpha=0.9) ) base <- list( formula = y ~ 1, data = dat, family = gamlss.dist::NO(), mu_scope = ~ f + x1 + pb(x2) + x1:f, sigma_scope = ~ f + x1 + pb(x2), pi_thr = 0.6, pre_standardize = TRUE, sample_fraction = 0.7, parallel = "auto", trace = FALSE ) tuned <- tune_sb_gamlss(cfgs, base_args = base, B_small = 30) tuned$best_config # Approximate group knockoffs for mu sel_mu <- NULL if (requireNamespace("knockoff", quietly = TRUE)) { sel_mu <- tryCatch( knockoff_filter_mu(dat, response = "y", mu_scope = ~ f + x1 + pb(x2), fdr = 0.1), error = function(e) { cat("Knockoff filter failed:", conditionMessage(e), "— skipping example.\n") NULL } ) } else { cat("Package 'knockoff' not installed; skipping knockoff example.\n") } if (!is.null(sel_mu)) sel_mu ```