GRM Forests for Robust DIF Detection

Introduction

GRM Forests extend GRM Trees by creating ensembles of trees to provide more robust variable importance measures. This vignette covers:

See the vignette on getting started with grmtree package for a more detailed walkthrough of the tree-based graded response theory model (GRMTree).

Install and Load required packages

To implement the tree-based GRM (GRMTree), you will install the following packages if not previously installed.

## Install packages from CRAN repository
install.packages(c("dplyr", "grmtree"))

Once installed, load the packages as follows:

library(dplyr)        # For data manipulation
library(grmtree)      # For tree-based GRM DIF Test

Import and prepare the data

The data set used in this demonstration is a test/sample data for the package.

## Load the data
data("grmtree_data", package = "grmtree")

## Take a glimpse at the data
glimpse(grmtree_data)
#> Rows: 3,500
#> Columns: 17
#> $ MOS_Listen        <chr> "5", "4", "5", "3", "5", "2", "5", "5", "2", "3", "3…
#> $ MOS_Info          <chr> "5", "3", "5", "3", "5", "3", "5", "5", "2", "4", "3…
#> $ MOS_Advice_Crisis <chr> "5", "3", "5", "3", "5", "5", "5", "5", "4", "4", "3…
#> $ MOS_Confide       <chr> "5", "4", "4", "2", "4", "4", "5", "5", "2", "3", "3…
#> $ MOS_Advice_Want   <chr> "5", "2", "4", "3", "4", "3", "5", "5", "4", "3", "3…
#> $ MOS_Fears         <chr> "5", "2", "5", "4", "5", "4", "5", "5", "1", "2", "1…
#> $ MOS_Personal      <chr> "5", "2", "5", "1", "5", "2", "5", "5", "4", "2", "3…
#> $ MOS_Understand    <chr> "5", "2", "5", "1", "4", "3", "5", "5", "4", "2", "3…
#> $ sex               <chr> "Male", "Male", "Male", "Male", "Male", "Female", "M…
#> $ age               <dbl> 69, 66, 72, 52, 61, 57, 61, 71, 77, 65, 55, 64, 45, …
#> $ residency         <chr> "urban", "urban", "urban", "urban", "urban", "rural"…
#> $ depressed         <chr> "No", "No", "No", "Yes", "No", "No", "Yes", "No", "N…
#> $ bmi               <dbl> 22.85714, 33.59375, 24.38272, 31.14187, 25.88057, 24…
#> $ Education         <chr> "Primary/High school", "College/University", "Colleg…
#> $ job               <chr> "Unemployed", "Unemployed", "Unemployed", "Unemploye…
#> $ smoker            <chr> "No", "No", "Yes", "No", "No", "No", "Yes", "Yes", "…
#> $ multimorbidity    <chr> "2+", "2+", "1", "1", "2+", "2+", "2+", "1", "2+", "…

## Prepare the data
resp.data <- grmtree_data %>% 
  mutate_at(vars(starts_with("MOS")), as.ordered) %>% 
  mutate_at(vars(c(sex, residency, depressed,
                   Education, job, smoker,
                   multimorbidity)), as.factor) 

## Explore the data
head(resp.data)
#> # A tibble: 6 × 17
#>   MOS_Listen MOS_Info MOS_Advice_Crisis MOS_Confide MOS_Advice_Want MOS_Fears
#>   <ord>      <ord>    <ord>             <ord>       <ord>           <ord>    
#> 1 5          5        5                 5           5               5        
#> 2 4          3        3                 4           2               2        
#> 3 5          5        5                 4           4               5        
#> 4 3          3        3                 2           3               4        
#> 5 5          5        5                 4           4               5        
#> 6 2          3        5                 4           3               4        
#> # ℹ 11 more variables: MOS_Personal <ord>, MOS_Understand <ord>, sex <fct>,
#> #   age <dbl>, residency <fct>, depressed <fct>, bmi <dbl>, Education <fct>,
#> #   job <fct>, smoker <fct>, multimorbidity <fct>

## Check the structure of the data
glimpse(resp.data)
#> Rows: 3,500
#> Columns: 17
#> $ MOS_Listen        <ord> 5, 4, 5, 3, 5, 2, 5, 5, 2, 3, 3, 4, 5, 2, 5, 3, 5, 4…
#> $ MOS_Info          <ord> 5, 3, 5, 3, 5, 3, 5, 5, 2, 4, 3, 4, 5, 2, 4, 4, 4, 3…
#> $ MOS_Advice_Crisis <ord> 5, 3, 5, 3, 5, 5, 5, 5, 4, 4, 3, 4, 2, 2, 4, 4, 4, 3…
#> $ MOS_Confide       <ord> 5, 4, 4, 2, 4, 4, 5, 5, 2, 3, 3, 4, 5, 2, 5, 4, 5, 3…
#> $ MOS_Advice_Want   <ord> 5, 2, 4, 3, 4, 3, 5, 5, 4, 3, 3, 4, 2, 2, 5, 4, 4, 2…
#> $ MOS_Fears         <ord> 5, 2, 5, 4, 5, 4, 5, 5, 1, 2, 1, 4, 5, 2, 5, 4, 4, 4…
#> $ MOS_Personal      <ord> 5, 2, 5, 1, 5, 2, 5, 5, 4, 2, 3, 4, 5, 2, 5, 4, 4, 3…
#> $ MOS_Understand    <ord> 5, 2, 5, 1, 4, 3, 5, 5, 4, 2, 3, 4, 5, 2, 5, 4, 4, 3…
#> $ sex               <fct> Male, Male, Male, Male, Male, Female, Male, Female, …
#> $ age               <dbl> 69, 66, 72, 52, 61, 57, 61, 71, 77, 65, 55, 64, 45, …
#> $ residency         <fct> urban, urban, urban, urban, urban, rural, urban, urb…
#> $ depressed         <fct> No, No, No, Yes, No, No, Yes, No, No, No, Yes, No, N…
#> $ bmi               <dbl> 22.85714, 33.59375, 24.38272, 31.14187, 25.88057, 24…
#> $ Education         <fct> Primary/High school, College/University, College/Uni…
#> $ job               <fct> Unemployed, Unemployed, Unemployed, Unemployed, Unem…
#> $ smoker            <fct> No, No, Yes, No, No, No, Yes, Yes, No, No, No, Yes, …
#> $ multimorbidity    <fct> 2+, 2+, 1, 1, 2+, 2+, 2+, 1, 2+, 2+, 2+, 2+, 1, 2+, …

## Create response as outcomes
resp.data$resp <- data.matrix(resp.data[, 1:8])

GRM Forests Implementation

Define the forest control parameters

## Get help on the control parameter
# ?grmforest.control

## GRMTree control parameters with Benjamini-Hochberg 
grm_control <- grmtree.control(
  minbucket = 350,
  p_adjust = "BH", alpha = 0.05)

## Define the forest control parameters
forest_control <- grmforest.control(
  n_tree = 3, # Number of trees (Reduced for vignette build time)
  sampling = "bootstrap",  # Bootstrap method; resampling also available
  sample_fraction = 0.632,
  mtry = sqrt(9),  # Usually the square root of the number of covariates
  control = grm_control,
  remove_dead_trees = TRUE, # Remove any null GRMTree
  seed = 123
)

Grow the GRM Forest

## Fit the GRM forest
mos_forest <- grmforest(
  resp ~ sex + age + bmi + Education + 
  residency + depressed + job + multimorbidity + smoker,  
  data = resp.data,
  control = forest_control
)

## Get the summary of the fitted forest
summary(mos_forest)
print(mos_forest)

## Plot a tree in the forest
plot(mos_forest$trees[[1]])

Variable Importance

Compute the variable importance of each covariate

## Calculate the variable importance
importance <- varimp(mos_forest, seed = 123, verbose = T)

## Print the result of the variable importance
print(importance)

Example output:

           age         smoker            bmi multimorbidity            sex 
     403.07554      220.37908       39.02621       37.00120       32.06389 
     Education      residency      depressed            job 
       0.00000        0.00000        0.00000        0.00000 

Plot the variable importance of each variable

Here plot.varimp creates a bar plot of variable importance scores with options for both ggplot2 and base R graphics.

## Plot the variable importance scores (ggplot is the default)
plot(importance)

## Plot onlt the top 5 importance variables
plot(importance, xlab = "", top_n = 5)

## Plot the base R version
plot(importance, use_ggplot = FALSE)

## Custom colors
plot(importance, col = c("green", "red"))

## Rename the variable names in the order from the variable importance result
names(importance) <- c("Age", "Smoking Status", "BMI",
                       "Multimorbidity", "Sex", "Education",
                       "Residency", "Depression", "Employment")

## Now create the plot with informative names
plot(importance)

Conclusion

GRM Forests provide more stable DIF detection by aggregating across many trees. Key advantages include robust variable importance measures, reduced overfitting, and better handling of complex interactions.