Install and Load required packages

To implement the tree-based GRM (GRMTree), you will install the following packages if not previously installed.

## Install packages from CRAN repository
install.packages(c("dplyr", "grmtree"))

Once installed, load the packages as follows:

library(dplyr)        # For data manipulation
library(grmtree)      # For tree-based GRM DIF Test

Import and prepare the data

The data set used in this demonstration is a test/sample data for the package.

## Load the data
data("grmtree_data", package = "grmtree")

## Take a glimpse at the data
glimpse(grmtree_data)
#> Rows: 3,500
#> Columns: 17
#> $ MOS_Listen        <chr> "5", "4", "5", "3", "5", "2", "5", "5", "2", "3", "3…
#> $ MOS_Info          <chr> "5", "3", "5", "3", "5", "3", "5", "5", "2", "4", "3…
#> $ MOS_Advice_Crisis <chr> "5", "3", "5", "3", "5", "5", "5", "5", "4", "4", "3…
#> $ MOS_Confide       <chr> "5", "4", "4", "2", "4", "4", "5", "5", "2", "3", "3…
#> $ MOS_Advice_Want   <chr> "5", "2", "4", "3", "4", "3", "5", "5", "4", "3", "3…
#> $ MOS_Fears         <chr> "5", "2", "5", "4", "5", "4", "5", "5", "1", "2", "1…
#> $ MOS_Personal      <chr> "5", "2", "5", "1", "5", "2", "5", "5", "4", "2", "3…
#> $ MOS_Understand    <chr> "5", "2", "5", "1", "4", "3", "5", "5", "4", "2", "3…
#> $ sex               <chr> "Male", "Male", "Male", "Male", "Male", "Female", "M…
#> $ age               <dbl> 69, 66, 72, 52, 61, 57, 61, 71, 77, 65, 55, 64, 45, …
#> $ residency         <chr> "urban", "urban", "urban", "urban", "urban", "rural"…
#> $ depressed         <chr> "No", "No", "No", "Yes", "No", "No", "Yes", "No", "N…
#> $ bmi               <dbl> 22.85714, 33.59375, 24.38272, 31.14187, 25.88057, 24…
#> $ Education         <chr> "Primary/High school", "College/University", "Colleg…
#> $ job               <chr> "Unemployed", "Unemployed", "Unemployed", "Unemploye…
#> $ smoker            <chr> "No", "No", "Yes", "No", "No", "No", "Yes", "Yes", "…
#> $ multimorbidity    <chr> "2+", "2+", "1", "1", "2+", "2+", "2+", "1", "2+", "…

## Prepare the data
resp.data <- grmtree_data %>% 
  mutate_at(vars(starts_with("MOS")), as.ordered) %>% 
  mutate_at(vars(c(sex, residency, depressed,
                   Education, job, smoker,
                   multimorbidity)), as.factor) 

## Explore the data
head(resp.data)
#> # A tibble: 6 × 17
#>   MOS_Listen MOS_Info MOS_Advice_Crisis MOS_Confide MOS_Advice_Want MOS_Fears
#>   <ord>      <ord>    <ord>             <ord>       <ord>           <ord>    
#> 1 5          5        5                 5           5               5        
#> 2 4          3        3                 4           2               2        
#> 3 5          5        5                 4           4               5        
#> 4 3          3        3                 2           3               4        
#> 5 5          5        5                 4           4               5        
#> 6 2          3        5                 4           3               4        
#> # ℹ 11 more variables: MOS_Personal <ord>, MOS_Understand <ord>, sex <fct>,
#> #   age <dbl>, residency <fct>, depressed <fct>, bmi <dbl>, Education <fct>,
#> #   job <fct>, smoker <fct>, multimorbidity <fct>

## Check the structure of the data
glimpse(resp.data)
#> Rows: 3,500
#> Columns: 17
#> $ MOS_Listen        <ord> 5, 4, 5, 3, 5, 2, 5, 5, 2, 3, 3, 4, 5, 2, 5, 3, 5, 4…
#> $ MOS_Info          <ord> 5, 3, 5, 3, 5, 3, 5, 5, 2, 4, 3, 4, 5, 2, 4, 4, 4, 3…
#> $ MOS_Advice_Crisis <ord> 5, 3, 5, 3, 5, 5, 5, 5, 4, 4, 3, 4, 2, 2, 4, 4, 4, 3…
#> $ MOS_Confide       <ord> 5, 4, 4, 2, 4, 4, 5, 5, 2, 3, 3, 4, 5, 2, 5, 4, 5, 3…
#> $ MOS_Advice_Want   <ord> 5, 2, 4, 3, 4, 3, 5, 5, 4, 3, 3, 4, 2, 2, 5, 4, 4, 2…
#> $ MOS_Fears         <ord> 5, 2, 5, 4, 5, 4, 5, 5, 1, 2, 1, 4, 5, 2, 5, 4, 4, 4…
#> $ MOS_Personal      <ord> 5, 2, 5, 1, 5, 2, 5, 5, 4, 2, 3, 4, 5, 2, 5, 4, 4, 3…
#> $ MOS_Understand    <ord> 5, 2, 5, 1, 4, 3, 5, 5, 4, 2, 3, 4, 5, 2, 5, 4, 4, 3…
#> $ sex               <fct> Male, Male, Male, Male, Male, Female, Male, Female, …
#> $ age               <dbl> 69, 66, 72, 52, 61, 57, 61, 71, 77, 65, 55, 64, 45, …
#> $ residency         <fct> urban, urban, urban, urban, urban, rural, urban, urb…
#> $ depressed         <fct> No, No, No, Yes, No, No, Yes, No, No, No, Yes, No, N…
#> $ bmi               <dbl> 22.85714, 33.59375, 24.38272, 31.14187, 25.88057, 24…
#> $ Education         <fct> Primary/High school, College/University, College/Uni…
#> $ job               <fct> Unemployed, Unemployed, Unemployed, Unemployed, Unem…
#> $ smoker            <fct> No, No, Yes, No, No, No, Yes, Yes, No, No, No, Yes, …
#> $ multimorbidity    <fct> 2+, 2+, 1, 1, 2+, 2+, 2+, 1, 2+, 2+, 2+, 2+, 1, 2+, …

## Create response as outcomes
resp.data$resp <- data.matrix(resp.data[, 1:8])

GRM Forests Implementation

Define the forest control parameters

## Get help on the control parameter
# ?grmforest.control

## GRMTree control parameters with Benjamini-Hochberg 
grm_control <- grmtree.control(
  minbucket = 350,
  p_adjust = "BH", alpha = 0.05)

## Define the forest control parameters
forest_control <- grmforest.control(
  n_tree = 3, # Number of trees (Reduced for vignette build time)
  sampling = "bootstrap",  # Bootstrap method; resampling also available
  sample_fraction = 0.632,
  mtry = sqrt(9),  # Usually the square root of the number of covariates
  control = grm_control,
  remove_dead_trees = TRUE, # Remove any null GRMTree
  seed = 123
)

Grow the GRM Forest

## Fit the GRM forest
mos_forest <- grmforest(
  resp ~ sex + age + bmi + Education + 
  residency + depressed + job + multimorbidity + smoker,  
  data = resp.data,
  control = forest_control
)

## Get the summary of the fitted forest
summary(mos_forest)
print(mos_forest)

## Plot a tree in the forest
plot(mos_forest$trees[[1]])

Variable Importance

Compute the variable importance of each covariate

## Calculate the variable importance
importance <- varimp(mos_forest, seed = 123, verbose = T)

## Print the result of the variable importance
print(importance)

Example output:

           age         smoker            bmi multimorbidity            sex 
     403.07554      220.37908       39.02621       37.00120       32.06389 
     Education      residency      depressed            job 
       0.00000        0.00000        0.00000        0.00000

Plot the variable importance of each variable

Here plot.varimp creates a bar plot of variable importance scores with options for both ggplot2 and base R graphics.

## Plot the variable importance scores (ggplot is the default)
plot(importance)

## Plot onlt the top 5 importance variables
plot(importance, xlab = "", top_n = 5)

## Plot the base R version
plot(importance, use_ggplot = FALSE)

## Custom colors
plot(importance, col = c("green", "red"))

## Rename the variable names in the order from the variable importance result
names(importance) <- c("Age", "Smoking Status", "BMI",
                       "Multimorbidity", "Sex", "Education",
                       "Residency", "Depression", "Employment")

## Now create the plot with informative names
plot(importance)

GRM Forests for Robust DIF Detection

Introduction