Manuscript Figures and Tables

1 Overview

This R Markdown document includes code to reproduce the analyses, figures, and tables from the 2024 manuscript that describes invivoPKfit’s use in a case study with CvT data from over 200 Chemicals.

2 Setup

Load needed packages.

library(tidyverse, quietly = TRUE)
library(cowplot)
library(RColorBrewer)
library(ctxR)
library(flextable)
library(officer)
library(patchwork)

devtools::load_all() #load invivopkfit

#get today's date
today <- format(Sys.Date(), "%d%B%Y")

This file refers to two environment variables, OD_DIR and FIG_DIR. OD_DIR should be set to the path where you have placed the necessary input files. FIG_DIR should be set to the path where you want to save the figures and tables. You can set those environment variables locally inside R like this:

Sys.setenv(OD_DIR = "path/to/inputs/",
           FIG_DIR = "path/to/outputs/")

You’ll want to make sure the paths end with a slash.

Additionally, at one point, this code uses package ctxR to query the CompTox Chemicals Dashboard. In order to run that portion of the code, you will need to have an API key (free), which you can request by emailing ccte_api@epa.gov. Set another environment variable to contain that code as a string, like this:

Sys.setenv(ccd_api_key = "your_api_key")

Define a date for the pre-fit pk objects to read in:

read_date_pk <- "25November2024"

And a date for the results files to read from:

read_date_results <- "27November2024"

3 Pulling data from CvTdb

See pulling_iv_oral_cvtdb.Rmd.

4 Minimal PK object

The following chunk shows the column/field mapping from the cvt object and the pk object needed for invivoPKfit analysis. The cvt object is data from the CvTdb database that includes chemicals with oral or intravenous routes of administration and takes measurements from blood or plasma. The data has been processed and cleaned to facilitate its use with invivoPKfit.


### Minimal PK Object ###----
# Minimum pk object, add options later
# Note that these mappings are now a default mappings, just being verbose here.
# If there are any standard deviations that are zero or NA, but N_Subjects > 1,
# Set the n_subjects to 6 or the current value if lower than 6.
# (In Showa the mode is 6, but this condition is present in CVT_Legacy as well)
minimal_pk <- pk(data = cvt %>%
                   mutate(n_subjects_normalized = if_else(
                     n_subjects_normalized > 1 & is.na(invivPK_conc_sd),
                     min(n_subjects_normalized, 6), n_subjects_normalized)),
                 mapping = ggplot2::aes(
                   Chemical = analyte_dtxsid,
                   Chemical_Name = analyte_name_original,
                   DTXSID = analyte_dtxsid,
                   CASRN = analyte_casrn,
                   Species = species,
                   Reference = fk_extraction_document_id,
                   Media = conc_medium_normalized,
                   Route = administration_route_normalized,
                   Dose = invivPK_dose_level,
                   Dose.Units = "mg/kg",
                   Subject_ID = fk_subject_id,
                   Series_ID = fk_series_id,
                   Study_ID = fk_study_id,
                   ConcTime_ID = conc_time_id,
                   N_Subjects = n_subjects_normalized,
                   Weight = weight_kg,
                   Weight.Units = "kg",
                   Time = time_hr,
                   Time.Units = "hours",
                   Value = invivPK_conc,
                   Value.Units = "mg/L",
                   Value_SD = invivPK_conc_sd,
                   LOQ = invivPK_loq
                 ))

5 Running all possible fitting options

Here we setup the PK object with the various options for fitting and data transformations. As shown in this vignette, it is possible to setup multiple distinct options for fitting and data transformation at once prior to processing and fitting the data.

# Types of Choices:
## Dose Normalized
## Log10 transform
## Scale time
# List of options
fitopts <- expand.grid(error_model = c("pooled",
                                       "hierarchical"),
                       dose_norm = c(TRUE, FALSE),
                       log10_trans = c(TRUE, FALSE),
                       time_scale = c(
                         TRUE,
                         FALSE),
                       stringsAsFactors = FALSE)
# Make function similar to that in test_vignette authored by Caroline Ring
fit_data <- function(this_error_model,
                     this_dose_norm,
                     this_log10_trans,
                     this_time_scale,
                     n_cores = 12,
                     file_dir = Sys.getenv("OD_DIR")){
  retval <- -9
   dose_indic <- as.numeric(this_dose_norm)
  log10_indic <- as.numeric(this_log10_trans)
  time_indic <- as.numeric(this_time_scale)
  errmodel_indic <- substr(this_error_model, 1, 1)
  
  file_str <- paste0("mypk_fit_",
                     dose_indic,
                     log10_indic,
                     time_indic,
                     errmodel_indic,
                     ".Rds")
  
  cat("Fitting: ", file_str, "\n")
  
  tryCatch(
    expr = {
      this_pk <-   minimal_pk +
        facet_data(vars(Chemical, Species)) +
        settings_preprocess(keep_data_original = FALSE,
                            suppress.messages = TRUE) +
        scale_conc(dose_norm = this_dose_norm, log10_trans = this_log10_trans) +
        settings_optimx(method = c(
          "L-BFGS-B",
          "bobyqa"
          ))
      
      if(this_error_model %in% "pooled"){
        this_pk <- this_pk +
          stat_error_model(error_group = vars(Chemical, Species)) 
      } else if(this_error_model %in% c("hierarchical", "joint")){
        this_pk <- this_pk +
          stat_error_model(error_group = vars(Chemical, Species, Reference)) 
      } else{
        stop("this_error_model must be either 'pooled' or 'hierarchical'/'joint'")
      }
      
      if(this_time_scale %in% TRUE){
        this_pk <- this_pk +
          scale_time(new_units = "auto")
      }
      
      #do the fit
      this_time <- system.time(this_fit <- do_fit(this_pk, n_cores = n_cores))
      
      
      
      #save the result
      saveRDS(this_fit,
              file = paste0(file_dir,
                     file_str))
      
      cli::cli_alert_success(text = "Fitting success!")
      print(this_time)
      
      retval <- this_time[[3]]
    },
    error = function(e){
      cli::cli_alert_danger(text = paste("Analysis",
                                         file_str,
                                         "failed with error",
                                         e))
      retval <- -1
    }
  )
  
  return(retval)
}

system.time(tmp_out <- mapply(fit_data,
                              this_error_model = fitopts$error_model,
                              this_dose_norm = fitopts$dose_norm,
                              this_log10_trans = fitopts$log10_trans,
                              this_time_scale = fitopts$time_scale,
                              n_cores = 14))

mean(tmp_out)/60 # Average runtime (in minutes) per fitting option
median(tmp_out)/60
sum(tmp_out) # Looks like there's 10 extra seconds of overhead for non-fitting methods

6 Characterizing the data

Load in one of the saved fit objects; it doesn’t matter which one, since we will be using only the pre-processed data, which is the same regardless of fitting options.

my_pk <- readRDS(paste0(Sys.getenv("OD_DIR"),
                        read_date_pk,
                        "mypk_fit_110p.Rds"))

Number of observations:

get_data(my_pk) %>%
  count()

Number of unique chemicals:

get_data(my_pk) %>%
  distinct(Chemical) %>%
  count()

Number of unique references:

get_data(my_pk) %>%
  distinct(Reference) %>%
  count()

Number of unique species:

get_data(my_pk) %>%
  distinct(Species) %>%
  count()

Number of unique combinations of Chemical and Species:

get_data(my_pk) %>%
  distinct(Chemical, Species) %>%
  count()

Number of unique combinations of Chemical, Species, and Reference:

get_data(my_pk) %>%
  distinct(Chemical, Species, Reference) %>%
  count()

Number of unique combinations of Chemical, Species, Reference, Route, Media:

get_data(my_pk) %>%
  distinct(Chemical, Species, Reference, Route, Media) %>%
  count()

Number of unique combinations of Chemical, Species, Reference, Route, Media, Dose:

get_data(my_pk) %>%
  distinct(Chemical, Species, Reference, Route, Media, Dose) %>%
  count()

6.1 Number of data groups with both blood and plasma data available

get_data(my_pk) %>%
  group_by(Chemical, Species) %>%
  summarise(blood_and_plasma = all(c("blood",
                  "plasma") %in% Media)) %>%
  ungroup() %>%
  summarise(n_both = sum(blood_and_plasma))

6.2 Number of data groups where time scaling could occur

get_data(my_pk) %>%
  group_by(Chemical, Species) %>%
  summarise(t_mid = mean(range(Time)),
    t_mid_log10 = midpt_log10(Time),
            new_time_units = auto_units(y = Time,
                       from = "hours",
                       target = 10)) %>%
  ungroup() %>%
  count(new_time_units)

6.3 How many experiments (Chemical, Species, Reference, Route, Media) were flagged for potential dose dependence?

get_data_info(my_pk)$dose_norm_check %>%
  dplyr::summarise(AUC_flag = sum(!is.na(data_flag_AUC)),
            Cmax_flag = sum(!is.na(data_flag_Cmax)),
            total_multi_dose = sum(n_dose_groups>1),
            total_expts = n())

6.4 Evaluating time and concentration ranges

Supp. Figure 2: Concentration and final time-points have wide ranges across CvT datasets.

#compute duration and concentration range across experiments (Chemical, Species, Reference, Route, Media, Dose)
data_range <- my_pk$data %>%
  filter(Detect, !exclude) %>%
  group_by(Chemical, Species, Reference, Media, Route, Dose, Dose.Units) %>%
  summarize(maxT = max(Time)/24,
         concRange = log10(max(Conc)/min(Conc))) %>%
  ungroup()
   
#Panel A: histogram of day of final timepoint per experiment  
sf_2A <- data_range %>%
  ggplot(aes(x = maxT)) +
  geom_histogram(fill = "grey5",
                 bins = 40,
                 color = "grey5") +
  scale_x_log10(labels = scales::number,
                breaks = c(0.1, 1, 7, 30, 365)) +
  annotation_logticks(sides = "b") +
  labs(x = "Day of final Timepoint",
       y = "Count") +
  theme(aspect.ratio = 1,
        text = element_text(size = 10),
        panel.border = element_rect(color = "black", fill = NA, size = 1),
        panel.background = element_blank(),
        panel.grid.major = element_line(color = "grey90", linewidth = 0.5),
        axis.ticks = element_blank(),
        axis.line = element_blank())

#panel B: histogram of fold concentration range
sf_2B <- data_range %>%
  ggplot(aes(x = concRange)) +
  geom_histogram(fill = "grey5",
                 bins = 40,
                 color = "grey5") +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(0, 50),
                     breaks = c(0, 25, 50)) +
  labs(x = "log10(Concentration Range)",
       y = "Count") +
  annotation_logticks(sides = "b") +
  theme(aspect.ratio = 1,
        panel.border = element_rect(color = "black", fill = NA, size = 1),
        panel.background = element_blank(),
        panel.grid.major = element_line(color = "grey90", linewidth = 0.5),
        strip.background = element_rect(fill = "white"),
        strip.text = element_text(face = "bold"),
        axis.ticks = element_blank(),
        axis.line = element_blank(),
        axis.text = element_text(size = 10),
        axis.title = element_text(size = 11))

sf_2AB <- plot_grid(sf_2A, sf_2B,
                    labels = c("A", "B"), 
                    nrow = 1)

sf_2AB

#save PDF version
ggsave(paste0(Sys.getenv("FIG_DIR"),
              "Manu_Files/",
              today,
              "_Supp_Fig2_ConcTimeRanges",
              ".pdf"),
       height = 3.2,
       width = 6,
       bg = "white",
       units = "in")

#save PNG version
ggsave(paste0(Sys.getenv("FIG_DIR"),
              "Manu_Files/",
              today,
              "_Supp_Fig2_ConcTimeRanges",
              ".png"),
       height = 3.2,
       width = 6,
       bg = "white",
       units = "in",
       dpi = 300)

rm(sf_2A, sf_2B, sf_2AB)

6.5 Evaluating data variability

6.5.1 Figure 3

data_cvt <- get_data(my_pk)
data_counts <- data_cvt %>%
dplyr::filter(Detect) %>%
dplyr::count(Chemical, Species, Reference, Route, Media, Dose, Time, N_Subjects,
name = "Count")
data_counts <- dplyr::left_join(data_counts, data_cvt,
by = c("Chemical", "Species", "Reference",
"Route", "Media", "Dose", "Time",
"N_Subjects")) %>%
dplyr::mutate(data_descr = dplyr::case_when(
(Count > 1 & N_Subjects == 1) ~ "Individual Data, Multiple Observations",
(Count == 1 & N_Subjects == 1) ~ "Individual Data, Single Observation",
(Count == 1 & N_Subjects > 1 & Conc_SD > 0) ~ "Grouped Data w SD",
(N_Subjects > 1 & Conc_SD == 0) ~ "Grouped Data no SD",
.default = NA_character_
))

indiv_data <- subset(data_counts,
                     subset = (data_descr %in% "Individual Data, Multiple Observations"))

indiv_data <- indiv_data %>%
dplyr::group_by(Chemical, Species, Reference, Route, Media, Dose, Time) %>%
dplyr::mutate(replicate_mean = mean(Conc, na.rm = TRUE)) %>%
  ungroup()

indiv_data_bygroup <- indiv_data %>%
  dplyr::group_by(Chemical, Species) %>%
  dplyr::summarise(AFE = 10^mean(log10(Conc/replicate_mean)),
                                                                                               AAFE = 10^mean(abs(log10(Conc/replicate_mean)))) %>%
  ungroup()

indiv_data_bygroup %>% count(AFE < 0.5)
indiv_data_bygroup %>% count(AFE > 2)

Do twofold-test; use only the information about the observations at this point (predictions will be handled later).

my_tf <- my_pk %>% twofold_test()

Figure 3 Panel A: Histogram of mean-normalized concentrations

pl3A <- my_tf$individual_data %>%
  # Plotting
  ggplot(aes(x = foldConc)) +
  geom_histogram(binwidth = 0.2,
                 fill = "grey5",
                 color = "grey5",
                 linewidth = 0.4) +
  coord_cartesian(xlim =  c(0, 3.1),
                  ylim = c(0, 1200)) +
  scale_y_continuous(expand = c(0, 0),
                     breaks = c(0, 250, 500, 750, 1000),
                     labels = c("0", "2.5", "5", "7.5", "10")) +
  expand_limits(y = 0) +
  geom_vline(xintercept = c(0.5, 2),
             color = "red3",
             linetype = "dashed",
             linewidth = 1) +
  theme_bw() +
  labs(x = "Mean-normalized concentration",
                y = "Observations (hundreds))") +
  theme(text = element_text(size = 14),
        panel.border = element_rect(color = "black", linewidth = 1, fill = NA),
        panel.background = element_blank(),
        panel.grid.major.y = element_line(color = "grey90", linewidth = 0.4),
        axis.title = element_text(face = "bold"),
        axis.line = element_blank(),
        axis.text = element_text(size = 14),
        plot.background = element_blank(),
        axis.ticks.y = element_blank())

pl3A

Figure 3 Panel B: Plot of mean-normalized concentrations vs. ADME-Normalized Time

#get time of peak concentration for use in calculated ADME-normalized time
my_nca <- nca(my_pk) %>%
  dplyr::filter(param_name %in% "tmax")

pl3B <- my_tf$individual_data %>%
  inner_join(my_nca %>%
              dplyr::select(Chemical, Species,
                            Route, Media,
                            tmax = param_value) %>%
              filter(!is.na(tmax))) %>%
  group_by(Chemical,
           Species,
           Reference,
           Route,
           Media,
           Dose) %>%
  mutate(
         normTime = ifelse(
           Route == "iv",
           1 + Time/max(Time),
           ifelse(
             Time > tmax,
             ((Time - tmax)/max(Time)) + 1,
             0.5 + Time/(2*tmax)
           )
         )) %>%
  ggplot(aes(
    x = normTime,
    y = foldConc
  )) +
  scale_x_continuous(breaks = c(1,2),
                     labels = c("tmax", "end")) +
  geom_point(alpha = 0.1, size = 0.7) +
  geom_hline(yintercept = c(0.5, 2),
             color = "red3",
             linewidth = 0.8,
             linetype = "dashed") +
  facet_grid(cols = vars(Route),
             scales = "free_x") +
  labs(x = "ADME-Normalized Time",
       y = "Mean-Normalized\nConcentration") +
  theme(text = element_text(size = 14),
        panel.border = element_rect(color = "black", fill = NA, size = 1),
        panel.background = element_blank(),
        panel.grid.major = element_line(color = "grey90", linewidth = 0.5),
        strip.background = element_rect(fill = "white"),
        strip.text = element_text(face = "bold"),
        axis.line = element_blank(),
        axis.text = element_text(size = 14),
        axis.title = element_text(face = "bold"),
        panel.spacing = unit(0.125, units = "in"),
        plot.background = element_blank(),
        legend.position = "none")

pl3B

Paste these two plots together

pl3 <- cowplot::plot_grid(pl3A,
                          pl3B,
                          ncol = 2,
                          rel_widths = c(1,2),
                          labels = c("A", "B"))

#save PDF version
ggsave(paste0(Sys.getenv("FIG_DIR"),
                "Manu_Files/",
              today,
                "_Fig3.pdf"),
      pl3,
          bg = "white",
      width = 8,
      height = 4,
      units = "in")

#Save PNG version
ggsave(paste0(Sys.getenv("FIG_DIR"),
                "Manu_Files/",
              today,
                "_Fig3.png"),
      pl3,
          bg = "white",
      width = 8,
      height = 4,
      units = "in",
      dpi = 300)

#remove unneeded objects
rm(pl3A, pl3B, pl3)

6.5.2 Supplemental Figure 3: Histogram of data variability by time

plsupp3 <- my_tf$individual_data %>%
  inner_join(my_nca %>%
              dplyr::select(Chemical, Species,
                            Route, Media,
                            tmax = param_value) %>%
              filter(!is.na(tmax))) %>%
  group_by(Chemical,
           Species,
           Reference,
           Route,
           Media,
           Dose) %>%
  mutate(
         normTime = ifelse(
           Route == "iv",
           1 + Time/max(Time),
           ifelse(
             Time > tmax,
             ((Time - tmax)/max(Time)) + 1,
             0.5 + Time/(2*tmax)
           )
         )) %>%
  ggplot(aes(
    x = normTime
  )) +
  scale_x_continuous(breaks = c(1,2),
                     labels = c("tmax", "end")) +
  geom_histogram() +
  facet_grid(cols = vars(Route),
             scales = "free_x") +
  labs(x = "ADME-Normalized Time",
       "# Observations") +
  theme(text = element_text(size = 14),
        panel.border = element_rect(color = "black", fill = NA, size = 1),
        panel.background = element_blank(),
        panel.grid.major = element_line(color = "grey90", linewidth = 0.5),
        strip.background = element_rect(fill = "white"),
        strip.text = element_text(face = "bold"),
        #axis.ticks = element_blank(),
        axis.line = element_blank(),
        #axis.text.x = element_blank(),
        axis.text = element_text(size = 14),
        axis.title = element_text(face = "bold"),
        panel.spacing = unit(0.125, units = "in"),
        plot.background = element_blank(),
        legend.position = "none")

#save PDF version
ggsave(paste0(Sys.getenv("FIG_DIR"),
                "Manu_Files/",
              today,
                "_Supp_Fig3",
                ".png"),
      plsupp3,
          bg = "white",
      width = 8,
      height = 4,
      units = "in",
      dpi = 300)

ggsave(paste0(Sys.getenv("FIG_DIR"),
                "Manu_Files/",
              today,
                "_Supp_Fig3",
                ".pdf"),
      plsupp3,
          bg = "white",
      width = 8,
      height = 4,
      units = "in")

7 Evaluating fitting options

Here we evaluate the various fitting options by:

Define some helper functions to do the various model evaluations.

7.1 Winning model tally

# Evaluation
# Winning Model Tally
winmodel_comparison <- function(this_error_model,
                             this_dose_norm,
                             this_log10_trans,
                             this_time_scale) {
  dose_indic <- as.numeric(this_dose_norm)
  log10_indic <- as.numeric(this_log10_trans)
  time_indic <- as.numeric(this_time_scale)
  errmodel_indic <- substr(this_error_model, 1, 1)
  
  file_str <- paste0(Sys.getenv("OD_DIR"),
                     read_date_pk,
                     "mypk_fit_",
                     dose_indic,
                     log10_indic,
                     time_indic,
                     errmodel_indic,
                     ".Rds")
  pk_name <- paste0(dose_indic, log10_indic, time_indic, errmodel_indic)
  
  current_rds <- readRDS(file = file_str)
  
  pf <- current_rds$fit %>%
    distinct(Chemical, Species, model, method, convcode) %>%
    group_by(method, .drop = FALSE) %>%
    count(convcode) %>%
    pivot_wider(names_from = convcode,
                names_prefix = "convcode.",
                values_from = n)
    
  
  
  # Wide winning model
  suppressMessages({
    winning_model <- get_winning_model(obj = current_rds)
    wide_winning_model <- winning_model %>%
      group_by(method, model) %>%
      count() %>%
      pivot_wider(names_from = model, values_from = n) %>%
      left_join(pf)
  })
  return(wide_winning_model)
}

7.2 RMSLE for Cmax and AUC (with some tallys for extreme values)

First we need to determine how to filter out cases where NCA or tkstats predicted ridiculously large AUCs. To do this, create a function to plot \(AUC_{\infty}\) predicted by the model vs. \(AUC_{\infty}\) estimated by NCA, for each set of fitting options. Show the identity line, and also show lines for a factor of 20 above and below the identity line, since a 20-fold cutoff is one proposed cutoff. Do the same for Cmax, as well.

7.2.1 AUC_inf and Cmax comparison

AUC_plot <- function(
    this_error_model,
    this_dose_norm,
    this_log10_trans,
    this_time_scale) {
  dose_indic <- as.numeric(this_dose_norm)
  log10_indic <- as.numeric(this_log10_trans)
  time_indic <- as.numeric(this_time_scale)
  errmodel_indic <- substr(this_error_model, 1, 1)
  
  file_str <- paste0(Sys.getenv("OD_DIR"),
                     read_date_pk,
                     "mypk_fit_",
                     dose_indic,
                     log10_indic,
                     time_indic,
                     errmodel_indic,
                     ".Rds")
  pk_name <- paste0(dose_indic, log10_indic, time_indic, errmodel_indic)
  
  message(paste0("Evaluation for: ", pk_name))
  current_rds <- readRDS(file = file_str)
  # Cmax and AUC_inf RMSEs
  #note that eval_tkstats already filters winning models
  #use finite_only = FALSE so that we can count the excluded cases
  suppressMessages(RMSLE_eval <- eval_tkstats(obj = current_rds, 
                                             dose_norm = FALSE,
                                             finite_only = FALSE,
                                             suppress.messages = TRUE) %>%
                                  mutate(Options = pk_name) %>%
                                  dplyr::select(
                                    Options,
                                    Chemical,
                                    Species,
                                    Route,
                                    Media,
                                    Reference,
                                    method,
                                    model,
                                    starts_with("Cmax"),
                                    starts_with("AUC_")
                                  ))
  
 these_opts <- paste0(dose_indic,
                     log10_indic,
                     time_indic,
                     errmodel_indic)
  
  #plot AUC infinity for tkstats vs. AUC infinity for NCA
   #show identity line and 20x above and below lines
 AUC_plot <- RMSLE_eval %>%
    dplyr::filter(is.finite(AUC_infinity.nca) & is.finite(AUC_infinity.tkstats)) %>%
    ggplot() +
    geom_point(aes(x = AUC_infinity.nca,
                   y = AUC_infinity.tkstats,
                   color = model)) +
    scale_x_log10() + scale_y_log10() +
    annotation_logticks() +
    geom_abline(aes(intercept = 0, slope =1)) +
    geom_abline(aes(intercept = log10(20), slope = 1),
                linetype = 2) +
    geom_abline(aes(intercept = log10(1/20), slope = 1),
                linetype = 2) +
      facet_grid(rows = vars(method)) +
      ggtitle(paste(these_opts,
                    "AUC_inf pred vs. AUC_inf NCA")) +
      scale_color_manual(values = c("model_1comp" = "#0398FC",
                                 "model_2comp" = "#D68E09",
                               "model_flat" = "black"),
                    name = "Winning model") +
      coord_equal()
 
 Cmax_plot <- RMSLE_eval %>%
    dplyr::filter(is.finite(Cmax.nca) & is.finite(Cmax.tkstats)) %>%
    ggplot() +
    geom_point(aes(x = Cmax.nca,
                   y = Cmax.tkstats,
                   color = model)) +
    scale_x_log10() + scale_y_log10() +
    annotation_logticks() +
    geom_abline(aes(intercept = 0, slope =1)) +
    geom_abline(aes(intercept = log10(20), slope = 1),
                linetype = 2) +
    geom_abline(aes(intercept = log10(1/20), slope = 1),
                linetype = 2) +
      facet_grid(rows = vars(method)) +
      ggtitle(paste(these_opts,
                    "Cmax pred vs. Cmax NCA")) +
      scale_color_manual(values = c("model_1comp" = "#0398FC",
                                 "model_2comp" = "#D68E09",
                               "model_flat" = "black"),
                    name = "Winning model") +
      coord_equal()

  return(
    list(AUC_plot = AUC_plot,
    Cmax_plot = Cmax_plot
    )
  )
}

Run these plots for all fitting opts

all_AUC_plots <- fitopts %>%
              dplyr::rowwise() %>%
  dplyr::summarise(this_plot = list(AUC_plot(this_error_model = error_model,
                                 this_dose_norm = dose_norm,
                                 this_log10_trans = log10_trans,
                                 this_time_scale = time_scale)
                                 )
                   )
#save the plots
pdf(file = paste0(
  Sys.getenv("FIG_DIR"),
  today,
  "_AUCinf_nca_vs_tk_plots.pdf"),
  onefile = TRUE,
  height = 5,
  width = 7)

plotlist <- all_AUC_plots %>%
  pull(this_plot)

for (x in seq_along(plotlist)) {
  print(plotlist[[x]][["AUC_plot"]])
}
dev.off()

pdf(file = paste0(
  Sys.getenv("FIG_DIR"),
  today,
  "_Cmax_nca_vs_tk_plots.pdf"),
  onefile = TRUE,
  height = 5,
  width = 7)

plotlist <- all_AUC_plots %>%
  pull(this_plot)

for (x in seq_along(plotlist)) {
  print(plotlist[[x]][["Cmax_plot"]])
}
dev.off()

rm(plotlist, all_AUC_plots)

It appears that the best filter, other than filtering out NA, Inf, zero, and negative values, would be to filter out any AUC_inf values greater than 1e7, since that cutoff identifies outliers.

Write the function for RMSLE for Cmax and AUC accordingly.


# RMSLE for Cmax and AUC
rmsles_Cmax_AUC <- function(
    this_error_model,
    this_dose_norm,
    this_log10_trans,
    this_time_scale) {
  dose_indic <- as.numeric(this_dose_norm)
  log10_indic <- as.numeric(this_log10_trans)
  time_indic <- as.numeric(this_time_scale)
  errmodel_indic <- substr(this_error_model, 1, 1)
  
  file_str <- paste0(Sys.getenv("OD_DIR"),
                     read_date_pk,
                     "mypk_fit_",
                     dose_indic,
                     log10_indic,
                     time_indic,
                     errmodel_indic,
                     ".Rds")
  pk_name <- paste0(dose_indic, log10_indic, time_indic, errmodel_indic)
  
  message(paste0("Evaluation for: ", pk_name))
  current_rds <- readRDS(file = file_str)
  # Cmax and AUC_inf RMSEs
  #note that eval_tkstats already filters winning models
  #use finite_only = FALSE so that we can count the excluded cases
  suppressMessages(RMSLE_eval <- eval_tkstats(obj = current_rds, 
                                             dose_norm = FALSE,
                                             finite_only = FALSE,
                                             suppress.messages = TRUE) %>%
                                  mutate(Options = pk_name) %>%
                                  dplyr::select(
                                    Options,
                                    Chemical,
                                    Species,
                                    Route,
                                    Media,
                                    Reference,
                                    method,
                                    model,
                                    starts_with("Cmax"),
                                    starts_with("AUC_")
                                  ))
 
  #Couunt bad observations:
  #zero, NA, infinite
  RMSLE_badobs <- RMSLE_eval %>% 
    group_by(Options, method) %>%
    summarise(total_expts = n(),
              count_AUCinf_NCA_0 = sum(AUC_infinity.nca %in% 0),
              count_AUCinf_pred_0 = sum(AUC_infinity.tkstats %in% 0),
              count_AUCinf_NCA_isNA = sum(is.na(AUC_infinity.nca)),
              count_AUCinf_pred_isNA = sum(is.na(AUC_infinity.tkstats)),
              count_AUCinf_NCA_isInf = sum(is.infinite(AUC_infinity.nca)),
              count_AUCinf_pred_isInf = sum(is.infinite(AUC_infinity.tkstats)),
             count_Cmax_NCA_0 = sum(Cmax.nca %in% 0),
              count_Cmax_pred_0 = sum(Cmax.tkstats %in% 0),
              count_Cmax_NCA_isNA = sum(is.na(Cmax.nca)),
              count_Cmax_pred_isNA = sum(is.na(Cmax.tkstats)),
              count_Cmax_NCA_isInf = sum(is.infinite(Cmax.nca)),
              count_Cmax_pred_isInf = sum(is.infinite(Cmax.tkstats))
    ) %>%
    ungroup()
  
  #count cases where AUC or Cmax was negative,
  #or where AUC_inf was greater than 1e7
  RMSLE_neg <- RMSLE_eval %>%
    dplyr::filter(is.finite(AUC_infinity.nca),
                            is.finite(AUC_infinity.tkstats),
                            is.finite(Cmax.nca),
                            is.finite(Cmax.tkstats)) %>%
    group_by(Options, method) %>%
      summarise(count_AUCinf_NCA_lt0 = sum(AUC_infinity.nca < 0),
              count_AUCinf_pred_lt0 = sum(AUC_infinity.tkstats < 0),
              count_Cmax_NCA_lt0 = sum(AUC_infinity.nca < 0),
              count_Cmax_pred_lt0 = sum(AUC_infinity.tkstats < 0),
              count_AUCinf_NCA_gt_1e7 = sum(AUC_infinity.nca > 1e7),
               count_AUCinf_pred_gt_1e7 = sum(AUC_infinity.tkstats > 1e7),
       )
  
  RMSLE_badobs <- RMSLE_badobs %>%
    left_join(RMSLE_neg, by = c("Options",
                                    "method")) %>%
        rowwise() %>%
    dplyr::mutate(total_excl = sum(
      c_across(
      starts_with("count")
      )
      )
    )
  
  #compute RMSLEs after excluding the bad observations
  RMSLE_eval <- RMSLE_eval %>%
    dplyr::filter(is.finite(AUC_infinity.nca),
           is.finite(AUC_infinity.tkstats)) %>%
    dplyr::filter(
      AUC_infinity.nca > 0,
      AUC_infinity.tkstats > 0,
      AUC_infinity.nca < 1e7,
      AUC_infinity.tkstats < 1e7) %>%
    rowwise() %>%
    mutate(
      SLE_Cmax = (log10(Cmax.tkstats) - log10(Cmax.nca)) ^ 2,
      SLE_AUC_inf = (log10(AUC_infinity.tkstats) - log10(AUC_infinity.nca)) ^ 2
    ) %>%
    dplyr::filter(!is.na(SLE_Cmax), !is.na(SLE_AUC_inf)) %>%
    group_by(Options, method) %>%
    summarise(
      n_expts = n(),
      RMSLE_Cmax = sqrt(mean(SLE_Cmax, na.rm = FALSE)) %>% signif(digits = 4),
      RMSLE_AUC = sqrt(mean(SLE_AUC_inf, na.rm = FALSE)) %>% signif(digits = 4)
    ) %>%
    ungroup()
  
  #merge in counts of bad obs
  
  RMSLE_eval <- RMSLE_eval %>%
    dplyr::left_join(RMSLE_badobs,
              by = c("Options", "method"))
  
  return(RMSLE_eval)
}

7.3 Goodness of fit metrics

Goodness-of-fit table with R-squared, RMSLE and AIC:

# Goodness-of-fit table with R-squared, RMSLE and AIC
get_gof <- function(this_error_model,
                             this_dose_norm,
                             this_log10_trans,
                             this_time_scale) {
  dose_indic <- as.numeric(this_dose_norm)
  log10_indic <- as.numeric(this_log10_trans)
  time_indic <- as.numeric(this_time_scale)
  errmodel_indic <- substr(this_error_model, 1, 1)
  
  file_str <- paste0(Sys.getenv("OD_DIR"),
                     read_date_pk,
                     "mypk_fit_",
                     dose_indic,
                     log10_indic,
                     time_indic,
                     errmodel_indic,
                     ".Rds")
  pk_name <- paste0(dose_indic, log10_indic, time_indic, errmodel_indic)
  
  current_rds <- readRDS(file = file_str)
  winning_model <- suppressMessages(get_winning_model(obj = current_rds))
  
  suppressMessages({
    this_rsq <- rsq.pk(current_rds,
                    use_scale_conc = FALSE,
                    rsq_group = ggplot2::vars(Chemical, Species),
                    sub_pLOQ = TRUE) %>%
      semi_join(winning_model)
    
    this_AIC <- AIC(current_rds) %>%
      semi_join(winning_model)
    
    this_rmsle <- rmse.pk(current_rds,
                      rmse_group = vars(Chemical, Species),
                      use_scale_conc = list(dose_norm = FALSE,
                                            log10_trans = TRUE),
                      sub_pLOQ = TRUE) %>%
      semi_join(winning_model)

  })
  
  # Since they are all joined by winmodel
  # all three must have same NROW
  message(
    paste0("For ", pk_name, "... outputs below must have same number of rows")
  )
  
  message(paste0("rsq: ", NROW(this_rsq)))
  message(paste0("aic: ", NROW(this_AIC)))
  message(paste0("rmsle: ", NROW(this_rmsle)))
  
  gof_df <- this_rsq %>%
    inner_join(this_AIC, by = join_by(Chemical, Species, method, model)) %>%
    inner_join(this_rmsle, by = join_by(Chemical, Species, method, model)) %>%
    mutate(Options = pk_name, .before = Chemical)
  
  return(gof_df)
}

7.4 Get all predictions


# All predictions 
get_all_preds <- function(this_error_model,
                             this_dose_norm,
                             this_log10_trans,
                             this_time_scale) {
  dose_indic <- as.numeric(this_dose_norm)
  log10_indic <- as.numeric(this_log10_trans)
  time_indic <- as.numeric(this_time_scale)
  errmodel_indic <- substr(this_error_model, 1, 1)
  
  file_str <- paste0(Sys.getenv("OD_DIR"),
                     read_date_pk,
                     "mypk_fit_",
                     dose_indic,
                     log10_indic,
                     time_indic,
                     errmodel_indic,
                     ".Rds")
  pk_name <- paste0(dose_indic, log10_indic, time_indic, errmodel_indic)
  
  
  current_rds <- readRDS(file = file_str)
  
  # Wide winning model
  suppressMessages({
    winning_model <- get_winning_model(obj = current_rds) %>%
      select(-c(near_flat, preds_below_loq))
    this_preds <- predict(current_rds,
                          use_scale_conc = FALSE) %>%
      semi_join(winning_model) %>%
      mutate(Options = pk_name, .before = Chemical)
  })
  return(this_preds)
}

7.5 Factor of two model error

# Factor of two model error
tf_tests <- function(this_error_model,
                             this_dose_norm,
                             this_log10_trans,
                             this_time_scale) {
  dose_indic <- as.numeric(this_dose_norm)
  log10_indic <- as.numeric(this_log10_trans)
  time_indic <- as.numeric(this_time_scale)
  errmodel_indic <- substr(this_error_model, 1, 1)
  
  file_str <- paste0(Sys.getenv("OD_DIR"),
                     read_date_pk,
                     "mypk_fit_",
                     dose_indic,
                     log10_indic,
                     time_indic,
                     errmodel_indic,
                     ".Rds")
  pk_name <- paste0(dose_indic, log10_indic, time_indic, errmodel_indic)
  current_rds <- readRDS(file = file_str)
  print(paste0("Now analyzing: ", pk_name))
  
  out <- twofold_test(current_rds,
                      sub_pLOQ = TRUE,
                      suppress_messages = TRUE)
  
  out <- out$model_error_summary %>%
    mutate(Options = pk_name)
  
  return(out)
}

7.6 Evaluate the various GOF metrics

Now call these functions and do the evaluations.

Winning model comparisons:


#winning model
system.time(wm_comp <- fitopts %>%
              dplyr::rowwise() %>%
              dplyr::mutate(winmodel_comp = list(
                winmodel_comparison(this_error_model = error_model,
                                 this_dose_norm = dose_norm,
                                 this_log10_trans = log10_trans,
                                 this_time_scale = time_scale))) %>%
              tidyr::unnest(winmodel_comp))

RMSLE for Cmax and AUC:


#RMSLE for Cmax and AUC
system.time(cmax_auc_rmsle_tbl <- fitopts %>%
              dplyr::rowwise() %>%
              dplyr::mutate(nca_comp = list(
                rmsles_Cmax_AUC(this_error_model = error_model,
                                 this_dose_norm = dose_norm,
                                 this_log10_trans = log10_trans,
                                 this_time_scale = time_scale))) %>%
              tidyr::unnest(nca_comp))

Goodness-of-fit metrics table:


#GOF table
system.time(gof_tbl <- fitopts %>%
              dplyr::rowwise() %>%
              dplyr::mutate(gof_win = list(
                get_gof(this_error_model = error_model,
                                 this_dose_norm = dose_norm,
                                 this_log10_trans = log10_trans,
                                 this_time_scale = time_scale))) %>%
              tidyr::unnest(gof_win))

Factor of two:


#factor of two
system.time(all_tf_tests <- fitopts %>%
              dplyr::rowwise() %>%
              dplyr::mutate(TF = list(
                tf_tests(error_model,
                         this_dose_norm = dose_norm,
                         this_log10_trans = log10_trans,
                         this_time_scale = time_scale))) %>%
              tidyr::unnest(TF))

Get all predictions:

#all predictions
system.time(all_preds <- fitopts %>%
              dplyr::rowwise() %>%
              dplyr::mutate(these_preds = list(
                get_all_preds(this_error_model = error_model,
                                 this_dose_norm = dose_norm,
                                 this_log10_trans = log10_trans,
                                 this_time_scale = time_scale))) %>%
              tidyr::unnest(these_preds))

#Save preds to a file as well
saveRDS(all_preds, paste0(Sys.getenv("FIG_DIR"),
                          today,
                          "_all_preds.Rds"))

rm(all_preds)

Produce and save tables for these evaluation metrics.

(win_mdl_count_by_opts <- gof_tbl %>%
  group_by(Options, method, model) %>%
  count() %>%
  pivot_wider(names_from = model,
              values_from = n) )

7.7 Rank fitting options

Rank fitting options by count of Chemical-Species data groups where RMSLE < 1

gof_tbl %>%
  group_by(Options, method) %>%
  count(RMSLE < 1) %>%
  filter(`RMSLE < 1`) %>%
  arrange(desc(n)) %>%
  print(n = 8)
# bobyqa-optimized options 010h, 110h, 110p

What about R-squared greater than 0.75?

gof_tbl %>%
  group_by(Options, method) %>%
  count(Rsq > 0.75) %>%
  filter(`Rsq > 0.75`) %>%
  arrange(desc(n)) %>%
  print(n = 8)

What about both RMSLE < 1 and R-squared greater than 0.75?

rmsle_rsq_rank <- gof_tbl %>%
  group_by(Options, method) %>%
  count(RMSLE < 1 & Rsq > 0.75) %>%
  filter(`RMSLE < 1 & Rsq > 0.75`) %>%
  arrange(desc(n)) %>%
  dplyr::rename(`# Chemical-Species data groups` = n) 

rmsle_rsq_rank %>% print(n=8)

Looking at both criteria together, it appears that the best set of fitting options is actually the simplest: 000p (no data transformation and pooled error model), bobyqa-optimized.

But also, let’s add in the Cmax/AUC RMSLE calculations: Sort the fitting options from smallest to greatest RMSLE for Cmax and AUC, and see which set of fitting options yields the lowest RMSLE.

cmax_auc_rmsle_rank <- cmax_auc_rmsle_tbl %>%
  arrange(RMSLE_Cmax, RMSLE_AUC) %>%
  select(Options, method, RMSLE_Cmax, RMSLE_AUC) 

cmax_auc_rmsle_rank %>%
  print(n = 8)

This ranking strongly suggests 110p, with 100p as a second choice (both bobyqa).

110p was ranked number 3 in the list of RMSLE and Rsq criteria 9and 100p was ranked number 5). So we select 110p bobyqa as the best when all of these metrics are considered.

7.8 Supplemental Table 2: Save evaluation results

Write the evaluation results to a spreadsheet, including the rankings.


# Write the results to file
writexl::write_xlsx(x = list(
  Fit_Counts = wm_comp,
  AUC_and_Cmax_RMSLE_summary = cmax_auc_rmsle_tbl,
  Prediction_Evaluations = gof_tbl,
  Twofold_Tests = all_tf_tests,
   RMSLE_Rsq_rank = rmsle_rsq_rank,
                         Cmax_AUC_RMSLE_rank = cmax_auc_rmsle_rank),
  path = paste0(Sys.getenv("FIG_DIR"),
                "Manu_Files/",
                today,
                "_Supp_Table2_eval_results",
                ".xlsx"))

rm(wm_comp, cmax_auc_rmsle_tbl, gof_tbl, all_tf_tests)

gc()

7.9 Plots for evaluation of fitting options

7.9.1 Goodness-of-fit metrics Rsq and RMSLE across fitting options

Plot R-sq vs. RMSLE for the various fitting options, color-coding by winning model.

gof_tbl <- readxl::read_excel(path = paste0(Sys.getenv("FIG_DIR"),
                "Manu_Files/",
                read_date_results,
                "_Supp_Table2_eval_results",
                ".xlsx"),
                sheet = "Prediction_Evaluations")

 gof_tbl <- gof_tbl %>%
  dplyr::mutate(opts = substr(Options, 1, 3),
                model_text = dplyr::case_when(model %in% "model_1comp" ~ "1comp",
                                              model %in% "model_2comp" ~ "2comp",
                                              model %in% "model_flat" ~ "flat",
                                              .default = NA_character_))

fig_rsq_rmsle <- ggplot(gof_tbl) +
  geom_point(aes(x = RMSLE,
                 y = Rsq,
                 color = model_text)) +
  facet_grid(rows = vars(opts),
             cols = vars(interaction(error_model, method)
                         )
             ) +
 scale_color_manual(values = c("1comp" = "#0398FC",
                                 "2comp" = "#D68E09",
                               "flat" = "black"),
                    name = "Winning model") +
  theme_bw()

fig_rsq_rmsle

This figure reveals two things:

First, when L-BFGS-B is the optimizer, RMSLEs are much larger for a subset of data groups compared to bobyqa.

Second, when L-BFGS-B is the optimizer, the flat model is the winning model for a much greater number of data groups, indicating that for some data groups, L-BFGS-B could not locate a reasonable set of model parameters for 1- or 2-comapartment models, but bobyqa could.

If we zoom in to RMSLE < 2:

fig_rsq_rmsle_zoom <- ggplot(gof_tbl) +
  geom_point(aes(x = RMSLE,
                 y = Rsq,
                 color = model_text)) +
  facet_grid(rows = vars(opts),
             cols = vars(interaction(error_model, method)
                         )
             ) +
 scale_color_manual(values = c("1comp" = "#0398FC",
                                 "2comp" = "#D68E09",
                               "flat" = "black"),
                    name = "Winning model") +
  coord_cartesian(xlim = c(0,1)) +
  theme_bw()

fig_rsq_rmsle_zoom

When each time-scaling fit (when “1” is in the last place of the fit options shorthand) is compared to the fit in the row immediately above it (the same dose-normalization and log10-transformations options, but without time scaling), it can be seen that time scaling either has little effect on goodness of fit, or it makes RMSLE and/or R-squared slightly worse.

Save these figures.

#PDF versions
ggsave(paste0(Sys.getenv("FIG_DIR"),
                "Manu_Files/",
              today,
              "_Supp_Fig4",
              ".pdf"),
       fig_rsq_rmsle,
       width = 11,
       height = 8.5)

ggsave(paste0(Sys.getenv("FIG_DIR"),
                "Manu_Files/",
              today,
              "_Supp_Fig5",
              ".pdf"),
       fig_rsq_rmsle_zoom,
       width = 11,
       height = 8.5)

#PNG versions
ggsave(paste0(Sys.getenv("FIG_DIR"),
                "Manu_Files/",
              today,
              "_Supp_Fig4",
              ".png"),
       fig_rsq_rmsle,
       width = 11,
       height = 8.5)

ggsave(paste0(Sys.getenv("FIG_DIR"),
                "Manu_Files/",
              today,
              "_Supp_Fig5",
              ".png"),
       fig_rsq_rmsle_zoom,
       width = 11,
       height = 8.5)

rm(fig_rsq_rmsle_zoom, fig_rsq_rmsle, gof_tbl)

7.9.2 Cmax RMSLE vs. AUC RMSLE across fitting options

Plot Cmax RMSLE vs. AUC RMSLE.

#read in table
cmax_auc_rmsle_tbl <- readxl::read_xlsx(path = paste0(Sys.getenv("FIG_DIR"),
                                                   "Manu_Files/",
                                                  read_date_results,
                                                  "_Supp_Table2_eval_results.xlsx"),
                                     sheet = "AUC_and_Cmax_RMSLE_summary")
#make plot
cmax_auc_rmsle_tbl <- cmax_auc_rmsle_tbl %>%
  dplyr::mutate(opts = substr(Options, 1, 3))

cmax_auc_rmsle_fig <- ggplot(cmax_auc_rmsle_tbl) +
  geom_point(aes(x = RMSLE_AUC,
                 y = RMSLE_Cmax,
                 color = opts),
             size = 4) +
  facet_grid(rows = vars(error_model),
             cols = vars(method)) +
  scale_color_brewer(palette = "Paired",
                     name = "Fit options") +
  theme_bw()

cmax_auc_rmsle_fig

#save plot
#PDF version
ggsave(paste0(Sys.getenv("FIG_DIR"),
                "Manu_Files/",
              today,
              "_Supp_Fig6",
                ".pdf"),
      cmax_auc_rmsle_fig,
      width = 7,
      height = 5)

#PNG version
ggsave(paste0(Sys.getenv("FIG_DIR"),
                "Manu_Files/",
              today,
              "_Supp_Fig6",
                ".png"),
      cmax_auc_rmsle_fig,
      width = 7,
      height = 5,
      units = "in",
      dpi = 300)

rm(cmax_auc_rmsle_fig, cmax_auc_rmsle_tbl)

This figure reveals that overall, L-BFGS-B produced fits that did not match NCA-estimated Cmax or AUC as well as bobyqa fits. Considering only the bobyqa fits, the pooled error model achieved the smallest differences between NCA and model-predicted Cmax and AUC, and it did so with the two sets of fitting options that included both dose-normalization and log10-transformation.

Plot predicted vs. observed concentrations for everything, using the winning fit options and the winning model for each data group.

all_preds <- readRDS(paste0(Sys.getenv("FIG_DIR"),
                          read_date_results,
                          "_all_preds.Rds"))

#select winning fit opts: 110p bobyqa

all_preds_prep <- all_preds %>%
  filter(Options %in% c("110p"),
         method %in% "bobyqa") %>%
  mutate(Conc_est_sub = dplyr::if_else(Conc_est < pLOQ,
         pLOQ,
         Conc_est),
         Conc_est_belowLOQ = dplyr::if_else(Conc_est < pLOQ,
         TRUE,
         FALSE),
         ID = as.factor(paste(Chemical, Chemical_Name, Species)),
         .keep = "unused") %>%
  select(ID, Options,
         dose_norm, log10_trans, time_scale,
         Route, Media, Dose,
         Conc, Conc_est_sub, Conc_est_belowLOQ,
         Time, Reference)

split_preds <- split(all_preds_prep, all_preds_prep$ID)

sp_plots <- lapply(names(split_preds), function(i) {
  ggplot(data = split_preds[[i]],
         mapping = aes(x = Conc,
                       y = Conc_est_sub,
                       color = as.factor(Dose),
                       shape = Reference)) +
    xlab("Observed conc, mg/L") +
    ylab("Predicted conc, mg/L") +
    geom_point(size = 2) +
    geom_abline(slope = 1) +
    facet_grid(Route ~ Media, 
               scales = "free_y") +
    scale_y_log10() +
    scale_x_log10() +
    annotation_logticks(sides = "bl") +
    scale_color_viridis_d(name = "Dose, mg/kg") +
    theme_bw() +
    labs(title = i) +
    theme(legend.position = "bottom",
          plot.title = element_text(hjust = 0.5, face = "bold"))
}
)

# Very BIG FILE
pdf(file = paste0(
  Sys.getenv("FIG_DIR"),
  "Manu_Files/",
  today,
   "_log10_predvsobs_plots_110pbobyqa_all.pdf"),
  onefile = TRUE, height = 8, width = 8)
for (x in seq_along(sp_plots)) {
  print(sp_plots[[x]])
}
dev.off()

rm(sp_plots, split_preds, all_preds_prep, all_preds)

8 Analysis Plots and Tables for Best Set of Fitting Options

Begin by collating the data needed for plots. These data are generally accessible using functions/methods for evaluating fitted pk objects.

my_pk <- readRDS(paste0(Sys.getenv("OD_DIR"),
                        read_date_pk,
                        "mypk_fit_110p.Rds"))

# Need the following data for any of the subsequent plots

# after fitting pk object for all cvt objects or reading the fitted pk object in `setup`
winmodel <- get_winning_model(my_pk, method = "bobyqa")
my_preds <- semi_join(predict(my_pk, use_scale_conc = FALSE, method = "bobyqa"), winmodel) 
my_residuals <- residuals(my_pk, use_scale_conc = FALSE, method = "bobyqa") %>%
  semi_join(winmodel) 
my_tkstats <- eval_tkstats(my_pk, method = "bobyqa") %>% semi_join(winmodel)
my_nca <- get_nca(my_pk)
all_my_data <- get_data(my_pk)

#merge together long form coefs and long form coef SDs

my_tf <- twofold_test(my_pk, method = "bobyqa")  

NROW(all_my_data) > NROW(my_preds) # THis must be true... or else try distinct(my_preds)

my_coef_sd <- coef_sd(my_pk, method = "bobyqa") %>% #this will take a few minutes to run
  semi_join(winmodel) #keep only coefs & SDs for winning model

# Writing file to xlsx
writexl::write_xlsx(x = list(predictions = my_preds,
                             tkstats = my_tkstats,
                             coefs_sds = my_coef_sd),
                    path = paste0(Sys.getenv("FIG_DIR"),
                                  "Manu_Files/",
                                  today,
                                  "_Supp_Table3",
                                  ".xlsx"))

Compute fold errors, r-squared, AIC, RMSLE, RMSE.


fe_df <- fold_error(my_pk,
                     sub_pLOQ = TRUE,
                     method = "bobyqa") %>% semi_join(winmodel)

# May need to change the use_scale_conc defaults
r2_df <- rsq(my_pk, use_scale_conc = FALSE,
             sub_pLOQ = TRUE,
             method = "bobyqa") %>%
  semi_join(winmodel)  
myAIC <- AIC(my_pk, method = "bobyqa") %>%
  semi_join(winmodel)

myRMSLE <- rmse(my_pk,
               use_scale_conc = list(dose_norm = FALSE,
                                     log10_trans = TRUE),
               sub_pLOQ = TRUE,
               method = "bobyqa") %>% 
  semi_join(winmodel)  

myRMSE <- rmse(my_pk,
               rmse_group = ggplot2::vars(Chemical,
                                          Species,
                                          Route,
                                          Media,
                                          Dose),
               sub_pLOQ = TRUE,
               method = "bobyqa") %>%
  semi_join(winmodel) 

chem_name_translate <- all_my_data %>%
  dplyr::select(Chemical, Chemical_Name) %>%
  dplyr::filter(!(Chemical_Name %in% c("2,4-d",
                                     "methylene chloride",
                                     "benzo[a]pyrene"))) %>%
  dplyr::distinct() 
#these chemical names are filtered out because they are alternate/secondary versions of names for the same DTXSID that is already identified by another name

8.0.1 Supp Figure 7: invivoPKfit model performance

Overall fold error distributions.

my_tf$model_error_all %>%
  filter(model %in% c("model_1comp", "model_2comp")) %>%
  mutate(Route.model = factor(paste(Route, model),
                              levels = c("iv model_1comp",
                                         "iv model_2comp",
                                         "oral model_1comp",
                                         "oral model_2comp"))) %>%
  ggplot(aes(x = Fold_Error,
             fill = Route.model)) +
  geom_histogram(color = NA,
                 position = position_stack(),
                 bins = 50) +
  scale_x_log10(labels = scales::label_math(format = log10)) +
  annotation_logticks(sides = "b") +
    scale_fill_brewer(palette = "Paired",
                      name = "Route/winning model") +
  expand_limits(y = 0) +
  geom_vline(xintercept = c(0.5, 2), color = "black", linetype = "dashed") +
  labs(x = "Predicted/Observed",
       y = "Number of Observations") +
  theme_classic() +
  theme(aspect.ratio = 1,
        panel.border = element_rect(color = "black", fill = NA),
        panel.grid.major = element_line(color = "grey95", linewidth = 0.4)) +
  coord_cartesian(xlim = c(10^(-1.5), 100))


ggsave(paste0(Sys.getenv("FIG_DIR"),
               "Manu_Files/",
                                  today,
              "_Supp_Fig7_PredictedObserved_compartmentModels.png"),
       height = 5, width = 7)

ggsave(paste0(Sys.getenv("FIG_DIR"),
               "Manu_Files/",
                                  today,
              "_Supp_Fig7_PredictedObserved_compartmentModels.pdf"),
       height = 5, width = 7)
  

8.0.2 Figure 4 and Supplemental Figures 4 & 5: Model performance vs Data Variability

Here we examine the performance of “best fit” models with relation to data variability. Here, because we are trying to specifically compare data variability with model performance, we look at data with replicated timepoint observations, which are largely data with individual animal data.

my_tf$indiv_data_test_fold_errors %>%
  select(model, starts_with("percent_")) %>%
  glimpse()

pl_4A <- my_tf$indiv_data_fold_errors %>%
  # For Plotting
  ggplot(aes(
    y = Fold_Error,
    x = foldConc
  )) +
  geom_bin2d(bins = 40, color = NA) +
  geom_hline(yintercept = c(0.5, 2), linetype = "dashed") +
  geom_vline(xintercept = c(0.5, 2), linetype = "dashed") +
  scale_x_log10(labels = scales::label_math(format = log10),
                limits = c(0.001, 20)) +
  scale_y_log10(labels = scales::label_math(format = log10),
                limits = c(0.0001, 10000)) +
  annotation_logticks(sides = "bl") +
  scale_fill_viridis_c(option = "cividis",
                       limits = c(1, 100),
                       oob = scales::oob_squish) +
  labs(y = "Model Error",
       x = "Data Variability",
       fill = "Count") +
  theme(aspect.ratio = 1,
        panel.border = element_rect(color = "black", fill = NA, size = 1.5),
        panel.background = element_blank(),
        panel.grid.major = element_line(color = "grey90", linewidth = 0.5),
        strip.background = element_rect(fill = "white"),
        strip.text = element_text(face = "bold"),
        axis.ticks = element_blank(),
        axis.line = element_blank(),
        axis.text = element_text(size = 14),
        axis.title = element_text(size = 14)) 

pl_4A
ggsave(paste0(Sys.getenv("FIG_DIR"),
              today,
              "_Fig4A_ModelPerformance_vDataVariability.png"),
       height = 5,
       width = 7,
       device = "png",
       dpi = 300,
       units = "in")  

Create table.

  
my_table <- my_tf$indiv_data_test_fold_errors %>%
  ungroup() %>%
  filter(model %in% "All") %>%
  select(starts_with("percent_")) %>% t() %>%
  round(digits = 2) %>%
  as.data.frame() %>%
  rownames_to_column(var = "percentages") %>%
  mutate(percentages = case_when(
    percentages == "percent_both_within" ~ "Both within a factor of 2",
    percentages == "percent_both_outside" ~ "Both outside a factor of 2",
    percentages == "percent_model_outside" ~ "Model too variable",
    percentages == "percent_data_outside" ~ "Data too variable"
  ))
  
names(my_table) <- c(" ", "Overall (%)")

flextable(my_table) %>%
  border_inner() %>%
  fontsize(part = "all", size = 11) %>%
  bold(part = "all", j = 2) %>%
  autofit()

table_plot <- gen_grob(flextable(my_table) %>%
                         border_inner() %>%
                         fontsize(part = "all", size = 10) %>%
                         bold(part = "all", j = 2) %>%
                         autofit(),
                       autowidths = TRUE,
                       fit = "width")


dual_plot <- plot_grid(pl_4A,
                       table_plot,
                       ncol = 1,
                       align = "hv",
          rel_heights = c(1, 0.5),
          rel_widths = c(1, 0.5)
          )

dual_plot

ggsave(paste0(Sys.getenv("FIG_DIR"),
              "Manu_Files/",
              today,
              "_Fig4.png"),
       plot = dual_plot,
       height = 4.5,
       width = 6,
       device = "png",
       bg = "white",
       dpi = 300,
       units = "in")  

8.0.3 Figure 5: Multiple goodness-of-fit metrics validate model performance

Here we analyze and validate model performance per data group using R-squared, RMSE, and fraction of predictions that are within 2-fold for each Chemical, Species, Route, Media, and Dose groupings.

combined_gof_df <- my_tf$model_error_all %>%
  group_by(Chemical, Species, model, method) %>%
  summarize(within_2fold = sum(between(Fold_Error, 0.5, 2))/n()) %>%
  left_join(r2_df) %>%
  left_join(myAIC) %>%
  left_join(myRMSLE) %>%
  semi_join(winmodel)
  

mypl <- ggplot(data = combined_gof_df,
               aes(
                 x = within_2fold,
                 y = Rsq,
                 color = model,
               )) +
  geom_point() +
  geom_abline(slope = 1, color = "black", linetype = "longdash") +
  scale_color_manual(values = c("#0398FC", "#D68E09", "black")) +
    #coord_cartesian(xlim = c(0,1), ylim = c(0,1)) +
  labs(x = "Fraction of predictions within 2-fold",
       y = "R-squared value") +
  theme(#aspect.ratio = 1,
        panel.border = element_rect(color = "black", fill = NA, size = 1),
        panel.background = element_blank(),
        panel.grid.major = element_line(color = "grey90", linewidth = 0.5),
        axis.ticks = element_blank(),
        axis.line = element_blank(),
        legend.position = "none",
        legend.title = element_blank(),
        legend.key = element_blank()
  )
  
panelA_plot <- ggExtra::ggMarginal(mypl,
                                   groupFill = TRUE,
                    type = "histogram",
                    xparams = list(binwidth = 0.05),
                    yparams = list(binwidth = 0.05))
panelA_plot

panelB_plot <- ggplot(data = myRMSE,
               aes(
                 x = RMSE,
                 fill = model
               )) + 
  scale_x_log10() +
  annotation_logticks(sides = "b") +
  geom_histogram(bins = 50,
                 position = position_stack(),
                 color = NA) +
  labs(y = "Count") +
  scale_fill_manual(values = c("#0398FC", "#D68E09", "grey10")) +
   scale_color_manual(values = c("#0398FC", "#D68E09", "grey10"), guide = "none") +
  theme(text = element_text(size = 10),
        #aspect.ratio = 1,
    panel.border = element_rect(color = "black", fill = NA, size = 1),
        panel.background = element_blank(),
        panel.grid.major = element_line(color = "grey90", linewidth = 0.5),
        axis.ticks = element_blank(),
        axis.line = element_blank(),
        strip.background = element_blank(),
    panel.spacing.y = unit(0.125, units = "in"),
        legend.title = element_blank(),
    legend.position = "bottom")
panelB_plot

plAB <- patchwork::wrap_elements(panelA_plot) + panelB_plot +
  plot_annotation(tag_levels = "A") + plot_layout(guides = 'collect', widths = c(1,0.9)) & theme(legend.position = "bottom") & guides(color = "none")

#save PNG version
ggsave(filename = paste0(Sys.getenv("FIG_DIR"),
                         "Manu_Files/",
                         today,
                         "_Fig5.png"),
       plAB,
       height = 4,
       width = 6.5,
       bg = "white",
       units = "in")

#save PDF version
ggsave(filename = paste0(Sys.getenv("FIG_DIR"),
                         "Manu_Files/",
                         today,
                         "_Fig5.pdf"),
       plAB,
       height = 4,
       width = 6.5,
       bg = "white")

8.0.4 Supp. Fig. 8 Examples fits for chemicals with R-squared and within 2-fold

pl <- plot(my_pk,
           method = "bobyqa",
           best_fit = TRUE,
           use_scale_conc = list(dose_norm = TRUE,
                                 log10_trans = FALSE),
           drop_nonDetect = FALSE)

cvt %>% filter(curation_set_tag == "CVT_Showa") %>%
  pull(analyte_dtxsid) -> Showa_chems


my_rsq <- rsq(my_pk, method = "bobyqa")
my_rsq %>% semi_join(winmodel) %>% filter(Chemical %in% c("DTXSID2020139",
                            "DTXSID4023917",
                            "DTXSID3061635",
                            "DTXSID8030760"))

#High rsq: DTXSID2020139 and DTXSID4023917
#Low rsq: DTXSID3061635 and DTXSID8030760

fe_df %>% filter(Chemical %in% c("DTXSID2020139",
                                  "DTXSID4023917",
                                  "DTXSID3061635",
                                  "DTXSID8030760"), Species %in% "rat") %>% group_by(Chemical, Species) %>% summarise(frac_2fold = sum(Fold_Error < 2 & Fold_Error > 0.5)/n())

# # A tibble: 4 × 3
# # Groups:   Chemical [4]
#   Chemical      Species frac_2fold
#   <chr>         <chr>        <dbl>
# 1 DTXSID2020139 rat          0.271
# 2 DTXSID3061635 rat          0.125
# 3 DTXSID4023917 rat          0.953
# 4 DTXSID8030760 rat          0.778

#High frac 2fold: DTXSID4023917 and DTXSID8030760
#Low frac 2fold: DTXSID2020139 and DTXSID3061635

pl_sub <- pl %>%
  filter(Chemical %in% c("DTXSID2020139",
                         "DTXSID4023917",
                         "DTXSID3061635",
                         "DTXSID8030760"),
         Species %in% "rat") 

ex_fits <- pl_sub %>%
  pull(final_plot)
ex_fits <- set_names(ex_fits,
          pl_sub$Chemical)



cowplot::plot_grid(plotlist = list(
  #Low frac 2fold and high Rsq
  ex_fits[["DTXSID2020139"]] +
                                     scale_color_manual(values = c("#a6bddb",
                                                                   "#74a9cf",
                                                                   "#41bbc4",
                                                                   "#2b8cbe",
                                                                   "#045a8d")) +
                                     theme(legend.position = "none") +
                                     xlim(0,30),
  #high frac 2fold and high Rsq
                                   ex_fits[["DTXSID4023917"]] +
                                     scale_color_manual(values = c("black",
                                                                   "grey40")) +
                                     theme(legend.position = "none"),
  
  #low frac 2fold and low rsq
                                    ex_fits[["DTXSID3061635"]] +
                                     scale_color_manual(values = c("#006837")) +
                                     theme(legend.position = "none"),
  
  #high frac2fold and low rsq
                                   ex_fits[["DTXSID8030760"]] +
                                     scale_color_manual(values = "magenta3") +
                                     theme(legend.position = "none")
                                  ),
                   scale = 0.85)

ggsave(filename = paste0(Sys.getenv("FIG_DIR"),
                         "Manu_Files/",
                         today,
                         "_Supp_Fig8.png"),
       height = 12,
       width = 12)

ggsave(filename = paste0(Sys.getenv("FIG_DIR"),
                         "Manu_Files/",
                         today,
                         "_Supp_Fig8.pdf"),
       height = 12,
       width = 12)

8.0.4.1 Supp. Fig 9: plots of the five data groups that were best fit by the null model

flat_chems <- winmodel %>%
  filter(model %in% "model_flat") %>%
  distinct(Chemical, Species)
print(flat_chems)

pl <- plot(my_pk,
method = "bobyqa",
best_fit = FALSE,
use_scale_conc = list(dose_norm = TRUE,
log10_trans = FALSE),
drop_nonDetect = FALSE)

flat_fits <- pl %>% 
  inner_join(flat_chems) %>%
                              pull(final_plot)

cowplot::plot_grid(plotlist = flat_fits)

ggsave(filename = paste0(Sys.getenv("FIG_DIR"),
                         "Manu_Files/",
                         today,
                         "_Supp_Fig9.pdf"),
       height = 8,
       width = 16,
       units = "in")

ggsave(filename = paste0(Sys.getenv("FIG_DIR"),
                         "Manu_Files/",
                         today,
                         "_Supp_Fig9.png"),
       bg = "white",
       height =8,
       width = 16,
       dpi = 300,
       units = "in")

8.0.5 Figure 6: Comparing derived TK stats with human TK stats from Lombardo et al.

This meta-analysis compares the TK stats from model fits with other published studies.

cvt_DTXSIDs <- unique(my_pk$data$Chemical)
# First things first, let's load the data from Lombardo et al.
lombardo <- readxl::read_xlsx(
  paste0(Sys.getenv("OD_DIR"),
         "Lombardo2018-Supplemental_82966_revised_corrected.xlsx"),
  skip = 8)

#use ctxR to search the Dashboard for DTXSIDs corresponding to these CASRNs
lombardo_dtxsid <- ctxR::chemical_equal_batch(
  word_list = unique(lombardo$`CAS #`),
  API_key = Sys.getenv("ccd_api_key")
)

#merge in the DTXSIDs
lombardo <- lombardo %>%
  left_join(lombardo_dtxsid %>% select(searchValue, dtxsid, preferredName),
            by = c("CAS #" = "searchValue"))

#any left unidentified: search by name
lombardo_missing <- lombardo %>%
  filter(is.na(dtxsid))

missing_names <- lombardo_missing %>%
  pull(Name)

lombardo_dtxsid_name <- ctxR::chemical_equal_batch(
  word_list = unique(missing_names),
  API_key = Sys.getenv("ccd_api_key")
)

lombardo_missing <- lombardo_missing %>%
  left_join(lombardo_dtxsid_name %>% 
              select(searchValue, dtxsid, preferredName),
            by = c("Name" = "searchValue") )

lombardo <- bind_rows(lombardo %>% filter(!is.na(dtxsid)),
                      lombardo_missing)

lombard_abbr <- lombardo %>%
  dplyr::select(dtxsid,
                preferredName,
                Vss = `human VDss (L/kg)`,
                CLtot = `human CL (mL/min/kg)`,
                halflife = `terminal  t1/2 (h)` ) %>%
  dplyr::mutate(Species = 'human',
                source = "Lombardo et al.",
                CLtot = CLtot*60/1000 # Lombardo reports this as mL/min/kg, need L/h/kg
  ) %>%
  arrange(halflife)

my_pk_vals <- my_tkstats %>%
  dplyr::select(dtxsid = Chemical,
                Species,
                model,
                halflife = halflife.tkstats,
                Vss = Vss.tkstats,
                CLtot = CLtot.tkstats) %>%
  dplyr::distinct() %>%
  dplyr::mutate(source = "invivoPKfit") %>%
  dplyr::filter(model != "model_flat",
                !is.na(halflife))

lombardo_inner <- my_pk_vals %>%
  inner_join(lombard_abbr, by = "dtxsid")
#79 chemicals in common

lombardo_comparison <- bind_rows(lombard_abbr %>%
                                   filter(
                                     dtxsid %in%
                                       lombardo_inner$dtxsid
                                   ),
                                 my_pk_vals %>%
                                   filter(
                                     dtxsid %in%
                                       lombardo_inner$dtxsid
                                   )
)

#merge in chemical names

lombardo_comparison <- lombardo_comparison %>%
  left_join(chem_name_translate,
            by = c("dtxsid" = "Chemical"))

#create a factor variable for chemical names that sorts chemicals from highest to lowest Lombardo halflife

chemnames_levels <- lombardo_comparison %>%
  filter(source %in% "Lombardo et al.") %>%
  arrange(halflife) %>%
  pull(Chemical_Name)


lombardo_comparison <- lombardo_comparison %>%
  mutate(Chemical_Name = factor(Chemical_Name,
                                levels = chemnames_levels))

#reshape longer and plot
lombardo_comparison %>%
  pivot_longer(cols = c("Vss", "CLtot", "halflife")) %>%
  mutate(name = factor(name, levels = c("halflife", "Vss", "CLtot"))) %>%
  ggplot(mapping = aes(
    x = value,
    y = Chemical_Name
  )) +
  geom_point(aes(color = source,
                 shape = Species),
             #position = position_dodge(0.7),
             size = 5/.pt, stroke = 2.5/.pt) +
  facet_grid(cols = vars(name),
             scales = "free_x") +
  scale_x_log10( labels = scales::label_math(format = log10)) +
  annotation_logticks(sides = "b") +
  scale_shape_manual(values = c("human" = 3, "rat" = 21)) +
  scale_color_manual(values = c("black", "#1064c9")) +
  guides(color = guide_legend(override.aes = list(shape = 21),
                              nrow = 2,
                              title.position = "top",
                              title.hjust = 0.5),
         shape = guide_legend(nrow = 2,
                              title.position = "top",
                              title.hjust = 0.5)) +
  theme(panel.border = element_rect(color = "black", fill = NA, linewidth = 1.5),
        panel.background = element_blank(),
        panel.grid.major = element_line(color = "grey90", linewidth = 0.5),
        strip.background = element_rect(fill = "white"),
        strip.text = element_text(face = "bold", size = 12),
        text = element_text(size = 10),
        axis.ticks = element_blank(),
        axis.line = element_blank(),
        axis.text.y = element_text(face = "bold", size = 6),
        axis.text.x = element_text(size = 10),
        axis.title.y = element_blank(),
        axis.title.x = element_blank(),
        legend.position = "bottom",
        legend.key = element_blank(),
        legend.title = element_text(face = "bold"),
        legend.text = element_text())

ggsave(filename = paste0(Sys.getenv("FIG_DIR"),
                         "Manu_Files/",
                         today,
                         "_Fig6.png"),
       height = 7,
       width = 6.5,
       device = "png", dpi = 300)

ggsave(filename = paste0(Sys.getenv("FIG_DIR"),
                         "Manu_Files/",
                         today,
                         "_Fig6.pdf"),
       height = 7,
       width = 6.5)

8.0.6 Supp. Fig. 10: Fgutabs comparison with literature values compiled by Musther et al. (2014)

Compare with literature values of Fgutabs from the supplemental material of Musther et al. 2014.

Read in these literature values and reshape them to prepare for fitting.

species_cols <- c(paste("Mouse",
       c("Dosing Formulation",
                    "Strain",
                    "Sex",
                    "F% Mean",
                    "F% Range"),
      sep = "."),

paste("Rat",
       c("Dosing Formulation",
                    "Strain",
                    "Sex",
                    "F% Mean",
         "WSD",
                    "F% Range"),
      sep = "."),

paste("Dog",
       c("Dosing Formulation",
                    "Strain",
                    "Sex",
                    "F% Mean",
         "WSD",
                    "F% Range"),
      sep = "."),

paste("Non-Human Primate",
       c("Dosing Formulation",
                    "Strain",
                    "Sex",
                    "F% Mean",
         "WSD",
                    "F% Range"),
      sep = "."),

paste("Human",
       c("Dosing Formulation",
                    "Sex",
                    "F% Mean",
         "WSD",
                    "F% Range"),
      sep = ".")
)



musther2014 <- readxl::read_excel(
  path = paste0(Sys.getenv("OD_DIR"),
                "SupTableMusther2014.xlsx"),
  sheet = "Sheet1",
  skip = 1,
  col_names = c( "Compound",
                 "CAS",
                 "MW", "Ion Class",
               species_cols,
                "References.mouse",
                "References.rat",
                "References.dog",
                "References.NHP",
                "References.human"
                ),
  col_types = "text"
  ) 

#pivot longer
musther2014_long <- musther2014 %>%
  select(c(Compound,
           CAS,
         contains("F% Mean"))) %>%
  pivot_longer(cols = !c(Compound, CAS),
               names_to = c("Species", "stat"),
               names_sep = "\\.") %>%
  mutate(value = gsub(x=value,
                      pattern = "<",
                      replacement = ""),
         value_num = as.numeric(value))

Use ctxR to get DTXSIDs from CAS number.

musther2014_dtxsid <- ctxR::chemical_equal_batch(word_list = unique(musther2014_long$CAS), API_key = Sys.getenv("ccd_api_key"))

musther2014_long <- musther2014_long %>%
  left_join(musther2014_dtxsid %>% select(searchValue, dtxsid, preferredName),
            by = c("CAS" = "searchValue"))

Get Fgutabs estimates from the fitted pk object.

fgutabs <- coef(my_pk, method = "bobyqa") %>%
  distinct() %>%
   semi_join(winmodel) %>% #keep only winning models
  rowwise() %>%
  mutate(Fgutabs = coefs_vector["Fgutabs"]) %>%
  filter(!is.na(Fgutabs))

How many chemicals overlapped at all

length(intersect(musther2014_long$dtxsid,
                 get_data(my_pk)$Chemical))

Join with the literature estimates, keeping only cases with data in both tables

fgutabs_pk_musther <- fgutabs %>% inner_join(musther2014_long,
                                             by = c("Chemical" = "dtxsid"))

Make a plot:

#capitalize invivopkfit species to harmonize with Musther
fgutabs_pk_musther <- fgutabs_pk_musther %>%
  dplyr::mutate(Species.x =  stringr::str_to_sentence(Species.x))

ggplot(fgutabs_pk_musther) +
  geom_point(aes(y = preferredName,
                 x = Fgutabs,
                 shape = Species.x,
                 color = "invivopkfit"),
             size =4,
             stroke = 2) +
  geom_point(aes(y = preferredName,
                 x = value_num/100,
                 shape = Species.y,
                 color = "Musther et al. 2014"),
             size = 4,
             stroke = 2) +
  scale_shape_manual(values = c("Rat" = 21,
                                "Mouse" = 22,
                                "Dog" = 23,
                                "Non-Human Primate" = 24,
                                "Human" = 3),
                     breaks = c("Rat", "Mouse", "Dog", "Non-Human Primate", "Human")) +
  
  theme_bw() +
  theme(axis.title.y = element_blank(),
        legend.title = element_blank(),
        axis.title.x = element_text(size = 12),
        axis.text = element_text(size = 12),
        legend.text = element_text(size = 12)
        )

ggsave(filename = paste0(Sys.getenv("FIG_DIR"),
                         "Manu_Files/",
                         today,
                         "_Supp_Fig10.pdf"),
       height = 5,
       width = 8)

ggsave(filename = paste0(Sys.getenv("FIG_DIR"),
                          "Manu_Files/",
                         today,
                         "_Supp_Fig10.png"),
       height = 5,
       width = 8,
       device = "png", dpi = 300)

8.0.7 Supp. Fig. 11: Parallelization decreases runtime of invivoPKfit

Benchmarking for parallelization vs. non-parallelization

# Randomize the order of chemical species
chem_spec <- cvt %>%
  dplyr::distinct(analyte_dtxsid, species) %>%
  dplyr::slice_sample(prop = 1)

test_group_size <- seq(25, 105, by = 10)
fit_options <- expand.grid(error_model = c("pooled",
                                           "hierarchical"),
                           dose_norm = FALSE,
                           log10_trans = TRUE,
                           time_scale = c(
                             TRUE,
                             FALSE),
                           stringsAsFactors = FALSE)

fit_options <- tidyr::expand_grid(fit_options, test_group_size)

fit_options <- fit_options %>% rowwise() %>%
  mutate(this_data = list({
    tmp <- chem_spec %>% slice_head(n = test_group_size) %>%
      left_join(cvt, by = join_by(analyte_dtxsid, species)) %>% 
      pk() +
        scale_time(new_units = ifelse(!time_scale, 
                                      "identity",
                                      "auto")) +
        scale_conc(dose_norm = dose_norm, log10_trans = log10_trans) +
        settings_preprocess(suppress.messages = TRUE)
    do_prefit(tmp)
  }
  ),
  data_size = chem_spec %>% slice_head(n = test_group_size) %>%
    left_join(cvt, by = join_by(analyte_dtxsid, species)) %>%
    NROW())

# Organization of the benchmarking
# For each fit_option, return the four values of system.time() that we want
df_out <- fit_options %>%
  rowwise() %>%
  mutate(tim_1core = system.time(do_fit(this_data))[["elapsed"]],
         tim_4core = system.time(do_fit(this_data, n_cores = 4))[["elapsed"]],
         tim_7core = system.time(do_fit(this_data, n_cores = 7))[["elapsed"]],
         tim_12core = system.time(do_fit(this_data, n_cores = 12))[["elapsed"]]
         )

df_out <- df_out %>% select(-this_data)
gc()

full_long <- df_out %>%
  pivot_longer(cols = c(tim_1core:tim_12core),
                        names_to = "N_cores",
                        values_to = "Time_s") %>%
  group_by(N_cores, test_group_size, data_size) %>%
  summarize(mean_time = mean(Time_s, na.rm = TRUE)/60,
            max_time = max(Time_s, na.rm = TRUE)/60,
            min_time = min(Time_s, na.rm = TRUE)/60
            ) %>%
  mutate(N_cores = as.numeric(str_extract(N_cores, "[:digit:]+")))

df_out %>% clipr::write_clip()
ggplot(full_long,
       aes(x = test_group_size,
           y = mean_time,
           color = as.factor(N_cores))) +
  geom_point() +
  geom_linerange(aes(ymin = min_time,
                ymax = max_time)) +
  geom_line(aes(group = N_cores)) +
  labs(x = "Number of Data Groups",
       y = "Runtime (minutes)",
       title = "Number of Processing Cores",
       subtitle = "(with min/max runtime range)") +
  facet_grid(cols = vars(N_cores)) +
  scale_color_manual(values = c("black", "#40c679", "#31a354", "#006837")) +
  coord_fixed(ratio = 8) +
  theme(panel.border = element_rect(color = "black", fill = NA, size = 1.5),
        panel.background = element_blank(),
        panel.grid.major = element_line(color = "grey90", linewidth = 0.5),
        legend.position = "none",
        plot.title = element_text(hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5),
        axis.ticks = element_blank(),
        axis.line = element_blank(),
        strip.background = element_blank())


ggsave(paste0(Sys.getenv("FIG_DIR"),
              "Manu_Files/",
              today,
              "_Supp_Fig11.png"),
       width = 5,
       height = 2.5,
       units = "in")