6 Characterizing the data

Load in one of the saved fit objects; it doesn’t matter which one, since we will be using only the pre-processed data, which is the same regardless of fitting options.

my_pk <- readRDS(paste0(Sys.getenv("OD_DIR"),
                        read_date_pk,
                        "mypk_fit_110p.Rds"))

Number of observations:

get_data(my_pk) %>%
  count()

Number of unique chemicals:

get_data(my_pk) %>%
  distinct(Chemical) %>%
  count()

Number of unique references:

get_data(my_pk) %>%
  distinct(Reference) %>%
  count()

Number of unique species:

get_data(my_pk) %>%
  distinct(Species) %>%
  count()

Number of unique combinations of Chemical and Species:

get_data(my_pk) %>%
  distinct(Chemical, Species) %>%
  count()

Number of unique combinations of Chemical, Species, and Reference:

get_data(my_pk) %>%
  distinct(Chemical, Species, Reference) %>%
  count()

Number of unique combinations of Chemical, Species, Reference, Route, Media:

get_data(my_pk) %>%
  distinct(Chemical, Species, Reference, Route, Media) %>%
  count()

Number of unique combinations of Chemical, Species, Reference, Route, Media, Dose:

get_data(my_pk) %>%
  distinct(Chemical, Species, Reference, Route, Media, Dose) %>%
  count()

6.1 Number of data groups with both blood and plasma data available

get_data(my_pk) %>%
  group_by(Chemical, Species) %>%
  summarise(blood_and_plasma = all(c("blood",
                  "plasma") %in% Media)) %>%
  ungroup() %>%
  summarise(n_both = sum(blood_and_plasma))

6.2 Number of data groups where time scaling could occur

get_data(my_pk) %>%
  group_by(Chemical, Species) %>%
  summarise(t_mid = mean(range(Time)),
    t_mid_log10 = midpt_log10(Time),
            new_time_units = auto_units(y = Time,
                       from = "hours",
                       target = 10)) %>%
  ungroup() %>%
  count(new_time_units)

6.3 How many experiments (Chemical, Species, Reference, Route, Media) were flagged for potential dose dependence?

get_data_info(my_pk)$dose_norm_check %>%
  dplyr::summarise(AUC_flag = sum(!is.na(data_flag_AUC)),
            Cmax_flag = sum(!is.na(data_flag_Cmax)),
            total_multi_dose = sum(n_dose_groups>1),
            total_expts = n())

6.4 Evaluating time and concentration ranges

Supp. Figure 2: Concentration and final time-points have wide ranges across CvT datasets.

#compute duration and concentration range across experiments (Chemical, Species, Reference, Route, Media, Dose)
data_range <- my_pk$data %>%
  filter(Detect, !exclude) %>%
  group_by(Chemical, Species, Reference, Media, Route, Dose, Dose.Units) %>%
  summarize(maxT = max(Time)/24,
         concRange = log10(max(Conc)/min(Conc))) %>%
  ungroup()
   
#Panel A: histogram of day of final timepoint per experiment  
sf_2A <- data_range %>%
  ggplot(aes(x = maxT)) +
  geom_histogram(fill = "grey5",
                 bins = 40,
                 color = "grey5") +
  scale_x_log10(labels = scales::number,
                breaks = c(0.1, 1, 7, 30, 365)) +
  annotation_logticks(sides = "b") +
  labs(x = "Day of final Timepoint",
       y = "Count") +
  theme(aspect.ratio = 1,
        text = element_text(size = 10),
        panel.border = element_rect(color = "black", fill = NA, size = 1),
        panel.background = element_blank(),
        panel.grid.major = element_line(color = "grey90", linewidth = 0.5),
        axis.ticks = element_blank(),
        axis.line = element_blank())

#panel B: histogram of fold concentration range
sf_2B <- data_range %>%
  ggplot(aes(x = concRange)) +
  geom_histogram(fill = "grey5",
                 bins = 40,
                 color = "grey5") +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(0, 50),
                     breaks = c(0, 25, 50)) +
  labs(x = "log10(Concentration Range)",
       y = "Count") +
  annotation_logticks(sides = "b") +
  theme(aspect.ratio = 1,
        panel.border = element_rect(color = "black", fill = NA, size = 1),
        panel.background = element_blank(),
        panel.grid.major = element_line(color = "grey90", linewidth = 0.5),
        strip.background = element_rect(fill = "white"),
        strip.text = element_text(face = "bold"),
        axis.ticks = element_blank(),
        axis.line = element_blank(),
        axis.text = element_text(size = 10),
        axis.title = element_text(size = 11))

sf_2AB <- plot_grid(sf_2A, sf_2B,
                    labels = c("A", "B"), 
                    nrow = 1)

sf_2AB

#save PDF version
ggsave(paste0(Sys.getenv("FIG_DIR"),
              "Manu_Files/",
              today,
              "_Supp_Fig2_ConcTimeRanges",
              ".pdf"),
       height = 3.2,
       width = 6,
       bg = "white",
       units = "in")

#save PNG version
ggsave(paste0(Sys.getenv("FIG_DIR"),
              "Manu_Files/",
              today,
              "_Supp_Fig2_ConcTimeRanges",
              ".png"),
       height = 3.2,
       width = 6,
       bg = "white",
       units = "in",
       dpi = 300)

rm(sf_2A, sf_2B, sf_2AB)

6.5 Evaluating data variability

6.5.1 Figure 3

data_cvt <- get_data(my_pk)
data_counts <- data_cvt %>%
dplyr::filter(Detect) %>%
dplyr::count(Chemical, Species, Reference, Route, Media, Dose, Time, N_Subjects,
name = "Count")
data_counts <- dplyr::left_join(data_counts, data_cvt,
by = c("Chemical", "Species", "Reference",
"Route", "Media", "Dose", "Time",
"N_Subjects")) %>%
dplyr::mutate(data_descr = dplyr::case_when(
(Count > 1 & N_Subjects == 1) ~ "Individual Data, Multiple Observations",
(Count == 1 & N_Subjects == 1) ~ "Individual Data, Single Observation",
(Count == 1 & N_Subjects > 1 & Conc_SD > 0) ~ "Grouped Data w SD",
(N_Subjects > 1 & Conc_SD == 0) ~ "Grouped Data no SD",
.default = NA_character_
))

indiv_data <- subset(data_counts,
                     subset = (data_descr %in% "Individual Data, Multiple Observations"))

indiv_data <- indiv_data %>%
dplyr::group_by(Chemical, Species, Reference, Route, Media, Dose, Time) %>%
dplyr::mutate(replicate_mean = mean(Conc, na.rm = TRUE)) %>%
  ungroup()

indiv_data_bygroup <- indiv_data %>%
  dplyr::group_by(Chemical, Species) %>%
  dplyr::summarise(AFE = 10^mean(log10(Conc/replicate_mean)),
                                                                                               AAFE = 10^mean(abs(log10(Conc/replicate_mean)))) %>%
  ungroup()

indiv_data_bygroup %>% count(AFE < 0.5)
indiv_data_bygroup %>% count(AFE > 2)

Do twofold-test; use only the information about the observations at this point (predictions will be handled later).

my_tf <- my_pk %>% twofold_test()

Figure 3 Panel A: Histogram of mean-normalized concentrations

pl3A <- my_tf$individual_data %>%
  # Plotting
  ggplot(aes(x = foldConc)) +
  geom_histogram(binwidth = 0.2,
                 fill = "grey5",
                 color = "grey5",
                 linewidth = 0.4) +
  coord_cartesian(xlim =  c(0, 3.1),
                  ylim = c(0, 1200)) +
  scale_y_continuous(expand = c(0, 0),
                     breaks = c(0, 250, 500, 750, 1000),
                     labels = c("0", "2.5", "5", "7.5", "10")) +
  expand_limits(y = 0) +
  geom_vline(xintercept = c(0.5, 2),
             color = "red3",
             linetype = "dashed",
             linewidth = 1) +
  theme_bw() +
  labs(x = "Mean-normalized concentration",
                y = "Observations (hundreds))") +
  theme(text = element_text(size = 14),
        panel.border = element_rect(color = "black", linewidth = 1, fill = NA),
        panel.background = element_blank(),
        panel.grid.major.y = element_line(color = "grey90", linewidth = 0.4),
        axis.title = element_text(face = "bold"),
        axis.line = element_blank(),
        axis.text = element_text(size = 14),
        plot.background = element_blank(),
        axis.ticks.y = element_blank())

pl3A

Figure 3 Panel B: Plot of mean-normalized concentrations vs. ADME-Normalized Time

#get time of peak concentration for use in calculated ADME-normalized time
my_nca <- nca(my_pk) %>%
  dplyr::filter(param_name %in% "tmax")

pl3B <- my_tf$individual_data %>%
  inner_join(my_nca %>%
              dplyr::select(Chemical, Species,
                            Route, Media,
                            tmax = param_value) %>%
              filter(!is.na(tmax))) %>%
  group_by(Chemical,
           Species,
           Reference,
           Route,
           Media,
           Dose) %>%
  mutate(
         normTime = ifelse(
           Route == "iv",
           1 + Time/max(Time),
           ifelse(
             Time > tmax,
             ((Time - tmax)/max(Time)) + 1,
             0.5 + Time/(2*tmax)
           )
         )) %>%
  ggplot(aes(
    x = normTime,
    y = foldConc
  )) +
  scale_x_continuous(breaks = c(1,2),
                     labels = c("tmax", "end")) +
  geom_point(alpha = 0.1, size = 0.7) +
  geom_hline(yintercept = c(0.5, 2),
             color = "red3",
             linewidth = 0.8,
             linetype = "dashed") +
  facet_grid(cols = vars(Route),
             scales = "free_x") +
  labs(x = "ADME-Normalized Time",
       y = "Mean-Normalized\nConcentration") +
  theme(text = element_text(size = 14),
        panel.border = element_rect(color = "black", fill = NA, size = 1),
        panel.background = element_blank(),
        panel.grid.major = element_line(color = "grey90", linewidth = 0.5),
        strip.background = element_rect(fill = "white"),
        strip.text = element_text(face = "bold"),
        axis.line = element_blank(),
        axis.text = element_text(size = 14),
        axis.title = element_text(face = "bold"),
        panel.spacing = unit(0.125, units = "in"),
        plot.background = element_blank(),
        legend.position = "none")

pl3B

Paste these two plots together

pl3 <- cowplot::plot_grid(pl3A,
                          pl3B,
                          ncol = 2,
                          rel_widths = c(1,2),
                          labels = c("A", "B"))

#save PDF version
ggsave(paste0(Sys.getenv("FIG_DIR"),
                "Manu_Files/",
              today,
                "_Fig3.pdf"),
      pl3,
          bg = "white",
      width = 8,
      height = 4,
      units = "in")

#Save PNG version
ggsave(paste0(Sys.getenv("FIG_DIR"),
                "Manu_Files/",
              today,
                "_Fig3.png"),
      pl3,
          bg = "white",
      width = 8,
      height = 4,
      units = "in",
      dpi = 300)

#remove unneeded objects
rm(pl3A, pl3B, pl3)

6.5.2 Supplemental Figure 3: Histogram of data variability by time

plsupp3 <- my_tf$individual_data %>%
  inner_join(my_nca %>%
              dplyr::select(Chemical, Species,
                            Route, Media,
                            tmax = param_value) %>%
              filter(!is.na(tmax))) %>%
  group_by(Chemical,
           Species,
           Reference,
           Route,
           Media,
           Dose) %>%
  mutate(
         normTime = ifelse(
           Route == "iv",
           1 + Time/max(Time),
           ifelse(
             Time > tmax,
             ((Time - tmax)/max(Time)) + 1,
             0.5 + Time/(2*tmax)
           )
         )) %>%
  ggplot(aes(
    x = normTime
  )) +
  scale_x_continuous(breaks = c(1,2),
                     labels = c("tmax", "end")) +
  geom_histogram() +
  facet_grid(cols = vars(Route),
             scales = "free_x") +
  labs(x = "ADME-Normalized Time",
       "# Observations") +
  theme(text = element_text(size = 14),
        panel.border = element_rect(color = "black", fill = NA, size = 1),
        panel.background = element_blank(),
        panel.grid.major = element_line(color = "grey90", linewidth = 0.5),
        strip.background = element_rect(fill = "white"),
        strip.text = element_text(face = "bold"),
        #axis.ticks = element_blank(),
        axis.line = element_blank(),
        #axis.text.x = element_blank(),
        axis.text = element_text(size = 14),
        axis.title = element_text(face = "bold"),
        panel.spacing = unit(0.125, units = "in"),
        plot.background = element_blank(),
        legend.position = "none")

#save PDF version
ggsave(paste0(Sys.getenv("FIG_DIR"),
                "Manu_Files/",
              today,
                "_Supp_Fig3",
                ".png"),
      plsupp3,
          bg = "white",
      width = 8,
      height = 4,
      units = "in",
      dpi = 300)

ggsave(paste0(Sys.getenv("FIG_DIR"),
                "Manu_Files/",
              today,
                "_Supp_Fig3",
                ".pdf"),
      plsupp3,
          bg = "white",
      width = 8,
      height = 4,
      units = "in")

7 Evaluating fitting options

Here we evaluate the various fitting options by:

Counting the number of 1-, 2-compartment and flat model fits that had lowest AIC (winning model).
Summarizing the root mean squared log10 error between model fit and NCA approximation of:
- Cmax
- AUC_infinity
- (Counts and filters extreme values of log10(model/NCA) > log10(20) for AUC_infinity)
- (Counts and filters non-finite and zero values of AUC_infinity)
A summary goodness-of-fit table for each data group’s winning model and fitting options that includes:
- R-squared
- RMSLE
- AIC
All predictions for each data group and fitting option (winning models only)
Model errors within a factor of two for each fitting option

Define some helper functions to do the various model evaluations.

7.1 Winning model tally

# Evaluation
# Winning Model Tally
winmodel_comparison <- function(this_error_model,
                             this_dose_norm,
                             this_log10_trans,
                             this_time_scale) {
  dose_indic <- as.numeric(this_dose_norm)
  log10_indic <- as.numeric(this_log10_trans)
  time_indic <- as.numeric(this_time_scale)
  errmodel_indic <- substr(this_error_model, 1, 1)
  
  file_str <- paste0(Sys.getenv("OD_DIR"),
                     read_date_pk,
                     "mypk_fit_",
                     dose_indic,
                     log10_indic,
                     time_indic,
                     errmodel_indic,
                     ".Rds")
  pk_name <- paste0(dose_indic, log10_indic, time_indic, errmodel_indic)
  
  current_rds <- readRDS(file = file_str)
  
  # Wide winning model
  suppressMessages({
    winning_model <- get_winning_model(obj = current_rds) 
    
    wide_winning_model <- winning_model %>%
      dplyr::group_by(method, model) %>%
      dplyr::count() %>%
      tidyr::pivot_wider(names_from = model,
                         values_from = n)
  })
  return(wide_winning_model)
}

7.2 RMSLE for Cmax and AUC (with some tallys for extreme values)

First we need to determine how to filter out cases where NCA or tkstats predicted ridiculously large AUCs. To do this, create a function to plot \(AUC_{\infty}\) predicted by the model vs. \(AUC_{\infty}\) estimated by NCA, for each set of fitting options. Show the identity line, and also show lines for a factor of 20 above and below the identity line, since a 20-fold cutoff is one proposed cutoff. Do the same for Cmax, as well.

7.2.1 AUC_inf and Cmax comparison

AUC_plot <- function(
    this_error_model,
    this_dose_norm,
    this_log10_trans,
    this_time_scale) {
  dose_indic <- as.numeric(this_dose_norm)
  log10_indic <- as.numeric(this_log10_trans)
  time_indic <- as.numeric(this_time_scale)
  errmodel_indic <- substr(this_error_model, 1, 1)
  
  file_str <- paste0(Sys.getenv("OD_DIR"),
                     read_date_pk,
                     "mypk_fit_",
                     dose_indic,
                     log10_indic,
                     time_indic,
                     errmodel_indic,
                     ".Rds")
  pk_name <- paste0(dose_indic, log10_indic, time_indic, errmodel_indic)
  
  message(paste0("Evaluation for: ", pk_name))
  current_rds <- readRDS(file = file_str)
  # Cmax and AUC_inf RMSEs
  #note that eval_tkstats already filters winning models
  #use finite_only = FALSE so that we can count the excluded cases
  suppressMessages(RMSLE_eval <- eval_tkstats(obj = current_rds, 
                                             dose_norm = FALSE,
                                             finite_only = FALSE,
                                             suppress.messages = TRUE) %>%
                                  mutate(Options = pk_name) %>%
                                  dplyr::select(
                                    Options,
                                    Chemical,
                                    Species,
                                    Route,
                                    Media,
                                    Reference,
                                    method,
                                    model,
                                    starts_with("Cmax"),
                                    starts_with("AUC_")
                                  ))
  
 these_opts <- paste0(dose_indic,
                     log10_indic,
                     time_indic,
                     errmodel_indic)
  
  #plot AUC infinity for tkstats vs. AUC infinity for NCA
   #show identity line and 20x above and below lines
 AUC_plot <- RMSLE_eval %>%
    dplyr::filter(is.finite(AUC_infinity.nca) & is.finite(AUC_infinity.tkstats)) %>%
    ggplot() +
    geom_point(aes(x = AUC_infinity.nca,
                   y = AUC_infinity.tkstats,
                   color = model)) +
    scale_x_log10() + scale_y_log10() +
    annotation_logticks() +
    geom_abline(aes(intercept = 0, slope =1)) +
    geom_abline(aes(intercept = log10(20), slope = 1),
                linetype = 2) +
    geom_abline(aes(intercept = log10(1/20), slope = 1),
                linetype = 2) +
      facet_grid(rows = vars(method)) +
      ggtitle(paste(these_opts,
                    "AUC_inf pred vs. AUC_inf NCA")) +
      scale_color_manual(values = c("model_1comp" = "#0398FC",
                                 "model_2comp" = "#D68E09",
                               "model_flat" = "black"),
                    name = "Winning model") +
      coord_equal()
 
 Cmax_plot <- RMSLE_eval %>%
    dplyr::filter(is.finite(Cmax.nca) & is.finite(Cmax.tkstats)) %>%
    ggplot() +
    geom_point(aes(x = Cmax.nca,
                   y = Cmax.tkstats,
                   color = model)) +
    scale_x_log10() + scale_y_log10() +
    annotation_logticks() +
    geom_abline(aes(intercept = 0, slope =1)) +
    geom_abline(aes(intercept = log10(20), slope = 1),
                linetype = 2) +
    geom_abline(aes(intercept = log10(1/20), slope = 1),
                linetype = 2) +
      facet_grid(rows = vars(method)) +
      ggtitle(paste(these_opts,
                    "Cmax pred vs. Cmax NCA")) +
      scale_color_manual(values = c("model_1comp" = "#0398FC",
                                 "model_2comp" = "#D68E09",
                               "model_flat" = "black"),
                    name = "Winning model") +
      coord_equal()

  return(
    list(AUC_plot = AUC_plot,
    Cmax_plot = Cmax_plot
    )
  )
}

Run these plots for all fitting opts

all_AUC_plots <- fitopts %>%
              dplyr::rowwise() %>%
  dplyr::summarise(this_plot = list(AUC_plot(this_error_model = error_model,
                                 this_dose_norm = dose_norm,
                                 this_log10_trans = log10_trans,
                                 this_time_scale = time_scale)
                                 )
                   )
#save the plots
pdf(file = paste0(
  Sys.getenv("FIG_DIR"),
  today,
  "_AUCinf_nca_vs_tk_plots.pdf"),
  onefile = TRUE,
  height = 5,
  width = 7)

plotlist <- all_AUC_plots %>%
  pull(this_plot)

for (x in seq_along(plotlist)) {
  print(plotlist[[x]][["AUC_plot"]])
}
dev.off()

pdf(file = paste0(
  Sys.getenv("FIG_DIR"),
  today,
  "_Cmax_nca_vs_tk_plots.pdf"),
  onefile = TRUE,
  height = 5,
  width = 7)

plotlist <- all_AUC_plots %>%
  pull(this_plot)

for (x in seq_along(plotlist)) {
  print(plotlist[[x]][["Cmax_plot"]])
}
dev.off()

rm(plotlist, all_AUC_plots)

It appears that the best filter, other than filtering out NA, Inf, zero, and negative values, would be to filter out any AUC_inf values greater than 1e7, since that cutoff identifies outliers.

Write the function for RMSLE for Cmax and AUC accordingly.


# RMSLE for Cmax and AUC
rmsles_Cmax_AUC <- function(
    this_error_model,
    this_dose_norm,
    this_log10_trans,
    this_time_scale) {
  dose_indic <- as.numeric(this_dose_norm)
  log10_indic <- as.numeric(this_log10_trans)
  time_indic <- as.numeric(this_time_scale)
  errmodel_indic <- substr(this_error_model, 1, 1)
  
  file_str <- paste0(Sys.getenv("OD_DIR"),
                     read_date_pk,
                     "mypk_fit_",
                     dose_indic,
                     log10_indic,
                     time_indic,
                     errmodel_indic,
                     ".Rds")
  pk_name <- paste0(dose_indic, log10_indic, time_indic, errmodel_indic)
  
  message(paste0("Evaluation for: ", pk_name))
  current_rds <- readRDS(file = file_str)
  # Cmax and AUC_inf RMSEs
  #note that eval_tkstats already filters winning models
  #use finite_only = FALSE so that we can count the excluded cases
  suppressMessages(RMSLE_eval <- eval_tkstats(obj = current_rds, 
                                             dose_norm = FALSE,
                                             finite_only = FALSE,
                                             suppress.messages = TRUE) %>%
                                  mutate(Options = pk_name) %>%
                                  dplyr::select(
                                    Options,
                                    Chemical,
                                    Species,
                                    Route,
                                    Media,
                                    Reference,
                                    method,
                                    model,
                                    starts_with("Cmax"),
                                    starts_with("AUC_")
                                  ))
 
  #Couunt bad observations:
  #zero, NA, infinite
  RMSLE_badobs <- RMSLE_eval %>% 
    group_by(Options, method) %>%
    summarise(total_expts = n(),
              count_AUCinf_NCA_0 = sum(AUC_infinity.nca %in% 0),
              count_AUCinf_pred_0 = sum(AUC_infinity.tkstats %in% 0),
              count_AUCinf_NCA_isNA = sum(is.na(AUC_infinity.nca)),
              count_AUCinf_pred_isNA = sum(is.na(AUC_infinity.tkstats)),
              count_AUCinf_NCA_isInf = sum(is.infinite(AUC_infinity.nca)),
              count_AUCinf_pred_isInf = sum(is.infinite(AUC_infinity.tkstats)),
             count_Cmax_NCA_0 = sum(Cmax.nca %in% 0),
              count_Cmax_pred_0 = sum(Cmax.tkstats %in% 0),
              count_Cmax_NCA_isNA = sum(is.na(Cmax.nca)),
              count_Cmax_pred_isNA = sum(is.na(Cmax.tkstats)),
              count_Cmax_NCA_isInf = sum(is.infinite(Cmax.nca)),
              count_Cmax_pred_isInf = sum(is.infinite(Cmax.tkstats))
    ) %>%
    ungroup()
  
  #count cases where AUC or Cmax was negative,
  #or where AUC_inf was greater than 1e7
  RMSLE_neg <- RMSLE_eval %>%
    dplyr::filter(is.finite(AUC_infinity.nca),
                            is.finite(AUC_infinity.tkstats),
                            is.finite(Cmax.nca),
                            is.finite(Cmax.tkstats)) %>%
    group_by(Options, method) %>%
      summarise(count_AUCinf_NCA_lt0 = sum(AUC_infinity.nca < 0),
              count_AUCinf_pred_lt0 = sum(AUC_infinity.tkstats < 0),
              count_Cmax_NCA_lt0 = sum(AUC_infinity.nca < 0),
              count_Cmax_pred_lt0 = sum(AUC_infinity.tkstats < 0),
              count_AUCinf_NCA_gt_1e7 = sum(AUC_infinity.nca > 1e7),
               count_AUCinf_pred_gt_1e7 = sum(AUC_infinity.tkstats > 1e7),
       )
  
  RMSLE_badobs <- RMSLE_badobs %>%
    left_join(RMSLE_neg, by = c("Options",
                                    "method")) %>%
        rowwise() %>%
    dplyr::mutate(total_excl = sum(
      c_across(
      starts_with("count")
      )
      )
    )
  
  #compute RMSLEs after excluding the bad observations
  RMSLE_eval <- RMSLE_eval %>%
    dplyr::filter(is.finite(AUC_infinity.nca),
           is.finite(AUC_infinity.tkstats)) %>%
    dplyr::filter(
      AUC_infinity.nca > 0,
      AUC_infinity.tkstats > 0,
      AUC_infinity.nca < 1e7,
      AUC_infinity.tkstats < 1e7) %>%
    rowwise() %>%
    mutate(
      SLE_Cmax = (log10(Cmax.tkstats) - log10(Cmax.nca)) ^ 2,
      SLE_AUC_inf = (log10(AUC_infinity.tkstats) - log10(AUC_infinity.nca)) ^ 2
    ) %>%
    dplyr::filter(!is.na(SLE_Cmax), !is.na(SLE_AUC_inf)) %>%
    group_by(Options, method) %>%
    summarise(
      n_expts = n(),
      RMSLE_Cmax = sqrt(mean(SLE_Cmax, na.rm = FALSE)) %>% signif(digits = 4),
      RMSLE_AUC = sqrt(mean(SLE_AUC_inf, na.rm = FALSE)) %>% signif(digits = 4)
    ) %>%
    ungroup()
  
  #merge in counts of bad obs
  
  RMSLE_eval <- RMSLE_eval %>%
    dplyr::left_join(RMSLE_badobs,
              by = c("Options", "method"))
  
  return(RMSLE_eval)
}

7.3 Goodness of fit metrics

Goodness-of-fit table with R-squared, RMSLE and AIC:

# Goodness-of-fit table with R-squared, RMSLE and AIC
get_gof <- function(this_error_model,
                             this_dose_norm,
                             this_log10_trans,
                             this_time_scale) {
  dose_indic <- as.numeric(this_dose_norm)
  log10_indic <- as.numeric(this_log10_trans)
  time_indic <- as.numeric(this_time_scale)
  errmodel_indic <- substr(this_error_model, 1, 1)
  
  file_str <- paste0(Sys.getenv("OD_DIR"),
                     read_date_pk,
                     "mypk_fit_",
                     dose_indic,
                     log10_indic,
                     time_indic,
                     errmodel_indic,
                     ".Rds")
  pk_name <- paste0(dose_indic, log10_indic, time_indic, errmodel_indic)
  
  current_rds <- readRDS(file = file_str)
  winning_model <- suppressMessages(get_winning_model(obj = current_rds))
  
  suppressMessages({
    this_rsq <- rsq.pk(current_rds,
                    use_scale_conc = FALSE,
                    rsq_group = ggplot2::vars(Chemical, Species),
                    sub_pLOQ = TRUE) %>%
      semi_join(winning_model)
    
    this_AIC <- AIC(current_rds) %>%
      semi_join(winning_model)
    
    this_rmsle <- rmse.pk(current_rds,
                      rmse_group = vars(Chemical, Species),
                      use_scale_conc = list(dose_norm = FALSE,
                                            log10_trans = TRUE),
                      sub_pLOQ = TRUE) %>%
      semi_join(winning_model)

  })
  
  # Since they are all joined by winmodel
  # all three must have same NROW
  message(
    paste0("For ", pk_name, "... outputs below must have same number of rows")
  )
  
  message(paste0("rsq: ", NROW(this_rsq)))
  message(paste0("aic: ", NROW(this_AIC)))
  message(paste0("rmsle: ", NROW(this_rmsle)))
  
  gof_df <- this_rsq %>%
    inner_join(this_AIC, by = join_by(Chemical, Species, method, model)) %>%
    inner_join(this_rmsle, by = join_by(Chemical, Species, method, model)) %>%
    mutate(Options = pk_name, .before = Chemical)
  
  return(gof_df)
}

7.4 Get all predictions


# All predictions 
get_all_preds <- function(this_error_model,
                             this_dose_norm,
                             this_log10_trans,
                             this_time_scale) {
  dose_indic <- as.numeric(this_dose_norm)
  log10_indic <- as.numeric(this_log10_trans)
  time_indic <- as.numeric(this_time_scale)
  errmodel_indic <- substr(this_error_model, 1, 1)
  
  file_str <- paste0(Sys.getenv("OD_DIR"),
                     read_date_pk,
                     "mypk_fit_",
                     dose_indic,
                     log10_indic,
                     time_indic,
                     errmodel_indic,
                     ".Rds")
  pk_name <- paste0(dose_indic, log10_indic, time_indic, errmodel_indic)
  
  
  current_rds <- readRDS(file = file_str)
  
  # Wide winning model
  suppressMessages({
    winning_model <- get_winning_model(obj = current_rds) %>%
      select(-c(near_flat, preds_below_loq))
    this_preds <- predict(current_rds,
                          use_scale_conc = FALSE) %>%
      semi_join(winning_model) %>%
      mutate(Options = pk_name, .before = Chemical)
  })
  return(this_preds)
}

7.5 Factor of two model error

# Factor of two model error
tf_tests <- function(this_error_model,
                             this_dose_norm,
                             this_log10_trans,
                             this_time_scale) {
  dose_indic <- as.numeric(this_dose_norm)
  log10_indic <- as.numeric(this_log10_trans)
  time_indic <- as.numeric(this_time_scale)
  errmodel_indic <- substr(this_error_model, 1, 1)
  
  file_str <- paste0(Sys.getenv("OD_DIR"),
                     read_date_pk,
                     "mypk_fit_",
                     dose_indic,
                     log10_indic,
                     time_indic,
                     errmodel_indic,
                     ".Rds")
  pk_name <- paste0(dose_indic, log10_indic, time_indic, errmodel_indic)
  current_rds <- readRDS(file = file_str)
  print(paste0("Now analyzing: ", pk_name))
  
  out <- twofold_test(current_rds,
                      sub_pLOQ = TRUE,
                      suppress_messages = TRUE)
  
  out <- out$model_error_summary %>%
    mutate(Options = pk_name)
  
  return(out)
}

7.6 Evaluate the various GOF metrics

Now call these functions and do the evaluations.

Winning model comparisons:


#winning model
system.time(wm_comp <- fitopts %>%
              dplyr::rowwise() %>%
              dplyr::mutate(winmodel_comp = list(
                winmodel_comparison(this_error_model = error_model,
                                 this_dose_norm = dose_norm,
                                 this_log10_trans = log10_trans,
                                 this_time_scale = time_scale))) %>%
              tidyr::unnest(winmodel_comp))

RMSLE for Cmax and AUC:


#RMSLE for Cmax and AUC
system.time(cmax_auc_rmsle_tbl <- fitopts %>%
              dplyr::rowwise() %>%
              dplyr::mutate(nca_comp = list(
                rmsles_Cmax_AUC(this_error_model = error_model,
                                 this_dose_norm = dose_norm,
                                 this_log10_trans = log10_trans,
                                 this_time_scale = time_scale))) %>%
              tidyr::unnest(nca_comp))

Goodness-of-fit metrics table:


#GOF table
system.time(gof_tbl <- fitopts %>%
              dplyr::rowwise() %>%
              dplyr::mutate(gof_win = list(
                get_gof(this_error_model = error_model,
                                 this_dose_norm = dose_norm,
                                 this_log10_trans = log10_trans,
                                 this_time_scale = time_scale))) %>%
              tidyr::unnest(gof_win))

Factor of two:


#factor of two
system.time(all_tf_tests <- fitopts %>%
              dplyr::rowwise() %>%
              dplyr::mutate(TF = list(
                tf_tests(error_model,
                         this_dose_norm = dose_norm,
                         this_log10_trans = log10_trans,
                         this_time_scale = time_scale))) %>%
              tidyr::unnest(TF))

Get all predictions:

#all predictions
system.time(all_preds <- fitopts %>%
              dplyr::rowwise() %>%
              dplyr::mutate(these_preds = list(
                get_all_preds(this_error_model = error_model,
                                 this_dose_norm = dose_norm,
                                 this_log10_trans = log10_trans,
                                 this_time_scale = time_scale))) %>%
              tidyr::unnest(these_preds))


#Save preds to a file as well
saveRDS(all_preds, paste0(Sys.getenv("FIG_DIR"),
                          today,
                          "_all_preds.Rds"))

rm(all_preds)

Produce and save tables for these evaluation metrics.

(win_mdl_count_by_opts <- gof_tbl %>%
  group_by(Options, method, model) %>%
  count() %>%
  pivot_wider(names_from = model,
              values_from = n) )

7.7 Rank fitting options

Rank fitting options by count of Chemical-Species data groups where RMSLE < 1

gof_tbl %>%
  group_by(Options, method) %>%
  count(RMSLE < 1) %>%
  filter(`RMSLE < 1`) %>%
  arrange(desc(n)) %>%
  print(n = 8)
# bobyqa-optimized options 010h, 110h, 110p

What about R-squared greater than 0.75?

gof_tbl %>%
  group_by(Options, method) %>%
  count(Rsq > 0.75) %>%
  filter(`Rsq > 0.75`) %>%
  arrange(desc(n)) %>%
  print(n = 8)

What about both RMSLE < 1 and R-squared greater than 0.75?

rmsle_rsq_rank <- gof_tbl %>%
  group_by(Options, method) %>%
  count(RMSLE < 1 & Rsq > 0.75) %>%
  filter(`RMSLE < 1 & Rsq > 0.75`) %>%
  arrange(desc(n)) %>%
  dplyr::rename(`# Chemical-Species data groups` = n) 

rmsle_rsq_rank %>% print(n=8)

Looking at both criteria together, it appears that the best set of fitting options is actually the simplest: 000p (no data transformation and pooled error model), bobyqa-optimized.

But also, let’s add in the Cmax/AUC RMSLE calculations: Sort the fitting options from smallest to greatest RMSLE for Cmax and AUC, and see which set of fitting options yields the lowest RMSLE.

cmax_auc_rmsle_rank <- cmax_auc_rmsle_tbl %>%
  arrange(RMSLE_Cmax, RMSLE_AUC) %>%
  select(Options, method, RMSLE_Cmax, RMSLE_AUC) 

cmax_auc_rmsle_rank %>%
  print(n = 8)

This ranking strongly suggests 110p, with 100p as a second choice (both bobyqa).

110p was ranked number 3 in the list of RMSLE and Rsq criteria 9and 100p was ranked number 5). So we select 110p bobyqa as the best when all of these metrics are considered.

7.8 Supplemental Table 2: Save evaluation results

Write the evaluation results to a spreadsheet, including the rankings.


# Write the results to file
writexl::write_xlsx(x = list(
  Fit_Counts = wm_comp,
  AUC_and_Cmax_RMSLE_summary = cmax_auc_rmsle_tbl,
  Prediction_Evaluations = gof_tbl,
  Twofold_Tests = all_tf_tests,
   RMSLE_Rsq_rank = rmsle_rsq_rank,
                         Cmax_AUC_RMSLE_rank = cmax_auc_rmsle_rank),
  path = paste0(Sys.getenv("FIG_DIR"),
                "Manu_Files/",
                today,
                "_Supp_Table2_eval_results",
                ".xlsx"))

rm(wm_comp, cmax_auc_rmsle_tbl, gof_tbl, all_tf_tests)

gc()

7.9 Plots for evaluation of fitting options

7.9.1 Goodness-of-fit metrics Rsq and RMSLE across fitting options

Plot R-sq vs. RMSLE for the various fitting options, color-coding by winning model.

gof_tbl <- readxl::read_excel(path = paste0(Sys.getenv("FIG_DIR"),
                "Manu_Files/",
                today,
                "_Supp_Table2_eval_results",
                ".xlsx"),
                sheet = "Prediction_Evaluations")

 gof_tbl <- gof_tbl %>%
  dplyr::mutate(opts = substr(Options, 1, 3),
                model_text = dplyr::case_when(model %in% "model_1comp" ~ "1comp",
                                              model %in% "model_2comp" ~ "2comp",
                                              model %in% "model_flat" ~ "flat",
                                              .default = NA_character_))

fig_rsq_rmsle <- ggplot(gof_tbl) +
  geom_point(aes(x = RMSLE,
                 y = Rsq,
                 color = model_text)) +
  facet_grid(rows = vars(opts),
             cols = vars(interaction(error_model, method)
                         )
             ) +
 scale_color_manual(values = c("1comp" = "#0398FC",
                                 "2comp" = "#D68E09",
                               "flat" = "black"),
                    name = "Winning model") +
  theme_bw()

fig_rsq_rmsle

This figure reveals two things:

First, when L-BFGS-B is the optimizer, RMSLEs are much larger for a subset of data groups compared to bobyqa.

Second, when L-BFGS-B is the optimizer, the flat model is the winning model for a much greater number of data groups, indicating that for some data groups, L-BFGS-B could not locate a reasonable set of model parameters for 1- or 2-comapartment models, but bobyqa could.

If we zoom in to RMSLE < 2:

fig_rsq_rmsle_zoom <- ggplot(gof_tbl) +
  geom_point(aes(x = RMSLE,
                 y = Rsq,
                 color = model_text)) +
  facet_grid(rows = vars(opts),
             cols = vars(interaction(error_model, method)
                         )
             ) +
 scale_color_manual(values = c("1comp" = "#0398FC",
                                 "2comp" = "#D68E09",
                               "flat" = "black"),
                    name = "Winning model") +
  coord_cartesian(xlim = c(0,1)) +
  theme_bw()

fig_rsq_rmsle_zoom

When each time-scaling fit (when “1” is in the last place of the fit options shorthand) is compared to the fit in the row immediately above it (the same dose-normalization and log10-transformations options, but without time scaling), it can be seen that time scaling either has little effect on goodness of fit, or it makes RMSLE and/or R-squared slightly worse.

Save these figures.

#PDF versions
ggsave(paste0(Sys.getenv("FIG_DIR"),
                "Manu_Files/",
              today,
              "_Supp_Fig4",
              ".pdf"),
       fig_rsq_rmsle,
       width = 11,
       height = 8.5)

ggsave(paste0(Sys.getenv("FIG_DIR"),
                "Manu_Files/",
              today,
              "_Supp_Fig5",
              ".pdf"),
       fig_rsq_rmsle_zoom,
       width = 11,
       height = 8.5)

#PNG versions
ggsave(paste0(Sys.getenv("FIG_DIR"),
                "Manu_Files/",
              today,
              "_Supp_Fig4",
              ".png"),
       fig_rsq_rmsle,
       width = 11,
       height = 8.5)

ggsave(paste0(Sys.getenv("FIG_DIR"),
                "Manu_Files/",
              today,
              "_Supp_Fig5",
              ".png"),
       fig_rsq_rmsle_zoom,
       width = 11,
       height = 8.5)

rm(fig_rsq_rmsle_zoom, fig_rsq_rmsle, gof_tbl)

7.9.2 Cmax RMSLE vs. AUC RMSLE across fitting options

Plot Cmax RMSLE vs. AUC RMSLE.

#read in table
cmax_auc_rmsle_tbl <- readxl::read_xlsx(path = paste0(Sys.getenv("FIG_DIR"),
                                                   "Manu_Files/",
                                                  today,
                                                  "_Supp_Table2_eval_results.xlsx"),
                                     sheet = "AUC_and_Cmax_RMSLE_summary")
#make plot
cmax_auc_rmsle_tbl <- cmax_auc_rmsle_tbl %>%
  dplyr::mutate(opts = substr(Options, 1, 3))

cmax_auc_rmsle_fig <- ggplot(cmax_auc_rmsle_tbl) +
  geom_point(aes(x = RMSLE_AUC,
                 y = RMSLE_Cmax,
                 color = opts),
             size = 4) +
  facet_grid(rows = vars(error_model),
             cols = vars(method)) +
  scale_color_brewer(palette = "Paired",
                     name = "Fit options") +
  theme_bw()

cmax_auc_rmsle_fig

#save plot
#PDF version
ggsave(paste0(Sys.getenv("FIG_DIR"),
                "Manu_Files/",
              today,
              "_Supp_Fig6",
                ".pdf"),
      cmax_auc_rmsle_fig,
      width = 7,
      height = 5)

#PNG version
ggsave(paste0(Sys.getenv("FIG_DIR"),
                "Manu_Files/",
              today,
              "_Supp_Fig6",
                ".png"),
      cmax_auc_rmsle_fig,
      width = 7,
      height = 5,
      units = "in",
      dpi = 300)

rm(cmax_auc_rmsle_fig, cmax_auc_rmsle_tbl)

This figure reveals that overall, L-BFGS-B produced fits that did not match NCA-estimated Cmax or AUC as well as bobyqa fits. Considering only the bobyqa fits, the pooled error model achieved the smallest differences between NCA and model-predicted Cmax and AUC, and it did so with the two sets of fitting options that included both dose-normalization and log10-transformation.

Plot predicted vs. observed concentrations for everything, using the winning fit options and the winning model for each data group.

all_preds <- readRDS(paste0(Sys.getenv("FIG_DIR"),
                          today,
                          "_all_preds.Rds"))

#select winning fit opts: 110p bobyqa

all_preds_prep <- all_preds %>%
  filter(Options %in% c("110p"),
         method %in% "bobyqa") %>%
  mutate(Conc_est_sub = dplyr::if_else(Conc_est < pLOQ,
         pLOQ,
         Conc_est),
         Conc_est_belowLOQ = dplyr::if_else(Conc_est < pLOQ,
         TRUE,
         FALSE),
         ID = as.factor(paste(Chemical, Chemical_Name, Species)),
         .keep = "unused") %>%
  select(ID, Options,
         dose_norm, log10_trans, time_scale,
         Route, Media, Dose,
         Conc, Conc_est_sub, Conc_est_belowLOQ,
         Time, Reference)

split_preds <- split(all_preds_prep, all_preds_prep$ID)

sp_plots <- lapply(names(split_preds), function(i) {
  ggplot(data = split_preds[[i]],
         mapping = aes(x = Conc,
                       y = Conc_est_sub,
                       color = as.factor(Dose),
                       shape = Reference)) +
    xlab("Observed conc, mg/L") +
    ylab("Predicted conc, mg/L") +
    geom_point(size = 2) +
    geom_abline(slope = 1) +
    facet_grid(Route ~ Media, 
               scales = "free_y") +
    scale_y_log10() +
    scale_x_log10() +
    annotation_logticks(sides = "bl") +
    scale_color_viridis_d(name = "Dose, mg/kg") +
    theme_bw() +
    labs(title = i) +
    theme(legend.position = "bottom",
          plot.title = element_text(hjust = 0.5, face = "bold"))
}
)

# Very BIG FILE
pdf(file = paste0(
  Sys.getenv("FIG_DIR"),
  "Manu_Files/",
  today,
   "_log10_predvsobs_plots_110pbobyqa_all.pdf"),
  onefile = TRUE, height = 8, width = 8)
for (x in seq_along(sp_plots)) {
  print(sp_plots[[x]])
}
dev.off()

rm(sp_plots, split_preds, all_preds_prep, all_preds)

8 Analysis Plots and Tables for Best Set of Fitting Options

Begin by collating the data needed for plots. These data are generally accessible using functions/methods for evaluating fitted pk objects.

Read in the fitted pk object for the best set of fitting options.

my_pk <- readRDS(paste0(Sys.getenv("OD_DIR"),
                        read_date_pk,
                        "mypk_fit_110p.Rds"))

Get winning model; predictions; residuals; TK stats; NCA results; twofold test results; coefficients; coefficient SDs.

# after fitting pk object for all cvt objects or reading the fitted pk object in `setup`
winmodel <- get_winning_model(my_pk, method = "bobyqa")
my_preds <- semi_join(predict(my_pk, use_scale_conc = FALSE, method = "bobyqa"), winmodel) 
my_residuals <- residuals(my_pk, use_scale_conc = FALSE, method = "bobyqa") %>%
  semi_join(winmodel) 
my_tkstats <- eval_tkstats(my_pk, method = "bobyqa") %>% semi_join(winmodel)
my_nca <- get_nca(my_pk)
all_my_data <- get_data(my_pk)

#merge together long form coefs and long form coef SDs

my_tf <- twofold_test(my_pk, method = "bobyqa")  

NROW(all_my_data) > NROW(my_preds) # THis must be true... or else try distinct(my_preds)

my_coef_sd <- coef_sd(my_pk, method = "bobyqa") %>% #this will take a few minutes to run
  semi_join(winmodel) #keep only coefs & SDs for winning model

# Writing file to xlsx
writexl::write_xlsx(x = list(predictions = my_preds,
                             tkstats = my_tkstats,
                             coefs_sds = my_coef_sd),
                    path = paste0(Sys.getenv("FIG_DIR"),
                                  "Manu_Files/",
                                  today,
                                  "_Supp_Table3",
                                  ".xlsx"))

Compute fold errors, r-squared, AIC, RMSLE, RMSE.


fe_df <- fold_error(my_pk,
                     sub_pLOQ = TRUE,
                     method = "bobyqa") %>% semi_join(winmodel)

r2_df <- rsq(my_pk,
             use_scale_conc = FALSE,
             sub_pLOQ = TRUE,
             method = "bobyqa") %>%
  semi_join(winmodel)

myAIC <- AIC(my_pk, method = "bobyqa") %>%
  semi_join(winmodel)

myRMSLE <- rmse(my_pk,
               use_scale_conc = list(dose_norm = FALSE,
                                     log10_trans = TRUE),
               sub_pLOQ = TRUE,
               method = "bobyqa") %>% 
  semi_join(winmodel)  

myRMSE <- rmse(my_pk,
               rmse_group = ggplot2::vars(Chemical,
                                          Species,
                                          Route,
                                          Media,
                                          Dose),
               sub_pLOQ = TRUE,
               method = "bobyqa") %>%
  semi_join(winmodel)

8.0.1 Supp Figure 7: invivoPKfit model performance

Overall fold error distributions.

my_tf$model_error_all %>%
  filter(model %in% c("model_1comp", "model_2comp")) %>%
  mutate(Route.model = factor(paste(Route, model),
                              levels = c("iv model_1comp",
                                         "iv model_2comp",
                                         "oral model_1comp",
                                         "oral model_2comp"))) %>%
  ggplot(aes(x = Fold_Error,
             fill = Route.model)) +
  geom_histogram(color = NA,
                 position = position_stack(),
                 bins = 50) +
  scale_x_log10(labels = scales::label_math(format = log10)) +
  annotation_logticks(sides = "b") +
    scale_fill_brewer(palette = "Paired",
                      name = "Route/winning model") +
  expand_limits(y = 0) +
  geom_vline(xintercept = c(0.5, 2), color = "black", linetype = "dashed") +
  labs(x = "Predicted/Observed",
       y = "Number of Observations") +
  theme_classic() +
  theme(aspect.ratio = 1,
        panel.border = element_rect(color = "black", fill = NA),
        panel.grid.major = element_line(color = "grey95", linewidth = 0.4)) +
  coord_cartesian(xlim = c(10^(-1.5), 100))


ggsave(paste0(Sys.getenv("FIG_DIR"),
               "Manu_Files/",
                                  today,
              "_Supp_Fig7_PredictedObserved_compartmentModels.png"),
       height = 5, width = 7)

ggsave(paste0(Sys.getenv("FIG_DIR"),
               "Manu_Files/",
                                  today,
              "_Supp_Fig7_PredictedObserved_compartmentModels.pdf"),
       height = 5, width = 7)

8.0.2 Figure 4 and Supplemental Figures 4 & 5: Model performance vs Data Variability

Here we examine the performance of “best fit” models with relation to data variability. Here, because we are trying to specifically compare data variability with model performance, we look at data with replicated timepoint observations, which are largely data with individual animal data.

my_tf$indiv_data_test_fold_errors %>%
  select(model, starts_with("percent_")) %>%
  glimpse()

pl_4A <- my_tf$indiv_data_fold_errors %>%
  # For Plotting
  ggplot(aes(
    y = Fold_Error,
    x = foldConc
  )) +
  geom_bin2d(bins = 40, color = NA) +
  geom_hline(yintercept = c(0.5, 2), linetype = "dashed") +
  geom_vline(xintercept = c(0.5, 2), linetype = "dashed") +
  scale_x_log10(labels = scales::label_math(format = log10),
                limits = c(0.001, 20)) +
  scale_y_log10(labels = scales::label_math(format = log10),
                limits = c(0.0001, 10000)) +
  annotation_logticks(sides = "bl") +
  scale_fill_viridis_c(option = "cividis",
                       limits = c(1, 100),
                       oob = scales::oob_squish) +
  labs(y = "Model Error",
       x = "Data Variability",
       fill = "Count") +
  theme(aspect.ratio = 1,
        panel.border = element_rect(color = "black", fill = NA, size = 1.5),
        panel.background = element_blank(),
        panel.grid.major = element_line(color = "grey90", linewidth = 0.5),
        strip.background = element_rect(fill = "white"),
        strip.text = element_text(face = "bold"),
        axis.ticks = element_blank(),
        axis.line = element_blank(),
        axis.text = element_text(size = 14),
        axis.title = element_text(size = 14)) 

pl_4A
ggsave(paste0(Sys.getenv("FIG_DIR"),
              today,
              "_Fig4A_ModelPerformance_vDataVariability.png"),
       height = 5,
       width = 7,
       device = "png",
       dpi = 300,
       units = "in")

Create table.

  
my_table <- my_tf$indiv_data_test_fold_errors %>%
  ungroup() %>%
  filter(model %in% "All") %>%
  select(starts_with("percent_")) %>% t() %>%
  round(digits = 2) %>%
  as.data.frame() %>%
  rownames_to_column(var = "percentages") %>%
  mutate(percentages = case_when(
    percentages == "percent_both_within" ~ "Both within a factor of 2",
    percentages == "percent_both_outside" ~ "Both outside a factor of 2",
    percentages == "percent_model_outside" ~ "Model too variable",
    percentages == "percent_data_outside" ~ "Data too variable"
  ))
  
names(my_table) <- c(" ", "Overall (%)")

flextable(my_table) %>%
  border_inner() %>%
  fontsize(part = "all", size = 11) %>%
  bold(part = "all", j = 2) %>%
  autofit()

table_plot <- gen_grob(flextable(my_table) %>%
                         border_inner() %>%
                         fontsize(part = "all", size = 10) %>%
                         bold(part = "all", j = 2) %>%
                         autofit(),
                       autowidths = TRUE,
                       fit = "width")


dual_plot <- plot_grid(pl_4A,
                       table_plot,
                       ncol = 1,
                       align = "hv",
          rel_heights = c(1, 0.5),
          rel_widths = c(1, 0.5)
          )

dual_plot

ggsave(paste0(Sys.getenv("FIG_DIR"),
              "Manu_Files/",
              today,
              "_Fig4.png"),
       plot = dual_plot,
       height = 4.5,
       width = 6,
       device = "png",
       bg = "white",
       dpi = 300,
       units = "in")

8.0.3 Figure 5: Multiple goodness-of-fit metrics validate model performance

Here we analyze and validate model performance per data group using R-squared, RMSE, and fraction of predictions that are within 2-fold for each Chemical, Species, Route, Media, and Dose groupings.

combined_gof_df <- my_tf$model_error_all %>%
  group_by(Chemical, Species, model, method) %>%
  summarize(within_2fold = sum(between(Fold_Error, 0.5, 2))/n()) %>%
  left_join(r2_df) %>%
  left_join(myAIC) %>%
  left_join(myRMSLE) %>%
  semi_join(winmodel)
  

mypl <- ggplot(data = combined_gof_df,
               aes(
                 x = within_2fold,
                 y = Rsq,
                 color = model,
               )) +
  geom_point() +
  geom_abline(slope = 1, color = "black", linetype = "longdash") +
  scale_color_manual(values = c("#0398FC", "#D68E09", "black")) +
    #coord_cartesian(xlim = c(0,1), ylim = c(0,1)) +
  labs(x = "Fraction of predictions within 2-fold",
       y = "R-squared value") +
  theme(#aspect.ratio = 1,
        panel.border = element_rect(color = "black", fill = NA, size = 1),
        panel.background = element_blank(),
        panel.grid.major = element_line(color = "grey90", linewidth = 0.5),
        axis.ticks = element_blank(),
        axis.line = element_blank(),
        legend.position = "none",
        legend.title = element_blank(),
        legend.key = element_blank()
  )
  
panelA_plot <- ggExtra::ggMarginal(mypl,
                                   groupFill = TRUE,
                    type = "histogram",
                    xparams = list(binwidth = 0.05),
                    yparams = list(binwidth = 0.05))
panelA_plot

panelB_plot <- ggplot(data = myRMSE,
               aes(
                 x = RMSE,
                 fill = model
               )) + 
  scale_x_log10() +
  annotation_logticks(sides = "b") +
  geom_histogram(bins = 50,
                 position = position_stack(),
                 color = NA) +
  labs(y = "Count") +
  scale_fill_manual(values = c("#0398FC", "#D68E09", "grey10")) +
   scale_color_manual(values = c("#0398FC", "#D68E09", "grey10"), guide = "none") +
  theme(text = element_text(size = 10),
        #aspect.ratio = 1,
    panel.border = element_rect(color = "black", fill = NA, size = 1),
        panel.background = element_blank(),
        panel.grid.major = element_line(color = "grey90", linewidth = 0.5),
        axis.ticks = element_blank(),
        axis.line = element_blank(),
        strip.background = element_blank(),
    panel.spacing.y = unit(0.125, units = "in"),
        legend.title = element_blank(),
    legend.position = "bottom")
panelB_plot

plAB <- patchwork::wrap_elements(panelA_plot) + panelB_plot +
  plot_annotation(tag_levels = "A") + plot_layout(guides = 'collect', widths = c(1,0.9)) & theme(legend.position = "bottom") & guides(color = "none")

#save PNG version
ggsave(filename = paste0(Sys.getenv("FIG_DIR"),
                         "Manu_Files/",
                         today,
                         "_Fig5.png"),
       plAB,
       height = 4,
       width = 6.5,
       bg = "white",
       units = "in")

#save PDF version
ggsave(filename = paste0(Sys.getenv("FIG_DIR"),
                         "Manu_Files/",
                         today,
                         "_Fig5.pdf"),
       plAB,
       height = 4,
       width = 6.5,
       bg = "white")

8.0.4 Figure 7: Examples fits for chemicals with R-squared and within 2-fold

pl <- plot(my_pk,
           method = "bobyqa",
           best_fit = TRUE,
           use_scale_conc = list(dose_norm = TRUE,
                                 log10_trans = FALSE),
           drop_nonDetect = FALSE)

cvt %>% filter(curation_set_tag == "CVT_Showa") %>%
  pull(analyte_dtxsid) -> Showa_chems


my_rsq <- rsq(my_pk, method = "bobyqa")
my_rsq %>% semi_join(winmodel) %>% filter(Chemical %in% c("DTXSID2020139",
                            "DTXSID4023917",
                            "DTXSID3061635",
                            "DTXSID8030760"))

#High rsq: DTXSID2020139 and DTXSID4023917
#Low rsq: DTXSID3061635 and DTXSID8030760

fe_df %>% filter(Chemical %in% c("DTXSID2020139",
                                  "DTXSID4023917",
                                  "DTXSID3061635",
                                  "DTXSID8030760"), Species %in% "rat") %>% group_by(Chemical, Species) %>% summarise(frac_2fold = sum(Fold_Error < 2 & Fold_Error > 0.5)/n())

# # A tibble: 4 × 3
# # Groups:   Chemical [4]
#   Chemical      Species frac_2fold
#   <chr>         <chr>        <dbl>
# 1 DTXSID2020139 rat          0.271
# 2 DTXSID3061635 rat          0.125
# 3 DTXSID4023917 rat          0.953
# 4 DTXSID8030760 rat          0.778

#High frac 2fold: DTXSID4023917 and DTXSID8030760
#Low frac 2fold: DTXSID2020139 and DTXSID3061635

pl_sub <- pl %>%
  filter(Chemical %in% c("DTXSID2020139",
                         "DTXSID4023917",
                         "DTXSID3061635",
                         "DTXSID8030760"),
         Species %in% "rat") 

ex_fits <- pl_sub %>%
  pull(final_plot)
ex_fits <- set_names(ex_fits,
          pl_sub$Chemical)



cowplot::plot_grid(plotlist = list(
  #Low frac 2fold and high Rsq
  ex_fits[["DTXSID2020139"]] +
                                     scale_color_manual(values = c("#a6bddb",
                                                                   "#74a9cf",
                                                                   "#41bbc4",
                                                                   "#2b8cbe",
                                                                   "#045a8d")) +
                                     theme(legend.position = "none") +
                                     xlim(0,30) +
    ggtitle("A: DTXSID2020139 rat"),
  #high frac 2fold and high Rsq
                                   ex_fits[["DTXSID4023917"]] +
                                     scale_color_manual(values = c("black",
                                                                   "grey40")) +
                                     theme(legend.position = "none") +
    ggtitle("B: DTXSID4023917 rat"),
  
  #low frac 2fold and low rsq
                                    ex_fits[["DTXSID3061635"]] +
                                     scale_color_manual(values = c("#006837")) +
                                     theme(legend.position = "none") +
    ggtitle("C: DTXSID3061635 rat"),
  
  #high frac2fold and low rsq
                                   ex_fits[["DTXSID8030760"]] +
                                     scale_color_manual(values = "magenta3") +
                                     theme(legend.position = "none") +
    ggtitle("D: DTXSID8030760 rat")
                                  ),
                   scale = 0.85)

ggsave(filename = paste0(Sys.getenv("FIG_DIR"),
                         "Manu_Files/",
                         today,
                         "_Fig7.png"),
       height = 12,
       width = 12)

ggsave(filename = paste0(Sys.getenv("FIG_DIR"),
                         "Manu_Files/",
                         today,
                         "_Fig7.pdf"),
       height = 12,
       width = 12)

8.0.4.1 Supp. Fig 8: plots of the five data groups that were best fit by the null model

flat_chems <- winmodel %>%
  filter(model %in% "model_flat") %>%
  distinct(Chemical, Species)
print(flat_chems)

pl <- plot(my_pk,
method = "bobyqa",
best_fit = FALSE,
use_scale_conc = list(dose_norm = TRUE,
log10_trans = FALSE),
drop_nonDetect = FALSE)

flat_fits <- pl %>% 
  inner_join(flat_chems) %>%
                              pull(final_plot)

cowplot::plot_grid(plotlist = flat_fits)

ggsave(filename = paste0(Sys.getenv("FIG_DIR"),
                         "Manu_Files/",
                         today,
                         "_Supp_Fig8.pdf"),
       height = 8,
       width = 16,
       units = "in")

ggsave(filename = paste0(Sys.getenv("FIG_DIR"),
                         "Manu_Files/",
                         today,
                         "_Supp_Fig8.png"),
       bg = "white",
       height =8,
       width = 16,
       dpi = 300,
       units = "in")

8.0.5 Figure 6: Comparing derived TK stats with human TK stats from Lombardo et al.

This meta-analysis compares the TK stats from model fits with other published studies.

cvt_DTXSIDs <- unique(my_pk$data$Chemical)
# First things first, let's load the data from Lombardo et al.
lombardo <- readxl::read_xlsx(
  paste0(Sys.getenv("OD_DIR"),
         "Lombardo2018-Supplemental_82966_revised_corrected.xlsx"),
  skip = 8)

#use ctxR to search the Dashboard for DTXSIDs corresponding to these CASRNs
lombardo_dtxsid <- ctxR::chemical_equal_batch(
  word_list = unique(lombardo$`CAS #`)
)

#merge in the DTXSIDs
lombardo <- lombardo %>%
   dplyr::left_join(lombardo_dtxsid$valid %>%
               dplyr::select(searchValue, dtxsid, preferredName),
            by = c("CAS #" = "searchValue"))

#any left unidentified: search by name
lombardo_missing <- lombardo %>%
  dplyr:: filter(is.na(dtxsid))

missing_names <- lombardo_missing %>%
   dplyr::pull(Name)

lombardo_dtxsid_name <- ctxR::chemical_equal_batch(
  word_list = unique(missing_names)
)

lombardo_missing <- lombardo_missing %>%
   dplyr::left_join(lombardo_dtxsid_name$valid %>% 
               dplyr::select(searchValue, dtxsid, preferredName),
            by = c("Name" = "searchValue") )

lombardo <-  dplyr::bind_rows(lombardo %>% filter(!is.na(dtxsid)),
                      lombardo_missing)

lombard_abbr <- lombardo %>%
  dplyr::select(dtxsid,
                preferredName,
                Vss = `human VDss (L/kg)`,
                CLtot = `human CL (mL/min/kg)`,
                halflife = `terminal  t1/2 (h)` ) %>%
  dplyr::mutate(Species = 'human',
                source = "Lombardo et al.",
                CLtot = CLtot*60/1000 # Lombardo reports this as mL/min/kg, need L/h/kg
  ) %>%
   dplyr::arrange(halflife)

my_pk_vals <- my_tkstats %>%
  dplyr::select(dtxsid = Chemical,
                Species,
                model,
                halflife = halflife.tkstats,
                Vss = Vss.tkstats,
                CLtot = CLtot.tkstats) %>%
  dplyr::distinct() %>%
  dplyr::mutate(source = "invivoPKfit") %>%
  dplyr::filter(model != "model_flat",
                !is.na(halflife))

lombardo_inner <- my_pk_vals %>%
   dplyr::inner_join(lombard_abbr, by = "dtxsid")
#79 chemicals in common

lombardo_comparison <-  dplyr::bind_rows(lombard_abbr %>%
                                    dplyr::filter(
                                     dtxsid %in%
                                       lombardo_inner$dtxsid
                                   ),
                                 my_pk_vals %>%
                                    dplyr::filter(
                                     dtxsid %in%
                                       lombardo_inner$dtxsid
                                   )
)

#merge in chemical names

#get preferred names by DTXSID
all_details <- ctxR::get_chemical_details_batch(
  unique(lombardo_comparison$dtxsid)
)

lombardo_comparison <- lombardo_comparison %>%
  select(-preferredName) %>%
  left_join(all_details %>% select(dtxsid, preferredName))

#create a factor variable for chemical names that sorts chemicals from highest to lowest Lombardo halflife

chemnames_levels <- lombardo_comparison %>%
   dplyr::filter(source %in% "Lombardo et al.") %>%
   dplyr::arrange(halflife) %>%
   dplyr::pull(preferredName)


lombardo_comparison <- lombardo_comparison %>%
   dplyr::mutate(preferredName = factor(preferredName,
                                levels = chemnames_levels))

#get invivoPKfit species for plot-splitting purposes
ivpk_species <- lombardo_comparison %>%
  dplyr::filter(source %in% "invivoPKfit") %>%
  dplyr::distinct(dtxsid, Species) %>%
  dplyr::rename(invivopkfit_species = Species)

lombardo_comparison <- lombardo_comparison %>%
  dplyr::left_join(ivpk_species,
                   by = "dtxsid")

#reshape longer and plot
lombardo_comparison %>%
   tidyr::pivot_longer(cols = c("Vss", "CLtot", "halflife")) %>%
   dplyr::mutate(name = factor(name,
                               levels = c("halflife", "Vss", "CLtot")
                               )
                 ) %>%
  ggplot(mapping = aes(
    x = value,
    y = preferredName
  )) +
  geom_point(aes(shape = source,
                 color = source),
             #position = position_dodge(0.7),
             size = 5/.pt, stroke = 2.5/.pt) +
  facet_grid(cols = vars(name),
             rows = vars(invivopkfit_species),
             scales = "free",
             space =  "free_y",
             drop = TRUE) +
  scale_x_log10( labels = scales::label_math(format = log10)) +
  #annotation_logticks(sides = "b") +
  scale_shape_manual(values = c("Lombardo et al." = 3,
                                "invivoPKfit" = 22)) +
  scale_color_manual(values = c("#d95f02", "#1b9e77")) +
  guides(x = "axis_logticks") +
  theme(panel.border = element_rect(color = "black",
                                    fill = NA,
                                    linewidth = 1.5),
        panel.background = element_blank(),
        panel.grid.major = element_line(color = "grey90", linewidth = 0.5),
         panel.spacing.y = unit(0, "line"),
        strip.background = element_rect(fill = "white"),
        strip.text.x = element_text(face = "bold", size = 12),
       strip.text.y = element_blank(),
        text = element_text(size = 10),
        axis.ticks = element_blank(),
        axis.line = element_blank(),
        axis.text.y = element_text(face = "bold", size = 6),
        axis.text.x = element_text(size = 10),
        axis.title.y = element_blank(),
        axis.title.x = element_blank(),
        legend.position = "bottom",
        legend.key = element_blank(),
        legend.title = element_text(face = "bold"),
        legend.text = element_text())

ggsave(filename = paste0(Sys.getenv("FIG_DIR"),
                         "Manu_Files/",
                         today,
                         "_Fig6.png"),
       height = 11,
       width = 10,
       units = "in",
       device = "png",
       dpi = 300)

ggsave(filename = paste0(Sys.getenv("FIG_DIR"),
                         "Manu_Files/",
                         today,
                         "_Fig6.pdf"),
       height = 11,
       width = 10,
       units = "in")

8.0.6 Supp. Fig. 9: Fgutabs comparison with literature values compiled by Musther et al. (2014)

Compare with literature values of Fgutabs from the supplemental material of Musther et al. 2014.

Read in these literature values and reshape them to prepare for fitting.

species_cols <- c(paste("Mouse",
       c("Dosing Formulation",
                    "Strain",
                    "Sex",
                    "F% Mean",
                    "F% Range"),
      sep = "."),

paste("Rat",
       c("Dosing Formulation",
                    "Strain",
                    "Sex",
                    "F% Mean",
         "WSD",
                    "F% Range"),
      sep = "."),

paste("Dog",
       c("Dosing Formulation",
                    "Strain",
                    "Sex",
                    "F% Mean",
         "WSD",
                    "F% Range"),
      sep = "."),

paste("Non-Human Primate",
       c("Dosing Formulation",
                    "Strain",
                    "Sex",
                    "F% Mean",
         "WSD",
                    "F% Range"),
      sep = "."),

paste("Human",
       c("Dosing Formulation",
                    "Sex",
                    "F% Mean",
         "WSD",
                    "F% Range"),
      sep = ".")
)



musther2014 <- readxl::read_excel(
  path = paste0(Sys.getenv("OD_DIR"),
                "SupTableMusther2014.xlsx"),
  sheet = "Sheet1",
  skip = 1,
  col_names = c( "Compound",
                 "CAS",
                 "MW", "Ion Class",
               species_cols,
                "References.mouse",
                "References.rat",
                "References.dog",
                "References.NHP",
                "References.human"
                ),
  col_types = "text"
  ) 

#pivot longer
musther2014_long <- musther2014 %>%
  select(c(Compound,
           CAS,
         contains("F% Mean"))) %>%
  pivot_longer(cols = !c(Compound, CAS),
               names_to = c("Species", "stat"),
               names_sep = "\\.") %>%
  mutate(value = gsub(x=value,
                      pattern = "<",
                      replacement = ""),
         value_num = as.numeric(value))

Use ctxR to get DTXSIDs from CAS number.

musther2014_dtxsid <- ctxR::chemical_equal_batch(word_list = unique(musther2014_long$CAS))

musther2014_long <- musther2014_long %>%
  left_join(musther2014_dtxsid$valid %>% select(searchValue, dtxsid, preferredName),
            by = c("CAS" = "searchValue"))

Get Fgutabs estimates from the fitted pk object.

fgutabs <- coef(my_pk, method = "bobyqa") %>%
  distinct() %>%
   semi_join(winmodel) %>% #keep only winning models
  rowwise() %>%
  mutate(Fgutabs = coefs_vector["Fgutabs"]) %>%
  filter(!is.na(Fgutabs))

How many chemicals overlapped at all

length(intersect(musther2014_long$dtxsid,
                 get_data(my_pk)$Chemical))

Join with the literature estimates, keeping only cases with data in both tables

fgutabs_pk_musther <- fgutabs %>% inner_join(musther2014_long,
                                             by = c("Chemical" = "dtxsid"))

Make a plot:

#capitalize invivopkfit species to harmonize with Musther
fgutabs_pk_musther <- fgutabs_pk_musther %>%
  dplyr::mutate(Species.x =  stringr::str_to_sentence(Species.x))

ggplot(fgutabs_pk_musther) +
  geom_point(aes(y = preferredName,
                 x = Fgutabs,
                 shape = Species.x,
                 color = "invivopkfit"),
             size =4,
             stroke = 2) +
  geom_point(aes(y = preferredName,
                 x = value_num/100,
                 shape = Species.y,
                 color = "Musther et al. 2014"),
             size = 4,
             stroke = 2) +
  scale_shape_manual(values = c("Rat" = 21,
                                "Mouse" = 22,
                                "Dog" = 23,
                                "Non-Human Primate" = 24,
                                "Human" = 3),
                     breaks = c("Rat", "Mouse", "Dog", "Non-Human Primate", "Human")) +
  
  theme_bw() +
  theme(axis.title.y = element_blank(),
        legend.title = element_blank(),
        axis.title.x = element_text(size = 12),
        axis.text = element_text(size = 12),
        legend.text = element_text(size = 12)
        )

ggsave(filename = paste0(Sys.getenv("FIG_DIR"),
                         "Manu_Files/",
                         today,
                         "_Supp_Fig9.pdf"),
       height = 5,
       width = 8)

ggsave(filename = paste0(Sys.getenv("FIG_DIR"),
                          "Manu_Files/",
                         today,
                         "_Supp_Fig9.png"),
       height = 5,
       width = 8,
       device = "png", dpi = 300)

8.0.7 Supp. Fig. 10: Parallelization decreases runtime of invivoPKfit

Benchmarking for parallelization vs. non-parallelization

# Randomize the order of chemical species
chem_spec <- cvt %>%
  dplyr::distinct(analyte_dtxsid, species) %>%
  dplyr::slice_sample(prop = 1)

test_group_size <- seq(25, 105, by = 10)
fit_options <- expand.grid(error_model = c("pooled",
                                           "hierarchical"),
                           dose_norm = FALSE,
                           log10_trans = TRUE,
                           time_scale = c(
                             TRUE,
                             FALSE),
                           stringsAsFactors = FALSE)

fit_options <- tidyr::expand_grid(fit_options, test_group_size)

fit_options <- fit_options %>% rowwise() %>%
  mutate(this_data = list({
    tmp <- chem_spec %>% slice_head(n = test_group_size) %>%
      left_join(cvt, by = join_by(analyte_dtxsid, species)) %>% 
      pk() +
        scale_time(new_units = ifelse(!time_scale, 
                                      "identity",
                                      "auto")) +
        scale_conc(dose_norm = dose_norm, log10_trans = log10_trans) +
        settings_preprocess(suppress.messages = TRUE)
    do_prefit(tmp)
  }
  ),
  data_size = chem_spec %>% slice_head(n = test_group_size) %>%
    left_join(cvt, by = join_by(analyte_dtxsid, species)) %>%
    NROW())

# Organization of the benchmarking
# For each fit_option, return the four values of system.time() that we want
df_out <- fit_options %>%
  rowwise() %>%
  mutate(tim_1core = system.time(do_fit(this_data))[["elapsed"]],
         tim_4core = system.time(do_fit(this_data, n_cores = 4))[["elapsed"]],
         tim_7core = system.time(do_fit(this_data, n_cores = 7))[["elapsed"]],
         tim_12core = system.time(do_fit(this_data, n_cores = 12))[["elapsed"]]
         )

df_out <- df_out %>% select(-this_data)
gc()

full_long <- df_out %>%
  pivot_longer(cols = c(tim_1core:tim_12core),
                        names_to = "N_cores",
                        values_to = "Time_s") %>%
  group_by(N_cores, test_group_size, data_size) %>%
  summarize(mean_time = mean(Time_s, na.rm = TRUE)/60,
            max_time = max(Time_s, na.rm = TRUE)/60,
            min_time = min(Time_s, na.rm = TRUE)/60
            ) %>%
  mutate(N_cores = as.numeric(str_extract(N_cores, "[:digit:]+")))

df_out %>% clipr::write_clip()
ggplot(full_long,
       aes(x = test_group_size,
           y = mean_time,
           color = as.factor(N_cores))) +
  geom_point() +
  geom_linerange(aes(ymin = min_time,
                ymax = max_time)) +
  geom_line(aes(group = N_cores)) +
  labs(x = "Number of Data Groups",
       y = "Runtime (minutes)",
       title = "Number of Processing Cores",
       subtitle = "(with min/max runtime range)") +
  facet_grid(cols = vars(N_cores)) +
  scale_color_manual(values = c("black", "#40c679", "#31a354", "#006837")) +
  coord_fixed(ratio = 8) +
  theme(panel.border = element_rect(color = "black", fill = NA, size = 1.5),
        panel.background = element_blank(),
        panel.grid.major = element_line(color = "grey90", linewidth = 0.5),
        legend.position = "none",
        plot.title = element_text(hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5),
        axis.ticks = element_blank(),
        axis.line = element_blank(),
        strip.background = element_blank())


ggsave(paste0(Sys.getenv("FIG_DIR"),
              "Manu_Files/",
              today,
              "_Supp_Fig10.png"),
       width = 5,
       height = 2.5,
       units = "in")

9 Summary fit data for all fitting options, all models, all data groups

fit_summary_bydatagrp <- function(this_error_model,
                             this_dose_norm,
                             this_log10_trans,
                             this_time_scale) {
  dose_indic <- as.numeric(this_dose_norm)
  log10_indic <- as.numeric(this_log10_trans)
  time_indic <- as.numeric(this_time_scale)
  errmodel_indic <- substr(this_error_model, 1, 1)
  
  file_str <- paste0(Sys.getenv("OD_DIR"),
                     read_date_pk,
                     "mypk_fit_",
                     dose_indic,
                     log10_indic,
                     time_indic,
                     errmodel_indic,
                     ".Rds")
  pk_name <- paste0(dose_indic, log10_indic, time_indic, errmodel_indic)
  
  my_pk <- readRDS(file = file_str)
  
    pk_name <- paste0(dose_indic, log10_indic, time_indic, errmodel_indic)
  
  message(paste0("Summary by data group for: ", pk_name))

myAFE <- AFE(my_pk,
              use_scale_conc = FALSE,
             sub_pLOQ = TRUE)

myAAFE <- AAFE(my_pk,
              use_scale_conc = FALSE,
             sub_pLOQ = TRUE)

my_r2 <- rsq(my_pk,
             use_scale_conc = FALSE,
             sub_pLOQ = TRUE) 

myAIC <- AIC(my_pk) 

myRMSLE <- rmse(my_pk,
               use_scale_conc = list(dose_norm = FALSE,
                                     log10_trans = TRUE),
               sub_pLOQ = TRUE)


myRMSE <- rmse(my_pk,
               use_scale_conc = FALSE,
               sub_pLOQ = TRUE) 

winmodel <- get_winning_model(my_pk) %>%
  select(Chemical, Species, method, model) %>%
  dplyr::mutate(is_winning_model = TRUE)


 #note convergence codes for each model
  pf <- my_pk$fit %>%
    dplyr::distinct(Chemical, Species, model, method, convcode)
  
  
  convcode_labels <- data.frame(convcode =c(0,
                                               1,
                                               20,
                                               21,
                                               10,
                                               51,
                                               52,
                                               9999,
                                               -9999),
                                convcode_label = c('Successful convergence (see ?optimx::optimx)',
                                                   'The iteration limit maxit had been reached (see ?optimx::optimx)',
                                'the initial set of parameters is inadmissible, that is, that the function cannot be computed or returns an infinite, NULL, or NA value (see ?optimx::optimx))',
  'an intermediate set of parameters is inadmissible (see ?optimx::optimx)',
  'degeneracy of the Nelder–Mead simplex (see ?optimx::optimx)',
  'a warning from the "L-BFGS-B" method (see ?optimx::optimx)',
  'an error from the "L-BFGS-B" method (see ?optimx::optimx)',
  'error in optimx::optimx()',
  'model status was "abort"')
  )
  
  pf <- pf %>%
    dplyr::left_join(convcode_labels)
  
#join absolutely everything together
summ_by_datagroup <- pf %>%
  left_join(winmodel) %>%
  left_join(myAIC) %>%
  left_join(myAFE) %>%
  left_join(myAAFE) %>%
  left_join(my_r2) %>%
  left_join(myRMSLE) %>%
  left_join(myRMSE) %>%
mutate(is_winning_model = case_when(is_winning_model %in% TRUE ~ TRUE,
                                    is.na(is_winning_model) ~ FALSE,
                                    .default = FALSE)) %>%
  arrange(Chemical, Species, method, model)

return(summ_by_datagroup)
}

fit_summary_byexpt <- function(this_error_model,
                             this_dose_norm,
                             this_log10_trans,
                             this_time_scale) {
  dose_indic <- as.numeric(this_dose_norm)
  log10_indic <- as.numeric(this_log10_trans)
  time_indic <- as.numeric(this_time_scale)
  errmodel_indic <- substr(this_error_model, 1, 1)
  
  file_str <- paste0(Sys.getenv("OD_DIR"),
                     read_date_pk,
                     "mypk_fit_",
                     dose_indic,
                     log10_indic,
                     time_indic,
                     errmodel_indic,
                     ".Rds")
  pk_name <- paste0(dose_indic, log10_indic, time_indic, errmodel_indic)
  
  my_pk <- readRDS(file = file_str)
  
      pk_name <- paste0(dose_indic, log10_indic, time_indic, errmodel_indic)
  
  message(paste0("Summary by experiment for: ", pk_name))

my_tkstats <- eval_tkstats(my_pk,
                           model = NULL)

winmodel <- get_winning_model(my_pk) %>%
  select(Chemical, Species, method, model) %>%
  dplyr::mutate(is_winning_model = TRUE)

pf <- my_pk$fit %>%
    dplyr::distinct(Chemical, Species, model, method, convcode)
  
  
  convcode_labels <- data.frame(convcode =c(0,
                                               1,
                                               20,
                                               21,
                                               10,
                                               51,
                                               52,
                                               9999,
                                               -9999),
                                convcode_label = c('Successful convergence (see ?optimx::optimx)',
                                                   'The iteration limit maxit had been reached (see ?optimx::optimx)',
                                'the initial set of parameters is inadmissible, that is, that the function cannot be computed or returns an infinite, NULL, or NA value (see ?optimx::optimx))',
  'an intermediate set of parameters is inadmissible (see ?optimx::optimx)',
  'degeneracy of the Nelder–Mead simplex (see ?optimx::optimx)',
  'a warning from the "L-BFGS-B" method (see ?optimx::optimx)',
  'an error from the "L-BFGS-B" method (see ?optimx::optimx)',
  'error in optimx::optimx()',
  'model status was "abort"')
  )
  
  pf <- pf %>%
    dplyr::left_join(convcode_labels)
  

summ_by_expt <- pf %>%
  left_join(winmodel) %>%
  left_join(my_tkstats) %>%
mutate(is_winning_model = case_when(is_winning_model %in% TRUE ~ TRUE,
                                    is.na(is_winning_model) ~ FALSE,
                                    .default = FALSE))  %>%
  arrange(Chemical, Species, method, model)

return(summ_by_expt)
}

summary_datagrp_all <- fitopts %>%
              dplyr::rowwise() %>%
  dplyr::mutate(summ = list(
    fit_summary_bydatagrp(this_error_model = error_model,
                                             this_dose_norm = dose_norm,
                                             this_log10_trans = log10_trans,
                                             this_time_scale = time_scale)
    )
    ) %>%
  tidyr::unnest(cols = c(summ))

summary_expt_all <- fitopts %>%
              dplyr::rowwise() %>%
  dplyr::mutate(summ_expt = list(
    fit_summary_byexpt(this_error_model = error_model,
                                             this_dose_norm = dose_norm,
                                             this_log10_trans = log10_trans,
                                             this_time_scale = time_scale)
    )
    ) %>%
  tidyr::unnest(cols = c(summ_expt))

writexl::write_xlsx(list("Summary by data grp" = summary_datagrp_all,
                         "Summary by expt" = summary_expt_all),
                    paste0(Sys.getenv("FIG_DIR"),
              "Manu_Files/",
              today,
              "_Supp_Table4.xlsx"))

10 Individual study level analysis

While there is no convenient external data set for evaluation, one way of evaluating the generalizability of the model fits is to compare the pooled fits to fits done on an individual-study basis, where a “study” is defined as a unique combination of Chemical, Species, and Reference.

This means setting the data_group to vars(Chemical, Species, Reference) and also setting the error_group to vars(Chemical, Species, Reference).

For comparison purposes, we will use the same “best set” of options that was used for the pooled analysis: dose-normalization, log10-transformation, no time transformation, and the “bobyqa” optimization algorithm.

#set up pk analysis for single-study
pk_study <- minimal_pk +
  facet_data(
    facets = vars(Chemical, Species, Reference)
  ) +
  stat_error_model(
    error_group = vars(Chemical, Species, Reference)
  ) +
  settings_preprocess(keep_data_original = FALSE,
                      suppress.messages = TRUE) +
  scale_conc(dose_norm = TRUE,
             log10_trans = TRUE) +
  settings_optimx(method = "bobyqa") +
  stat_model(model = c("model_flat",
                       "model_1comp",
                       "model_2comp"))

#do pk analysis
system.time(
  pk_study <- do_fit(pk_study,
                     n_cores = 11)
)

saveRDS(pk_study,
        paste0(Sys.getenv("OD_DIR"),
               today,
               "studylevel_pk_fit_110p.Rds"))

Get TK statistics for the winning models. Do them dose-normalized so that they won’t vary by dose. Keep only halflife and Vss, which shouldn’t vary by Route and Media.

tkstats_study <- eval_tkstats(pk_study,
                              finite_only = FALSE,
                              dose_norm = TRUE,
                              model = "winning") %>%
  dplyr::select(Chemical,
                Species, 
                Reference,
                method,
                model,
                halflife.tkstats,
                Vss.tkstats,
                `Vss/Fgutabs.tkstats`) %>%
  dplyr::rename(halflife = halflife.tkstats,
                Vss = Vss.tkstats,
                `Vss/Fgutabs` = `Vss/Fgutabs.tkstats`) %>%
  dplyr::distinct()

#Also: get coefficients and try to pull Fgutabs (if it was optimized)
coef_study <- coef_sd(pk_study,
                      method = "bobyqa",
                       include_type = "optimize") %>%
  dplyr::filter(param_name %in% "Fgutabs") %>%  dplyr::select(Chemical,
                Species, 
                Reference,
                method,
                model,
                param_name,
                param_value) %>%
  dplyr::distinct()

#keep only winning model coefs
win_study <- get_winning_model(pk_study) %>%
  dplyr::select(-near_flat, -preds_below_loq)

coef_study <- coef_study %>%
  dplyr::semi_join(win_study)

fgutabs_study <- coef_study %>%
  select(-param_name) %>%
  rename(Fgutabs = param_value)

tkstats_study2 <- full_join(tkstats_study,
                            fgutabs_study)

For comparison, read in the saved data from the best options with pooled data. Get those TK statistics and winning model as well, for the “bobyqa” method.

  file_str <- paste0(Sys.getenv("OD_DIR"),
                     read_date_pk,
                     "mypk_fit_110p.Rds")
  pk_pooled <- readRDS(file = file_str)
  
 tkstats_pooled <- eval_tkstats(pk_pooled,
                              finite_only = FALSE,
                              dose_norm = TRUE,
                              method = "bobyqa",
                              model = "winning") %>%
    dplyr::select(Chemical,
                Species, 
                Reference,
                method,
                model,
                halflife.tkstats,
                Vss.tkstats,
                `Vss/Fgutabs.tkstats`) %>%
  dplyr::rename(halflife = halflife.tkstats,
                Vss = Vss.tkstats,
                `Vss/Fgutabs` = `Vss/Fgutabs.tkstats`) %>%
   dplyr::group_by(Chemical,
                   Species,
                   method,
                   model,
                   halflife,
                   Vss,
                   `Vss/Fgutabs`) %>%
   dplyr::summarise(n_ref = n_distinct(Reference)) %>%
   dplyr::ungroup()

 #Also: get coefficients and try to pull Fgutabs
coef_pooled <- coef_sd(pk_pooled,
                       method = "bobyqa",
                       include_type = "optimize") %>%
  dplyr::filter(param_name %in% "Fgutabs") %>%  dplyr::select(Chemical,
                Species, 
                method,
                model,
                param_name,
                param_value) %>%
  dplyr::distinct()

#keep only winning model coefs
win_pooled <- get_winning_model(pk_pooled,
                                method = "bobyqa") %>%
  dplyr::select(-near_flat, -preds_below_loq)

coef_pooled <- coef_pooled %>%
  dplyr::semi_join(win_pooled)

fgutabs_pooled <- coef_pooled %>%
  select(-param_name) %>%
  rename(Fgutabs = param_value)

tkstats_pooled2 <- full_join(tkstats_pooled,
                            fgutabs_pooled)

For plotting: Pivot the TK stats tables longer, add an identifier column for single-study vs. pooled, and merge.

tkstats_study3 <- tkstats_study2 %>%
  dplyr::rename(model_study = model) %>%
  pivot_longer(cols = c(halflife,
                        Vss,
                        `Vss/Fgutabs`,
                        Fgutabs),
               names_to = "name",
               values_to = "value_study") 

tkstats_pooled3 <- tkstats_pooled2 %>%
  dplyr::rename(model_pooled = model,
                n_ref_pooled = n_ref) %>%
  pivot_longer(cols = c(halflife,
                        Vss,
                        `Vss/Fgutabs`,
                        Fgutabs),
               names_to = "name",
               values_to = "value_pooled") 

tkstats_all <- full_join(tkstats_study3,
                         tkstats_pooled3)

Exclude those for which the winning model was flat. (They will have infinite half-life.)

tkstats_all <- tkstats_all %>%
  filter(!(model_pooled %in% "model_flat") &
           !(model_study %in% "model_flat"))

ggplot(tkstats_all %>%
         filter( !is.na(value_study) &
                  !is.na(value_pooled))) +
  geom_point(aes(x = value_pooled,
                 y = value_study,
                 color = factor(n_ref_pooled))) +
  facet_wrap(facets = vars(name),
             scales = "free") +
  scale_x_log10(guide = "axis_logticks") +
  scale_y_log10(guide = "axis_logticks")

Plot only the ones with more than one reference.

ggplot(tkstats_all %>%
         filter(n_ref_pooled > 1 &
                  !is.na(value_study) &
                  !is.na(value_pooled))) +
  geom_point(aes(x = value_pooled,
                 y = value_study),
             color = "gray50",
             shape = 1) +
  geom_line(aes(x = value_pooled,
                 y = value_study,
                 group = interaction(Chemical, Species)),
             color = "gray50") +
  facet_wrap(facets = vars(name),
             scales = "free") +
  geom_abline(intercept = 0, slope = 1,
              color = "black") +
  geom_smooth(aes(x = value_pooled,
                 y = value_study),
              method = "lm",
              se = FALSE,
              color = "black",
              linetype = 2) +
  scale_x_log10(guide = "axis_logticks") +
  scale_y_log10(guide = "axis_logticks") +
  theme_bw() +
  theme(strip.background = element_blank())

Get R-sq for each of these, including only those with more than one reference:

rsq_df <- tkstats_all %>%
         filter(n_ref_pooled > 1 &
                  !is.na(value_study) &
                  !is.na(value_pooled)) %>%
  mutate(log10_value_pooled = log10(value_pooled),
         log10_value_study = log10(value_study)) %>%
  group_by(name) %>%
  summarise(n = n(),
            n_grp = n_distinct(Chemical, Species),
    rsq_log10 = summary(
      lm(
        log10_value_study ~ log10_value_pooled,
        na.action = "na.omit"
        )
      )[[
    "r.squared"
  ]],
  rsq = summary(lm(value_study ~ value_pooled))[[
    "r.squared"
  ]]) %>%
  ungroup() %>%
  mutate(label = sprintf("R^2 == %.3f",
                         rsq_log10))

rsq_df %>% select(-label) %>% print()

Re-make plot with R-sq information as text annotation, and save.

ggplot(tkstats_all %>%
         filter(n_ref_pooled > 1 &
                  !is.na(value_study) &
                  !is.na(value_pooled))) +
  geom_point(aes(x = value_pooled,
                 y = value_study),
             color = "gray50",
             shape = 1) +
  geom_line(aes(x = value_pooled,
                 y = value_study,
                 group = interaction(Chemical, Species)),
             color = "gray50") +
  geom_text(data = rsq_df,
            aes(x = 0,
                y = Inf,
                label = label),
            parse = TRUE,
            hjust = -0.3,
            vjust = 1.8) +
  facet_wrap(facets = vars(name),
             scales = "free") +
  geom_abline(intercept = 0, slope = 1,
              color = "black") +
  geom_smooth(aes(x = value_pooled,
                 y = value_study),
              method = "lm",
              se = FALSE,
              color = "black",
              linetype = 2) +
  scale_x_log10(guide = "axis_logticks") +
  scale_y_log10(guide = "axis_logticks") +
  xlab("Pooled value") +
  ylab("Single-study value") +
  theme_bw() +
  theme(strip.background = element_blank(),
        strip.text = element_text(size = 12))

ggsave(paste0(Sys.getenv("FIG_DIR"),
              "Manu_Files/",
              today,
              "_Fig8.png"),
       height = 5,
       width = 7)

ggsave(paste0(Sys.getenv("FIG_DIR"),
              today,
              "Manu_Files/",
              "_Fig8.pdf"),
       height = 5,
       width = 7)

Save the data as a supplemental table

tkstats_all_wide <- tkstats_all %>%
  pivot_wider(id_cols = c(Chemical, Species,
                          Reference, method,
                          model_study,
                          model_pooled,
                          n_ref_pooled),
              names_from = name,
              values_from = c(value_study, value_pooled))

tkstats_all_wide %>% filter(n_ref_pooled > 1) %>%
  writexl::write_xlsx(path = paste0(Sys.getenv("FIG_DIR"),
                                    "Manu_Files/",
                                    today,
                                    "Supp_Table5.xlsx"))

Plot the case with a large Vdist:

plot_pooled <- plot(pk_pooled,
                    method = "bobyqa",
                    use_scale_conc = TRUE,
                    best_fit = TRUE)
plot_pooled %>%
  filter(Chemical %in% "DTXSID2020139" & Species %in% "rat") %>%
  pull(final_plot[[1]])

Manuscript Figures and Tables (2024, submitted)

1 Overview

2 Setup

3 Pulling data from CvTdb

4 Minimal PK object

5 Running all possible fitting options