This R Markdown document includes code to reproduce the analyses, figures, and tables from the 2024 manuscript that describes invivoPKfit’s use in a case study with CvT data from over 200 Chemicals.
Load needed packages.
library(tidyverse, quietly = TRUE)
library(cowplot)
library(RColorBrewer)
library(ctxR)
library(flextable)
library(officer)
library(patchwork)
devtools::load_all() #load invivopkfit
#get today's date
today <- format(Sys.Date(), "%d%B%Y")
This file refers to two environment variables, OD_DIR
and FIG_DIR
. OD_DIR
should be set to the path
where you have placed the necessary input files. FIG_DIR
should be set to the path where you want to save the figures and tables.
You can set those environment variables locally inside R like this:
You’ll want to make sure the paths end with a slash.
Additionally, at one point, this code uses package ctxR
to query the CompTox Chemicals Dashboard. In order to run that portion
of the code, you will need to have an API key (free), which you can
request by emailing ccte_api@epa.gov
. Set another
environment variable to contain that code as a string, like this:
Define a date for the pre-fit pk objects to read in:
And a date for the results files to read from:
See pulling_iv_oral_cvtdb.Rmd
.
The following chunk shows the column/field mapping from the
cvt
object and the pk
object needed for
invivoPKfit analysis. The cvt
object is data from the CvTdb
database that includes chemicals with oral or intravenous routes of
administration and takes measurements from blood or plasma. The data has
been processed and cleaned to facilitate its use with
invivoPKfit.
### Minimal PK Object ###----
# Minimum pk object, add options later
# Note that these mappings are now a default mappings, just being verbose here.
# If there are any standard deviations that are zero or NA, but N_Subjects > 1,
# Set the n_subjects to 6 or the current value if lower than 6.
# (In Showa the mode is 6, but this condition is present in CVT_Legacy as well)
minimal_pk <- pk(data = cvt %>%
mutate(n_subjects_normalized = if_else(
n_subjects_normalized > 1 & is.na(invivPK_conc_sd),
min(n_subjects_normalized, 6), n_subjects_normalized)),
mapping = ggplot2::aes(
Chemical = analyte_dtxsid,
Chemical_Name = analyte_name_original,
DTXSID = analyte_dtxsid,
CASRN = analyte_casrn,
Species = species,
Reference = fk_extraction_document_id,
Media = conc_medium_normalized,
Route = administration_route_normalized,
Dose = invivPK_dose_level,
Dose.Units = "mg/kg",
Subject_ID = fk_subject_id,
Series_ID = fk_series_id,
Study_ID = fk_study_id,
ConcTime_ID = conc_time_id,
N_Subjects = n_subjects_normalized,
Weight = weight_kg,
Weight.Units = "kg",
Time = time_hr,
Time.Units = "hours",
Value = invivPK_conc,
Value.Units = "mg/L",
Value_SD = invivPK_conc_sd,
LOQ = invivPK_loq
))
Here we setup the PK object with the various options for fitting and data transformations. As shown in this vignette, it is possible to setup multiple distinct options for fitting and data transformation at once prior to processing and fitting the data.
# Types of Choices:
## Dose Normalized
## Log10 transform
## Scale time
# List of options
fitopts <- expand.grid(error_model = c("pooled",
"hierarchical"),
dose_norm = c(TRUE, FALSE),
log10_trans = c(TRUE, FALSE),
time_scale = c(
TRUE,
FALSE),
stringsAsFactors = FALSE)
# Make function similar to that in test_vignette authored by Caroline Ring
fit_data <- function(this_error_model,
this_dose_norm,
this_log10_trans,
this_time_scale,
n_cores = 12,
file_dir = Sys.getenv("OD_DIR")){
retval <- -9
dose_indic <- as.numeric(this_dose_norm)
log10_indic <- as.numeric(this_log10_trans)
time_indic <- as.numeric(this_time_scale)
errmodel_indic <- substr(this_error_model, 1, 1)
file_str <- paste0("mypk_fit_",
dose_indic,
log10_indic,
time_indic,
errmodel_indic,
".Rds")
cat("Fitting: ", file_str, "\n")
tryCatch(
expr = {
this_pk <- minimal_pk +
facet_data(vars(Chemical, Species)) +
settings_preprocess(keep_data_original = FALSE,
suppress.messages = TRUE) +
scale_conc(dose_norm = this_dose_norm, log10_trans = this_log10_trans) +
settings_optimx(method = c(
"L-BFGS-B",
"bobyqa"
))
if(this_error_model %in% "pooled"){
this_pk <- this_pk +
stat_error_model(error_group = vars(Chemical, Species))
} else if(this_error_model %in% c("hierarchical", "joint")){
this_pk <- this_pk +
stat_error_model(error_group = vars(Chemical, Species, Reference))
} else{
stop("this_error_model must be either 'pooled' or 'hierarchical'/'joint'")
}
if(this_time_scale %in% TRUE){
this_pk <- this_pk +
scale_time(new_units = "auto")
}
#do the fit
this_time <- system.time(this_fit <- do_fit(this_pk, n_cores = n_cores))
#save the result
saveRDS(this_fit,
file = paste0(file_dir,
file_str))
cli::cli_alert_success(text = "Fitting success!")
print(this_time)
retval <- this_time[[3]]
},
error = function(e){
cli::cli_alert_danger(text = paste("Analysis",
file_str,
"failed with error",
e))
retval <- -1
}
)
return(retval)
}
system.time(tmp_out <- mapply(fit_data,
this_error_model = fitopts$error_model,
this_dose_norm = fitopts$dose_norm,
this_log10_trans = fitopts$log10_trans,
this_time_scale = fitopts$time_scale,
n_cores = 14))
mean(tmp_out)/60 # Average runtime (in minutes) per fitting option
median(tmp_out)/60
sum(tmp_out) # Looks like there's 10 extra seconds of overhead for non-fitting methods
Load in one of the saved fit objects; it doesn’t matter which one, since we will be using only the pre-processed data, which is the same regardless of fitting options.
Number of observations:
Number of unique chemicals:
Number of unique references:
Number of unique species:
Number of unique combinations of Chemical and Species:
Number of unique combinations of Chemical, Species, and Reference:
Number of unique combinations of Chemical, Species, Reference, Route, Media:
Number of unique combinations of Chemical, Species, Reference, Route, Media, Dose:
Supp. Figure 2: Concentration and final time-points have wide ranges across CvT datasets.
#compute duration and concentration range across experiments (Chemical, Species, Reference, Route, Media, Dose)
data_range <- my_pk$data %>%
filter(Detect, !exclude) %>%
group_by(Chemical, Species, Reference, Media, Route, Dose, Dose.Units) %>%
summarize(maxT = max(Time)/24,
concRange = log10(max(Conc)/min(Conc))) %>%
ungroup()
#Panel A: histogram of day of final timepoint per experiment
sf_2A <- data_range %>%
ggplot(aes(x = maxT)) +
geom_histogram(fill = "grey5",
bins = 40,
color = "grey5") +
scale_x_log10(labels = scales::number,
breaks = c(0.1, 1, 7, 30, 365)) +
annotation_logticks(sides = "b") +
labs(x = "Day of final Timepoint",
y = "Count") +
theme(aspect.ratio = 1,
text = element_text(size = 10),
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.background = element_blank(),
panel.grid.major = element_line(color = "grey90", linewidth = 0.5),
axis.ticks = element_blank(),
axis.line = element_blank())
#panel B: histogram of fold concentration range
sf_2B <- data_range %>%
ggplot(aes(x = concRange)) +
geom_histogram(fill = "grey5",
bins = 40,
color = "grey5") +
scale_y_continuous(expand = c(0, 0),
limits = c(0, 50),
breaks = c(0, 25, 50)) +
labs(x = "log10(Concentration Range)",
y = "Count") +
annotation_logticks(sides = "b") +
theme(aspect.ratio = 1,
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.background = element_blank(),
panel.grid.major = element_line(color = "grey90", linewidth = 0.5),
strip.background = element_rect(fill = "white"),
strip.text = element_text(face = "bold"),
axis.ticks = element_blank(),
axis.line = element_blank(),
axis.text = element_text(size = 10),
axis.title = element_text(size = 11))
sf_2AB <- plot_grid(sf_2A, sf_2B,
labels = c("A", "B"),
nrow = 1)
sf_2AB
#save PDF version
ggsave(paste0(Sys.getenv("FIG_DIR"),
"Manu_Files/",
today,
"_Supp_Fig2_ConcTimeRanges",
".pdf"),
height = 3.2,
width = 6,
bg = "white",
units = "in")
#save PNG version
ggsave(paste0(Sys.getenv("FIG_DIR"),
"Manu_Files/",
today,
"_Supp_Fig2_ConcTimeRanges",
".png"),
height = 3.2,
width = 6,
bg = "white",
units = "in",
dpi = 300)
rm(sf_2A, sf_2B, sf_2AB)
data_cvt <- get_data(my_pk)
data_counts <- data_cvt %>%
dplyr::filter(Detect) %>%
dplyr::count(Chemical, Species, Reference, Route, Media, Dose, Time, N_Subjects,
name = "Count")
data_counts <- dplyr::left_join(data_counts, data_cvt,
by = c("Chemical", "Species", "Reference",
"Route", "Media", "Dose", "Time",
"N_Subjects")) %>%
dplyr::mutate(data_descr = dplyr::case_when(
(Count > 1 & N_Subjects == 1) ~ "Individual Data, Multiple Observations",
(Count == 1 & N_Subjects == 1) ~ "Individual Data, Single Observation",
(Count == 1 & N_Subjects > 1 & Conc_SD > 0) ~ "Grouped Data w SD",
(N_Subjects > 1 & Conc_SD == 0) ~ "Grouped Data no SD",
.default = NA_character_
))
indiv_data <- subset(data_counts,
subset = (data_descr %in% "Individual Data, Multiple Observations"))
indiv_data <- indiv_data %>%
dplyr::group_by(Chemical, Species, Reference, Route, Media, Dose, Time) %>%
dplyr::mutate(replicate_mean = mean(Conc, na.rm = TRUE)) %>%
ungroup()
indiv_data_bygroup <- indiv_data %>%
dplyr::group_by(Chemical, Species) %>%
dplyr::summarise(AFE = 10^mean(log10(Conc/replicate_mean)),
AAFE = 10^mean(abs(log10(Conc/replicate_mean)))) %>%
ungroup()
indiv_data_bygroup %>% count(AFE < 0.5)
indiv_data_bygroup %>% count(AFE > 2)
Do twofold-test; use only the information about the observations at this point (predictions will be handled later).
Figure 3 Panel A: Histogram of mean-normalized concentrations
pl3A <- my_tf$individual_data %>%
# Plotting
ggplot(aes(x = foldConc)) +
geom_histogram(binwidth = 0.2,
fill = "grey5",
color = "grey5",
linewidth = 0.4) +
coord_cartesian(xlim = c(0, 3.1),
ylim = c(0, 1200)) +
scale_y_continuous(expand = c(0, 0),
breaks = c(0, 250, 500, 750, 1000),
labels = c("0", "2.5", "5", "7.5", "10")) +
expand_limits(y = 0) +
geom_vline(xintercept = c(0.5, 2),
color = "red3",
linetype = "dashed",
linewidth = 1) +
theme_bw() +
labs(x = "Mean-normalized concentration",
y = "Observations (hundreds))") +
theme(text = element_text(size = 14),
panel.border = element_rect(color = "black", linewidth = 1, fill = NA),
panel.background = element_blank(),
panel.grid.major.y = element_line(color = "grey90", linewidth = 0.4),
axis.title = element_text(face = "bold"),
axis.line = element_blank(),
axis.text = element_text(size = 14),
plot.background = element_blank(),
axis.ticks.y = element_blank())
pl3A
Figure 3 Panel B: Plot of mean-normalized concentrations vs. ADME-Normalized Time
#get time of peak concentration for use in calculated ADME-normalized time
my_nca <- nca(my_pk) %>%
dplyr::filter(param_name %in% "tmax")
pl3B <- my_tf$individual_data %>%
inner_join(my_nca %>%
dplyr::select(Chemical, Species,
Route, Media,
tmax = param_value) %>%
filter(!is.na(tmax))) %>%
group_by(Chemical,
Species,
Reference,
Route,
Media,
Dose) %>%
mutate(
normTime = ifelse(
Route == "iv",
1 + Time/max(Time),
ifelse(
Time > tmax,
((Time - tmax)/max(Time)) + 1,
0.5 + Time/(2*tmax)
)
)) %>%
ggplot(aes(
x = normTime,
y = foldConc
)) +
scale_x_continuous(breaks = c(1,2),
labels = c("tmax", "end")) +
geom_point(alpha = 0.1, size = 0.7) +
geom_hline(yintercept = c(0.5, 2),
color = "red3",
linewidth = 0.8,
linetype = "dashed") +
facet_grid(cols = vars(Route),
scales = "free_x") +
labs(x = "ADME-Normalized Time",
y = "Mean-Normalized\nConcentration") +
theme(text = element_text(size = 14),
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.background = element_blank(),
panel.grid.major = element_line(color = "grey90", linewidth = 0.5),
strip.background = element_rect(fill = "white"),
strip.text = element_text(face = "bold"),
axis.line = element_blank(),
axis.text = element_text(size = 14),
axis.title = element_text(face = "bold"),
panel.spacing = unit(0.125, units = "in"),
plot.background = element_blank(),
legend.position = "none")
pl3B
Paste these two plots together
pl3 <- cowplot::plot_grid(pl3A,
pl3B,
ncol = 2,
rel_widths = c(1,2),
labels = c("A", "B"))
#save PDF version
ggsave(paste0(Sys.getenv("FIG_DIR"),
"Manu_Files/",
today,
"_Fig3.pdf"),
pl3,
bg = "white",
width = 8,
height = 4,
units = "in")
#Save PNG version
ggsave(paste0(Sys.getenv("FIG_DIR"),
"Manu_Files/",
today,
"_Fig3.png"),
pl3,
bg = "white",
width = 8,
height = 4,
units = "in",
dpi = 300)
#remove unneeded objects
rm(pl3A, pl3B, pl3)
plsupp3 <- my_tf$individual_data %>%
inner_join(my_nca %>%
dplyr::select(Chemical, Species,
Route, Media,
tmax = param_value) %>%
filter(!is.na(tmax))) %>%
group_by(Chemical,
Species,
Reference,
Route,
Media,
Dose) %>%
mutate(
normTime = ifelse(
Route == "iv",
1 + Time/max(Time),
ifelse(
Time > tmax,
((Time - tmax)/max(Time)) + 1,
0.5 + Time/(2*tmax)
)
)) %>%
ggplot(aes(
x = normTime
)) +
scale_x_continuous(breaks = c(1,2),
labels = c("tmax", "end")) +
geom_histogram() +
facet_grid(cols = vars(Route),
scales = "free_x") +
labs(x = "ADME-Normalized Time",
"# Observations") +
theme(text = element_text(size = 14),
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.background = element_blank(),
panel.grid.major = element_line(color = "grey90", linewidth = 0.5),
strip.background = element_rect(fill = "white"),
strip.text = element_text(face = "bold"),
#axis.ticks = element_blank(),
axis.line = element_blank(),
#axis.text.x = element_blank(),
axis.text = element_text(size = 14),
axis.title = element_text(face = "bold"),
panel.spacing = unit(0.125, units = "in"),
plot.background = element_blank(),
legend.position = "none")
#save PDF version
ggsave(paste0(Sys.getenv("FIG_DIR"),
"Manu_Files/",
today,
"_Supp_Fig3",
".png"),
plsupp3,
bg = "white",
width = 8,
height = 4,
units = "in",
dpi = 300)
ggsave(paste0(Sys.getenv("FIG_DIR"),
"Manu_Files/",
today,
"_Supp_Fig3",
".pdf"),
plsupp3,
bg = "white",
width = 8,
height = 4,
units = "in")
Here we evaluate the various fitting options by:
Define some helper functions to do the various model evaluations.
# Evaluation
# Winning Model Tally
winmodel_comparison <- function(this_error_model,
this_dose_norm,
this_log10_trans,
this_time_scale) {
dose_indic <- as.numeric(this_dose_norm)
log10_indic <- as.numeric(this_log10_trans)
time_indic <- as.numeric(this_time_scale)
errmodel_indic <- substr(this_error_model, 1, 1)
file_str <- paste0(Sys.getenv("OD_DIR"),
read_date_pk,
"mypk_fit_",
dose_indic,
log10_indic,
time_indic,
errmodel_indic,
".Rds")
pk_name <- paste0(dose_indic, log10_indic, time_indic, errmodel_indic)
current_rds <- readRDS(file = file_str)
pf <- current_rds$fit %>%
distinct(Chemical, Species, model, method, convcode) %>%
group_by(method, .drop = FALSE) %>%
count(convcode) %>%
pivot_wider(names_from = convcode,
names_prefix = "convcode.",
values_from = n)
# Wide winning model
suppressMessages({
winning_model <- get_winning_model(obj = current_rds)
wide_winning_model <- winning_model %>%
group_by(method, model) %>%
count() %>%
pivot_wider(names_from = model, values_from = n) %>%
left_join(pf)
})
return(wide_winning_model)
}
First we need to determine how to filter out cases where NCA or tkstats predicted ridiculously large AUCs. To do this, create a function to plot \(AUC_{\infty}\) predicted by the model vs. \(AUC_{\infty}\) estimated by NCA, for each set of fitting options. Show the identity line, and also show lines for a factor of 20 above and below the identity line, since a 20-fold cutoff is one proposed cutoff. Do the same for Cmax, as well.
AUC_plot <- function(
this_error_model,
this_dose_norm,
this_log10_trans,
this_time_scale) {
dose_indic <- as.numeric(this_dose_norm)
log10_indic <- as.numeric(this_log10_trans)
time_indic <- as.numeric(this_time_scale)
errmodel_indic <- substr(this_error_model, 1, 1)
file_str <- paste0(Sys.getenv("OD_DIR"),
read_date_pk,
"mypk_fit_",
dose_indic,
log10_indic,
time_indic,
errmodel_indic,
".Rds")
pk_name <- paste0(dose_indic, log10_indic, time_indic, errmodel_indic)
message(paste0("Evaluation for: ", pk_name))
current_rds <- readRDS(file = file_str)
# Cmax and AUC_inf RMSEs
#note that eval_tkstats already filters winning models
#use finite_only = FALSE so that we can count the excluded cases
suppressMessages(RMSLE_eval <- eval_tkstats(obj = current_rds,
dose_norm = FALSE,
finite_only = FALSE,
suppress.messages = TRUE) %>%
mutate(Options = pk_name) %>%
dplyr::select(
Options,
Chemical,
Species,
Route,
Media,
Reference,
method,
model,
starts_with("Cmax"),
starts_with("AUC_")
))
these_opts <- paste0(dose_indic,
log10_indic,
time_indic,
errmodel_indic)
#plot AUC infinity for tkstats vs. AUC infinity for NCA
#show identity line and 20x above and below lines
AUC_plot <- RMSLE_eval %>%
dplyr::filter(is.finite(AUC_infinity.nca) & is.finite(AUC_infinity.tkstats)) %>%
ggplot() +
geom_point(aes(x = AUC_infinity.nca,
y = AUC_infinity.tkstats,
color = model)) +
scale_x_log10() + scale_y_log10() +
annotation_logticks() +
geom_abline(aes(intercept = 0, slope =1)) +
geom_abline(aes(intercept = log10(20), slope = 1),
linetype = 2) +
geom_abline(aes(intercept = log10(1/20), slope = 1),
linetype = 2) +
facet_grid(rows = vars(method)) +
ggtitle(paste(these_opts,
"AUC_inf pred vs. AUC_inf NCA")) +
scale_color_manual(values = c("model_1comp" = "#0398FC",
"model_2comp" = "#D68E09",
"model_flat" = "black"),
name = "Winning model") +
coord_equal()
Cmax_plot <- RMSLE_eval %>%
dplyr::filter(is.finite(Cmax.nca) & is.finite(Cmax.tkstats)) %>%
ggplot() +
geom_point(aes(x = Cmax.nca,
y = Cmax.tkstats,
color = model)) +
scale_x_log10() + scale_y_log10() +
annotation_logticks() +
geom_abline(aes(intercept = 0, slope =1)) +
geom_abline(aes(intercept = log10(20), slope = 1),
linetype = 2) +
geom_abline(aes(intercept = log10(1/20), slope = 1),
linetype = 2) +
facet_grid(rows = vars(method)) +
ggtitle(paste(these_opts,
"Cmax pred vs. Cmax NCA")) +
scale_color_manual(values = c("model_1comp" = "#0398FC",
"model_2comp" = "#D68E09",
"model_flat" = "black"),
name = "Winning model") +
coord_equal()
return(
list(AUC_plot = AUC_plot,
Cmax_plot = Cmax_plot
)
)
}
Run these plots for all fitting opts
all_AUC_plots <- fitopts %>%
dplyr::rowwise() %>%
dplyr::summarise(this_plot = list(AUC_plot(this_error_model = error_model,
this_dose_norm = dose_norm,
this_log10_trans = log10_trans,
this_time_scale = time_scale)
)
)
#save the plots
pdf(file = paste0(
Sys.getenv("FIG_DIR"),
today,
"_AUCinf_nca_vs_tk_plots.pdf"),
onefile = TRUE,
height = 5,
width = 7)
plotlist <- all_AUC_plots %>%
pull(this_plot)
for (x in seq_along(plotlist)) {
print(plotlist[[x]][["AUC_plot"]])
}
dev.off()
pdf(file = paste0(
Sys.getenv("FIG_DIR"),
today,
"_Cmax_nca_vs_tk_plots.pdf"),
onefile = TRUE,
height = 5,
width = 7)
plotlist <- all_AUC_plots %>%
pull(this_plot)
for (x in seq_along(plotlist)) {
print(plotlist[[x]][["Cmax_plot"]])
}
dev.off()
rm(plotlist, all_AUC_plots)
It appears that the best filter, other than filtering out NA, Inf, zero, and negative values, would be to filter out any AUC_inf values greater than 1e7, since that cutoff identifies outliers.
Write the function for RMSLE for Cmax and AUC accordingly.
# RMSLE for Cmax and AUC
rmsles_Cmax_AUC <- function(
this_error_model,
this_dose_norm,
this_log10_trans,
this_time_scale) {
dose_indic <- as.numeric(this_dose_norm)
log10_indic <- as.numeric(this_log10_trans)
time_indic <- as.numeric(this_time_scale)
errmodel_indic <- substr(this_error_model, 1, 1)
file_str <- paste0(Sys.getenv("OD_DIR"),
read_date_pk,
"mypk_fit_",
dose_indic,
log10_indic,
time_indic,
errmodel_indic,
".Rds")
pk_name <- paste0(dose_indic, log10_indic, time_indic, errmodel_indic)
message(paste0("Evaluation for: ", pk_name))
current_rds <- readRDS(file = file_str)
# Cmax and AUC_inf RMSEs
#note that eval_tkstats already filters winning models
#use finite_only = FALSE so that we can count the excluded cases
suppressMessages(RMSLE_eval <- eval_tkstats(obj = current_rds,
dose_norm = FALSE,
finite_only = FALSE,
suppress.messages = TRUE) %>%
mutate(Options = pk_name) %>%
dplyr::select(
Options,
Chemical,
Species,
Route,
Media,
Reference,
method,
model,
starts_with("Cmax"),
starts_with("AUC_")
))
#Couunt bad observations:
#zero, NA, infinite
RMSLE_badobs <- RMSLE_eval %>%
group_by(Options, method) %>%
summarise(total_expts = n(),
count_AUCinf_NCA_0 = sum(AUC_infinity.nca %in% 0),
count_AUCinf_pred_0 = sum(AUC_infinity.tkstats %in% 0),
count_AUCinf_NCA_isNA = sum(is.na(AUC_infinity.nca)),
count_AUCinf_pred_isNA = sum(is.na(AUC_infinity.tkstats)),
count_AUCinf_NCA_isInf = sum(is.infinite(AUC_infinity.nca)),
count_AUCinf_pred_isInf = sum(is.infinite(AUC_infinity.tkstats)),
count_Cmax_NCA_0 = sum(Cmax.nca %in% 0),
count_Cmax_pred_0 = sum(Cmax.tkstats %in% 0),
count_Cmax_NCA_isNA = sum(is.na(Cmax.nca)),
count_Cmax_pred_isNA = sum(is.na(Cmax.tkstats)),
count_Cmax_NCA_isInf = sum(is.infinite(Cmax.nca)),
count_Cmax_pred_isInf = sum(is.infinite(Cmax.tkstats))
) %>%
ungroup()
#count cases where AUC or Cmax was negative,
#or where AUC_inf was greater than 1e7
RMSLE_neg <- RMSLE_eval %>%
dplyr::filter(is.finite(AUC_infinity.nca),
is.finite(AUC_infinity.tkstats),
is.finite(Cmax.nca),
is.finite(Cmax.tkstats)) %>%
group_by(Options, method) %>%
summarise(count_AUCinf_NCA_lt0 = sum(AUC_infinity.nca < 0),
count_AUCinf_pred_lt0 = sum(AUC_infinity.tkstats < 0),
count_Cmax_NCA_lt0 = sum(AUC_infinity.nca < 0),
count_Cmax_pred_lt0 = sum(AUC_infinity.tkstats < 0),
count_AUCinf_NCA_gt_1e7 = sum(AUC_infinity.nca > 1e7),
count_AUCinf_pred_gt_1e7 = sum(AUC_infinity.tkstats > 1e7),
)
RMSLE_badobs <- RMSLE_badobs %>%
left_join(RMSLE_neg, by = c("Options",
"method")) %>%
rowwise() %>%
dplyr::mutate(total_excl = sum(
c_across(
starts_with("count")
)
)
)
#compute RMSLEs after excluding the bad observations
RMSLE_eval <- RMSLE_eval %>%
dplyr::filter(is.finite(AUC_infinity.nca),
is.finite(AUC_infinity.tkstats)) %>%
dplyr::filter(
AUC_infinity.nca > 0,
AUC_infinity.tkstats > 0,
AUC_infinity.nca < 1e7,
AUC_infinity.tkstats < 1e7) %>%
rowwise() %>%
mutate(
SLE_Cmax = (log10(Cmax.tkstats) - log10(Cmax.nca)) ^ 2,
SLE_AUC_inf = (log10(AUC_infinity.tkstats) - log10(AUC_infinity.nca)) ^ 2
) %>%
dplyr::filter(!is.na(SLE_Cmax), !is.na(SLE_AUC_inf)) %>%
group_by(Options, method) %>%
summarise(
n_expts = n(),
RMSLE_Cmax = sqrt(mean(SLE_Cmax, na.rm = FALSE)) %>% signif(digits = 4),
RMSLE_AUC = sqrt(mean(SLE_AUC_inf, na.rm = FALSE)) %>% signif(digits = 4)
) %>%
ungroup()
#merge in counts of bad obs
RMSLE_eval <- RMSLE_eval %>%
dplyr::left_join(RMSLE_badobs,
by = c("Options", "method"))
return(RMSLE_eval)
}
Goodness-of-fit table with R-squared, RMSLE and AIC:
# Goodness-of-fit table with R-squared, RMSLE and AIC
get_gof <- function(this_error_model,
this_dose_norm,
this_log10_trans,
this_time_scale) {
dose_indic <- as.numeric(this_dose_norm)
log10_indic <- as.numeric(this_log10_trans)
time_indic <- as.numeric(this_time_scale)
errmodel_indic <- substr(this_error_model, 1, 1)
file_str <- paste0(Sys.getenv("OD_DIR"),
read_date_pk,
"mypk_fit_",
dose_indic,
log10_indic,
time_indic,
errmodel_indic,
".Rds")
pk_name <- paste0(dose_indic, log10_indic, time_indic, errmodel_indic)
current_rds <- readRDS(file = file_str)
winning_model <- suppressMessages(get_winning_model(obj = current_rds))
suppressMessages({
this_rsq <- rsq.pk(current_rds,
use_scale_conc = FALSE,
rsq_group = ggplot2::vars(Chemical, Species),
sub_pLOQ = TRUE) %>%
semi_join(winning_model)
this_AIC <- AIC(current_rds) %>%
semi_join(winning_model)
this_rmsle <- rmse.pk(current_rds,
rmse_group = vars(Chemical, Species),
use_scale_conc = list(dose_norm = FALSE,
log10_trans = TRUE),
sub_pLOQ = TRUE) %>%
semi_join(winning_model)
})
# Since they are all joined by winmodel
# all three must have same NROW
message(
paste0("For ", pk_name, "... outputs below must have same number of rows")
)
message(paste0("rsq: ", NROW(this_rsq)))
message(paste0("aic: ", NROW(this_AIC)))
message(paste0("rmsle: ", NROW(this_rmsle)))
gof_df <- this_rsq %>%
inner_join(this_AIC, by = join_by(Chemical, Species, method, model)) %>%
inner_join(this_rmsle, by = join_by(Chemical, Species, method, model)) %>%
mutate(Options = pk_name, .before = Chemical)
return(gof_df)
}
# All predictions
get_all_preds <- function(this_error_model,
this_dose_norm,
this_log10_trans,
this_time_scale) {
dose_indic <- as.numeric(this_dose_norm)
log10_indic <- as.numeric(this_log10_trans)
time_indic <- as.numeric(this_time_scale)
errmodel_indic <- substr(this_error_model, 1, 1)
file_str <- paste0(Sys.getenv("OD_DIR"),
read_date_pk,
"mypk_fit_",
dose_indic,
log10_indic,
time_indic,
errmodel_indic,
".Rds")
pk_name <- paste0(dose_indic, log10_indic, time_indic, errmodel_indic)
current_rds <- readRDS(file = file_str)
# Wide winning model
suppressMessages({
winning_model <- get_winning_model(obj = current_rds) %>%
select(-c(near_flat, preds_below_loq))
this_preds <- predict(current_rds,
use_scale_conc = FALSE) %>%
semi_join(winning_model) %>%
mutate(Options = pk_name, .before = Chemical)
})
return(this_preds)
}
# Factor of two model error
tf_tests <- function(this_error_model,
this_dose_norm,
this_log10_trans,
this_time_scale) {
dose_indic <- as.numeric(this_dose_norm)
log10_indic <- as.numeric(this_log10_trans)
time_indic <- as.numeric(this_time_scale)
errmodel_indic <- substr(this_error_model, 1, 1)
file_str <- paste0(Sys.getenv("OD_DIR"),
read_date_pk,
"mypk_fit_",
dose_indic,
log10_indic,
time_indic,
errmodel_indic,
".Rds")
pk_name <- paste0(dose_indic, log10_indic, time_indic, errmodel_indic)
current_rds <- readRDS(file = file_str)
print(paste0("Now analyzing: ", pk_name))
out <- twofold_test(current_rds,
sub_pLOQ = TRUE,
suppress_messages = TRUE)
out <- out$model_error_summary %>%
mutate(Options = pk_name)
return(out)
}
Now call these functions and do the evaluations.
Winning model comparisons:
#winning model
system.time(wm_comp <- fitopts %>%
dplyr::rowwise() %>%
dplyr::mutate(winmodel_comp = list(
winmodel_comparison(this_error_model = error_model,
this_dose_norm = dose_norm,
this_log10_trans = log10_trans,
this_time_scale = time_scale))) %>%
tidyr::unnest(winmodel_comp))
RMSLE for Cmax and AUC:
#RMSLE for Cmax and AUC
system.time(cmax_auc_rmsle_tbl <- fitopts %>%
dplyr::rowwise() %>%
dplyr::mutate(nca_comp = list(
rmsles_Cmax_AUC(this_error_model = error_model,
this_dose_norm = dose_norm,
this_log10_trans = log10_trans,
this_time_scale = time_scale))) %>%
tidyr::unnest(nca_comp))
Goodness-of-fit metrics table:
#GOF table
system.time(gof_tbl <- fitopts %>%
dplyr::rowwise() %>%
dplyr::mutate(gof_win = list(
get_gof(this_error_model = error_model,
this_dose_norm = dose_norm,
this_log10_trans = log10_trans,
this_time_scale = time_scale))) %>%
tidyr::unnest(gof_win))
Factor of two:
#factor of two
system.time(all_tf_tests <- fitopts %>%
dplyr::rowwise() %>%
dplyr::mutate(TF = list(
tf_tests(error_model,
this_dose_norm = dose_norm,
this_log10_trans = log10_trans,
this_time_scale = time_scale))) %>%
tidyr::unnest(TF))
Get all predictions:
#all predictions
system.time(all_preds <- fitopts %>%
dplyr::rowwise() %>%
dplyr::mutate(these_preds = list(
get_all_preds(this_error_model = error_model,
this_dose_norm = dose_norm,
this_log10_trans = log10_trans,
this_time_scale = time_scale))) %>%
tidyr::unnest(these_preds))
#Save preds to a file as well
saveRDS(all_preds, paste0(Sys.getenv("FIG_DIR"),
today,
"_all_preds.Rds"))
rm(all_preds)
Produce and save tables for these evaluation metrics.
Rank fitting options by count of Chemical-Species data groups where RMSLE < 1
gof_tbl %>%
group_by(Options, method) %>%
count(RMSLE < 1) %>%
filter(`RMSLE < 1`) %>%
arrange(desc(n)) %>%
print(n = 8)
# bobyqa-optimized options 010h, 110h, 110p
What about R-squared greater than 0.75?
gof_tbl %>%
group_by(Options, method) %>%
count(Rsq > 0.75) %>%
filter(`Rsq > 0.75`) %>%
arrange(desc(n)) %>%
print(n = 8)
What about both RMSLE < 1 and R-squared greater than 0.75?
rmsle_rsq_rank <- gof_tbl %>%
group_by(Options, method) %>%
count(RMSLE < 1 & Rsq > 0.75) %>%
filter(`RMSLE < 1 & Rsq > 0.75`) %>%
arrange(desc(n)) %>%
dplyr::rename(`# Chemical-Species data groups` = n)
rmsle_rsq_rank %>% print(n=8)
Looking at both criteria together, it appears that the best set of fitting options is actually the simplest: 000p (no data transformation and pooled error model), bobyqa-optimized.
But also, let’s add in the Cmax/AUC RMSLE calculations: Sort the fitting options from smallest to greatest RMSLE for Cmax and AUC, and see which set of fitting options yields the lowest RMSLE.
cmax_auc_rmsle_rank <- cmax_auc_rmsle_tbl %>%
arrange(RMSLE_Cmax, RMSLE_AUC) %>%
select(Options, method, RMSLE_Cmax, RMSLE_AUC)
cmax_auc_rmsle_rank %>%
print(n = 8)
This ranking strongly suggests 110p, with 100p as a second choice (both bobyqa).
110p was ranked number 3 in the list of RMSLE and Rsq criteria 9and 100p was ranked number 5). So we select 110p bobyqa as the best when all of these metrics are considered.
Write the evaluation results to a spreadsheet, including the rankings.
# Write the results to file
writexl::write_xlsx(x = list(
Fit_Counts = wm_comp,
AUC_and_Cmax_RMSLE_summary = cmax_auc_rmsle_tbl,
Prediction_Evaluations = gof_tbl,
Twofold_Tests = all_tf_tests,
RMSLE_Rsq_rank = rmsle_rsq_rank,
Cmax_AUC_RMSLE_rank = cmax_auc_rmsle_rank),
path = paste0(Sys.getenv("FIG_DIR"),
"Manu_Files/",
today,
"_Supp_Table2_eval_results",
".xlsx"))
rm(wm_comp, cmax_auc_rmsle_tbl, gof_tbl, all_tf_tests)
gc()
Plot R-sq vs. RMSLE for the various fitting options, color-coding by winning model.
gof_tbl <- readxl::read_excel(path = paste0(Sys.getenv("FIG_DIR"),
"Manu_Files/",
read_date_results,
"_Supp_Table2_eval_results",
".xlsx"),
sheet = "Prediction_Evaluations")
gof_tbl <- gof_tbl %>%
dplyr::mutate(opts = substr(Options, 1, 3),
model_text = dplyr::case_when(model %in% "model_1comp" ~ "1comp",
model %in% "model_2comp" ~ "2comp",
model %in% "model_flat" ~ "flat",
.default = NA_character_))
fig_rsq_rmsle <- ggplot(gof_tbl) +
geom_point(aes(x = RMSLE,
y = Rsq,
color = model_text)) +
facet_grid(rows = vars(opts),
cols = vars(interaction(error_model, method)
)
) +
scale_color_manual(values = c("1comp" = "#0398FC",
"2comp" = "#D68E09",
"flat" = "black"),
name = "Winning model") +
theme_bw()
fig_rsq_rmsle
This figure reveals two things:
First, when L-BFGS-B is the optimizer, RMSLEs are much larger for a subset of data groups compared to bobyqa.
Second, when L-BFGS-B is the optimizer, the flat model is the winning model for a much greater number of data groups, indicating that for some data groups, L-BFGS-B could not locate a reasonable set of model parameters for 1- or 2-comapartment models, but bobyqa could.
If we zoom in to RMSLE < 2:
fig_rsq_rmsle_zoom <- ggplot(gof_tbl) +
geom_point(aes(x = RMSLE,
y = Rsq,
color = model_text)) +
facet_grid(rows = vars(opts),
cols = vars(interaction(error_model, method)
)
) +
scale_color_manual(values = c("1comp" = "#0398FC",
"2comp" = "#D68E09",
"flat" = "black"),
name = "Winning model") +
coord_cartesian(xlim = c(0,1)) +
theme_bw()
fig_rsq_rmsle_zoom
When each time-scaling fit (when “1” is in the last place of the fit options shorthand) is compared to the fit in the row immediately above it (the same dose-normalization and log10-transformations options, but without time scaling), it can be seen that time scaling either has little effect on goodness of fit, or it makes RMSLE and/or R-squared slightly worse.
Save these figures.
#PDF versions
ggsave(paste0(Sys.getenv("FIG_DIR"),
"Manu_Files/",
today,
"_Supp_Fig4",
".pdf"),
fig_rsq_rmsle,
width = 11,
height = 8.5)
ggsave(paste0(Sys.getenv("FIG_DIR"),
"Manu_Files/",
today,
"_Supp_Fig5",
".pdf"),
fig_rsq_rmsle_zoom,
width = 11,
height = 8.5)
#PNG versions
ggsave(paste0(Sys.getenv("FIG_DIR"),
"Manu_Files/",
today,
"_Supp_Fig4",
".png"),
fig_rsq_rmsle,
width = 11,
height = 8.5)
ggsave(paste0(Sys.getenv("FIG_DIR"),
"Manu_Files/",
today,
"_Supp_Fig5",
".png"),
fig_rsq_rmsle_zoom,
width = 11,
height = 8.5)
rm(fig_rsq_rmsle_zoom, fig_rsq_rmsle, gof_tbl)
Plot Cmax RMSLE vs. AUC RMSLE.
#read in table
cmax_auc_rmsle_tbl <- readxl::read_xlsx(path = paste0(Sys.getenv("FIG_DIR"),
"Manu_Files/",
read_date_results,
"_Supp_Table2_eval_results.xlsx"),
sheet = "AUC_and_Cmax_RMSLE_summary")
#make plot
cmax_auc_rmsle_tbl <- cmax_auc_rmsle_tbl %>%
dplyr::mutate(opts = substr(Options, 1, 3))
cmax_auc_rmsle_fig <- ggplot(cmax_auc_rmsle_tbl) +
geom_point(aes(x = RMSLE_AUC,
y = RMSLE_Cmax,
color = opts),
size = 4) +
facet_grid(rows = vars(error_model),
cols = vars(method)) +
scale_color_brewer(palette = "Paired",
name = "Fit options") +
theme_bw()
cmax_auc_rmsle_fig
#save plot
#PDF version
ggsave(paste0(Sys.getenv("FIG_DIR"),
"Manu_Files/",
today,
"_Supp_Fig6",
".pdf"),
cmax_auc_rmsle_fig,
width = 7,
height = 5)
#PNG version
ggsave(paste0(Sys.getenv("FIG_DIR"),
"Manu_Files/",
today,
"_Supp_Fig6",
".png"),
cmax_auc_rmsle_fig,
width = 7,
height = 5,
units = "in",
dpi = 300)
rm(cmax_auc_rmsle_fig, cmax_auc_rmsle_tbl)
This figure reveals that overall, L-BFGS-B produced fits that did not match NCA-estimated Cmax or AUC as well as bobyqa fits. Considering only the bobyqa fits, the pooled error model achieved the smallest differences between NCA and model-predicted Cmax and AUC, and it did so with the two sets of fitting options that included both dose-normalization and log10-transformation.
Plot predicted vs. observed concentrations for everything, using the winning fit options and the winning model for each data group.
all_preds <- readRDS(paste0(Sys.getenv("FIG_DIR"),
read_date_results,
"_all_preds.Rds"))
#select winning fit opts: 110p bobyqa
all_preds_prep <- all_preds %>%
filter(Options %in% c("110p"),
method %in% "bobyqa") %>%
mutate(Conc_est_sub = dplyr::if_else(Conc_est < pLOQ,
pLOQ,
Conc_est),
Conc_est_belowLOQ = dplyr::if_else(Conc_est < pLOQ,
TRUE,
FALSE),
ID = as.factor(paste(Chemical, Chemical_Name, Species)),
.keep = "unused") %>%
select(ID, Options,
dose_norm, log10_trans, time_scale,
Route, Media, Dose,
Conc, Conc_est_sub, Conc_est_belowLOQ,
Time, Reference)
split_preds <- split(all_preds_prep, all_preds_prep$ID)
sp_plots <- lapply(names(split_preds), function(i) {
ggplot(data = split_preds[[i]],
mapping = aes(x = Conc,
y = Conc_est_sub,
color = as.factor(Dose),
shape = Reference)) +
xlab("Observed conc, mg/L") +
ylab("Predicted conc, mg/L") +
geom_point(size = 2) +
geom_abline(slope = 1) +
facet_grid(Route ~ Media,
scales = "free_y") +
scale_y_log10() +
scale_x_log10() +
annotation_logticks(sides = "bl") +
scale_color_viridis_d(name = "Dose, mg/kg") +
theme_bw() +
labs(title = i) +
theme(legend.position = "bottom",
plot.title = element_text(hjust = 0.5, face = "bold"))
}
)
# Very BIG FILE
pdf(file = paste0(
Sys.getenv("FIG_DIR"),
"Manu_Files/",
today,
"_log10_predvsobs_plots_110pbobyqa_all.pdf"),
onefile = TRUE, height = 8, width = 8)
for (x in seq_along(sp_plots)) {
print(sp_plots[[x]])
}
dev.off()
rm(sp_plots, split_preds, all_preds_prep, all_preds)
Begin by collating the data needed for plots. These data are
generally accessible using functions/methods for evaluating fitted
pk
objects.
my_pk <- readRDS(paste0(Sys.getenv("OD_DIR"),
read_date_pk,
"mypk_fit_110p.Rds"))
# Need the following data for any of the subsequent plots
# after fitting pk object for all cvt objects or reading the fitted pk object in `setup`
winmodel <- get_winning_model(my_pk, method = "bobyqa")
my_preds <- semi_join(predict(my_pk, use_scale_conc = FALSE, method = "bobyqa"), winmodel)
my_residuals <- residuals(my_pk, use_scale_conc = FALSE, method = "bobyqa") %>%
semi_join(winmodel)
my_tkstats <- eval_tkstats(my_pk, method = "bobyqa") %>% semi_join(winmodel)
my_nca <- get_nca(my_pk)
all_my_data <- get_data(my_pk)
#merge together long form coefs and long form coef SDs
my_tf <- twofold_test(my_pk, method = "bobyqa")
NROW(all_my_data) > NROW(my_preds) # THis must be true... or else try distinct(my_preds)
my_coef_sd <- coef_sd(my_pk, method = "bobyqa") %>% #this will take a few minutes to run
semi_join(winmodel) #keep only coefs & SDs for winning model
# Writing file to xlsx
writexl::write_xlsx(x = list(predictions = my_preds,
tkstats = my_tkstats,
coefs_sds = my_coef_sd),
path = paste0(Sys.getenv("FIG_DIR"),
"Manu_Files/",
today,
"_Supp_Table3",
".xlsx"))
Compute fold errors, r-squared, AIC, RMSLE, RMSE.
fe_df <- fold_error(my_pk,
sub_pLOQ = TRUE,
method = "bobyqa") %>% semi_join(winmodel)
# May need to change the use_scale_conc defaults
r2_df <- rsq(my_pk, use_scale_conc = FALSE,
sub_pLOQ = TRUE,
method = "bobyqa") %>%
semi_join(winmodel)
myAIC <- AIC(my_pk, method = "bobyqa") %>%
semi_join(winmodel)
myRMSLE <- rmse(my_pk,
use_scale_conc = list(dose_norm = FALSE,
log10_trans = TRUE),
sub_pLOQ = TRUE,
method = "bobyqa") %>%
semi_join(winmodel)
myRMSE <- rmse(my_pk,
rmse_group = ggplot2::vars(Chemical,
Species,
Route,
Media,
Dose),
sub_pLOQ = TRUE,
method = "bobyqa") %>%
semi_join(winmodel)
chem_name_translate <- all_my_data %>%
dplyr::select(Chemical, Chemical_Name) %>%
dplyr::filter(!(Chemical_Name %in% c("2,4-d",
"methylene chloride",
"benzo[a]pyrene"))) %>%
dplyr::distinct()
#these chemical names are filtered out because they are alternate/secondary versions of names for the same DTXSID that is already identified by another name
Overall fold error distributions.
my_tf$model_error_all %>%
filter(model %in% c("model_1comp", "model_2comp")) %>%
mutate(Route.model = factor(paste(Route, model),
levels = c("iv model_1comp",
"iv model_2comp",
"oral model_1comp",
"oral model_2comp"))) %>%
ggplot(aes(x = Fold_Error,
fill = Route.model)) +
geom_histogram(color = NA,
position = position_stack(),
bins = 50) +
scale_x_log10(labels = scales::label_math(format = log10)) +
annotation_logticks(sides = "b") +
scale_fill_brewer(palette = "Paired",
name = "Route/winning model") +
expand_limits(y = 0) +
geom_vline(xintercept = c(0.5, 2), color = "black", linetype = "dashed") +
labs(x = "Predicted/Observed",
y = "Number of Observations") +
theme_classic() +
theme(aspect.ratio = 1,
panel.border = element_rect(color = "black", fill = NA),
panel.grid.major = element_line(color = "grey95", linewidth = 0.4)) +
coord_cartesian(xlim = c(10^(-1.5), 100))
ggsave(paste0(Sys.getenv("FIG_DIR"),
"Manu_Files/",
today,
"_Supp_Fig7_PredictedObserved_compartmentModels.png"),
height = 5, width = 7)
ggsave(paste0(Sys.getenv("FIG_DIR"),
"Manu_Files/",
today,
"_Supp_Fig7_PredictedObserved_compartmentModels.pdf"),
height = 5, width = 7)
Here we examine the performance of “best fit” models with relation to data variability. Here, because we are trying to specifically compare data variability with model performance, we look at data with replicated timepoint observations, which are largely data with individual animal data.
my_tf$indiv_data_test_fold_errors %>%
select(model, starts_with("percent_")) %>%
glimpse()
pl_4A <- my_tf$indiv_data_fold_errors %>%
# For Plotting
ggplot(aes(
y = Fold_Error,
x = foldConc
)) +
geom_bin2d(bins = 40, color = NA) +
geom_hline(yintercept = c(0.5, 2), linetype = "dashed") +
geom_vline(xintercept = c(0.5, 2), linetype = "dashed") +
scale_x_log10(labels = scales::label_math(format = log10),
limits = c(0.001, 20)) +
scale_y_log10(labels = scales::label_math(format = log10),
limits = c(0.0001, 10000)) +
annotation_logticks(sides = "bl") +
scale_fill_viridis_c(option = "cividis",
limits = c(1, 100),
oob = scales::oob_squish) +
labs(y = "Model Error",
x = "Data Variability",
fill = "Count") +
theme(aspect.ratio = 1,
panel.border = element_rect(color = "black", fill = NA, size = 1.5),
panel.background = element_blank(),
panel.grid.major = element_line(color = "grey90", linewidth = 0.5),
strip.background = element_rect(fill = "white"),
strip.text = element_text(face = "bold"),
axis.ticks = element_blank(),
axis.line = element_blank(),
axis.text = element_text(size = 14),
axis.title = element_text(size = 14))
pl_4A
ggsave(paste0(Sys.getenv("FIG_DIR"),
today,
"_Fig4A_ModelPerformance_vDataVariability.png"),
height = 5,
width = 7,
device = "png",
dpi = 300,
units = "in")
Create table.
my_table <- my_tf$indiv_data_test_fold_errors %>%
ungroup() %>%
filter(model %in% "All") %>%
select(starts_with("percent_")) %>% t() %>%
round(digits = 2) %>%
as.data.frame() %>%
rownames_to_column(var = "percentages") %>%
mutate(percentages = case_when(
percentages == "percent_both_within" ~ "Both within a factor of 2",
percentages == "percent_both_outside" ~ "Both outside a factor of 2",
percentages == "percent_model_outside" ~ "Model too variable",
percentages == "percent_data_outside" ~ "Data too variable"
))
names(my_table) <- c(" ", "Overall (%)")
flextable(my_table) %>%
border_inner() %>%
fontsize(part = "all", size = 11) %>%
bold(part = "all", j = 2) %>%
autofit()
table_plot <- gen_grob(flextable(my_table) %>%
border_inner() %>%
fontsize(part = "all", size = 10) %>%
bold(part = "all", j = 2) %>%
autofit(),
autowidths = TRUE,
fit = "width")
dual_plot <- plot_grid(pl_4A,
table_plot,
ncol = 1,
align = "hv",
rel_heights = c(1, 0.5),
rel_widths = c(1, 0.5)
)
dual_plot
ggsave(paste0(Sys.getenv("FIG_DIR"),
"Manu_Files/",
today,
"_Fig4.png"),
plot = dual_plot,
height = 4.5,
width = 6,
device = "png",
bg = "white",
dpi = 300,
units = "in")
Here we analyze and validate model performance per data group using R-squared, RMSE, and fraction of predictions that are within 2-fold for each Chemical, Species, Route, Media, and Dose groupings.
combined_gof_df <- my_tf$model_error_all %>%
group_by(Chemical, Species, model, method) %>%
summarize(within_2fold = sum(between(Fold_Error, 0.5, 2))/n()) %>%
left_join(r2_df) %>%
left_join(myAIC) %>%
left_join(myRMSLE) %>%
semi_join(winmodel)
mypl <- ggplot(data = combined_gof_df,
aes(
x = within_2fold,
y = Rsq,
color = model,
)) +
geom_point() +
geom_abline(slope = 1, color = "black", linetype = "longdash") +
scale_color_manual(values = c("#0398FC", "#D68E09", "black")) +
#coord_cartesian(xlim = c(0,1), ylim = c(0,1)) +
labs(x = "Fraction of predictions within 2-fold",
y = "R-squared value") +
theme(#aspect.ratio = 1,
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.background = element_blank(),
panel.grid.major = element_line(color = "grey90", linewidth = 0.5),
axis.ticks = element_blank(),
axis.line = element_blank(),
legend.position = "none",
legend.title = element_blank(),
legend.key = element_blank()
)
panelA_plot <- ggExtra::ggMarginal(mypl,
groupFill = TRUE,
type = "histogram",
xparams = list(binwidth = 0.05),
yparams = list(binwidth = 0.05))
panelA_plot
panelB_plot <- ggplot(data = myRMSE,
aes(
x = RMSE,
fill = model
)) +
scale_x_log10() +
annotation_logticks(sides = "b") +
geom_histogram(bins = 50,
position = position_stack(),
color = NA) +
labs(y = "Count") +
scale_fill_manual(values = c("#0398FC", "#D68E09", "grey10")) +
scale_color_manual(values = c("#0398FC", "#D68E09", "grey10"), guide = "none") +
theme(text = element_text(size = 10),
#aspect.ratio = 1,
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.background = element_blank(),
panel.grid.major = element_line(color = "grey90", linewidth = 0.5),
axis.ticks = element_blank(),
axis.line = element_blank(),
strip.background = element_blank(),
panel.spacing.y = unit(0.125, units = "in"),
legend.title = element_blank(),
legend.position = "bottom")
panelB_plot
plAB <- patchwork::wrap_elements(panelA_plot) + panelB_plot +
plot_annotation(tag_levels = "A") + plot_layout(guides = 'collect', widths = c(1,0.9)) & theme(legend.position = "bottom") & guides(color = "none")
#save PNG version
ggsave(filename = paste0(Sys.getenv("FIG_DIR"),
"Manu_Files/",
today,
"_Fig5.png"),
plAB,
height = 4,
width = 6.5,
bg = "white",
units = "in")
#save PDF version
ggsave(filename = paste0(Sys.getenv("FIG_DIR"),
"Manu_Files/",
today,
"_Fig5.pdf"),
plAB,
height = 4,
width = 6.5,
bg = "white")
pl <- plot(my_pk,
method = "bobyqa",
best_fit = TRUE,
use_scale_conc = list(dose_norm = TRUE,
log10_trans = FALSE),
drop_nonDetect = FALSE)
cvt %>% filter(curation_set_tag == "CVT_Showa") %>%
pull(analyte_dtxsid) -> Showa_chems
my_rsq <- rsq(my_pk, method = "bobyqa")
my_rsq %>% semi_join(winmodel) %>% filter(Chemical %in% c("DTXSID2020139",
"DTXSID4023917",
"DTXSID3061635",
"DTXSID8030760"))
#High rsq: DTXSID2020139 and DTXSID4023917
#Low rsq: DTXSID3061635 and DTXSID8030760
fe_df %>% filter(Chemical %in% c("DTXSID2020139",
"DTXSID4023917",
"DTXSID3061635",
"DTXSID8030760"), Species %in% "rat") %>% group_by(Chemical, Species) %>% summarise(frac_2fold = sum(Fold_Error < 2 & Fold_Error > 0.5)/n())
# # A tibble: 4 × 3
# # Groups: Chemical [4]
# Chemical Species frac_2fold
# <chr> <chr> <dbl>
# 1 DTXSID2020139 rat 0.271
# 2 DTXSID3061635 rat 0.125
# 3 DTXSID4023917 rat 0.953
# 4 DTXSID8030760 rat 0.778
#High frac 2fold: DTXSID4023917 and DTXSID8030760
#Low frac 2fold: DTXSID2020139 and DTXSID3061635
pl_sub <- pl %>%
filter(Chemical %in% c("DTXSID2020139",
"DTXSID4023917",
"DTXSID3061635",
"DTXSID8030760"),
Species %in% "rat")
ex_fits <- pl_sub %>%
pull(final_plot)
ex_fits <- set_names(ex_fits,
pl_sub$Chemical)
cowplot::plot_grid(plotlist = list(
#Low frac 2fold and high Rsq
ex_fits[["DTXSID2020139"]] +
scale_color_manual(values = c("#a6bddb",
"#74a9cf",
"#41bbc4",
"#2b8cbe",
"#045a8d")) +
theme(legend.position = "none") +
xlim(0,30),
#high frac 2fold and high Rsq
ex_fits[["DTXSID4023917"]] +
scale_color_manual(values = c("black",
"grey40")) +
theme(legend.position = "none"),
#low frac 2fold and low rsq
ex_fits[["DTXSID3061635"]] +
scale_color_manual(values = c("#006837")) +
theme(legend.position = "none"),
#high frac2fold and low rsq
ex_fits[["DTXSID8030760"]] +
scale_color_manual(values = "magenta3") +
theme(legend.position = "none")
),
scale = 0.85)
ggsave(filename = paste0(Sys.getenv("FIG_DIR"),
"Manu_Files/",
today,
"_Supp_Fig8.png"),
height = 12,
width = 12)
ggsave(filename = paste0(Sys.getenv("FIG_DIR"),
"Manu_Files/",
today,
"_Supp_Fig8.pdf"),
height = 12,
width = 12)
flat_chems <- winmodel %>%
filter(model %in% "model_flat") %>%
distinct(Chemical, Species)
print(flat_chems)
pl <- plot(my_pk,
method = "bobyqa",
best_fit = FALSE,
use_scale_conc = list(dose_norm = TRUE,
log10_trans = FALSE),
drop_nonDetect = FALSE)
flat_fits <- pl %>%
inner_join(flat_chems) %>%
pull(final_plot)
cowplot::plot_grid(plotlist = flat_fits)
ggsave(filename = paste0(Sys.getenv("FIG_DIR"),
"Manu_Files/",
today,
"_Supp_Fig9.pdf"),
height = 8,
width = 16,
units = "in")
ggsave(filename = paste0(Sys.getenv("FIG_DIR"),
"Manu_Files/",
today,
"_Supp_Fig9.png"),
bg = "white",
height =8,
width = 16,
dpi = 300,
units = "in")
This meta-analysis compares the TK stats from model fits with other published studies.
cvt_DTXSIDs <- unique(my_pk$data$Chemical)
# First things first, let's load the data from Lombardo et al.
lombardo <- readxl::read_xlsx(
paste0(Sys.getenv("OD_DIR"),
"Lombardo2018-Supplemental_82966_revised_corrected.xlsx"),
skip = 8)
#use ctxR to search the Dashboard for DTXSIDs corresponding to these CASRNs
lombardo_dtxsid <- ctxR::chemical_equal_batch(
word_list = unique(lombardo$`CAS #`),
API_key = Sys.getenv("ccd_api_key")
)
#merge in the DTXSIDs
lombardo <- lombardo %>%
left_join(lombardo_dtxsid %>% select(searchValue, dtxsid, preferredName),
by = c("CAS #" = "searchValue"))
#any left unidentified: search by name
lombardo_missing <- lombardo %>%
filter(is.na(dtxsid))
missing_names <- lombardo_missing %>%
pull(Name)
lombardo_dtxsid_name <- ctxR::chemical_equal_batch(
word_list = unique(missing_names),
API_key = Sys.getenv("ccd_api_key")
)
lombardo_missing <- lombardo_missing %>%
left_join(lombardo_dtxsid_name %>%
select(searchValue, dtxsid, preferredName),
by = c("Name" = "searchValue") )
lombardo <- bind_rows(lombardo %>% filter(!is.na(dtxsid)),
lombardo_missing)
lombard_abbr <- lombardo %>%
dplyr::select(dtxsid,
preferredName,
Vss = `human VDss (L/kg)`,
CLtot = `human CL (mL/min/kg)`,
halflife = `terminal t1/2 (h)` ) %>%
dplyr::mutate(Species = 'human',
source = "Lombardo et al.",
CLtot = CLtot*60/1000 # Lombardo reports this as mL/min/kg, need L/h/kg
) %>%
arrange(halflife)
my_pk_vals <- my_tkstats %>%
dplyr::select(dtxsid = Chemical,
Species,
model,
halflife = halflife.tkstats,
Vss = Vss.tkstats,
CLtot = CLtot.tkstats) %>%
dplyr::distinct() %>%
dplyr::mutate(source = "invivoPKfit") %>%
dplyr::filter(model != "model_flat",
!is.na(halflife))
lombardo_inner <- my_pk_vals %>%
inner_join(lombard_abbr, by = "dtxsid")
#79 chemicals in common
lombardo_comparison <- bind_rows(lombard_abbr %>%
filter(
dtxsid %in%
lombardo_inner$dtxsid
),
my_pk_vals %>%
filter(
dtxsid %in%
lombardo_inner$dtxsid
)
)
#merge in chemical names
lombardo_comparison <- lombardo_comparison %>%
left_join(chem_name_translate,
by = c("dtxsid" = "Chemical"))
#create a factor variable for chemical names that sorts chemicals from highest to lowest Lombardo halflife
chemnames_levels <- lombardo_comparison %>%
filter(source %in% "Lombardo et al.") %>%
arrange(halflife) %>%
pull(Chemical_Name)
lombardo_comparison <- lombardo_comparison %>%
mutate(Chemical_Name = factor(Chemical_Name,
levels = chemnames_levels))
#reshape longer and plot
lombardo_comparison %>%
pivot_longer(cols = c("Vss", "CLtot", "halflife")) %>%
mutate(name = factor(name, levels = c("halflife", "Vss", "CLtot"))) %>%
ggplot(mapping = aes(
x = value,
y = Chemical_Name
)) +
geom_point(aes(color = source,
shape = Species),
#position = position_dodge(0.7),
size = 5/.pt, stroke = 2.5/.pt) +
facet_grid(cols = vars(name),
scales = "free_x") +
scale_x_log10( labels = scales::label_math(format = log10)) +
annotation_logticks(sides = "b") +
scale_shape_manual(values = c("human" = 3, "rat" = 21)) +
scale_color_manual(values = c("black", "#1064c9")) +
guides(color = guide_legend(override.aes = list(shape = 21),
nrow = 2,
title.position = "top",
title.hjust = 0.5),
shape = guide_legend(nrow = 2,
title.position = "top",
title.hjust = 0.5)) +
theme(panel.border = element_rect(color = "black", fill = NA, linewidth = 1.5),
panel.background = element_blank(),
panel.grid.major = element_line(color = "grey90", linewidth = 0.5),
strip.background = element_rect(fill = "white"),
strip.text = element_text(face = "bold", size = 12),
text = element_text(size = 10),
axis.ticks = element_blank(),
axis.line = element_blank(),
axis.text.y = element_text(face = "bold", size = 6),
axis.text.x = element_text(size = 10),
axis.title.y = element_blank(),
axis.title.x = element_blank(),
legend.position = "bottom",
legend.key = element_blank(),
legend.title = element_text(face = "bold"),
legend.text = element_text())
ggsave(filename = paste0(Sys.getenv("FIG_DIR"),
"Manu_Files/",
today,
"_Fig6.png"),
height = 7,
width = 6.5,
device = "png", dpi = 300)
ggsave(filename = paste0(Sys.getenv("FIG_DIR"),
"Manu_Files/",
today,
"_Fig6.pdf"),
height = 7,
width = 6.5)
Compare with literature values of Fgutabs from the supplemental material of Musther et al. 2014.
Read in these literature values and reshape them to prepare for fitting.
species_cols <- c(paste("Mouse",
c("Dosing Formulation",
"Strain",
"Sex",
"F% Mean",
"F% Range"),
sep = "."),
paste("Rat",
c("Dosing Formulation",
"Strain",
"Sex",
"F% Mean",
"WSD",
"F% Range"),
sep = "."),
paste("Dog",
c("Dosing Formulation",
"Strain",
"Sex",
"F% Mean",
"WSD",
"F% Range"),
sep = "."),
paste("Non-Human Primate",
c("Dosing Formulation",
"Strain",
"Sex",
"F% Mean",
"WSD",
"F% Range"),
sep = "."),
paste("Human",
c("Dosing Formulation",
"Sex",
"F% Mean",
"WSD",
"F% Range"),
sep = ".")
)
musther2014 <- readxl::read_excel(
path = paste0(Sys.getenv("OD_DIR"),
"SupTableMusther2014.xlsx"),
sheet = "Sheet1",
skip = 1,
col_names = c( "Compound",
"CAS",
"MW", "Ion Class",
species_cols,
"References.mouse",
"References.rat",
"References.dog",
"References.NHP",
"References.human"
),
col_types = "text"
)
#pivot longer
musther2014_long <- musther2014 %>%
select(c(Compound,
CAS,
contains("F% Mean"))) %>%
pivot_longer(cols = !c(Compound, CAS),
names_to = c("Species", "stat"),
names_sep = "\\.") %>%
mutate(value = gsub(x=value,
pattern = "<",
replacement = ""),
value_num = as.numeric(value))
Use ctxR
to get DTXSIDs from CAS number.
musther2014_dtxsid <- ctxR::chemical_equal_batch(word_list = unique(musther2014_long$CAS), API_key = Sys.getenv("ccd_api_key"))
musther2014_long <- musther2014_long %>%
left_join(musther2014_dtxsid %>% select(searchValue, dtxsid, preferredName),
by = c("CAS" = "searchValue"))
Get Fgutabs estimates from the fitted pk
object.
fgutabs <- coef(my_pk, method = "bobyqa") %>%
distinct() %>%
semi_join(winmodel) %>% #keep only winning models
rowwise() %>%
mutate(Fgutabs = coefs_vector["Fgutabs"]) %>%
filter(!is.na(Fgutabs))
How many chemicals overlapped at all
Join with the literature estimates, keeping only cases with data in both tables
Make a plot:
#capitalize invivopkfit species to harmonize with Musther
fgutabs_pk_musther <- fgutabs_pk_musther %>%
dplyr::mutate(Species.x = stringr::str_to_sentence(Species.x))
ggplot(fgutabs_pk_musther) +
geom_point(aes(y = preferredName,
x = Fgutabs,
shape = Species.x,
color = "invivopkfit"),
size =4,
stroke = 2) +
geom_point(aes(y = preferredName,
x = value_num/100,
shape = Species.y,
color = "Musther et al. 2014"),
size = 4,
stroke = 2) +
scale_shape_manual(values = c("Rat" = 21,
"Mouse" = 22,
"Dog" = 23,
"Non-Human Primate" = 24,
"Human" = 3),
breaks = c("Rat", "Mouse", "Dog", "Non-Human Primate", "Human")) +
theme_bw() +
theme(axis.title.y = element_blank(),
legend.title = element_blank(),
axis.title.x = element_text(size = 12),
axis.text = element_text(size = 12),
legend.text = element_text(size = 12)
)
ggsave(filename = paste0(Sys.getenv("FIG_DIR"),
"Manu_Files/",
today,
"_Supp_Fig10.pdf"),
height = 5,
width = 8)
ggsave(filename = paste0(Sys.getenv("FIG_DIR"),
"Manu_Files/",
today,
"_Supp_Fig10.png"),
height = 5,
width = 8,
device = "png", dpi = 300)
Benchmarking for parallelization vs. non-parallelization
# Randomize the order of chemical species
chem_spec <- cvt %>%
dplyr::distinct(analyte_dtxsid, species) %>%
dplyr::slice_sample(prop = 1)
test_group_size <- seq(25, 105, by = 10)
fit_options <- expand.grid(error_model = c("pooled",
"hierarchical"),
dose_norm = FALSE,
log10_trans = TRUE,
time_scale = c(
TRUE,
FALSE),
stringsAsFactors = FALSE)
fit_options <- tidyr::expand_grid(fit_options, test_group_size)
fit_options <- fit_options %>% rowwise() %>%
mutate(this_data = list({
tmp <- chem_spec %>% slice_head(n = test_group_size) %>%
left_join(cvt, by = join_by(analyte_dtxsid, species)) %>%
pk() +
scale_time(new_units = ifelse(!time_scale,
"identity",
"auto")) +
scale_conc(dose_norm = dose_norm, log10_trans = log10_trans) +
settings_preprocess(suppress.messages = TRUE)
do_prefit(tmp)
}
),
data_size = chem_spec %>% slice_head(n = test_group_size) %>%
left_join(cvt, by = join_by(analyte_dtxsid, species)) %>%
NROW())
# Organization of the benchmarking
# For each fit_option, return the four values of system.time() that we want
df_out <- fit_options %>%
rowwise() %>%
mutate(tim_1core = system.time(do_fit(this_data))[["elapsed"]],
tim_4core = system.time(do_fit(this_data, n_cores = 4))[["elapsed"]],
tim_7core = system.time(do_fit(this_data, n_cores = 7))[["elapsed"]],
tim_12core = system.time(do_fit(this_data, n_cores = 12))[["elapsed"]]
)
df_out <- df_out %>% select(-this_data)
gc()
full_long <- df_out %>%
pivot_longer(cols = c(tim_1core:tim_12core),
names_to = "N_cores",
values_to = "Time_s") %>%
group_by(N_cores, test_group_size, data_size) %>%
summarize(mean_time = mean(Time_s, na.rm = TRUE)/60,
max_time = max(Time_s, na.rm = TRUE)/60,
min_time = min(Time_s, na.rm = TRUE)/60
) %>%
mutate(N_cores = as.numeric(str_extract(N_cores, "[:digit:]+")))
df_out %>% clipr::write_clip()
ggplot(full_long,
aes(x = test_group_size,
y = mean_time,
color = as.factor(N_cores))) +
geom_point() +
geom_linerange(aes(ymin = min_time,
ymax = max_time)) +
geom_line(aes(group = N_cores)) +
labs(x = "Number of Data Groups",
y = "Runtime (minutes)",
title = "Number of Processing Cores",
subtitle = "(with min/max runtime range)") +
facet_grid(cols = vars(N_cores)) +
scale_color_manual(values = c("black", "#40c679", "#31a354", "#006837")) +
coord_fixed(ratio = 8) +
theme(panel.border = element_rect(color = "black", fill = NA, size = 1.5),
panel.background = element_blank(),
panel.grid.major = element_line(color = "grey90", linewidth = 0.5),
legend.position = "none",
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
axis.ticks = element_blank(),
axis.line = element_blank(),
strip.background = element_blank())
ggsave(paste0(Sys.getenv("FIG_DIR"),
"Manu_Files/",
today,
"_Supp_Fig11.png"),
width = 5,
height = 2.5,
units = "in")