--- title: "manuscript2023" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{manuscript2023} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r init, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` # Overview This vignette outlines the 2023 manuscript that describes _invivoPKfit_'s use in a case study with CvT data from over 200 Chemicals. The main inquiry is to assess whether we can estimate parameters for one- and two-compartment models such that the majority of predicted values are "within a factor of two" -- a common metric used to evaluate physiologically-based pharmacokinetic (PBPK) models. ```{r setup} set.seed(2023) library(tidyverse, quietly = TRUE) library(cowplot) library(RColorBrewer) devtools::load_all() today <- format(Sys.Date(), "%d%B%Y") ``` To speed up analysis in sections 4 and beyond, we can load an `.rds` file that includes the fitted models used in much of the publication. ``` {r file_load, eval=FALSE} my_pk <- readRDS("data-raw/all_cvt_pk.rds") ``` # Table of Contents 1. Pulling Data from CvTdb - Minimal `pk` object 2. Running all possible fitting options - Evaluating fitting options - Head-to-head comparison of pooled vs joint models 3. Running dose-normalized, log-transformed, joint fits 4. Analysis Plots - CvTdb replicate variation (Figure 4A & Supp. Figure 2A) - CvTdb replicate variation over ADME-normalized time (Figure 4B) - CvTdb concentration and final time-point distributions (Supp. Figure 2B) - invivoPKfit fit option selection (Supp. Table 1) - invivoPKfit model performance (Supp. Figure 2) - Model performance vs Data variability (Figure 5 and Alternative) - Goodness of fit plots (Figure 6) - Fits that may be improved (Supplementary Figure 3) 5. Lombardo Analysis - Chemicals included - Comparison of derived TK stats (Figure 7A) - Histograms for Steady-state Volume of Dist. and Total Clearance (Supplemental Figure 4) 6. Benchmarking invivoPKfit - Parallelization speeds up the process of fitting (Supplementary Figure 5) ## Pulling data from CvTdb See `pulling_iv_oral_cvtdb.Rmd`. ### Minimal PK object The following chunk shows the column/field mapping from the `cvt` object and the `pk` object needed for invivoPKfit analysis. The `cvt` object is data from the CvTdb database that includes chemicals with oral or intravenous routes of administration and takes measurements from blood or plasma. The data has been processed and cleaned to facilitate its use with _invivoPKfit_. ```{r pk_obj_fromSQL, eval=FALSE} ### Minimal PK Object ###---- # Minimum pk object, add options later # Note that these mappings are now a default mappings, just being verbose here. # If there are any standard deviations that are zero or NA, but N_Subjects > 1, # Set the n_subjects to 6 or the current value if lower than 6. # (In Showa the mode is 6, but this condition is present in CVT_Legacy as well) minimal_pk <- pk(data = cvt %>% mutate(n_subjects_normalized = if_else( n_subjects_normalized > 1 & is.na(invivPK_conc_sd), min(n_subjects_normalized, 6), n_subjects_normalized)), mapping = ggplot2::aes( Chemical = analyte_dtxsid, Chemical_Name = analyte_name_original, DTXSID = analyte_dtxsid, CASRN = analyte_casrn, Species = species, Reference = fk_extraction_document_id, Media = conc_medium_normalized, Route = administration_route_normalized, Dose = invivPK_dose_level, Dose.Units = "mg/kg", Subject_ID = fk_subject_id, Series_ID = fk_series_id, Study_ID = fk_study_id, ConcTime_ID = conc_time_id, N_Subjects = n_subjects_normalized, Weight = weight_kg, Weight.Units = "kg", Time = time_hr, Time.Units = "hours", Value = invivPK_conc, Value.Units = "mg/L", Value_SD = invivPK_conc_sd, LOQ = invivPK_loq )) ``` ## Running all possible fitting options Here we setup the PK object with the various options for fitting and data transformations. As shown in this vignette, it is possible to setup multiple distinct options for fitting and data transformation at once prior to processing and fitting the data. ```{r outlining_all_options, eval=FALSE} # Types of Choices: ## Dose Normalized ## Log10 transform ## Scale time # List of options fitopts <- expand.grid(error_model = c("pooled", "hierarchical"), dose_norm = c(TRUE, FALSE), log10_trans = c(TRUE, FALSE), time_scale = c( TRUE, FALSE), stringsAsFactors = FALSE) ``` ```{r fitting_choices, eval=FALSE} # Make function similar to that in test_vignette authored by Caroline Ring fit_data <- function(this_error_model, this_dose_norm, this_log10_trans, this_time_scale, n_cores = 12, file_dir = Sys.getenv("OD_DIR")){ retval <- -9 dose_indic <- as.numeric(this_dose_norm) log10_indic <- as.numeric(this_log10_trans) time_indic <- as.numeric(this_time_scale) errmodel_indic <- substr(this_error_model, 1, 1) file_str <- paste0(today, "mypk_fit_", dose_indic, log10_indic, time_indic, errmodel_indic, ".Rds") cat("Fitting: ", file_str, "\n") tryCatch( expr = { this_pk <- minimal_pk + facet_data(vars(Chemical, Species)) + settings_preprocess(keep_data_original = FALSE, suppress.messages = TRUE) + scale_conc(dose_norm = this_dose_norm, log10_trans = this_log10_trans) + settings_optimx(method = c( "L-BFGS-B", "bobyqa" )) if(this_error_model %in% "pooled"){ this_pk <- this_pk + stat_error_model(error_group = vars(Chemical, Species)) } else if(this_error_model %in% c("hierarchical", "joint")){ this_pk <- this_pk + stat_error_model(error_group = vars(Chemical, Species, Reference)) } else{ stop("this_error_model must be either 'pooled' or 'hierarchical'/'joint'") } if(this_time_scale %in% TRUE){ this_pk <- this_pk + scale_time(new_units = "auto") } #do the fit this_time <- system.time(this_fit <- do_fit(this_pk, n_cores = n_cores)) #save the result saveRDS(this_fit, file = paste0(file_dir, file_str)) cli::cli_alert_success(text = "Fitting success!") print(this_time) rm(this_fit) retval <- this_time[[3]] }, error = function(e){ cli::cli_alert_danger(text = paste("Analysis", file_str, "failed with error", e)) retval <- -1 } ) return(retval) } ``` ```{r running_all_options, eval=FALSE} system.time(tmp_out <- mapply(fit_data, this_error_model = fitopts$error_model, this_dose_norm = fitopts$dose_norm, this_log10_trans = fitopts$log10_trans, this_time_scale = fitopts$time_scale, n_cores = 13)) mean(tmp_out)/60 # Average runtime (in minutes) per fitting option median(tmp_out)/60 sum(tmp_out) # Looks like there's 10 extra seconds of overhead for non-fitting methods ``` # Evaluating fitting options Here I evaluate the various fitting options by: + Counting the number of 1-, 2-compartment and flat model fits that had lowest AIC (winning model). + Summarizing the root mean squared log10 error between model fit and NCA approximation of: + Cmax + AUC_infinity + (Counts and filters extreme values of log10(model/NCA) > log10(20) for AUC_infinity) + (Counts and filters non-finite and zero values of AUC_infinity) + A summary goodness-of-fit table for each data group's winning model and fitting options that includes: + R-squared + RMSLE + AIC + All predictions for each data group and fitting option (winning models only) + Model errors within a factor of two for each fitting option ```{r Evaluation-helpers, eval=FALSE} # Evaluation # Winning Model Tally winmodel_comparison <- function(this_error_model, this_dose_norm, this_log10_trans, this_time_scale) { dose_indic <- as.numeric(this_dose_norm) log10_indic <- as.numeric(this_log10_trans) time_indic <- as.numeric(this_time_scale) errmodel_indic <- substr(this_error_model, 1, 1) file_str <- paste0(Sys.getenv("OD_DIR"), today, "mypk_fit_", dose_indic, log10_indic, time_indic, errmodel_indic, ".Rds") pk_name <- paste0(dose_indic, log10_indic, time_indic, errmodel_indic) current_rds <- readRDS(file = file_str) pf <- current_rds$fit %>% distinct(Chemical, Species, model, method, convcode) %>% group_by(method, .drop = FALSE) %>% count(convcode) %>% pivot_wider(names_from = convcode, names_prefix = "convcode.", values_from = n) # Wide winning model suppressMessages({ winning_model <- get_winning_model(obj = current_rds) wide_winning_model <- winning_model %>% group_by(method, model) %>% count() %>% pivot_wider(names_from = model, values_from = n) %>% left_join(pf) }) return(wide_winning_model) } # RMSLE for Cmax and AUC (with some tallys for extreme values) rmsles_Cmax_AUC <- function( this_error_model, this_dose_norm, this_log10_trans, this_time_scale) { dose_indic <- as.numeric(this_dose_norm) log10_indic <- as.numeric(this_log10_trans) time_indic <- as.numeric(this_time_scale) errmodel_indic <- substr(this_error_model, 1, 1) file_str <- paste0(Sys.getenv("OD_DIR"), today, "mypk_fit_", dose_indic, log10_indic, time_indic, errmodel_indic, ".Rds") pk_name <- paste0(dose_indic, log10_indic, time_indic, errmodel_indic) message(paste0("Evaluation for: ", pk_name)) current_rds <- readRDS(file = file_str) # Cmax and AUC_inf RMSEs, eval_tkstats already filters winning models RMSLE_eval <- suppressMessages(eval_tkstats(obj = current_rds, dose_norm = FALSE, finite_only = TRUE) %>% mutate(Options = pk_name) %>% dplyr::select( Options, Chemical, Species, Route, Media, Reference, method, model, starts_with("Cmax"), starts_with("AUC_") )) message(paste0("How many finite, non-zero comparisons had ", "AUC_infinity.tkstats over/under 20X of AUC_infinity.nca?")) print( RMSLE_eval %>% filter( AUC_infinity.nca > 0, AUC_infinity.tkstats > 0) %>% group_by(Options, method) %>% summarize( `AUC_folderror>20X` = sum( log10(AUC_infinity.tkstats/AUC_infinity.nca) > log10(20) ) ) ) message("How many comparisons had AUC_infinity equal to zero?") print( RMSLE_eval %>% filter( AUC_infinity.nca > 0, AUC_infinity.tkstats > 0) %>% group_by(Options, method) %>% summarize(NCA_zeroAUC = sum(AUC_infinity.nca == 0), tkstats_zeroAUC =sum(AUC_infinity.tkstats == 0)) ) message("Removing those observations...") RMSLE_eval <- RMSLE_eval %>% filter( AUC_infinity.nca > 0, AUC_infinity.tkstats > 0, log10(AUC_infinity.tkstats/AUC_infinity.nca) <= log10(20) ) %>% rowwise() %>% mutate( SLE_Cmax = (log10(Cmax.tkstats) - log10(Cmax.nca)) ^ 2, SLE_AUC_inf = (log10(AUC_infinity.tkstats) - log10(AUC_infinity.nca)) ^ 2 ) %>% filter(!is.na(SLE_Cmax), !is.na(SLE_AUC_inf)) %>% group_by(Options, method) %>% summarize( RMSLE_Cmax = sqrt(mean(SLE_Cmax, na.rm = FALSE)) %>% signif(digits = 4), RMSLE_AUC = sqrt(mean(SLE_AUC_inf, na.rm = FALSE)) %>% signif(digits = 4) ) print(RMSLE_eval) return(RMSLE_eval) } # Goodness-of-fit table with R-squared, RMSLE and AIC get_gof <- function(this_error_model, this_dose_norm, this_log10_trans, this_time_scale) { dose_indic <- as.numeric(this_dose_norm) log10_indic <- as.numeric(this_log10_trans) time_indic <- as.numeric(this_time_scale) errmodel_indic <- substr(this_error_model, 1, 1) file_str <- paste0(Sys.getenv("OD_DIR"), today, "mypk_fit_", dose_indic, log10_indic, time_indic, errmodel_indic, ".Rds") pk_name <- paste0(dose_indic, log10_indic, time_indic, errmodel_indic) current_rds <- readRDS(file = file_str) winning_model <- suppressMessages(get_winning_model(obj = current_rds)) suppressMessages({ this_rsq <- rsq(current_rds, use_scale_conc = FALSE, rsq_group = ggplot2::vars(Chemical, Species)) %>% semi_join(winning_model) this_AIC <- AIC(current_rds) %>% semi_join(winning_model) this_rmsle <- rmse.pk(current_rds, rmse_group = vars(Chemical, Species), use_scale_conc = list(dose_norm = FALSE, log10_trans = TRUE)) %>% semi_join(winning_model) }) # Since they are all joined by winmodel # all three must have same NROW message( paste0("For ", pk_name, "... outputs below must have same number of rows") ) message(paste0("rsq: ", NROW(this_rsq))) message(paste0("aic: ", NROW(this_AIC))) message(paste0("rmsle: ", NROW(this_rmsle))) gof_df <- this_rsq %>% inner_join(this_AIC, by = join_by(Chemical, Species, method, model)) %>% inner_join(this_rmsle, by = join_by(Chemical, Species, method, model)) %>% mutate(Options = pk_name, .before = Chemical) return(gof_df) } # All predictions get_all_preds <- function(this_error_model, this_dose_norm, this_log10_trans, this_time_scale) { dose_indic <- as.numeric(this_dose_norm) log10_indic <- as.numeric(this_log10_trans) time_indic <- as.numeric(this_time_scale) errmodel_indic <- substr(this_error_model, 1, 1) file_str <- paste0(Sys.getenv("OD_DIR"), today, "mypk_fit_", dose_indic, log10_indic, time_indic, errmodel_indic, ".Rds") pk_name <- paste0(dose_indic, log10_indic, time_indic, errmodel_indic) current_rds <- readRDS(file = file_str) # Wide winning model suppressMessages({ winning_model <- get_winning_model(obj = current_rds) %>% select(-c(near_flat, preds_below_loq)) this_preds <- predict(current_rds, use_scale_conc = FALSE) %>% semi_join(winning_model) %>% mutate(Options = pk_name, .before = Chemical) }) return(this_preds) } # Factor of two model error tf_tests <- function(this_error_model, this_dose_norm, this_log10_trans, this_time_scale) { dose_indic <- as.numeric(this_dose_norm) log10_indic <- as.numeric(this_log10_trans) time_indic <- as.numeric(this_time_scale) errmodel_indic <- substr(this_error_model, 1, 1) file_str <- paste0(Sys.getenv("OD_DIR"), today, "mypk_fit_", dose_indic, log10_indic, time_indic, errmodel_indic, ".Rds") pk_name <- paste0(dose_indic, log10_indic, time_indic, errmodel_indic) current_rds <- readRDS(file = file_str) print(paste0("Now analyzing: ", pk_name)) out <- twofold_test(current_rds, suppress_messages = TRUE) out <- out$model_error_summary %>% mutate(Options = pk_name) } ``` ```{r do_evalutations, eval=FALSE} system.time(wm_comp <- fitopts %>% dplyr::rowwise() %>% dplyr::mutate(winmodel_comp = list( winmodel_comparison(this_error_model = error_model, this_dose_norm = dose_norm, this_log10_trans = log10_trans, this_time_scale = time_scale))) %>% tidyr::unnest(winmodel_comp)) system.time(cmax_auc_rmsle_tbl <- fitopts %>% dplyr::rowwise() %>% dplyr::mutate(nca_comp = list( rmsles_Cmax_AUC(this_error_model = error_model, this_dose_norm = dose_norm, this_log10_trans = log10_trans, this_time_scale = time_scale))) %>% tidyr::unnest(nca_comp)) system.time(gof_tbl <- fitopts %>% dplyr::rowwise() %>% dplyr::mutate(gof_win = list( get_gof(this_error_model = error_model, this_dose_norm = dose_norm, this_log10_trans = log10_trans, this_time_scale = time_scale))) %>% tidyr::unnest(gof_win)) system.time(all_tf_tests <- fitopts %>% dplyr::rowwise() %>% dplyr::mutate(TF = list( tf_tests(error_model, this_dose_norm = dose_norm, this_log10_trans = log10_trans, this_time_scale = time_scale))) %>% tidyr::unnest(TF)) system.time(all_preds <- fitopts %>% dplyr::rowwise() %>% dplyr::mutate(these_preds = list( get_all_preds(this_error_model = error_model, this_dose_norm = dose_norm, this_log10_trans = log10_trans, this_time_scale = time_scale))) %>% tidyr::unnest(these_preds)) gc() ``` ### General Observations + The time_scaling generally makes things worse. + L-BFGS-B had a difficult time fitting with new restriction, generally worse estimates. + log10_trans generally improves RMSLE, at a small expense of R-squared. + All log10_trans options have at least 10 more data_groups with RMSLE < 10 than the non-log10 transformed. + There are some cases where fits are bad due to a combination of factors, most interestingly error model, dose_norm, and optimizer. ```{r Evaluating choices, eval=FALSE} # Writing to file writexl::write_xlsx(x = list( Fit_Counts = wm_comp, AUC_and_Cmax_RMSE_summary = cmax_auc_rmsle_tbl, Prediction_Evaluations = gof_tbl, Twofold_Tests = all_tf_tests), path = paste0(Sys.getenv("FIG_DIR"), "Manu_Files/", today, "_Supp_Table1", ".xlsx")) gof_tbl <- readxl::read_xlsx(path = paste0(Sys.getenv("FIG_DIR"), "Manu_Files/", today, "_Supp_Table1.xlsx"), sheet = "Prediction_Evaluations") %>% mutate(RMSLE = as.numeric(RMSLE)) wm_comp <- readxl::read_xlsx(path = paste0(Sys.getenv("FIG_DIR"), "Manu_Files/", today, "_Supp_Table1.xlsx"), sheet = "Fit_Counts") cmax_auc_rmsle_tbl <- readxl::read_xlsx(path = paste0(Sys.getenv("FIG_DIR"), "Manu_Files/", today, "_Supp_Table1.xlsx"), sheet = "AUC_and_Cmax_RMSE_summary") gof_tbl %>% group_by(Options, method) %>% count(RMSLE < 1) %>% filter(`RMSLE < 1`) %>% arrange(desc(n)) %>% print(n = 8) # bobyqa-optimized options 010h, 110h, 110p wm_comp %>% filter(method %in% "bobyqa", !time_scale, log10_trans) # Off these, there is one more data group that is model_flat # I will remove this so it doesn't affect downstream analyses gof_tbl %>% filter(Options == "110p", model == "model_flat", method == "bobyqa") %>% distinct(Chemical, Species) gof_tbl %>% filter(Options == "110h", model == "model_flat", method == "bobyqa") %>% distinct(Chemical, Species) gof_tbl %>% filter(Options == "010h", model == "model_flat", method == "bobyqa") %>% distinct(Chemical, Species) gof_tbl %>% filter(Options == "010p", model == "model_flat", method == "bobyqa") %>% distinct(Chemical, Species) gof_tbl %>% filter(Options == "010h", method == "bobyqa", Chemical %in% "DTXSID1021116") %>% distinct(Chemical, Species) # Filter out DTXSID1021116 gof_tbl %>% anti_join(data.frame(Chemical = "DTXSID1021116", Species = "rat")) %>% filter(!(model %in% "model_flat")) %>% group_by(Options, method) %>% count(RMSLE < 0.7) %>% filter(`RMSLE < 0.7`) %>% arrange(desc(n)) %>% print(n = 8) # 110p seems to win out here. gof_tbl %>% filter(method %in% "bobyqa", Options %in% c("110p", "010h", "110h") ) %>% anti_join(data.frame(Chemical = "DTXSID1021116", Species = "rat")) %>% group_by(Options) %>% summarize(medianR2 = median(Rsq, na.rm = TRUE)) gof_tbl %>% filter(method %in% "bobyqa", Options %in% c("110p", "010h", "110h")) %>% anti_join(data.frame(Chemical = "DTXSID1021116", Species = "rat")) %>% group_by(Options) %>% summarize(medianR2 = mean(Rsq, na.rm = TRUE)) # Here we see that the highest median Rsq is for 110p # I use median because the distribution is skewed towards 1. gof_tbl %>% filter(method %in% "bobyqa", Options %in% c("110p", "010h")) %>% anti_join(data.frame(Chemical = "DTXSID1021116", Species = "rat")) %>% distinct(Chemical, Species, Options, Rsq) %>% pivot_wider(names_from = Options, values_from = Rsq) %>% ggplot(aes(x = `110p`, y = `010h`)) + geom_point() #110h is most even between models cmax_auc_rmsle_tbl %>% filter(Options %in% c("010h", "110p", "110h"), method %in% "bobyqa") # Here it is clear that 110p results in the all_tf_tests %>% filter(Options %in% c("110p", "110h", "010h"), model %in% "All non-flat", method %in% "bobyqa") %>% select(Options, model, starts_with("percent_")) # Relatively similar here cmaxAUCpl <- cmax_auc_rmsle_tbl %>% filter(RMSLE_AUC < 0.8, RMSLE_Cmax < 0.5) %>% ggplot(aes(x = RMSLE_AUC, y = RMSLE_Cmax, shape = method, fill = Options)) + scale_shape_manual(values = c(21,22))+ scale_color_brewer(palette = "Paired")+ geom_point(color = "black", size = 2, stroke = 0.5) + guides(fill = guide_legend(override.aes = list(shape = 21))) + # coord_equal(xlim = c(0.3, 0.45), ylim = c(0.3, 0.4), expand = TRUE) + labs(x = "AUC RMSLE", y = "Cmax RMSLE") + theme_bw() + theme(panel.border = element_rect(fill = NA, color = "black", linewidth = 2)) cmaxAUCpl ggsave(paste0(Sys.getenv("FIG_DIR"), today, "CmaxAUC_RMSLE_plotFinalists.png"), bg = "white", height = 3.25, width = 3.5, units = "in", device = "png", dpi = 300) # Plot Rsq violin_rmsle <- gof_tbl %>% filter(!(model %in% "model_flat"), Options %in% c("010h", "010p", "111h", "111p", "110h", "110p"), RMSLE <= 10) %>% ggplot() + geom_violin( aes(x = "", y = RMSLE, color = interaction(error_model, method)), draw_quantiles = c(0.25,0.5,0.75)) + facet_grid(cols = vars(Options)) + scale_color_brewer(palette = "Paired") + labs(title = "RMSLE", color = "Error Model.Optimizer", subtitle = "(values 10 or under)", x = "") + coord_cartesian(ylim = c(0, 10)) + theme_bw() + theme(legend.position = "none", plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5), axis.title.x = element_blank(), axis.ticks.x = element_blank(), axis.text.x = element_blank() ) violin_rsq <- gof_tbl %>% filter(!(model %in% "model_flat"), Options %in% c("010h", "010p", "111h", "111p", "110h", "110p")) %>% ggplot() + geom_violin(aes(x = "", y = Rsq, color = interaction(error_model, method)), draw_quantiles = c(0.25,0.5,0.75)) + facet_grid(cols = vars(Options)) + scale_color_brewer(palette = "Paired") + labs(title = "R-squared", color = "Error Model.Optimizer", x = "") + coord_cartesian(ylim = c(0, 1)) + theme_bw() + guides(color = guide_legend(nrow = 2, byrow = TRUE)) + theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5), axis.title.x = element_blank(), axis.ticks.x = element_blank(), axis.text.x = element_blank() ) cowplot::plot_grid(plotlist = list(violin_rmsle, violin_rsq), nrow = 2, align = "v") ggsave( filename = paste0( Sys.getenv("FIG_DIR"), today, "_RMSLE_Rsquared_ViolinPlots_selectOptions.png" ), height = 5, width = 6, units = "in" ) gof_tbl %>% filter(!(model %in% "model_flat"), Options %in% c("010h", "010p", "111h", "111p", "110h", "110p")) %>% ggplot() + geom_point(aes(x = Rsq, y = RMSLE, color = interaction(method, model)), shape = 1) + facet_wrap(~Options) + scale_color_brewer(palette = "Paired") + theme_bw() + coord_cartesian( xlim = c(0, 1), expand = TRUE ) + theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5)) ggsave( filename = paste0( Sys.getenv("FIG_DIR"), today, "_RMSLE_Rsquared_ScatterPlot.png" ), height = 7, width = 10, units = "in" ) rmsle_hist <- gof_tbl %>% filter(model != "model_flat", Options %in% c("100p", "010h", "110p", "110h"), method == "bobyqa") %>% ggplot(aes(x = RMSLE)) + geom_histogram(fill = "grey5", binwidth = 0.1) + scale_x_log10(label = scales::label_math()) + geom_vline(xintercept = 1, color = "red3", linetype = "dotted") + facet_grid(rows = vars(Options), cols = vars(method)) + theme_bw() rmsle_hist ggsave( filename = paste0( Sys.getenv("FIG_DIR"), today, "_RMSLE_histogram_topChoices.png" ), height = 4, width = 4, units = "in" ) cowplot::plot_grid( plotlist = list(rmsle_hist, cmaxAUCpl + guides(fill = guide_legend(ncol = 2, override.aes = list(shape = 21)), shape = guide_legend(nrow = 2)) + theme(legend.position = "bottom", legend.title.position = "top") ), rel_widths = c(1, 1.75), labels = "AUTO") ggsave( filename = paste0( Sys.getenv("FIG_DIR"), today, "_RMSLE_histogram_CmaxAUC.png" ), height = 4.5, width = 6, units = "in" ) all_preds_prep <- all_preds %>% filter(Conc_est >= LOQ, Options %in% c("110p", "110h", "010h"), method == "bobyqa") %>% mutate(ID = as.factor(paste(Chemical, Species, method)), .keep = "unused") %>% select(ID, Options, dose_norm, log10_trans, time_scale, Route, Media, Dose, Conc, Conc_est, Time, Reference) split_preds <- split(all_preds_prep, all_preds_prep$ID) sp_plots <- lapply(names(split_preds), function(i) { ggplot(data = split_preds[[i]], mapping = aes(x = Conc, y = Conc_est, color = Options, shape = Reference)) + geom_point(size = 2) + geom_abline(slope = 1) + facet_grid(dose_norm~Route+Media+Dose, scales = "free_y", labeller = labeller(dose_norm = label_both)) + scale_y_log10() + scale_x_log10() + scale_color_brewer(palette = "Accent") + theme_bw() + labs(title = i) + theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5, face = "bold")) } ) # Very BIG FILE pdf(file = paste0( Sys.getenv("FIG_DIR"), today, "log10_GOFplots_select.pdf"), onefile = TRUE, height = 8, width = 8) for (x in seq_along(sp_plots)) { print(sp_plots[[x]]) } dev.off() ``` ## Head-to-head comparison of pooled vs joint models Now I need to go back and re-analyze the data only for 100, 101, and 000 Pooled and Joint, L-BFGS-B only. I need to filter the cvt to only include Chem + Species combinations with multiple references. Then I will join them with the excel sheet of already collected data. ```{r load_multirefs, eval=FALSE} # Reference Multiples multiRef_ChemSpec <- all_my_data %>% ungroup() %>% distinct(Chemical, Species, Reference) %>% group_by(Chemical, Species) %>% count() %>% arrange(desc(n)) %>% filter(n > 1) %>% dplyr::select(!n) multiRef_ChemSpec %>% nrow() # Surprisingly only 15 Chem + Species have multiple references multiRef_ChemSpec ``` ### Evaluating error for the NCA estimations Some chemicals have one observation for a timepoint but not for others, which then each timepoint gets a random error value so that `PK::nca()` can run. Here I evaluate 100 random seeds and the differences in values of NCA estimated parameters. ```{r NCA_error_eval, eval=FALSE} # Which Chemical Species Route Media Dose groups have the randomness? my_pk$data %>% group_by(Chemical, Species, Reference, Route, Media, Dose, Time) %>% summarize(Count = n()) %>% group_by(Chemical, Species, Reference, Route, Media, Dose) %>% filter(any(Count == 1) && any(Count > 1)) %>% distinct(Chemical, Species, Reference, Route, Media, Dose) -> problem_set # Evaluate 100 times n_evals <- 100 evalNCA_df <- NULL my_pk_new <- my_pk + settings_preprocess(suppress.message = TRUE) for (i in 1:100) { seed_num <- 100 + i set.seed(seed_num) message(paste("Current seed:", seed_num)) my_pk_new <- do_preprocess(my_pk_new) my_pk_new <- do_data_info(my_pk_new) this_nca <- my_pk_new$data_info$nca %>% select(-param_sd_z) %>% mutate(seed = as.factor(seed_num)) evalNCA_df <- bind_rows(evalNCA_df, this_nca) } names(evalNCA_df) evalNCA_df %>% group_by(Chemical, Species, Route, Media, Dose, param_name) %>% summarize(Average = mean(param_value[which(!is.infinite(param_value))], na.rm = TRUE), Median = median(param_value[which(!is.infinite(param_value))], na.rm = TRUE), StDev = sd(param_value[which(!is.infinite(param_value))], na.rm = TRUE), Min = min(param_value[which(!is.infinite(param_value))], na.rm = TRUE), Max = max(param_value[which(!is.infinite(param_value))], na.rm = TRUE), Unique_Values = length(unique(signif(param_value, 4))), Inf_Count = sum(is.infinite(param_value))) %>% filter( # Route %in% "iv", param_name %in% "Cmax" ) %>% View() evalNCA_df %>% filter(param_name %in% c("tmax"), Route %in% "iv", is.finite(param_value), Species %in% c("rat", "human", "hamster")) %>% ggplot(aes(x = param_value, y = Chemical)) + geom_point() + facet_grid(rows = vars(Route)) + theme_bw() + theme(axis.text.y = element_text(size = 5), axis.ticks.y = element_blank(), panel.grid.major = element_line(color = "grey70", linewidth = 0.4, linetype = "solid"), panel.grid.major.y = element_blank(), panel.grid.minor = element_blank()) set.seed(2023) ``` ## Running log-transformed, pooled fits log10 transformation and a hierarchical model were the options that produced the least amount of error in predictions and derived TK stats. _L-BFGS-B_ ran significantly faster, but _bobyqa_ produced predictions with more lower RMSLE. ```{r Final_Options, eval=FALSE} my_pk <- minimal_pk + facet_data(vars(Chemical, Species)) + settings_preprocess(keep_data_original = TRUE, suppress.messages = FALSE) + settings_optimx(method = c("bobyqa")) + scale_conc(dose_norm = TRUE, log10_trans = TRUE) + scale_time(new_units = "auto") + stat_error_model(error_group = vars(Chemical, Species)) my_pk <- do_preprocess(my_pk) cvt_DTXSIDs <- unique(my_pk$data$Chemical) length(cvt_DTXSIDs) my_pk <- do_prefit(my_pk) system.time(my_pk <- do_fit(my_pk, n_cores = 14)) # saveRDS(my_pk, "data-raw/all_cvt_pk.rds") ``` ## Analysis Plots Next I will generate the figures for the paper. I will begin by collating the data that I need. This data are generally accessible using functions/methods for evaluating fitted `pk` objects. ```{r data_aggregation, eval=FALSE} # Need the following data for any of the subsequent plots # after fitting pk object for all cvt objects or reading the fitted pk object in `setup` winmodel <- get_winning_model(my_pk) my_preds <- semi_join(predict(my_pk, use_scale_conc = FALSE), winmodel) my_residuals <- residuals(my_pk, use_scale_conc = FALSE) %>% semi_join(winmodel) my_tkstats <- eval_tkstats(my_pk) %>% semi_join(winmodel) my_nca <- get_nca(my_pk) all_my_data <- get_data(my_pk) formatted_coefs <- my_pk$fit %>% filter(str_detect(param_name, "sigma_", negate = TRUE)) %>% select(Chemical, Species, model, model, param_name, estimate, param_name, estimate, convcode) %>% mutate(param_name = paste(param_name, "tkstats", sep = ".")) |> pivot_wider(names_from = param_name, values_from = estimate) my_tf <- twofold_test(my_pk) NROW(all_my_data) > NROW(my_preds) # THis must be true... or else try distinct(my_preds) # Writing file to xlsx writexl::write_xlsx(x = list(predictions = my_preds, tkstats = my_tkstats), path = paste0(Sys.getenv("FIG_DIR"), "Manu_Files/", today, "_Supp_Table2", ".xlsx")) fe_df <- fold_error(my_pk) %>% semi_join(winmodel) # May need to change the use_scale_conc defaults r2_df <- rsq(my_pk, use_scale_conc = FALSE) %>% semi_join(winmodel) myAIC <- AIC(my_pk) %>% semi_join(winmodel) myRMSLE <- rmse(my_pk, use_scale_conc = list(dose_norm = FALSE, log10_trans = TRUE)) %>% semi_join(winmodel) myRMSE <- rmse(my_pk, rmse_group = ggplot2::vars(Chemical, Species, Route, Media, Dose)) %>% semi_join(winmodel) chem_name_translate <- all_my_data %>% dplyr::select(Chemical, Chemical_Name) %>% dplyr::filter(!(Chemical_Name %in% c("2,4-d", "methylene chloride", "benzo[a]pyrene"))) %>% dplyr::distinct() ``` ### Figure 3A: Replicated timepoint variation in CvTdb The purpose of Figure 3 is to illustrate the variability among replicate time points per Chemical/Species/Reference/Route/Media/Dose group. This characterization is important as it can serve as a pre-fitting data check and if the data are too variable it is likely `invivoPKfit` will struggle to find an appropriate fit. One thing to consider is that there may be various factors that contribute to data variability including but not limited to: - Experimental design - Technical precision - Biological variation between subjects in different studies Additionally, this also contributes to supplemental figure 2. ```{r fig3A, eval=FALSE} my_tf$indiv_data_test %>% select(Route, starts_with("percent_")) pl3A <- my_tf$individual_data %>% # Plotting ggplot(aes(x = foldConc)) + geom_histogram(binwidth = 0.2, fill = "grey5", color = "grey5", linewidth = 0.4) + coord_cartesian(xlim = c(0, 4), ylim = c(0, 4000)) + scale_y_continuous(expand = c(0, 0), breaks = c(0, 1000, 2000, 3000), labels = c("0", "1", "2", "3")) + expand_limits(y = 0) + geom_vline(xintercept = c(0.5, 2), color = "red3", linetype = "dashed") + theme_bw() + labs(x = "Mean-normalized\nconcentration", y = "Observations\n(thousands)") + theme(text = element_text(size = 10), panel.border = element_rect(color = "black", linewidth = 1, fill = NA), panel.background = element_blank(), panel.grid.major.y = element_line(color = "grey90", linewidth = 0.4), axis.title = element_text(face = "bold"), axis.line = element_blank(), plot.background = element_blank(), axis.ticks.y = element_blank()) pl3A ggsave(paste0(Sys.getenv("FIG_DIR"), today, "NormalizedConcentrations",".png"), plot = pl3A, bg = "white", height = 2.2, width = 2.5, units = "in", device = "png", dpi = 300) # Not included in manuscript my_tf$data_descriptors %>% select(Chemical, Species, Reference, Route, Media, Time, data_descr) %>% distinct() %>% ggplot(aes(fill = data_descr, x = "")) + geom_bar(position = "stack", color = "black") + labs(x = "", y = "Occurence by Chemical + Species group") + scale_y_continuous(expand = c(0, NA)) + theme_minimal() ``` ### Figure 3A-2: Alternate version of Figure 3A This alternate version of Figure 3A aims to simplify the assessment of replication of datasets in invivoPKfit. In contrast to the original 4A, this will also result in the inclusion of datasets with recorded standard deviation values (i.e. non-indivudual animal data/summarized data). The way this will work is by calculating the average and standard deviations of individual animal data (eliminating individual animal data that does not have replicates) then testing the following: $$ \frac{\mu + 2\sigma}{\mu} \leq 2 $$ Which essentially tests whether, at least 95% of data is within a factor of 2. The result is a bar chart that shows how many timepoints with replicate values passed this test. It minimizes the double (or more) counting of timepoints with multiple replicates from individual animal data. ```{r fig4A-2-alt, eval=FALSE} # With the mapped column names my_tf$summarized_data_test pl3A_alt <- my_tf$summarized_data_test %>% filter(Route == "All") %>% select(Route, within, outside) %>% pivot_longer(cols = c(within, outside), names_to = "status", values_to = "value") %>% ggplot(aes(x = status, fill = status, y = value)) + geom_col(position = position_stack(), color = "black", linewidth = 1) + coord_cartesian(ylim = c(0, 2000)) + scale_y_continuous(expand = c(0, NA), breaks = c(0, 1000, 2000), labels = c("0", "1", "2")) + scale_fill_manual(values = c("black", "white")) + theme_classic() + labs(x = "95% of observations\nwithin factor of 2", y = "Observations\n(thousands)") + theme(text = element_text(size = 10), panel.border = element_rect(color = "black", linewidth = 1, fill = NA), panel.background = element_blank(), panel.grid.major.y = element_line(color = "grey90", linewidth = 0.4), axis.title = element_text(face = "bold"), axis.line = element_blank(), plot.background = element_blank(), axis.ticks.y = element_blank(), legend.position = "none") pl3A_alt # Not included in final manuscript ``` ### Figure 3B: Variation over time in CvTdb data. Here the goal is to evaluate whether particular timepoints throughout CvT data are particularly susceptible to high variability. To do this, we create a relative time scale anchored by $t_{max}$ and $t_{last}$. ```{r fig4B-cowplot, eval=FALSE} pl3B <- my_tf$individual_data %>% inner_join(my_tkstats %>% dplyr::select(Chemical, Species, Route, Media, tmax = tmax.nca) %>% filter(!is.na(tmax))) %>% group_by(Chemical, Species, Reference, Route, Media, Dose) %>% mutate( normTime = ifelse( Route == "iv", 1 + Time/max(Time), ifelse( Time > tmax, ((Time - tmax)/max(Time)) + 1, 0.5 + Time/(2*tmax) ) )) %>% ggplot(aes( x = normTime, y = foldConc )) + geom_point(alpha = 0.1, size = 0.7) + geom_vline(xintercept = 1, color = "blue", linewidth = 0.8, linetype = "dashed") + geom_vline(xintercept = 2, color = "green4", linewidth = 0.8, linetype = "dashed") + geom_hline(yintercept = c(0.5, 2), color = "red3", linewidth = 0.8, linetype = "dashed") + facet_grid(cols = vars(Route), scales = "free_x") + labs(x = "ADME-Normalized Time", y = "Mean-Normalized\nConcentration") + theme(text = element_text(size = 10), panel.border = element_rect(color = "black", fill = NA, size = 1), panel.background = element_blank(), panel.grid.major = element_line(color = "grey90", linewidth = 0.5), strip.background = element_rect(fill = "white"), strip.text = element_text(face = "bold"), axis.ticks = element_blank(), axis.line = element_blank(), axis.text.x = element_blank(), axis.title = element_text(face = "bold"), panel.spacing = unit(0.125, units = "in"), plot.background = element_blank(), legend.position = "none") pl3B ggsave(paste0(Sys.getenv("FIG_DIR"), today, "ADME_NormTime-NormConcs_onlyDetects.png"), height = 3, bg = "white", width = 5, units = "in", dpi = 300) # Make the Combination Plot---- plot_grid(pl3A, pl3B, labels = c("A", "B"), align = "h", axis = "bt", rel_widths = c(1, 1.7)) figure3 <- plot_grid(pl3A, pl3B, labels = c("A", "B"), align = "h", axis = "bt", rel_widths = c(1, 1.7)) ggsave(paste0(Sys.getenv("FIG_DIR"), today, "Figure3_onlyDetects.png"), plot = figure3, bg = "white", height = 3, width = 6.5, units = "in", dpi = 300) ``` ### Supp. Figure 2: Concentration and final time-points have wide range ```{r sFig2-cvt_analysis, eval=FALSE} data_range <- all_my_data %>% filter(Detect, !exclude) %>% group_by(Chemical, Species, Reference, Media, Route, Dose, Dose.Units) %>% summarize(maxT = max(Time)/24, concRange = log10(max(Conc)/min(Conc))) %>% ungroup() sf_2A <- data_range %>% ggplot(aes(x = maxT)) + geom_histogram(fill = "grey5", bins = 40, color = "grey5") + scale_x_log10(labels = scales::number, breaks = c(0.1, 1, 7, 30, 365)) + # scale_y_continuous(expand = c(0, 0), # limits = c(0, 60), # breaks = c(0, 10, 20, 30, 40, 50)) + labs(x = "Day of final Timepoint", y = "Count") + theme(aspect.ratio = 1, text = element_text(size = 10), panel.border = element_rect(color = "black", fill = NA, size = 1), panel.background = element_blank(), panel.grid.major = element_line(color = "grey90", linewidth = 0.5), axis.ticks = element_blank(), axis.line = element_blank()) sf_2A ggsave(paste0(Sys.getenv("FIG_DIR"), today, "SFig2A_TimePoints.png"), plot = sf_2C, height = 4, width = 4, units = "in", dpi = 300) sf_2B <- data_range %>% ggplot(aes(x = concRange)) + geom_histogram(fill = "grey5", bins = 40, color = "grey5") + scale_y_continuous(expand = c(0, 0), limits = c(0, 50), breaks = c(0, 25, 50)) + labs(x = "log10(Concentration Range)", y = "Count") + theme(aspect.ratio = 1, panel.border = element_rect(color = "black", fill = NA, size = 1), panel.background = element_blank(), panel.grid.major = element_line(color = "grey90", linewidth = 0.5), strip.background = element_rect(fill = "white"), strip.text = element_text(face = "bold"), axis.ticks = element_blank(), axis.line = element_blank(), axis.text = element_text(size = 10), axis.title = element_text(size = 11)) sf_2B ggsave(paste0(Sys.getenv("FIG_DIR"), today, "SFig2B_ConcRange.png"), plot = sf_2B, height = 4, width = 4, units = "in", dpi = 300) sf_2AB <- plot_grid(sf_2A, sf_2B, labels = c("A", "B"), nrow = 1) sf_2AB ggsave(paste0(Sys.getenv("FIG_DIR"), today, "SFig2_ConcTimeRanges.png"), height = 3.2, width = 6, bg = "white", units = "in", dpi = 300) ``` ### Supp Figure 4: invivoPKfit model performance ```{r sFig4-model_performance, eval=FALSE} my_tf$model_error_summary %>% select(model, method, starts_with("percent_")) my_tf$model_error_all %>% filter(model %in% c("model_1comp", "model_2comp")) %>% ggplot(aes(x = Fold_Error)) + geom_histogram( fill = "grey5") + scale_x_log10(limits = c(1E-3, 1E3), labels = scales::label_math(format = log10)) + expand_limits(y = 0) + geom_vline(xintercept = c(0.5, 2), color = "red3", linetype = "dashed") + facet_grid(cols = vars(model), rows = vars(Route)) + labs(x = "Predicted/Observed", y = "Number of Observations") + theme_classic() + theme(aspect.ratio = 1, panel.border = element_rect(color = "black", fill = NA), panel.grid.major = element_line(color = "grey95", linewidth = 0.4)) ggsave(paste0(Sys.getenv("FIG_DIR"), today, "PredictedObserved_compartmentModels.png"), height = 5, width = 5) ``` ### Figures 4 and Supplemental Figures 4 & 5: Model performance vs Data Variability Here we examine the performance of "best fit" models with relation to data variability. Please note that in supplemental Figure 3, the data with summarized observations (mean and sd) were included. Here, because we are trying to specifically compare data variability with model performance, we look at data with replicated timepoint observations, which are largely data with individual animal data. ```{r fig4-modelPerform-v-dataVar, eval=FALSE} my_tf$indiv_data_test_fold_errors %>% select(model, starts_with("percent_")) %>% glimpse() pl_4A <- my_tf$indiv_data_fold_errors %>% # For Plotting ggplot(aes( y = Fold_Error, x = foldConc )) + geom_bin2d(bins = 40, color = NA) + geom_hline(yintercept = c(0.5, 2), linetype = "dashed") + geom_vline(xintercept = c(0.5, 2), linetype = "dashed") + facet_grid(cols = vars(model)) + scale_x_log10(labels = scales::label_math(format = log10), limits = c(0.0001, 1000)) + scale_y_log10(labels = scales::label_math(format = log10), limits = c(0.0001, 10000)) + scale_fill_viridis_c(option = "cividis", limits = c(1, 100), oob = scales::oob_squish) + labs(y = "Model Error", x = "Data Variability", fill = "Count") + theme(aspect.ratio = 1, panel.border = element_rect(color = "black", fill = NA, size = 1.5), panel.background = element_blank(), panel.grid.major = element_line(color = "grey90", linewidth = 0.5), strip.background = element_rect(fill = "white"), strip.text = element_text(face = "bold"), axis.ticks = element_blank(), axis.line = element_blank()) pl_4A ggsave(paste0(Sys.getenv("FIG_DIR"), today, "ModelPerformance_vDataVariability.png"), height = 2, width = 4.2, device = "png", dpi = 300, units = "in") my_table <- my_tf$indiv_data_test_fold_errors %>% ungroup() %>% select(starts_with("percent_")) %>% t() %>% round(digits = 2) %>% as.data.frame() %>% rownames_to_column(var = "percentages") %>% mutate(percentages = case_when( percentages == "percent_both_within" ~ "Both within a factor of 2", percentages == "percent_both_outside" ~ "Both outside a factor of 2", percentages == "percent_model_outside" ~ "Model too variable", percentages == "percent_data_outside" ~ "Data too variable" )) names(my_table) <- c(" ","1-compartment (%)", "2-compartment (%)", "Overall (%)") library(flextable) library(officer) flextable(my_table)%>% border_inner() %>% fontsize(part = "all", size = 11) %>% bold(part = "all", j = 4) %>% autofit() table_plot <- gen_grob(flextable(my_table) %>% border_inner() %>% fontsize(part = "all", size = 10) %>% bold(part = "all", j = 4) %>% autofit()) dual_plot <- plot_grid(pl_4A, table_plot, ncol = 1, align = "v", rel_heights = c(1, 0.5)) dual_plot ggsave(paste0(Sys.getenv("FIG_DIR"), today, "ModelPerformance_vDataVariability_wTable.png"), plot = dual_plot, height = 4.5, width = 6, device = "png", bg = "white", dpi = 300, units = "in") ``` ### Figure 5Alt ```{r fig5Alt, eval=FALSE} # Ultimately this will be removed from manuscript my_tf$summarized_data_model_error %>% select(model, starts_with("percent_")) ``` ### Figure 5: Multiple goodness-of-fit metrics validate model performance Here we analyze and validate model performance per data group using R-squared, RMSE, and fraction of predictions that are within 2-fold for each Chemical, Species, Route, Media, and Dose groupings. ```{r fig5-goodness-of-fit, eval=FALSE} combined_gof_df <- my_tf$model_error_all %>% group_by(Chemical, Species, model, method) %>% summarize(within_2fold = sum(between(Fold_Error, 0.5, 2))/n()) %>% left_join(r2_df) %>% left_join(myAIC) %>% left_join(myRMSLE) %>% semi_join(winmodel) mypl <- ggplot(data = combined_gof_df, aes( x = within_2fold, y = Rsq, color = model )) + geom_point() + geom_abline(slope = 1, color = "red4", linetype = "longdash") + scale_color_manual(values = c("#0398FC", "#D68E09", "black")) + coord_cartesian(xlim = c(0,1), ylim = c(0,1)) + labs(x = "Fraction of predictions within 2-fold", y = "R-squared value") + theme(aspect.ratio = 1, panel.border = element_rect(color = "black", fill = NA, size = 1), panel.background = element_blank(), panel.grid.major = element_line(color = "grey90", linewidth = 0.5), axis.ticks = element_blank(), axis.line = element_blank(), legend.position = "none", legend.title = element_blank(), legend.key = element_blank()) panelA_plot <- ggExtra::ggMarginal(mypl, groupFill = TRUE, type = "histogram", xparams = list(binwidth = 0.1), yparams = list(binwidth = 0.1)) panelA_plot ggsave(filename = paste0(Sys.getenv("FIG_DIR"), today, "R2_within2fold_comparison.png"), plot = ggExtra::ggMarginal(mypl, groupFill = TRUE, type = "histogram", xparams = list(binwidth = 0.05), yparams = list(binwidth = 0.05)), height = 5, width = 5, units = "in") panelB_plot <- ggplot(data = myRMSE, aes( x = log10(RMSE), color = model, fill = model )) + geom_histogram(bins = 50) + labs(y = "Count") + scale_color_manual(values = c("#0398FC", "#D68E09", "grey10")) + scale_fill_manual(values = c("#0398FC", "#D68E09", "grey10")) + facet_grid(rows = vars(model), cols = vars(Route)) + theme(text = element_text(size = 10), aspect.ratio = 0.6, panel.border = element_rect(color = "black", fill = NA, size = 1), panel.background = element_blank(), panel.grid.major = element_line(color = "grey90", linewidth = 0.5), axis.ticks = element_blank(), axis.line = element_blank(), strip.background = element_blank(), panel.spacing.y = unit(0.125, units = "in"), legend.position = "bottom", legend.title = element_blank()) panelB_plot ggsave(filename = paste0(Sys.getenv("FIG_DIR"), today, "RMSEs_winningmodel_RouteFacet.png"), height = 4, width = 6, units = "in") legend <- get_legend(panelB_plot) plot_grid(plot_grid(panelA_plot, panelB_plot + theme(legend.position = "none"), labels = c("A", "B"), axis = "tb", align = "h", rel_heights = c(1,1)), legend, ncol = 1, rel_heights = c(1, 0.1)) ggsave(filename = paste0(Sys.getenv("FIG_DIR"), today, "CombinedPlot_Rsquared_RMSE.png"), height = 4, width = 6.5, bg = "white", units = "in") ``` ### Supp. Fig. 6 Examples fits for chemicals with R-squared and within 2-fold ```{r goodness-of-fit_exampleFits, eval = FALSE} pl <- plot(my_pk, best_fit = TRUE, use_scale_conc = TRUE, drop_nonDetect = FALSE) cvt %>% filter(curation_set_tag == "CVT_Showa") %>% pull(analyte_dtxsid) -> Showa_chems combined_gof_df %>% filter(!(Chemical %in% Showa_chems)) %>% View() ex_fits <- pl %>% filter(Chemical %in% c("DTXSID2020139", "DTXSID4023917", "DTXSID3061635", "DTXSID1034187"), Species %in% "rat") %>% pull(final_plot) cowplot::plot_grid(plotlist = list(ex_fits[[2]] + scale_color_manual(values = c("#a6bddb", "#74a9cf", "#41bbc4", "#2b8cbe", "#045a8d")) + theme(legend.position = "none") + xlim(0,30), ex_fits[[3]] + scale_color_manual(values = c("black", "grey40")) + theme(legend.position = "none"), ex_fits[[1]] + scale_color_manual(values = c("magenta3")) + theme(legend.position = "none"), ex_fits[[4]] + scale_color_manual(values = c("#006837")) + theme(legend.position = "none")), scale = 0.85) ggsave(filename = paste0(Sys.getenv("FIG_DIR"), today, "exampleFits_Rsq_within2fold.png"), height = 12, width = 12) ``` ### Supp. Fig. 7: Distribution of Sigmas in CvTdb data ```{r sFig7-sigmaDistribution, eval=FALSE} my_pk$fit %>% select(Chemical, Species, model, method, param_name, estimate) %>% filter(str_detect(param_name, "sigma_")) %>% semi_join(winmodel) %>% left_join(my_pk$prefit$stat_error_model$sigma_DF) %>% inner_join(get_data_summary(my_pk) %>% distinct(Chemical, Species, n_obs)) -> my_coefs my_coefs %>% mutate(model = case_when( model == "model_1comp" ~ "1-compartment", model == "model_2comp" ~ "2-compartment", model == "model_flat" ~ "Null" )) %>% ggplot() + geom_histogram(aes(x = estimate/n_obs), bins = 15, color = NA, fill = "grey5") + # geom_histogram(aes(x = start/n_obs), fill = NA, color = "grey20", bins = 15) + scale_x_log10(labels = scales::label_math(format = log10)) + facet_grid(cols = vars(model)) + labs(x = "Size-normalized Sigma Value", y = "Number of Observations") + theme(panel.border = element_rect(color = "black", fill = NA, size = 1.5), panel.background = element_blank(), panel.grid.major = element_line(color = "grey90", linewidth = 0.5), strip.background = element_rect(fill = "white"), strip.text = element_text(face = "bold"), plot.title = element_text(hjust = 0.5), axis.ticks = element_blank(), axis.line = element_blank()) ggsave(paste0(Sys.getenv("FIG_DIR"), today, "NormalizedSigma_Distributions.png"), bg = "white", height = 3, width = 5) ``` ### Figure 6: Comparing derived TK stats with human TK stats from Lombardo et al. This meta-analysis compares the TK stats from model fits with other published studies. ```{r Lombardo_Comparison, eval=FALSE} cvt_DTXSIDs <- unique(my_pk$data$Chemical) # First things first, let's load the data from Lombardo et al. lombardo <- readxl::read_xlsx( paste0(Sys.getenv("FIG_DIR"), "Lombardo2018-Supplemental_82966_revised_corrected.xlsx"), skip = 8) l_batch_search <- readxl::read_xlsx( paste0(Sys.getenv("FIG_DIR"), "LombardoBatchSearch_Name.xlsx"), sheet = "Main Data" ) l_batch_search <- l_batch_search[c("INPUT", "CASRN","DTXSID")] %>% distinct() %>% right_join(lombardo[c("Name", "CAS #")], by = c("CASRN" = "CAS #", "INPUT" = "Name")) l_noNamesCASRN <- readxl::read_xlsx( paste0(Sys.getenv("FIG_DIR"), "LombardoBatchSearch_CASRNnoName.xlsx"), sheet = "Main Data" ) %>% dplyr::select(CASRN = INPUT, DTXSID) l_curatedDTX <- read_tsv(paste0(Sys.getenv("FIG_DIR"), "CuratedIDs_lombardo.txt")) %>% rename(INPUT = Name) l_batch_search <- l_batch_search %>% left_join(l_noNamesCASRN, by = "CASRN") %>% mutate(DTXSID = coalesce(DTXSID.x, DTXSID.y)) %>% dplyr::select(-DTXSID.x, -DTXSID.y) # Now all but two of the IDs are unidentified lombardo <- lombardo %>% right_join(l_batch_search, by = c("Name" = "INPUT")) #### lombard_abbr <- lombardo %>% dplyr::select(DTXSID, hVss = `human VDss (L/kg)`, hCltot = `human CL (mL/min/kg)`, halflife = `terminal t1/2 (h)` ) %>% filter(DTXSID %in% cvt_DTXSIDs) %>% left_join(chem_name_translate, by = c("DTXSID" = "Chemical")) # 15 chemicals in common with values for both lombardo_comparison <- my_tkstats %>% dplyr::select(DTXSID = Chemical, Species, model, Vss.tkstats, CLtot.tkstats) %>% dplyr::filter(model != "model_flat") %>% distinct() %>% inner_join(lombard_abbr, by = "DTXSID") %>% # filter(!is.na(Vss.tkstats)) %>% arrange(halflife) comparable_ids <- lombardo_comparison %>% pull(DTXSID) lombard_abbr <- lombard_abbr %>% mutate(Species = "human", model = "Lombardo et al.") %>% rename(Vss = hVss, CLtot = hCltot) %>% mutate(CLtot = CLtot*60/1000) %>% # Lombardo reports this as mL/min/kg, need L/h/kg filter(DTXSID %in% comparable_ids) %>% mutate(Chemical_Name = reorder(Chemical_Name, halflife)) lombardo_comparison <- my_tkstats %>% dplyr::select(DTXSID = Chemical, Species, model, Vss.tkstats, CLtot.tkstats, halflife.tkstats) %>% filter(DTXSID %in% comparable_ids) %>% left_join(chem_name_translate, by = join_by(DTXSID == Chemical)) %>% distinct() %>% rename(Vss = Vss.tkstats, CLtot = CLtot.tkstats, halflife = halflife.tkstats) %>% bind_rows(lombard_abbr) %>% mutate(model = ifelse(str_detect(model, "^model"), "invivoPKfit", model), Chemical_Name = factor(Chemical_Name, levels = levels(lombard_abbr$Chemical_Name))) lombardo_comparison %>% pivot_longer(cols = c("Vss", "CLtot", "halflife")) %>% mutate(name = factor(name, levels = c("halflife", "Vss", "CLtot"))) %>% ggplot(mapping = aes( x = value, y = Chemical_Name )) + geom_point(aes(color = model, shape = Species), position = position_dodge(0.7), size = 5/.pt, stroke = 2.5/.pt) + facet_grid(cols = vars(name), scales = "free_x") + scale_x_log10( labels = scales::label_math(format = log10)) + scale_shape_manual(values = c(21, 22, 24), breaks = c("human", "dog", "rat")) + scale_color_manual(values = c("black", "#1064c9")) + guides(color = guide_legend(override.aes = list(shape = 21), nrow = 2, title.position = "top", title.hjust = 0.5), shape = guide_legend(nrow = 1, title.position = "top", title.hjust = 0.5)) + theme(panel.border = element_rect(color = "black", fill = NA, linewidth = 1.5), panel.background = element_blank(), panel.grid.major = element_line(color = "grey90", linewidth = 0.5), strip.background = element_rect(fill = "white"), strip.text = element_text(face = "bold", size = 12), text = element_text(size = 10), axis.ticks = element_blank(), axis.line = element_blank(), axis.text = element_text(face = "bold", size = 6), axis.title.y = element_blank(), axis.title.x = element_blank(), legend.position = "bottom", legend.key = element_blank(), legend.title = element_text(face = "bold"), legend.text = element_text()) ggsave(filename = paste0(Sys.getenv("FIG_DIR"), today, "lombardoComparison_named.png"), height = 7, width = 6.5, device = "png", dpi = 300) ``` ### Supp. Fig. 7 Histogram of Lombardo Vss and InvivoPKfit & another meta-analysis ```{r otherMeta-analysis, eval=FALSE} # Histograms lombardo_hist <- data.frame(Chemical = c(lombardo$DTXSID, my_tkstats$Chemical), Vss = c(lombardo$`human VDss (L/kg)`, my_tkstats$Vss.tkstats), halflife = c(lombardo$`terminal t1/2 (h)`, my_tkstats$halflife.tkstats), Source = c(rep_len("lombardo", length.out = nrow(lombardo)), rep_len("invivoPKfit", length.out = nrow(my_tkstats)))) sfig7A <- lombardo_hist %>% ggplot(aes(x = log2(Vss), color = Source, group = Source)) + geom_freqpoly(aes(y = after_stat(ndensity)), linewidth = 2/.pt, bins = 20) + labs(y = "Scaled Frequency", x = "Vss") + scale_x_continuous(labels = scales::label_math(expr = 2^.x)) + scale_color_manual(values = c("black", "#1064c9")) + guides(color = guide_legend(nrow = 2, linewidth = 1)) + theme(panel.border = element_rect(color = "black", fill = NA, linewidth = 1.5), panel.background = element_blank(), panel.grid.major = element_line(color = "grey90", linewidth = 0.5), strip.background = element_rect(fill = "white"), strip.text = element_text(face = "bold", size = 12), text = element_text(size = 10), axis.ticks = element_blank(), axis.line = element_blank(), axis.text = element_text(face = "bold"), legend.position = "bottom", legend.key = element_blank(), legend.title = element_text(face = "bold"), legend.text = element_text()) sfig7A other_meta <- read_csv( file = paste0(Sys.getenv("FIG_DIR"), "SupTableInVivoDatandPreds.csv")) %>% dplyr::select(DTXSID, fbioh, fbior) %>% filter(DTXSID %in% my_tkstats$Chemical) fgutabs <- my_pk$fit %>% filter(param_name %in% "Fgutabs", convcode == 0) %>% distinct(Chemical, Species, model, method, estimate) %>% semi_join(winmodel) %>% filter(model != "model_flat") %>% rename(Fgutabs.tkstats = "estimate") sfig7B <- other_meta %>% left_join(fgutabs, by = c("DTXSID" = "Chemical")) %>% filter(Species %in% c("rat")) %>% filter(!is.na(Fgutabs.tkstats), !is.na(fbior)) %>% distinct(DTXSID, Species, Fgutabs.tkstats, fbior) %>% pivot_longer(cols = c(Fgutabs.tkstats, fbior), names_to = "Source", values_to = "Fgutabs") %>% mutate(Source = ifelse(str_detect(Source, "tkstats"), "invivoPKfit", "Honda et al.")) %>% left_join(chem_name_translate, by = join_by(DTXSID == Chemical)) %>% mutate(Chemical_Name) %>% ggplot(mapping = aes( x = Fgutabs, y = reorder(Chemical_Name, Fgutabs) )) + geom_point(aes(color = Source), shape = 24, position = position_dodge(0.7), size = 4/.pt, stroke = 1.5/.pt) + scale_color_manual(values = c("black", "green4"), breaks = c("invivoPKfit", "Honda et al.")) + labs(x = "Oral Bioavailability") + guides(color = guide_legend(nrow = 2)) + theme(panel.border = element_rect(color = "black", fill = NA, linewidth = 1.5), panel.background = element_blank(), panel.grid.major = element_line(color = "grey90", linewidth = 0.5), strip.background = element_rect(fill = "white"), strip.text = element_text(face = "bold", size = 12), text = element_text(size = 10), axis.ticks = element_blank(), axis.line = element_blank(), axis.text = element_text(face = "bold"), axis.title.y = element_blank(), legend.position = "bottom", legend.key = element_blank(), legend.title = element_text(face = "bold"), legend.text = element_text()) plot_grid(sfig7A, sfig7B, align = "v", ncol = 1, axis = "lr", rel_heights = c(1,1.25), labels = c("A", "B")) ggsave(filename = paste0(Sys.getenv("FIG_DIR"), today, "metaAnalysis_Comparisons_SuppFigure7_named.png"), height = 5.5, width = 4, device = "png", dpi = 300) ``` ### Supp. Fig. 9: Parallelization decreases runtime of invivoPKfit Here I do a bit of benchmarking for the parallel-ized version of do_fit ```{r do_fit-benchmarking, eval=FALSE} # Randomize the order of chemical species chem_spec <- cvt %>% dplyr::distinct(analyte_dtxsid, species) %>% dplyr::slice_sample(prop = 1) test_group_size <- seq(25, 105, by = 10) fit_options <- expand.grid(error_model = c("pooled", "hierarchical"), dose_norm = FALSE, log10_trans = TRUE, time_scale = c( TRUE, FALSE), stringsAsFactors = FALSE) fit_options <- tidyr::expand_grid(fit_options, test_group_size) fit_options <- fit_options %>% rowwise() %>% mutate(this_data = list({ tmp <- chem_spec %>% slice_head(n = test_group_size) %>% left_join(cvt, by = join_by(analyte_dtxsid, species)) %>% pk() + scale_time(new_units = ifelse(!time_scale, "identity", "auto")) + scale_conc(dose_norm = dose_norm, log10_trans = log10_trans) + settings_preprocess(suppress.messages = TRUE) do_prefit(tmp) } ), data_size = chem_spec %>% slice_head(n = test_group_size) %>% left_join(cvt, by = join_by(analyte_dtxsid, species)) %>% NROW()) # Organization of the benchmarking # For each fit_option, return the four values of system.time() that we want df_out <- fit_options %>% rowwise() %>% mutate(tim_1core = system.time(do_fit(this_data))[["elapsed"]], tim_4core = system.time(do_fit(this_data, n_cores = 4))[["elapsed"]], tim_7core = system.time(do_fit(this_data, n_cores = 7))[["elapsed"]], tim_12core = system.time(do_fit(this_data, n_cores = 12))[["elapsed"]] ) df_out <- df_out %>% select(-this_data) gc() full_long <- df_out %>% pivot_longer(cols = c(tim_1core:tim_12core), names_to = "N_cores", values_to = "Time_s") %>% group_by(N_cores, test_group_size, data_size) %>% summarize(mean_time = mean(Time_s, na.rm = TRUE)/60, max_time = max(Time_s, na.rm = TRUE)/60, min_time = min(Time_s, na.rm = TRUE)/60 ) %>% mutate(N_cores = as.numeric(str_extract(N_cores, "[:digit:]+"))) df_out %>% clipr::write_clip() ggplot(full_long, aes(x = test_group_size, y = mean_time, color = as.factor(N_cores))) + geom_point() + geom_linerange(aes(ymin = min_time, ymax = max_time)) + geom_line(aes(group = N_cores)) + labs(x = "Number of Data Groups", y = "Runtime (minutes)", title = "Number of Processing Cores", subtitle = "(with min/max runtime range)") + facet_grid(cols = vars(N_cores)) + scale_color_manual(values = c("black", "#40c679", "#31a354", "#006837")) + coord_fixed(ratio = 8) + theme(panel.border = element_rect(color = "black", fill = NA, size = 1.5), panel.background = element_blank(), panel.grid.major = element_line(color = "grey90", linewidth = 0.5), legend.position = "none", plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5), axis.ticks = element_blank(), axis.line = element_blank(), strip.background = element_blank()) ggsave(paste0(Sys.getenv("FIG_DIR"), today, "Parallelization_Benchmarking.png"), width = 5, height = 2.5, units = "in") ``` ### Supp. Fig. 5: Example plots for data fits ```{r sfig5-examples-of-fits, eval=FALSE} pl <- plot(my_pk, use_scale_conc = TRUE, n_interp = 10) names(pl) # Chemical Species observations observation_plot predicted predicted_plot final_plot s6_plA <- pl %>% filter(Chemical %in% "DTXSID3031860") %>% pull(observation_plot) %>% .[[1]] s6_plB_mouse <- pl %>% filter(Chemical %in% "DTXSID4020533", Species %in% "mouse") %>% pull(final_plot) %>% .[[1]] s6_plB_rat <- pl %>% filter(Chemical %in% "DTXSID4020533", Species %in% "rat") %>% pull(final_plot) %>% .[[1]] s6_plC <- pl %>% filter(Chemical %in% "DTXSID00216205") %>% pull(final_plot) %>% .[[1]] s6_plD <- pl %>% filter(Chemical %in% "DTXSID8025337") %>% pull(final_plot) %>% .[[1]] # Set colors palettes for each # panel A : Oranges panelA_cols <- brewer.pal(8, "Oranges")[3:8] # panel B : Greens panelB_cols <- brewer.pal(9, "Greens")[2:10] # panel C : Blue (only 2 values) # panel D : Purple (6 values) panelD_cols <- brewer.pal(9, "Purples")[4:9] s6_plA <- s6_plA + labs(title = "perfluorodecanoic acid") + scale_color_manual(values = panelA_cols, aesthetics = c("color", "fill")) + theme(text = element_text(size = 20), legend.position = "none", strip.background = element_rect(fill = "white")) s6_plB_mouse <- s6_plB_mouse + labs(title = "1,4-dioxane\nMouse") + scale_color_manual(values = panelB_cols[c(5,8)], aesthetics = c("color", "fill")) + theme(text = element_text(size = 20), legend.position = "none", strip.background = element_rect(fill = "white")) s6_plB_rat <- s6_plB_rat + labs(title = "1,4-dioxane\nRat") + scale_color_manual(values = panelB_cols[c(1, 2, 3, 4, 5, 6,7, 8)], aesthetics = c("color", "fill")) + theme(text = element_text(size = 20), legend.position = "none", strip.background = element_rect(fill = "white")) s6_plC <- s6_plC + labs(title = "psoralen") + scale_color_manual(values = c("#6BAED6", "#08519C", "darkblue"), aesthetics = c("color", "fill")) + theme(text = element_text(size = 20), legend.position = "none", strip.background = element_rect(fill = "white")) s6_plD <- s6_plD + labs(title = "formamide") + scale_color_manual(values = panelD_cols, aesthetics = c("color", "fill")) + theme(text = element_text(size = 20), legend.position = "none", strip.background = element_rect(fill = "white")) # Put it all together s6_plB <- plot_grid(s6_plB_mouse, s6_plB_rat, align = "h", axis = "bt", rel_widths = c(1.2,2)) s6_plCD <- plot_grid(s6_plC, s6_plD, labels = c("C", "D"), label_size = 24) plot_grid( s6_plA, s6_plB, s6_plCD, align = "v", ncol = 1, rel_heights = c(1.25, 1.2, 1.25), labels = c("A", "B", ""), label_size = 24 ) # Am making this 2X the size I need it to be # so the points are a reasonable size when I scale it down ggsave(paste0(Sys.getenv("FIG_DIR"), "ExampleFitsPlots_06.12.2024.png"), width = 13, height = 14, units = "in") ``` ## Other Plots ### Plotting ALL fits Here is the code used to generate the CvT data and _invivoPKfit_ fits for each chemical, species data group. ```{r plotting_all_Fits, eval=FALSE} ### Plotting for all (2.5MB file) pl <- plot(my_pk, n_interp = 12, use_scale_conc = TRUE) pdf(file = paste0(Sys.getenv("FIG_DIR"), "Transformed_Joint_AllFits_Final_2024_05_16.pdf"), height = 6, width = 10) for (i in 1:nrow(pl)) { print(pl$final_plot[[i]]) } dev.off() # From my new SQL pull pl <- plot(my_pk, n_interp = 12, use_scale_conc = FALSE, best_fit = TRUE) pdf(file = paste0(Sys.getenv("FIG_DIR"), "Joint_bobyqa_AllFits_Final_20240514.pdf"), height = 10, width = 10) for (i in 1:nrow(pl)) { print(pl$final_plot[[i]]) } dev.off() ``` It is also possible to plot only one plot. ```{r Single-Plot, eval=FALSE} pl <- plot(my_pk, n_interp = 10, use_scale_conc = FALSE) pl %>% filter(Chemical %in% "DTXSID5020029") %>% pull(final_plot) %>% .[[1]] ``` ## Preparing results for CvTdb ```{r cvtdb-prep, eval=FALSE} my_cvt <- cvt my_wm <- get_winning_model(my_pk) my_tkstats <- get_tkstats(my_pk) %>% rename2_cvt() %>% select(-c(invivpk_dose_level, dose_units, conc_units, time_units, data_sigma_group, fk_extraction_document_id)) %>% distinct() %>% pivot_longer(cols = cltot:vss_fgutabs, names_to = "param_name", values_to = "estimate") %>% filter(!is.na(estimate)) %>% mutate(param_units = case_when( param_name %in% c("cltot", "cltot_fgutabs") ~ "L/hours", param_name %in% c("css", "cmax") ~ "mg/L", param_name %in% c("halflife", "tmax") ~ "hours", param_name %in% c("vss", "vss_fgutabs") ~ "L", param_name %in% "auc_infinity" ~ "mg/L * hours", .default = NA_character_ )) %>% distinct()%>% mutate(dose_normalization = conc_scale_use(TRUE, my_pk)$dose_norm, log10_transformation = conc_scale_use(TRUE,my_pk)$log10_trans, time_scaling = "identity", invivoPKfit_version = "2.0.0", update_date = today) %>% filter(model %in% c("model_1comp", "model_2comp")) my_params <- my_pk$fit %>% rename2_cvt() %>% distinct( analyte_dtxsid, species, model, method, param_name, estimate, param_units ) %>% mutate(dose_normalization = conc_scale_use(TRUE, my_pk)$dose_norm, log10_transformation = conc_scale_use(TRUE,my_pk)$log10_trans, time_scaling = "identity", invivoPKfit_version = "2.0.0", update_date = today) %>% filter(model %in% c("model_1comp", "model_2comp")) my_cvt_unique <- my_cvt %>% dplyr::mutate( conc_processing_flags = case_when( invivPK_conc == conc ~ glue::glue("Used 'conc' value."), invivPK_conc == conc_original ~ glue::glue("Used 'conc_original' value."), invivPK_conc == conc_original * conc_unit_norm_factor ~ glue::glue( "Mulitplied 'conc_original' by {conc_unit_norm_factor}." ), invivPK_conc != conc_original * conc_unit_norm_factor ~ glue::glue( "Multiplied 'conc_original' by {signif(invivPK_conc/conc_original, 2)}." ), is.na(conc) ~ glue::glue( "Multiplied 'conc_original' by {signif(invivPK_conc/conc_original, 2)}" ), .default = "" ) ) %>% dplyr::mutate( loq_processing_flags = if_else( invivPK_loq == loq, glue::glue("Used 'loq' value."), glue::glue("Multiplied 'loq' value by {signif(invivPK_loq/loq,2)}."), missing = glue::glue("'loq' was imputed as 90% of lowest concentration.") ) ) %>% dplyr::mutate( dose_processing_flags = if_else( invivPK_dose_level == dose_level_normalized, glue::glue("Used 'dose_level_normalized'."), glue::glue("Multiplied 'dose_level_normalized' by", " {signif(invivPK_dose_level/dose_level_normalized, 3)}.") ) ) %>% dplyr::select(fk_series_id, analyte_dtxsid, species, administration_route_normalized, conc_medium_normalized, conc_processing_flags, dose_processing_flags, loq_processing_flags, ) %>% group_by(analyte_dtxsid, species, administration_route_normalized, conc_medium_normalized) %>% summarize(all_series_ids = paste(unique(fk_series_id), collapse = ", "), conc_processing_flags = paste(unique(conc_processing_flags), collapse = " "), dose_processing_flags = paste(unique(dose_processing_flags), collapse = " "), loq_processing_flags = paste(unique(loq_processing_flags), collapse = " ")) %>% dplyr::distinct() tkstats_cvtdb <- inner_join(my_cvt_unique, my_tkstats, by = join_by(analyte_dtxsid, species, administration_route_normalized, conc_medium_normalized)) %>% distinct() params_cvtdb <- inner_join(my_cvt_unique, my_params, by = join_by(analyte_dtxsid, species), relationship = "many-to-many") %>% distinct() # Writing to file bind_rows(params_cvtdb, tkstats_cvtdb) %>% distinct() -> tkparams_cvtdb tkparams_cvtdb_winmodels <- tkparams_cvtdb %>% semi_join(my_wm) save(tkparams_cvtdb, tkstats_cvtdb, params_cvtdb, tkparams_cvtdb_winmodels, file = "data-raw/tkparams_CvTdb.Rdata") ``` ```{r information} sessionInfo() ```