## ----init, include = FALSE---------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----setup-------------------------------------------------------------------- set.seed(2023) library(tidyverse, quietly = TRUE) library(cowplot) library(RColorBrewer) devtools::load_all() today <- format(Sys.Date(), "%d%B%Y") ## ----file_load, eval=FALSE---------------------------------------------------- # my_pk <- readRDS("data-raw/all_cvt_pk.rds") ## ----pk_obj_fromSQL, eval=FALSE----------------------------------------------- # # ### Minimal PK Object ###---- # # Minimum pk object, add options later # # Note that these mappings are now a default mappings, just being verbose here. # # If there are any standard deviations that are zero or NA, but N_Subjects > 1, # # Set the n_subjects to 6 or the current value if lower than 6. # # (In Showa the mode is 6, but this condition is present in CVT_Legacy as well) # minimal_pk <- pk(data = cvt %>% # mutate(n_subjects_normalized = if_else( # n_subjects_normalized > 1 & is.na(invivPK_conc_sd), # min(n_subjects_normalized, 6), n_subjects_normalized)), # mapping = ggplot2::aes( # Chemical = analyte_dtxsid, # Chemical_Name = analyte_name_original, # DTXSID = analyte_dtxsid, # CASRN = analyte_casrn, # Species = species, # Reference = fk_extraction_document_id, # Media = conc_medium_normalized, # Route = administration_route_normalized, # Dose = invivPK_dose_level, # Dose.Units = "mg/kg", # Subject_ID = fk_subject_id, # Series_ID = fk_series_id, # Study_ID = fk_study_id, # ConcTime_ID = conc_time_id, # N_Subjects = n_subjects_normalized, # Weight = weight_kg, # Weight.Units = "kg", # Time = time_hr, # Time.Units = "hours", # Value = invivPK_conc, # Value.Units = "mg/L", # Value_SD = invivPK_conc_sd, # LOQ = invivPK_loq # )) ## ----outlining_all_options, eval=FALSE---------------------------------------- # # Types of Choices: # ## Dose Normalized # ## Log10 transform # ## Scale time # # List of options # fitopts <- expand.grid(error_model = c("pooled", # "hierarchical"), # dose_norm = c(TRUE, FALSE), # log10_trans = c(TRUE, FALSE), # time_scale = c( # TRUE, # FALSE), # stringsAsFactors = FALSE) ## ----fitting_choices, eval=FALSE---------------------------------------------- # # Make function similar to that in test_vignette authored by Caroline Ring # fit_data <- function(this_error_model, # this_dose_norm, # this_log10_trans, # this_time_scale, # n_cores = 12, # file_dir = Sys.getenv("OD_DIR")){ # retval <- -9 # dose_indic <- as.numeric(this_dose_norm) # log10_indic <- as.numeric(this_log10_trans) # time_indic <- as.numeric(this_time_scale) # errmodel_indic <- substr(this_error_model, 1, 1) # # file_str <- paste0(today, # "mypk_fit_", # dose_indic, # log10_indic, # time_indic, # errmodel_indic, # ".Rds") # # cat("Fitting: ", file_str, "\n") # # tryCatch( # expr = { # this_pk <- minimal_pk + # facet_data(vars(Chemical, Species)) + # settings_preprocess(keep_data_original = FALSE, # suppress.messages = TRUE) + # scale_conc(dose_norm = this_dose_norm, log10_trans = this_log10_trans) + # settings_optimx(method = c( # "L-BFGS-B", # "bobyqa" # )) # # if(this_error_model %in% "pooled"){ # this_pk <- this_pk + # stat_error_model(error_group = vars(Chemical, Species)) # } else if(this_error_model %in% c("hierarchical", "joint")){ # this_pk <- this_pk + # stat_error_model(error_group = vars(Chemical, Species, Reference)) # } else{ # stop("this_error_model must be either 'pooled' or 'hierarchical'/'joint'") # } # # if(this_time_scale %in% TRUE){ # this_pk <- this_pk + # scale_time(new_units = "auto") # } # # #do the fit # this_time <- system.time(this_fit <- do_fit(this_pk, n_cores = n_cores)) # # # # #save the result # saveRDS(this_fit, # file = paste0(file_dir, # file_str)) # # cli::cli_alert_success(text = "Fitting success!") # print(this_time) # rm(this_fit) # # retval <- this_time[[3]] # }, # error = function(e){ # cli::cli_alert_danger(text = paste("Analysis", # file_str, # "failed with error", # e)) # retval <- -1 # } # ) # # return(retval) # } ## ----running_all_options, eval=FALSE------------------------------------------ # # system.time(tmp_out <- mapply(fit_data, # this_error_model = fitopts$error_model, # this_dose_norm = fitopts$dose_norm, # this_log10_trans = fitopts$log10_trans, # this_time_scale = fitopts$time_scale, # n_cores = 13)) # # mean(tmp_out)/60 # Average runtime (in minutes) per fitting option # median(tmp_out)/60 # sum(tmp_out) # Looks like there's 10 extra seconds of overhead for non-fitting methods ## ----Evaluation-helpers, eval=FALSE------------------------------------------- # # Evaluation # # Winning Model Tally # winmodel_comparison <- function(this_error_model, # this_dose_norm, # this_log10_trans, # this_time_scale) { # dose_indic <- as.numeric(this_dose_norm) # log10_indic <- as.numeric(this_log10_trans) # time_indic <- as.numeric(this_time_scale) # errmodel_indic <- substr(this_error_model, 1, 1) # # file_str <- paste0(Sys.getenv("OD_DIR"), # today, # "mypk_fit_", # dose_indic, # log10_indic, # time_indic, # errmodel_indic, # ".Rds") # pk_name <- paste0(dose_indic, log10_indic, time_indic, errmodel_indic) # # current_rds <- readRDS(file = file_str) # # pf <- current_rds$fit %>% # distinct(Chemical, Species, model, method, convcode) %>% # group_by(method, .drop = FALSE) %>% # count(convcode) %>% # pivot_wider(names_from = convcode, # names_prefix = "convcode.", # values_from = n) # # # # # Wide winning model # suppressMessages({ # winning_model <- get_winning_model(obj = current_rds) # wide_winning_model <- winning_model %>% # group_by(method, model) %>% # count() %>% # pivot_wider(names_from = model, values_from = n) %>% # left_join(pf) # }) # return(wide_winning_model) # } # # # RMSLE for Cmax and AUC (with some tallys for extreme values) # rmsles_Cmax_AUC <- function( # this_error_model, # this_dose_norm, # this_log10_trans, # this_time_scale) { # dose_indic <- as.numeric(this_dose_norm) # log10_indic <- as.numeric(this_log10_trans) # time_indic <- as.numeric(this_time_scale) # errmodel_indic <- substr(this_error_model, 1, 1) # # file_str <- paste0(Sys.getenv("OD_DIR"), # today, # "mypk_fit_", # dose_indic, # log10_indic, # time_indic, # errmodel_indic, # ".Rds") # pk_name <- paste0(dose_indic, log10_indic, time_indic, errmodel_indic) # # message(paste0("Evaluation for: ", pk_name)) # current_rds <- readRDS(file = file_str) # # Cmax and AUC_inf RMSEs, eval_tkstats already filters winning models # RMSLE_eval <- suppressMessages(eval_tkstats(obj = current_rds, # dose_norm = FALSE, # finite_only = TRUE) %>% # mutate(Options = pk_name) %>% # dplyr::select( # Options, # Chemical, # Species, # Route, # Media, # Reference, # method, # model, # starts_with("Cmax"), # starts_with("AUC_") # )) # message(paste0("How many finite, non-zero comparisons had ", # "AUC_infinity.tkstats over/under 20X of AUC_infinity.nca?")) # print( # RMSLE_eval %>% # filter( # AUC_infinity.nca > 0, # AUC_infinity.tkstats > 0) %>% # group_by(Options, method) %>% # summarize( # `AUC_folderror>20X` = sum( # log10(AUC_infinity.tkstats/AUC_infinity.nca) > log10(20) # ) # ) # ) # message("How many comparisons had AUC_infinity equal to zero?") # print( # RMSLE_eval %>% # filter( # AUC_infinity.nca > 0, # AUC_infinity.tkstats > 0) %>% # group_by(Options, method) %>% # summarize(NCA_zeroAUC = sum(AUC_infinity.nca == 0), # tkstats_zeroAUC =sum(AUC_infinity.tkstats == 0)) # ) # # message("Removing those observations...") # RMSLE_eval <- RMSLE_eval %>% # filter( # AUC_infinity.nca > 0, # AUC_infinity.tkstats > 0, # log10(AUC_infinity.tkstats/AUC_infinity.nca) <= log10(20) # ) %>% # rowwise() %>% # mutate( # SLE_Cmax = (log10(Cmax.tkstats) - log10(Cmax.nca)) ^ 2, # SLE_AUC_inf = (log10(AUC_infinity.tkstats) - log10(AUC_infinity.nca)) ^ 2 # ) %>% # filter(!is.na(SLE_Cmax), !is.na(SLE_AUC_inf)) %>% # group_by(Options, method) %>% # summarize( # RMSLE_Cmax = sqrt(mean(SLE_Cmax, na.rm = FALSE)) %>% signif(digits = 4), # RMSLE_AUC = sqrt(mean(SLE_AUC_inf, na.rm = FALSE)) %>% signif(digits = 4) # ) # # print(RMSLE_eval) # # return(RMSLE_eval) # } # # # Goodness-of-fit table with R-squared, RMSLE and AIC # get_gof <- function(this_error_model, # this_dose_norm, # this_log10_trans, # this_time_scale) { # dose_indic <- as.numeric(this_dose_norm) # log10_indic <- as.numeric(this_log10_trans) # time_indic <- as.numeric(this_time_scale) # errmodel_indic <- substr(this_error_model, 1, 1) # # file_str <- paste0(Sys.getenv("OD_DIR"), # today, # "mypk_fit_", # dose_indic, # log10_indic, # time_indic, # errmodel_indic, # ".Rds") # pk_name <- paste0(dose_indic, log10_indic, time_indic, errmodel_indic) # # current_rds <- readRDS(file = file_str) # winning_model <- suppressMessages(get_winning_model(obj = current_rds)) # suppressMessages({ # this_rsq <- rsq(current_rds, # use_scale_conc = FALSE, # rsq_group = ggplot2::vars(Chemical, Species)) %>% # semi_join(winning_model) # this_AIC <- AIC(current_rds) %>% # semi_join(winning_model) # # this_rmsle <- rmse.pk(current_rds, # rmse_group = vars(Chemical, Species), # use_scale_conc = list(dose_norm = FALSE, # log10_trans = TRUE)) %>% # semi_join(winning_model) # }) # # # Since they are all joined by winmodel # # all three must have same NROW # message( # paste0("For ", pk_name, "... outputs below must have same number of rows") # ) # # message(paste0("rsq: ", NROW(this_rsq))) # message(paste0("aic: ", NROW(this_AIC))) # message(paste0("rmsle: ", NROW(this_rmsle))) # # gof_df <- this_rsq %>% # inner_join(this_AIC, by = join_by(Chemical, Species, method, model)) %>% # inner_join(this_rmsle, by = join_by(Chemical, Species, method, model)) %>% # mutate(Options = pk_name, .before = Chemical) # # # # return(gof_df) # } # # # # All predictions # get_all_preds <- function(this_error_model, # this_dose_norm, # this_log10_trans, # this_time_scale) { # dose_indic <- as.numeric(this_dose_norm) # log10_indic <- as.numeric(this_log10_trans) # time_indic <- as.numeric(this_time_scale) # errmodel_indic <- substr(this_error_model, 1, 1) # # file_str <- paste0(Sys.getenv("OD_DIR"), # today, # "mypk_fit_", # dose_indic, # log10_indic, # time_indic, # errmodel_indic, # ".Rds") # pk_name <- paste0(dose_indic, log10_indic, time_indic, errmodel_indic) # # # current_rds <- readRDS(file = file_str) # # # Wide winning model # suppressMessages({ # winning_model <- get_winning_model(obj = current_rds) %>% # select(-c(near_flat, preds_below_loq)) # this_preds <- predict(current_rds, # use_scale_conc = FALSE) %>% # semi_join(winning_model) %>% # mutate(Options = pk_name, .before = Chemical) # }) # return(this_preds) # } # # # Factor of two model error # tf_tests <- function(this_error_model, # this_dose_norm, # this_log10_trans, # this_time_scale) { # dose_indic <- as.numeric(this_dose_norm) # log10_indic <- as.numeric(this_log10_trans) # time_indic <- as.numeric(this_time_scale) # errmodel_indic <- substr(this_error_model, 1, 1) # # file_str <- paste0(Sys.getenv("OD_DIR"), # today, # "mypk_fit_", # dose_indic, # log10_indic, # time_indic, # errmodel_indic, # ".Rds") # pk_name <- paste0(dose_indic, log10_indic, time_indic, errmodel_indic) # current_rds <- readRDS(file = file_str) # print(paste0("Now analyzing: ", pk_name)) # # out <- twofold_test(current_rds, suppress_messages = TRUE) # # out <- out$model_error_summary %>% # mutate(Options = pk_name) # } # ## ----do_evalutations, eval=FALSE---------------------------------------------- # system.time(wm_comp <- fitopts %>% # dplyr::rowwise() %>% # dplyr::mutate(winmodel_comp = list( # winmodel_comparison(this_error_model = error_model, # this_dose_norm = dose_norm, # this_log10_trans = log10_trans, # this_time_scale = time_scale))) %>% # tidyr::unnest(winmodel_comp)) # # # system.time(cmax_auc_rmsle_tbl <- fitopts %>% # dplyr::rowwise() %>% # dplyr::mutate(nca_comp = list( # rmsles_Cmax_AUC(this_error_model = error_model, # this_dose_norm = dose_norm, # this_log10_trans = log10_trans, # this_time_scale = time_scale))) %>% # tidyr::unnest(nca_comp)) # # system.time(gof_tbl <- fitopts %>% # dplyr::rowwise() %>% # dplyr::mutate(gof_win = list( # get_gof(this_error_model = error_model, # this_dose_norm = dose_norm, # this_log10_trans = log10_trans, # this_time_scale = time_scale))) %>% # tidyr::unnest(gof_win)) # # # system.time(all_tf_tests <- fitopts %>% # dplyr::rowwise() %>% # dplyr::mutate(TF = list( # tf_tests(error_model, # this_dose_norm = dose_norm, # this_log10_trans = log10_trans, # this_time_scale = time_scale))) %>% # tidyr::unnest(TF)) # # system.time(all_preds <- fitopts %>% # dplyr::rowwise() %>% # dplyr::mutate(these_preds = list( # get_all_preds(this_error_model = error_model, # this_dose_norm = dose_norm, # this_log10_trans = log10_trans, # this_time_scale = time_scale))) %>% # tidyr::unnest(these_preds)) # # # # gc() ## ----Evaluating choices, eval=FALSE------------------------------------------- # # Writing to file # # writexl::write_xlsx(x = list( # Fit_Counts = wm_comp, # AUC_and_Cmax_RMSE_summary = cmax_auc_rmsle_tbl, # Prediction_Evaluations = gof_tbl, # Twofold_Tests = all_tf_tests), # path = paste0(Sys.getenv("FIG_DIR"), # "Manu_Files/", # today, # "_Supp_Table1", # ".xlsx")) # gof_tbl <- readxl::read_xlsx(path = paste0(Sys.getenv("FIG_DIR"), # "Manu_Files/", # today, # "_Supp_Table1.xlsx"), # sheet = "Prediction_Evaluations") %>% # mutate(RMSLE = as.numeric(RMSLE)) # wm_comp <- readxl::read_xlsx(path = paste0(Sys.getenv("FIG_DIR"), # "Manu_Files/", # today, # "_Supp_Table1.xlsx"), # sheet = "Fit_Counts") # cmax_auc_rmsle_tbl <- readxl::read_xlsx(path = paste0(Sys.getenv("FIG_DIR"), # "Manu_Files/", # today, # "_Supp_Table1.xlsx"), # sheet = "AUC_and_Cmax_RMSE_summary") # gof_tbl %>% # group_by(Options, method) %>% # count(RMSLE < 1) %>% # filter(`RMSLE < 1`) %>% # arrange(desc(n)) %>% # print(n = 8) # # bobyqa-optimized options 010h, 110h, 110p # # # wm_comp %>% filter(method %in% "bobyqa", # !time_scale, log10_trans) # # Off these, there is one more data group that is model_flat # # I will remove this so it doesn't affect downstream analyses # gof_tbl %>% filter(Options == "110p", model == "model_flat", # method == "bobyqa") %>% distinct(Chemical, Species) # gof_tbl %>% filter(Options == "110h", model == "model_flat", # method == "bobyqa") %>% distinct(Chemical, Species) # gof_tbl %>% filter(Options == "010h", model == "model_flat", # method == "bobyqa") %>% distinct(Chemical, Species) # gof_tbl %>% filter(Options == "010p", model == "model_flat", # method == "bobyqa") %>% distinct(Chemical, Species) # # # gof_tbl %>% filter(Options == "010h", # method == "bobyqa", # Chemical %in% "DTXSID1021116") %>% # distinct(Chemical, Species) # # Filter out DTXSID1021116 # # gof_tbl %>% # anti_join(data.frame(Chemical = "DTXSID1021116", Species = "rat")) %>% # filter(!(model %in% "model_flat")) %>% # group_by(Options, method) %>% # count(RMSLE < 0.7) %>% # filter(`RMSLE < 0.7`) %>% # arrange(desc(n)) %>% # print(n = 8) # # 110p seems to win out here. # # gof_tbl %>% # filter(method %in% "bobyqa", # Options %in% c("110p", "010h", "110h") # ) %>% # anti_join(data.frame(Chemical = "DTXSID1021116", Species = "rat")) %>% # group_by(Options) %>% # summarize(medianR2 = median(Rsq, na.rm = TRUE)) # # gof_tbl %>% # filter(method %in% "bobyqa", # Options %in% c("110p", "010h", "110h")) %>% # anti_join(data.frame(Chemical = "DTXSID1021116", Species = "rat")) %>% # group_by(Options) %>% # summarize(medianR2 = mean(Rsq, na.rm = TRUE)) # # Here we see that the highest median Rsq is for 110p # # I use median because the distribution is skewed towards 1. # gof_tbl %>% # filter(method %in% "bobyqa", # Options %in% c("110p", "010h")) %>% # anti_join(data.frame(Chemical = "DTXSID1021116", Species = "rat")) %>% # distinct(Chemical, Species, Options, Rsq) %>% # pivot_wider(names_from = Options, values_from = Rsq) %>% # ggplot(aes(x = `110p`, y = `010h`)) + geom_point() # #110h is most even between models # # # cmax_auc_rmsle_tbl %>% # filter(Options %in% c("010h", "110p", "110h"), # method %in% "bobyqa") # # Here it is clear that 110p results in the # # all_tf_tests %>% # filter(Options %in% c("110p", "110h", "010h"), # model %in% "All non-flat", # method %in% "bobyqa") %>% # select(Options, model, starts_with("percent_")) # # Relatively similar here # # # cmaxAUCpl <- cmax_auc_rmsle_tbl %>% # filter(RMSLE_AUC < 0.8, RMSLE_Cmax < 0.5) %>% # ggplot(aes(x = RMSLE_AUC, y = RMSLE_Cmax, # shape = method, fill = Options)) + # scale_shape_manual(values = c(21,22))+ # scale_color_brewer(palette = "Paired")+ # geom_point(color = "black", size = 2, stroke = 0.5) + # guides(fill = guide_legend(override.aes = list(shape = 21))) + # # coord_equal(xlim = c(0.3, 0.45), ylim = c(0.3, 0.4), expand = TRUE) + # labs(x = "AUC RMSLE", y = "Cmax RMSLE") + # theme_bw() + # theme(panel.border = element_rect(fill = NA, # color = "black", # linewidth = 2)) # cmaxAUCpl # ggsave(paste0(Sys.getenv("FIG_DIR"), # today, # "CmaxAUC_RMSLE_plotFinalists.png"), # bg = "white", # height = 3.25, # width = 3.5, # units = "in", # device = "png", # dpi = 300) # # # # Plot Rsq # violin_rmsle <- gof_tbl %>% # filter(!(model %in% "model_flat"), # Options %in% c("010h", "010p", # "111h", "111p", # "110h", "110p"), # RMSLE <= 10) %>% # ggplot() + # geom_violin( # aes(x = "", # y = RMSLE, # color = interaction(error_model, method)), # draw_quantiles = c(0.25,0.5,0.75)) + # facet_grid(cols = vars(Options)) + # scale_color_brewer(palette = "Paired") + # labs(title = "RMSLE", # color = "Error Model.Optimizer", # subtitle = "(values 10 or under)", # x = "") + # coord_cartesian(ylim = c(0, 10)) + # theme_bw() + # theme(legend.position = "none", # plot.title = element_text(hjust = 0.5), # plot.subtitle = element_text(hjust = 0.5), # axis.title.x = element_blank(), # axis.ticks.x = element_blank(), # axis.text.x = element_blank() # ) # # violin_rsq <- gof_tbl %>% # filter(!(model %in% "model_flat"), # Options %in% c("010h", "010p", # "111h", "111p", # "110h", "110p")) %>% # ggplot() + # geom_violin(aes(x = "", # y = Rsq, # color = interaction(error_model, method)), # draw_quantiles = c(0.25,0.5,0.75)) + # facet_grid(cols = vars(Options)) + # scale_color_brewer(palette = "Paired") + # labs(title = "R-squared", # color = "Error Model.Optimizer", # x = "") + # coord_cartesian(ylim = c(0, 1)) + # theme_bw() + # guides(color = guide_legend(nrow = 2, byrow = TRUE)) + # theme(legend.position = "bottom", # plot.title = element_text(hjust = 0.5), # axis.title.x = element_blank(), # axis.ticks.x = element_blank(), # axis.text.x = element_blank() # ) # # cowplot::plot_grid(plotlist = list(violin_rmsle, violin_rsq), # nrow = 2, align = "v") # ggsave( # filename = paste0( # Sys.getenv("FIG_DIR"), # today, # "_RMSLE_Rsquared_ViolinPlots_selectOptions.png" # ), # height = 5, # width = 6, # units = "in" # ) # # gof_tbl %>% # filter(!(model %in% "model_flat"), # Options %in% c("010h", "010p", # "111h", "111p", # "110h", "110p")) %>% # ggplot() + # geom_point(aes(x = Rsq, y = RMSLE, # color = interaction(method, model)), # shape = 1) + # facet_wrap(~Options) + # scale_color_brewer(palette = "Paired") + # theme_bw() + # coord_cartesian( # xlim = c(0, 1), # expand = TRUE # ) + # theme(legend.position = "bottom", # plot.title = element_text(hjust = 0.5)) # # ggsave( # filename = paste0( # Sys.getenv("FIG_DIR"), # today, # "_RMSLE_Rsquared_ScatterPlot.png" # ), # height = 7, # width = 10, # units = "in" # ) # # rmsle_hist <- gof_tbl %>% filter(model != "model_flat", # Options %in% c("100p", "010h", "110p", "110h"), # method == "bobyqa") %>% # ggplot(aes(x = RMSLE)) + # geom_histogram(fill = "grey5", # binwidth = 0.1) + # scale_x_log10(label = scales::label_math()) + # geom_vline(xintercept = 1, color = "red3", linetype = "dotted") + # facet_grid(rows = vars(Options), # cols = vars(method)) + # theme_bw() # # rmsle_hist # # ggsave( # filename = paste0( # Sys.getenv("FIG_DIR"), # today, # "_RMSLE_histogram_topChoices.png" # ), # height = 4, # width = 4, # units = "in" # ) # # cowplot::plot_grid( # plotlist = list(rmsle_hist, # cmaxAUCpl + # guides(fill = guide_legend(ncol = 2, # override.aes = list(shape = 21)), # shape = guide_legend(nrow = 2)) + # theme(legend.position = "bottom", # legend.title.position = "top") # ), # rel_widths = c(1, 1.75), # labels = "AUTO") # ggsave( # filename = paste0( # Sys.getenv("FIG_DIR"), # today, # "_RMSLE_histogram_CmaxAUC.png" # ), # height = 4.5, # width = 6, # units = "in" # ) # # # all_preds_prep <- all_preds %>% # filter(Conc_est >= LOQ, # Options %in% c("110p", "110h", "010h"), # method == "bobyqa") %>% # mutate(ID = as.factor(paste(Chemical, Species, # method)), # .keep = "unused") %>% # select(ID, Options, dose_norm, log10_trans, time_scale, # Route, Media, Dose, Conc, Conc_est, Time, Reference) # # split_preds <- split(all_preds_prep, all_preds_prep$ID) # # sp_plots <- lapply(names(split_preds), function(i) { # ggplot(data = split_preds[[i]], # mapping = aes(x = Conc, y = Conc_est, # color = Options, shape = Reference)) + # geom_point(size = 2) + # geom_abline(slope = 1) + # facet_grid(dose_norm~Route+Media+Dose, # scales = "free_y", # labeller = labeller(dose_norm = label_both)) + # scale_y_log10() + scale_x_log10() + # scale_color_brewer(palette = "Accent") + # theme_bw() + # labs(title = i) + # theme(legend.position = "bottom", # plot.title = element_text(hjust = 0.5, face = "bold")) # } # ) # # # Very BIG FILE # pdf(file = paste0( # Sys.getenv("FIG_DIR"), # today, # "log10_GOFplots_select.pdf"), # onefile = TRUE, height = 8, width = 8) # for (x in seq_along(sp_plots)) { # print(sp_plots[[x]]) # } # dev.off() ## ----load_multirefs, eval=FALSE----------------------------------------------- # # Reference Multiples # multiRef_ChemSpec <- all_my_data %>% # ungroup() %>% # distinct(Chemical, Species, Reference) %>% # group_by(Chemical, Species) %>% # count() %>% # arrange(desc(n)) %>% # filter(n > 1) %>% # dplyr::select(!n) # multiRef_ChemSpec %>% nrow() # Surprisingly only 15 Chem + Species have multiple references # multiRef_ChemSpec ## ----NCA_error_eval, eval=FALSE----------------------------------------------- # # # Which Chemical Species Route Media Dose groups have the randomness? # my_pk$data %>% # group_by(Chemical, Species, Reference, Route, Media, Dose, Time) %>% # summarize(Count = n()) %>% # group_by(Chemical, Species, Reference, Route, Media, Dose) %>% # filter(any(Count == 1) && any(Count > 1)) %>% # distinct(Chemical, Species, Reference, Route, Media, Dose) -> problem_set # # # Evaluate 100 times # n_evals <- 100 # evalNCA_df <- NULL # my_pk_new <- my_pk + # settings_preprocess(suppress.message = TRUE) # # for (i in 1:100) { # seed_num <- 100 + i # set.seed(seed_num) # message(paste("Current seed:", seed_num)) # my_pk_new <- do_preprocess(my_pk_new) # my_pk_new <- do_data_info(my_pk_new) # # this_nca <- my_pk_new$data_info$nca %>% # select(-param_sd_z) %>% # mutate(seed = as.factor(seed_num)) # evalNCA_df <- bind_rows(evalNCA_df, this_nca) # } # # names(evalNCA_df) # # evalNCA_df %>% # group_by(Chemical, Species, Route, Media, Dose, param_name) %>% # summarize(Average = mean(param_value[which(!is.infinite(param_value))], na.rm = TRUE), # Median = median(param_value[which(!is.infinite(param_value))], na.rm = TRUE), # StDev = sd(param_value[which(!is.infinite(param_value))], na.rm = TRUE), # Min = min(param_value[which(!is.infinite(param_value))], na.rm = TRUE), # Max = max(param_value[which(!is.infinite(param_value))], na.rm = TRUE), # Unique_Values = length(unique(signif(param_value, 4))), # Inf_Count = sum(is.infinite(param_value))) %>% # filter( # # Route %in% "iv", # param_name %in% "Cmax" # ) %>% # View() # # # evalNCA_df %>% # filter(param_name %in% c("tmax"), # Route %in% "iv", # is.finite(param_value), # Species %in% c("rat", "human", "hamster")) %>% # ggplot(aes(x = param_value, y = Chemical)) + # geom_point() + # facet_grid(rows = vars(Route)) + # theme_bw() + # theme(axis.text.y = element_text(size = 5), # axis.ticks.y = element_blank(), # panel.grid.major = element_line(color = "grey70", # linewidth = 0.4, # linetype = "solid"), # panel.grid.major.y = element_blank(), # panel.grid.minor = element_blank()) # # # # # set.seed(2023) ## ----Final_Options, eval=FALSE------------------------------------------------ # my_pk <- minimal_pk + # facet_data(vars(Chemical, Species)) + # settings_preprocess(keep_data_original = TRUE, # suppress.messages = FALSE) + # settings_optimx(method = c("bobyqa")) + # scale_conc(dose_norm = TRUE, log10_trans = TRUE) + # scale_time(new_units = "auto") + # stat_error_model(error_group = vars(Chemical, Species)) # # my_pk <- do_preprocess(my_pk) # cvt_DTXSIDs <- unique(my_pk$data$Chemical) # length(cvt_DTXSIDs) # # my_pk <- do_prefit(my_pk) # # # system.time(my_pk <- do_fit(my_pk, # n_cores = 14)) # # # saveRDS(my_pk, "data-raw/all_cvt_pk.rds") ## ----data_aggregation, eval=FALSE--------------------------------------------- # # Need the following data for any of the subsequent plots # # # after fitting pk object for all cvt objects or reading the fitted pk object in `setup` # winmodel <- get_winning_model(my_pk) # my_preds <- semi_join(predict(my_pk, use_scale_conc = FALSE), winmodel) # my_residuals <- residuals(my_pk, use_scale_conc = FALSE) %>% # semi_join(winmodel) # my_tkstats <- eval_tkstats(my_pk) %>% semi_join(winmodel) # my_nca <- get_nca(my_pk) # all_my_data <- get_data(my_pk) # # formatted_coefs <- my_pk$fit %>% # filter(str_detect(param_name, "sigma_", negate = TRUE)) %>% # select(Chemical, Species, model, model, param_name, estimate, # param_name, estimate, convcode) %>% # mutate(param_name = paste(param_name, "tkstats", sep = ".")) |> # pivot_wider(names_from = param_name, # values_from = estimate) # # my_tf <- twofold_test(my_pk) # # NROW(all_my_data) > NROW(my_preds) # THis must be true... or else try distinct(my_preds) # # # Writing file to xlsx # writexl::write_xlsx(x = list(predictions = my_preds, # tkstats = my_tkstats), # path = paste0(Sys.getenv("FIG_DIR"), # "Manu_Files/", # today, # "_Supp_Table2", # ".xlsx")) # # fe_df <- fold_error(my_pk) %>% semi_join(winmodel) # # May need to change the use_scale_conc defaults # r2_df <- rsq(my_pk, use_scale_conc = FALSE) %>% # semi_join(winmodel) # myAIC <- AIC(my_pk) %>% # semi_join(winmodel) # myRMSLE <- rmse(my_pk, # use_scale_conc = list(dose_norm = FALSE, log10_trans = TRUE)) %>% # semi_join(winmodel) # # myRMSE <- rmse(my_pk, # rmse_group = ggplot2::vars(Chemical, Species, Route, Media, Dose)) %>% # semi_join(winmodel) # # chem_name_translate <- all_my_data %>% # dplyr::select(Chemical, Chemical_Name) %>% # dplyr::filter(!(Chemical_Name %in% c("2,4-d", # "methylene chloride", # "benzo[a]pyrene"))) %>% # dplyr::distinct() # ## ----fig3A, eval=FALSE-------------------------------------------------------- # my_tf$indiv_data_test %>% select(Route, starts_with("percent_")) # # # pl3A <- my_tf$individual_data %>% # # Plotting # ggplot(aes(x = foldConc)) + # geom_histogram(binwidth = 0.2, # fill = "grey5", # color = "grey5", # linewidth = 0.4) + # coord_cartesian(xlim = c(0, 4), # ylim = c(0, 4000)) + # scale_y_continuous(expand = c(0, 0), # breaks = c(0, 1000, 2000, 3000), # labels = c("0", "1", "2", "3")) + # expand_limits(y = 0) + # geom_vline(xintercept = c(0.5, 2), color = "red3", linetype = "dashed") + # theme_bw() + # labs(x = "Mean-normalized\nconcentration", # y = "Observations\n(thousands)") + # theme(text = element_text(size = 10), # panel.border = element_rect(color = "black", linewidth = 1, fill = NA), # panel.background = element_blank(), # panel.grid.major.y = element_line(color = "grey90", linewidth = 0.4), # axis.title = element_text(face = "bold"), # axis.line = element_blank(), # plot.background = element_blank(), # axis.ticks.y = element_blank()) # # pl3A # # ggsave(paste0(Sys.getenv("FIG_DIR"), # today, # "NormalizedConcentrations",".png"), # plot = pl3A, # bg = "white", # height = 2.2, # width = 2.5, # units = "in", # device = "png", # dpi = 300) # # # Not included in manuscript # my_tf$data_descriptors %>% # select(Chemical, Species, Reference, Route, Media, Time, data_descr) %>% # distinct() %>% # ggplot(aes(fill = data_descr, x = "")) + # geom_bar(position = "stack", # color = "black") + # labs(x = "", y = "Occurence by Chemical + Species group") + # scale_y_continuous(expand = c(0, NA)) + # theme_minimal() # ## ----fig4A-2-alt, eval=FALSE-------------------------------------------------- # # With the mapped column names # my_tf$summarized_data_test # # pl3A_alt <- my_tf$summarized_data_test %>% # filter(Route == "All") %>% # select(Route, within, outside) %>% # pivot_longer(cols = c(within, outside), # names_to = "status", # values_to = "value") %>% # ggplot(aes(x = status, fill = status, y = value)) + # geom_col(position = position_stack(), # color = "black", linewidth = 1) + # coord_cartesian(ylim = c(0, 2000)) + # scale_y_continuous(expand = c(0, NA), # breaks = c(0, 1000, 2000), # labels = c("0", "1", "2")) + # scale_fill_manual(values = c("black", "white")) + # theme_classic() + # labs(x = "95% of observations\nwithin factor of 2", # y = "Observations\n(thousands)") + # theme(text = element_text(size = 10), # panel.border = element_rect(color = "black", linewidth = 1, fill = NA), # panel.background = element_blank(), # panel.grid.major.y = element_line(color = "grey90", linewidth = 0.4), # axis.title = element_text(face = "bold"), # axis.line = element_blank(), # plot.background = element_blank(), # axis.ticks.y = element_blank(), # legend.position = "none") # # pl3A_alt # Not included in final manuscript # ## ----fig4B-cowplot, eval=FALSE------------------------------------------------ # pl3B <- my_tf$individual_data %>% # inner_join(my_tkstats %>% # dplyr::select(Chemical, Species, # Route, Media, # tmax = tmax.nca) %>% # filter(!is.na(tmax))) %>% # group_by(Chemical, # Species, # Reference, # Route, # Media, # Dose) %>% # mutate( # normTime = ifelse( # Route == "iv", # 1 + Time/max(Time), # ifelse( # Time > tmax, # ((Time - tmax)/max(Time)) + 1, # 0.5 + Time/(2*tmax) # ) # )) %>% # ggplot(aes( # x = normTime, # y = foldConc # )) + # geom_point(alpha = 0.1, size = 0.7) + # geom_vline(xintercept = 1, color = "blue", # linewidth = 0.8, linetype = "dashed") + # geom_vline(xintercept = 2, color = "green4", # linewidth = 0.8, linetype = "dashed") + # geom_hline(yintercept = c(0.5, 2), color = "red3", # linewidth = 0.8, linetype = "dashed") + # facet_grid(cols = vars(Route), # scales = "free_x") + # labs(x = "ADME-Normalized Time", # y = "Mean-Normalized\nConcentration") + # theme(text = element_text(size = 10), # panel.border = element_rect(color = "black", fill = NA, size = 1), # panel.background = element_blank(), # panel.grid.major = element_line(color = "grey90", linewidth = 0.5), # strip.background = element_rect(fill = "white"), # strip.text = element_text(face = "bold"), # axis.ticks = element_blank(), # axis.line = element_blank(), # axis.text.x = element_blank(), # axis.title = element_text(face = "bold"), # panel.spacing = unit(0.125, units = "in"), # plot.background = element_blank(), # legend.position = "none") # # pl3B # # ggsave(paste0(Sys.getenv("FIG_DIR"), # today, # "ADME_NormTime-NormConcs_onlyDetects.png"), # height = 3, # bg = "white", # width = 5, # units = "in", # dpi = 300) # # # # Make the Combination Plot---- # # plot_grid(pl3A, pl3B, labels = c("A", "B"), # align = "h", axis = "bt", rel_widths = c(1, 1.7)) # # # figure3 <- plot_grid(pl3A, pl3B, labels = c("A", "B"), # align = "h", axis = "bt", rel_widths = c(1, 1.7)) # # ggsave(paste0(Sys.getenv("FIG_DIR"), # today, # "Figure3_onlyDetects.png"), # plot = figure3, # bg = "white", # height = 3, # width = 6.5, # units = "in", # dpi = 300) # ## ----sFig2-cvt_analysis, eval=FALSE------------------------------------------- # data_range <- all_my_data %>% # filter(Detect, !exclude) %>% # group_by(Chemical, Species, Reference, Media, Route, Dose, Dose.Units) %>% # summarize(maxT = max(Time)/24, # concRange = log10(max(Conc)/min(Conc))) %>% # ungroup() # # # sf_2A <- data_range %>% # ggplot(aes(x = maxT)) + # geom_histogram(fill = "grey5", # bins = 40, # color = "grey5") + # scale_x_log10(labels = scales::number, # breaks = c(0.1, 1, 7, 30, 365)) + # # scale_y_continuous(expand = c(0, 0), # # limits = c(0, 60), # # breaks = c(0, 10, 20, 30, 40, 50)) + # labs(x = "Day of final Timepoint", # y = "Count") + # theme(aspect.ratio = 1, # text = element_text(size = 10), # panel.border = element_rect(color = "black", fill = NA, size = 1), # panel.background = element_blank(), # panel.grid.major = element_line(color = "grey90", linewidth = 0.5), # axis.ticks = element_blank(), # axis.line = element_blank()) # # sf_2A # # ggsave(paste0(Sys.getenv("FIG_DIR"), # today, # "SFig2A_TimePoints.png"), # plot = sf_2C, # height = 4, # width = 4, # units = "in", # dpi = 300) # # # sf_2B <- data_range %>% # ggplot(aes(x = concRange)) + # geom_histogram(fill = "grey5", # bins = 40, # color = "grey5") + # scale_y_continuous(expand = c(0, 0), # limits = c(0, 50), # breaks = c(0, 25, 50)) + # labs(x = "log10(Concentration Range)", # y = "Count") + # theme(aspect.ratio = 1, # panel.border = element_rect(color = "black", fill = NA, size = 1), # panel.background = element_blank(), # panel.grid.major = element_line(color = "grey90", linewidth = 0.5), # strip.background = element_rect(fill = "white"), # strip.text = element_text(face = "bold"), # axis.ticks = element_blank(), # axis.line = element_blank(), # axis.text = element_text(size = 10), # axis.title = element_text(size = 11)) # # sf_2B # # ggsave(paste0(Sys.getenv("FIG_DIR"), # today, # "SFig2B_ConcRange.png"), # plot = sf_2B, # height = 4, # width = 4, # units = "in", # dpi = 300) # # sf_2AB <- plot_grid(sf_2A, sf_2B, # labels = c("A", "B"), # nrow = 1) # # # sf_2AB # # ggsave(paste0(Sys.getenv("FIG_DIR"), # today, # "SFig2_ConcTimeRanges.png"), # height = 3.2, # width = 6, # bg = "white", # units = "in", # dpi = 300) # ## ----sFig4-model_performance, eval=FALSE-------------------------------------- # my_tf$model_error_summary %>% # select(model, method, starts_with("percent_")) # # my_tf$model_error_all %>% # filter(model %in% c("model_1comp", "model_2comp")) %>% # ggplot(aes(x = Fold_Error)) + # geom_histogram( # fill = "grey5") + # scale_x_log10(limits = c(1E-3, 1E3), labels = scales::label_math(format = log10)) + # expand_limits(y = 0) + # geom_vline(xintercept = c(0.5, 2), color = "red3", linetype = "dashed") + # facet_grid(cols = vars(model), # rows = vars(Route)) + # labs(x = "Predicted/Observed", # y = "Number of Observations") + # theme_classic() + # theme(aspect.ratio = 1, # panel.border = element_rect(color = "black", fill = NA), # panel.grid.major = element_line(color = "grey95", linewidth = 0.4)) # # # ggsave(paste0(Sys.getenv("FIG_DIR"), # today, # "PredictedObserved_compartmentModels.png"), # height = 5, width = 5) # # ## ----fig4-modelPerform-v-dataVar, eval=FALSE---------------------------------- # my_tf$indiv_data_test_fold_errors %>% # select(model, starts_with("percent_")) %>% # glimpse() # # pl_4A <- my_tf$indiv_data_fold_errors %>% # # For Plotting # ggplot(aes( # y = Fold_Error, # x = foldConc # )) + # geom_bin2d(bins = 40, color = NA) + # geom_hline(yintercept = c(0.5, 2), linetype = "dashed") + # geom_vline(xintercept = c(0.5, 2), linetype = "dashed") + # facet_grid(cols = vars(model)) + # scale_x_log10(labels = scales::label_math(format = log10), # limits = c(0.0001, 1000)) + # scale_y_log10(labels = scales::label_math(format = log10), # limits = c(0.0001, 10000)) + # scale_fill_viridis_c(option = "cividis", # limits = c(1, 100), # oob = scales::oob_squish) + # labs(y = "Model Error", # x = "Data Variability", # fill = "Count") + # theme(aspect.ratio = 1, # panel.border = element_rect(color = "black", fill = NA, size = 1.5), # panel.background = element_blank(), # panel.grid.major = element_line(color = "grey90", linewidth = 0.5), # strip.background = element_rect(fill = "white"), # strip.text = element_text(face = "bold"), # axis.ticks = element_blank(), # axis.line = element_blank()) # # pl_4A # ggsave(paste0(Sys.getenv("FIG_DIR"), # today, # "ModelPerformance_vDataVariability.png"), # height = 2, # width = 4.2, # device = "png", # dpi = 300, # units = "in") # # my_table <- my_tf$indiv_data_test_fold_errors %>% # ungroup() %>% # select(starts_with("percent_")) %>% t() %>% # round(digits = 2) %>% # as.data.frame() %>% # rownames_to_column(var = "percentages") %>% # mutate(percentages = case_when( # percentages == "percent_both_within" ~ "Both within a factor of 2", # percentages == "percent_both_outside" ~ "Both outside a factor of 2", # percentages == "percent_model_outside" ~ "Model too variable", # percentages == "percent_data_outside" ~ "Data too variable" # )) # # # # names(my_table) <- c(" ","1-compartment (%)", "2-compartment (%)", "Overall (%)") # # library(flextable) # library(officer) # # flextable(my_table)%>% # border_inner() %>% # fontsize(part = "all", size = 11) %>% # bold(part = "all", j = 4) %>% autofit() # # table_plot <- gen_grob(flextable(my_table) %>% # border_inner() %>% # fontsize(part = "all", size = 10) %>% # bold(part = "all", j = 4) %>% # autofit()) # # # dual_plot <- plot_grid(pl_4A, table_plot, ncol = 1, align = "v", # rel_heights = c(1, 0.5)) # # dual_plot # # ggsave(paste0(Sys.getenv("FIG_DIR"), # today, # "ModelPerformance_vDataVariability_wTable.png"), # plot = dual_plot, # height = 4.5, # width = 6, # device = "png", # bg = "white", # dpi = 300, # units = "in") ## ----fig5Alt, eval=FALSE------------------------------------------------------ # # Ultimately this will be removed from manuscript # my_tf$summarized_data_model_error %>% # select(model, starts_with("percent_")) ## ----fig5-goodness-of-fit, eval=FALSE----------------------------------------- # combined_gof_df <- my_tf$model_error_all %>% # group_by(Chemical, Species, model, method) %>% # summarize(within_2fold = sum(between(Fold_Error, 0.5, 2))/n()) %>% # left_join(r2_df) %>% # left_join(myAIC) %>% # left_join(myRMSLE) %>% # semi_join(winmodel) # # # mypl <- ggplot(data = combined_gof_df, # aes( # x = within_2fold, # y = Rsq, # color = model # )) + # geom_point() + # geom_abline(slope = 1, color = "red4", linetype = "longdash") + # scale_color_manual(values = c("#0398FC", "#D68E09", "black")) + # coord_cartesian(xlim = c(0,1), ylim = c(0,1)) + # labs(x = "Fraction of predictions within 2-fold", # y = "R-squared value") + # theme(aspect.ratio = 1, # panel.border = element_rect(color = "black", fill = NA, size = 1), # panel.background = element_blank(), # panel.grid.major = element_line(color = "grey90", linewidth = 0.5), # axis.ticks = element_blank(), # axis.line = element_blank(), # legend.position = "none", # legend.title = element_blank(), # legend.key = element_blank()) # # panelA_plot <- ggExtra::ggMarginal(mypl, groupFill = TRUE, # type = "histogram", # xparams = list(binwidth = 0.1), # yparams = list(binwidth = 0.1)) # panelA_plot # ggsave(filename = paste0(Sys.getenv("FIG_DIR"), # today, # "R2_within2fold_comparison.png"), # plot = ggExtra::ggMarginal(mypl, groupFill = TRUE, # type = "histogram", # xparams = list(binwidth = 0.05), # yparams = list(binwidth = 0.05)), # height = 5, # width = 5, # units = "in") # # panelB_plot <- ggplot(data = myRMSE, # aes( # x = log10(RMSE), # color = model, # fill = model # )) + # geom_histogram(bins = 50) + # labs(y = "Count") + # scale_color_manual(values = c("#0398FC", "#D68E09", "grey10")) + # scale_fill_manual(values = c("#0398FC", "#D68E09", "grey10")) + # facet_grid(rows = vars(model), # cols = vars(Route)) + # theme(text = element_text(size = 10), # aspect.ratio = 0.6, # panel.border = element_rect(color = "black", fill = NA, size = 1), # panel.background = element_blank(), # panel.grid.major = element_line(color = "grey90", linewidth = 0.5), # axis.ticks = element_blank(), # axis.line = element_blank(), # strip.background = element_blank(), # panel.spacing.y = unit(0.125, units = "in"), # legend.position = "bottom", # legend.title = element_blank()) # panelB_plot # # ggsave(filename = paste0(Sys.getenv("FIG_DIR"), # today, # "RMSEs_winningmodel_RouteFacet.png"), # height = 4, # width = 6, # units = "in") # # legend <- get_legend(panelB_plot) # # plot_grid(plot_grid(panelA_plot, # panelB_plot + theme(legend.position = "none"), labels = c("A", "B"), # axis = "tb", align = "h", rel_heights = c(1,1)), legend, # ncol = 1, # rel_heights = c(1, 0.1)) # ggsave(filename = paste0(Sys.getenv("FIG_DIR"), # today, # "CombinedPlot_Rsquared_RMSE.png"), # height = 4, # width = 6.5, # bg = "white", # units = "in") ## ----goodness-of-fit_exampleFits, eval = FALSE-------------------------------- # pl <- plot(my_pk, best_fit = TRUE, # use_scale_conc = TRUE, # drop_nonDetect = FALSE) # # cvt %>% filter(curation_set_tag == "CVT_Showa") %>% # pull(analyte_dtxsid) -> Showa_chems # # combined_gof_df %>% filter(!(Chemical %in% Showa_chems)) %>% View() # # ex_fits <- pl %>% # filter(Chemical %in% c("DTXSID2020139", # "DTXSID4023917", # "DTXSID3061635", # "DTXSID1034187"), # Species %in% "rat") %>% # pull(final_plot) # # # cowplot::plot_grid(plotlist = list(ex_fits[[2]] + # scale_color_manual(values = c("#a6bddb", # "#74a9cf", # "#41bbc4", # "#2b8cbe", # "#045a8d")) + # theme(legend.position = "none") + # xlim(0,30), # ex_fits[[3]] + # scale_color_manual(values = c("black", # "grey40")) + # theme(legend.position = "none"), # ex_fits[[1]] + # scale_color_manual(values = c("magenta3")) + # theme(legend.position = "none"), # ex_fits[[4]] + # scale_color_manual(values = c("#006837")) + # theme(legend.position = "none")), # scale = 0.85) # # ggsave(filename = paste0(Sys.getenv("FIG_DIR"), # today, # "exampleFits_Rsq_within2fold.png"), # height = 12, # width = 12) # ## ----sFig7-sigmaDistribution, eval=FALSE-------------------------------------- # # my_pk$fit %>% # select(Chemical, Species, model, method, param_name, estimate) %>% # filter(str_detect(param_name, "sigma_")) %>% # semi_join(winmodel) %>% # left_join(my_pk$prefit$stat_error_model$sigma_DF) %>% # inner_join(get_data_summary(my_pk) %>% # distinct(Chemical, Species, n_obs)) -> my_coefs # # # my_coefs %>% # mutate(model = # case_when( # model == "model_1comp" ~ "1-compartment", # model == "model_2comp" ~ "2-compartment", # model == "model_flat" ~ "Null" # )) %>% # ggplot() + # geom_histogram(aes(x = estimate/n_obs), # bins = 15, # color = NA, fill = "grey5") + # # geom_histogram(aes(x = start/n_obs), fill = NA, color = "grey20", bins = 15) + # scale_x_log10(labels = scales::label_math(format = log10)) + # facet_grid(cols = vars(model)) + # labs(x = "Size-normalized Sigma Value", # y = "Number of Observations") + # theme(panel.border = element_rect(color = "black", fill = NA, size = 1.5), # panel.background = element_blank(), # panel.grid.major = element_line(color = "grey90", linewidth = 0.5), # strip.background = element_rect(fill = "white"), # strip.text = element_text(face = "bold"), # plot.title = element_text(hjust = 0.5), # axis.ticks = element_blank(), # axis.line = element_blank()) # # ggsave(paste0(Sys.getenv("FIG_DIR"), # today, # "NormalizedSigma_Distributions.png"), # bg = "white", # height = 3, width = 5) # ## ----Lombardo_Comparison, eval=FALSE------------------------------------------ # cvt_DTXSIDs <- unique(my_pk$data$Chemical) # # First things first, let's load the data from Lombardo et al. # lombardo <- readxl::read_xlsx( # paste0(Sys.getenv("FIG_DIR"), # "Lombardo2018-Supplemental_82966_revised_corrected.xlsx"), # skip = 8) # # # l_batch_search <- readxl::read_xlsx( # paste0(Sys.getenv("FIG_DIR"), # "LombardoBatchSearch_Name.xlsx"), # sheet = "Main Data" # ) # # # l_batch_search <- l_batch_search[c("INPUT", "CASRN","DTXSID")] %>% # distinct() %>% # right_join(lombardo[c("Name", "CAS #")], by = c("CASRN" = "CAS #", "INPUT" = "Name")) # # # # l_noNamesCASRN <- readxl::read_xlsx( # paste0(Sys.getenv("FIG_DIR"), # "LombardoBatchSearch_CASRNnoName.xlsx"), # sheet = "Main Data" # ) %>% # dplyr::select(CASRN = INPUT, DTXSID) # # l_curatedDTX <- read_tsv(paste0(Sys.getenv("FIG_DIR"), # "CuratedIDs_lombardo.txt")) %>% # rename(INPUT = Name) # # # l_batch_search <- l_batch_search %>% # left_join(l_noNamesCASRN, by = "CASRN") %>% # mutate(DTXSID = coalesce(DTXSID.x, DTXSID.y)) %>% # dplyr::select(-DTXSID.x, -DTXSID.y) # # Now all but two of the IDs are unidentified # # lombardo <- lombardo %>% # right_join(l_batch_search, by = c("Name" = "INPUT")) # # # #### # lombard_abbr <- lombardo %>% dplyr::select(DTXSID, # hVss = `human VDss (L/kg)`, # hCltot = `human CL (mL/min/kg)`, # halflife = `terminal t1/2 (h)` ) %>% # filter(DTXSID %in% cvt_DTXSIDs) %>% # left_join(chem_name_translate, by = c("DTXSID" = "Chemical")) # # # 15 chemicals in common with values for both # lombardo_comparison <- my_tkstats %>% # dplyr::select(DTXSID = Chemical, Species, # model, Vss.tkstats, CLtot.tkstats) %>% # dplyr::filter(model != "model_flat") %>% # distinct() %>% # inner_join(lombard_abbr, by = "DTXSID") %>% # # filter(!is.na(Vss.tkstats)) %>% # arrange(halflife) # # comparable_ids <- lombardo_comparison %>% pull(DTXSID) # # lombard_abbr <- lombard_abbr %>% # mutate(Species = "human", model = "Lombardo et al.") %>% # rename(Vss = hVss, CLtot = hCltot) %>% # mutate(CLtot = CLtot*60/1000) %>% # Lombardo reports this as mL/min/kg, need L/h/kg # filter(DTXSID %in% comparable_ids) %>% # mutate(Chemical_Name = reorder(Chemical_Name, halflife)) # # lombardo_comparison <- my_tkstats %>% # dplyr::select(DTXSID = Chemical, Species, # model, Vss.tkstats, CLtot.tkstats, halflife.tkstats) %>% # filter(DTXSID %in% comparable_ids) %>% # left_join(chem_name_translate, by = join_by(DTXSID == Chemical)) %>% # distinct() %>% # rename(Vss = Vss.tkstats, CLtot = CLtot.tkstats, halflife = halflife.tkstats) %>% # bind_rows(lombard_abbr) %>% # mutate(model = ifelse(str_detect(model, "^model"), "invivoPKfit", model), # Chemical_Name = factor(Chemical_Name, # levels = levels(lombard_abbr$Chemical_Name))) # # lombardo_comparison %>% # pivot_longer(cols = c("Vss", "CLtot", "halflife")) %>% # mutate(name = factor(name, levels = c("halflife", "Vss", "CLtot"))) %>% # ggplot(mapping = aes( # x = value, # y = Chemical_Name # )) + # geom_point(aes(color = model, # shape = Species), # position = position_dodge(0.7), # size = 5/.pt, stroke = 2.5/.pt) + # facet_grid(cols = vars(name), # scales = "free_x") + # scale_x_log10( labels = scales::label_math(format = log10)) + # scale_shape_manual(values = c(21, 22, 24), # breaks = c("human", "dog", "rat")) + # scale_color_manual(values = c("black", "#1064c9")) + # guides(color = guide_legend(override.aes = list(shape = 21), # nrow = 2, # title.position = "top", # title.hjust = 0.5), # shape = guide_legend(nrow = 1, # title.position = "top", # title.hjust = 0.5)) + # theme(panel.border = element_rect(color = "black", fill = NA, linewidth = 1.5), # panel.background = element_blank(), # panel.grid.major = element_line(color = "grey90", linewidth = 0.5), # strip.background = element_rect(fill = "white"), # strip.text = element_text(face = "bold", size = 12), # text = element_text(size = 10), # axis.ticks = element_blank(), # axis.line = element_blank(), # axis.text = element_text(face = "bold", size = 6), # axis.title.y = element_blank(), # axis.title.x = element_blank(), # legend.position = "bottom", # legend.key = element_blank(), # legend.title = element_text(face = "bold"), # legend.text = element_text()) # # ggsave(filename = paste0(Sys.getenv("FIG_DIR"), # today, # "lombardoComparison_named.png"), # height = 7, # width = 6.5, # device = "png", dpi = 300) # ## ----otherMeta-analysis, eval=FALSE------------------------------------------- # # # Histograms # lombardo_hist <- data.frame(Chemical = c(lombardo$DTXSID, # my_tkstats$Chemical), # Vss = c(lombardo$`human VDss (L/kg)`, # my_tkstats$Vss.tkstats), # halflife = c(lombardo$`terminal t1/2 (h)`, # my_tkstats$halflife.tkstats), # Source = c(rep_len("lombardo", # length.out = nrow(lombardo)), # rep_len("invivoPKfit", # length.out = nrow(my_tkstats)))) # # # sfig7A <- lombardo_hist %>% # ggplot(aes(x = log2(Vss), color = Source, # group = Source)) + # geom_freqpoly(aes(y = after_stat(ndensity)), # linewidth = 2/.pt, # bins = 20) + # labs(y = "Scaled Frequency", # x = "Vss") + # scale_x_continuous(labels = scales::label_math(expr = 2^.x)) + # scale_color_manual(values = c("black", "#1064c9")) + # guides(color = guide_legend(nrow = 2, linewidth = 1)) + # theme(panel.border = element_rect(color = "black", fill = NA, linewidth = 1.5), # panel.background = element_blank(), # panel.grid.major = element_line(color = "grey90", linewidth = 0.5), # strip.background = element_rect(fill = "white"), # strip.text = element_text(face = "bold", size = 12), # text = element_text(size = 10), # axis.ticks = element_blank(), # axis.line = element_blank(), # axis.text = element_text(face = "bold"), # legend.position = "bottom", # legend.key = element_blank(), # legend.title = element_text(face = "bold"), # legend.text = element_text()) # # sfig7A # other_meta <- read_csv( # file = paste0(Sys.getenv("FIG_DIR"), "SupTableInVivoDatandPreds.csv")) %>% # dplyr::select(DTXSID, fbioh, fbior) %>% # filter(DTXSID %in% my_tkstats$Chemical) # # fgutabs <- my_pk$fit %>% filter(param_name %in% "Fgutabs", convcode == 0) %>% # distinct(Chemical, Species, model, method, estimate) %>% # semi_join(winmodel) %>% # filter(model != "model_flat") %>% # rename(Fgutabs.tkstats = "estimate") # # # sfig7B <- other_meta %>% left_join(fgutabs, by = c("DTXSID" = "Chemical")) %>% # filter(Species %in% c("rat")) %>% # filter(!is.na(Fgutabs.tkstats), # !is.na(fbior)) %>% # distinct(DTXSID, Species, # Fgutabs.tkstats, fbior) %>% # pivot_longer(cols = c(Fgutabs.tkstats, fbior), # names_to = "Source", # values_to = "Fgutabs") %>% # mutate(Source = ifelse(str_detect(Source, "tkstats"), "invivoPKfit", # "Honda et al.")) %>% # left_join(chem_name_translate, by = join_by(DTXSID == Chemical)) %>% # mutate(Chemical_Name) %>% # ggplot(mapping = aes( # x = Fgutabs, # y = reorder(Chemical_Name, Fgutabs) # )) + # geom_point(aes(color = Source), # shape = 24, # position = position_dodge(0.7), # size = 4/.pt, stroke = 1.5/.pt) + # scale_color_manual(values = c("black", "green4"), # breaks = c("invivoPKfit", "Honda et al.")) + # labs(x = "Oral Bioavailability") + # guides(color = guide_legend(nrow = 2)) + # theme(panel.border = element_rect(color = "black", fill = NA, linewidth = 1.5), # panel.background = element_blank(), # panel.grid.major = element_line(color = "grey90", linewidth = 0.5), # strip.background = element_rect(fill = "white"), # strip.text = element_text(face = "bold", size = 12), # text = element_text(size = 10), # axis.ticks = element_blank(), # axis.line = element_blank(), # axis.text = element_text(face = "bold"), # axis.title.y = element_blank(), # legend.position = "bottom", # legend.key = element_blank(), # legend.title = element_text(face = "bold"), # legend.text = element_text()) # # # plot_grid(sfig7A, sfig7B, # align = "v", # ncol = 1, # axis = "lr", # rel_heights = c(1,1.25), # labels = c("A", "B")) # ggsave(filename = paste0(Sys.getenv("FIG_DIR"), # today, # "metaAnalysis_Comparisons_SuppFigure7_named.png"), # height = 5.5, # width = 4, # device = "png", dpi = 300) # ## ----do_fit-benchmarking, eval=FALSE------------------------------------------ # # Randomize the order of chemical species # chem_spec <- cvt %>% # dplyr::distinct(analyte_dtxsid, species) %>% # dplyr::slice_sample(prop = 1) # # test_group_size <- seq(25, 105, by = 10) # fit_options <- expand.grid(error_model = c("pooled", # "hierarchical"), # dose_norm = FALSE, # log10_trans = TRUE, # time_scale = c( # TRUE, # FALSE), # stringsAsFactors = FALSE) # # fit_options <- tidyr::expand_grid(fit_options, test_group_size) # # fit_options <- fit_options %>% rowwise() %>% # mutate(this_data = list({ # tmp <- chem_spec %>% slice_head(n = test_group_size) %>% # left_join(cvt, by = join_by(analyte_dtxsid, species)) %>% # pk() + # scale_time(new_units = ifelse(!time_scale, # "identity", # "auto")) + # scale_conc(dose_norm = dose_norm, log10_trans = log10_trans) + # settings_preprocess(suppress.messages = TRUE) # do_prefit(tmp) # } # ), # data_size = chem_spec %>% slice_head(n = test_group_size) %>% # left_join(cvt, by = join_by(analyte_dtxsid, species)) %>% # NROW()) # # # Organization of the benchmarking # # For each fit_option, return the four values of system.time() that we want # df_out <- fit_options %>% # rowwise() %>% # mutate(tim_1core = system.time(do_fit(this_data))[["elapsed"]], # tim_4core = system.time(do_fit(this_data, n_cores = 4))[["elapsed"]], # tim_7core = system.time(do_fit(this_data, n_cores = 7))[["elapsed"]], # tim_12core = system.time(do_fit(this_data, n_cores = 12))[["elapsed"]] # ) # # df_out <- df_out %>% select(-this_data) # gc() # # full_long <- df_out %>% # pivot_longer(cols = c(tim_1core:tim_12core), # names_to = "N_cores", # values_to = "Time_s") %>% # group_by(N_cores, test_group_size, data_size) %>% # summarize(mean_time = mean(Time_s, na.rm = TRUE)/60, # max_time = max(Time_s, na.rm = TRUE)/60, # min_time = min(Time_s, na.rm = TRUE)/60 # ) %>% # mutate(N_cores = as.numeric(str_extract(N_cores, "[:digit:]+"))) # # df_out %>% clipr::write_clip() # ggplot(full_long, # aes(x = test_group_size, # y = mean_time, # color = as.factor(N_cores))) + # geom_point() + # geom_linerange(aes(ymin = min_time, # ymax = max_time)) + # geom_line(aes(group = N_cores)) + # labs(x = "Number of Data Groups", # y = "Runtime (minutes)", # title = "Number of Processing Cores", # subtitle = "(with min/max runtime range)") + # facet_grid(cols = vars(N_cores)) + # scale_color_manual(values = c("black", "#40c679", "#31a354", "#006837")) + # coord_fixed(ratio = 8) + # theme(panel.border = element_rect(color = "black", fill = NA, size = 1.5), # panel.background = element_blank(), # panel.grid.major = element_line(color = "grey90", linewidth = 0.5), # legend.position = "none", # plot.title = element_text(hjust = 0.5), # plot.subtitle = element_text(hjust = 0.5), # axis.ticks = element_blank(), # axis.line = element_blank(), # strip.background = element_blank()) # # # # # ggsave(paste0(Sys.getenv("FIG_DIR"), # today, # "Parallelization_Benchmarking.png"), # width = 5, # height = 2.5, # units = "in") # ## ----sfig5-examples-of-fits, eval=FALSE--------------------------------------- # pl <- plot(my_pk, use_scale_conc = TRUE, n_interp = 10) # names(pl) # Chemical Species observations observation_plot predicted predicted_plot final_plot # # s6_plA <- pl %>% filter(Chemical %in% "DTXSID3031860") %>% # pull(observation_plot) %>% .[[1]] # s6_plB_mouse <- pl %>% filter(Chemical %in% "DTXSID4020533", # Species %in% "mouse") %>% pull(final_plot) %>% .[[1]] # s6_plB_rat <- pl %>% filter(Chemical %in% "DTXSID4020533", # Species %in% "rat") %>% pull(final_plot) %>% .[[1]] # s6_plC <- pl %>% filter(Chemical %in% "DTXSID00216205") %>% # pull(final_plot) %>% .[[1]] # s6_plD <- pl %>% filter(Chemical %in% "DTXSID8025337") %>% # pull(final_plot) %>% .[[1]] # # # Set colors palettes for each # # panel A : Oranges # panelA_cols <- brewer.pal(8, "Oranges")[3:8] # # panel B : Greens # panelB_cols <- brewer.pal(9, "Greens")[2:10] # # panel C : Blue (only 2 values) # # panel D : Purple (6 values) # panelD_cols <- brewer.pal(9, "Purples")[4:9] # # s6_plA <- s6_plA + # labs(title = "perfluorodecanoic acid") + # scale_color_manual(values = panelA_cols, aesthetics = c("color", "fill")) + # theme(text = element_text(size = 20), # legend.position = "none", # strip.background = element_rect(fill = "white")) # # s6_plB_mouse <- s6_plB_mouse + # labs(title = "1,4-dioxane\nMouse") + # scale_color_manual(values = panelB_cols[c(5,8)], # aesthetics = c("color", "fill")) + # theme(text = element_text(size = 20), # legend.position = "none", # strip.background = element_rect(fill = "white")) # # s6_plB_rat <- s6_plB_rat + # labs(title = "1,4-dioxane\nRat") + # scale_color_manual(values = panelB_cols[c(1, 2, 3, 4, 5, 6,7, 8)], # aesthetics = c("color", "fill")) + # theme(text = element_text(size = 20), # legend.position = "none", # strip.background = element_rect(fill = "white")) # # s6_plC <- s6_plC + # labs(title = "psoralen") + # scale_color_manual(values = c("#6BAED6", "#08519C", "darkblue"), # aesthetics = c("color", "fill")) + # theme(text = element_text(size = 20), # legend.position = "none", # strip.background = element_rect(fill = "white")) # # s6_plD <- s6_plD + # labs(title = "formamide") + # scale_color_manual(values = panelD_cols, # aesthetics = c("color", "fill")) + # theme(text = element_text(size = 20), # legend.position = "none", # strip.background = element_rect(fill = "white")) # # # Put it all together # s6_plB <- plot_grid(s6_plB_mouse, s6_plB_rat, # align = "h", axis = "bt", # rel_widths = c(1.2,2)) # # # s6_plCD <- plot_grid(s6_plC, s6_plD, labels = c("C", "D"), label_size = 24) # # # plot_grid( # s6_plA, # s6_plB, # s6_plCD, # align = "v", # ncol = 1, # rel_heights = c(1.25, 1.2, 1.25), # labels = c("A", "B", ""), # label_size = 24 # ) # # # Am making this 2X the size I need it to be # # so the points are a reasonable size when I scale it down # ggsave(paste0(Sys.getenv("FIG_DIR"), # "ExampleFitsPlots_06.12.2024.png"), # width = 13, # height = 14, # units = "in") # ## ----plotting_all_Fits, eval=FALSE-------------------------------------------- # ### Plotting for all (2.5MB file) # pl <- plot(my_pk, n_interp = 12, use_scale_conc = TRUE) # pdf(file = paste0(Sys.getenv("FIG_DIR"), # "Transformed_Joint_AllFits_Final_2024_05_16.pdf"), # height = 6, width = 10) # for (i in 1:nrow(pl)) { # print(pl$final_plot[[i]]) # } # dev.off() # # # From my new SQL pull # pl <- plot(my_pk, n_interp = 12, use_scale_conc = FALSE, best_fit = TRUE) # pdf(file = paste0(Sys.getenv("FIG_DIR"), # "Joint_bobyqa_AllFits_Final_20240514.pdf"), # height = 10, width = 10) # for (i in 1:nrow(pl)) { # print(pl$final_plot[[i]]) # } # dev.off() # ## ----Single-Plot, eval=FALSE-------------------------------------------------- # pl <- plot(my_pk, n_interp = 10, use_scale_conc = FALSE) # pl %>% # filter(Chemical %in% "DTXSID5020029") %>% # pull(final_plot) %>% .[[1]] # ## ----cvtdb-prep, eval=FALSE--------------------------------------------------- # my_cvt <- cvt # my_wm <- get_winning_model(my_pk) # # my_tkstats <- get_tkstats(my_pk) %>% # rename2_cvt() %>% # select(-c(invivpk_dose_level, dose_units, conc_units, time_units, # data_sigma_group, fk_extraction_document_id)) %>% # distinct() %>% # pivot_longer(cols = cltot:vss_fgutabs, # names_to = "param_name", # values_to = "estimate") %>% # filter(!is.na(estimate)) %>% # mutate(param_units = case_when( # param_name %in% c("cltot", "cltot_fgutabs") ~ "L/hours", # param_name %in% c("css", "cmax") ~ "mg/L", # param_name %in% c("halflife", "tmax") ~ "hours", # param_name %in% c("vss", "vss_fgutabs") ~ "L", # param_name %in% "auc_infinity" ~ "mg/L * hours", # .default = NA_character_ # )) %>% # distinct()%>% # mutate(dose_normalization = conc_scale_use(TRUE, my_pk)$dose_norm, # log10_transformation = conc_scale_use(TRUE,my_pk)$log10_trans, # time_scaling = "identity", # invivoPKfit_version = "2.0.0", # update_date = today) %>% # filter(model %in% c("model_1comp", "model_2comp")) # # my_params <- my_pk$fit %>% # rename2_cvt() %>% distinct( # analyte_dtxsid, species, model, method, param_name, estimate, param_units # ) %>% # mutate(dose_normalization = conc_scale_use(TRUE, my_pk)$dose_norm, # log10_transformation = conc_scale_use(TRUE,my_pk)$log10_trans, # time_scaling = "identity", # invivoPKfit_version = "2.0.0", # update_date = today) %>% # filter(model %in% c("model_1comp", "model_2comp")) # # my_cvt_unique <- my_cvt %>% # dplyr::mutate( # conc_processing_flags = # case_when( # invivPK_conc == conc ~ glue::glue("Used 'conc' value."), # invivPK_conc == conc_original ~ glue::glue("Used 'conc_original' value."), # invivPK_conc == conc_original * conc_unit_norm_factor ~ glue::glue( # "Mulitplied 'conc_original' by {conc_unit_norm_factor}." # ), # invivPK_conc != conc_original * conc_unit_norm_factor ~ glue::glue( # "Multiplied 'conc_original' by {signif(invivPK_conc/conc_original, 2)}." # ), # is.na(conc) ~ glue::glue( # "Multiplied 'conc_original' by {signif(invivPK_conc/conc_original, 2)}" # ), # .default = "" # ) # ) %>% # dplyr::mutate( # loq_processing_flags = # if_else( # invivPK_loq == loq, # glue::glue("Used 'loq' value."), # glue::glue("Multiplied 'loq' value by {signif(invivPK_loq/loq,2)}."), # missing = glue::glue("'loq' was imputed as 90% of lowest concentration.") # ) # ) %>% # dplyr::mutate( # dose_processing_flags = # if_else( # invivPK_dose_level == dose_level_normalized, # glue::glue("Used 'dose_level_normalized'."), # glue::glue("Multiplied 'dose_level_normalized' by", # " {signif(invivPK_dose_level/dose_level_normalized, 3)}.") # ) # ) %>% # dplyr::select(fk_series_id, # analyte_dtxsid, # species, # administration_route_normalized, # conc_medium_normalized, # conc_processing_flags, # dose_processing_flags, # loq_processing_flags, # ) %>% # group_by(analyte_dtxsid, species, # administration_route_normalized, # conc_medium_normalized) %>% # summarize(all_series_ids = paste(unique(fk_series_id), collapse = ", "), # conc_processing_flags = paste(unique(conc_processing_flags), collapse = " "), # dose_processing_flags = paste(unique(dose_processing_flags), collapse = " "), # loq_processing_flags = paste(unique(loq_processing_flags), collapse = " ")) %>% # dplyr::distinct() # # tkstats_cvtdb <- inner_join(my_cvt_unique, my_tkstats, # by = join_by(analyte_dtxsid, species, # administration_route_normalized, # conc_medium_normalized)) %>% # distinct() # params_cvtdb <- inner_join(my_cvt_unique, my_params, # by = join_by(analyte_dtxsid, species), # relationship = "many-to-many") %>% # distinct() # # # Writing to file # bind_rows(params_cvtdb, # tkstats_cvtdb) %>% # distinct() -> tkparams_cvtdb # tkparams_cvtdb_winmodels <- tkparams_cvtdb %>% semi_join(my_wm) # save(tkparams_cvtdb, tkstats_cvtdb, params_cvtdb, tkparams_cvtdb_winmodels, # file = "data-raw/tkparams_CvTdb.Rdata") ## ----information-------------------------------------------------------------- sessionInfo()