## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----setup-------------------------------------------------------------------- library(pcpr) suppressPackageStartupMessages({ library(dplyr) # for manipulation of queens library(GGally) # for the ggcorr fn library(ggplot2) # for plotting library(lubridate) # for manipulating dates library(magrittr) # for the pipe %>% library(tidyr) # for pivoting queens for better plotting }) plot_matrix <- function(D, ..., lp = "none", title = NULL) { D <- t(D) if (is.null(colnames(D))) colnames(D) <- paste0("C", 1:ncol(D)) data.frame(D) %>% dplyr::mutate(x = paste0("R", 1:nrow(.))) %>% tidyr::pivot_longer( tidyselect::all_of(colnames(D)), names_to = "y", values_to = "value" ) %>% dplyr::mutate( x = factor(x, levels = unique(x)), y = factor(y, levels = unique(y)) ) %>% ggplot(aes(x = x, y = y, fill = value)) + geom_raster() + scale_y_discrete(limits = rev) + coord_equal() + scale_fill_viridis_c(na.value = "white", ...) + theme_minimal() + theme( axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.title.x = element_blank(), axis.title.y = element_blank(), legend.position = lp, plot.margin = margin(0, 0, 0, 0), aspect.ratio = 1 ) + ggtitle(title) } ## ----queens raw--------------------------------------------------------------- queens ## ----plot queens-------------------------------------------------------------- plot_matrix(queens[, 2:ncol(queens)]) ## ----trends------------------------------------------------------------------- queens %>% pivot_longer( colnames(queens)[-1], names_to = "chem", values_to = "concentration" ) %>% filter(!is.na(concentration)) %>% ggplot(aes(x = Date, y = concentration)) + geom_line() + geom_smooth(color = "red", formula = "y ~ x", method = "loess", span = 0.05) + facet_wrap(~chem, scales = "free_y") + labs(x = "Date", y = "Concentration (µg/m^3)") + theme_bw() ## ----dates-------------------------------------------------------------------- start_date <- "2015-01-01" queens_small <- queens %>% filter(Date >= as.Date(start_date)) queens_small ## ----raw correlation---------------------------------------------------------- ggcorr( queens_small[, 2:ncol(queens_small)], method = "pairwise.complete.obs", limits = FALSE, label = FALSE, size = 5 ) ## ----distributions------------------------------------------------------------ queens_small %>% pivot_longer( colnames(queens_small)[-1], names_to = "chem", values_to = "concentration" ) %>% filter(!is.na(concentration)) %>% ggplot(aes(x = concentration)) + geom_histogram(bins = 50) + theme_bw() + facet_wrap(~chem, scales = "free") ## ----preprocessing------------------------------------------------------------ queens_scaled <- queens_small %>% select(-Date) queens_scaled[queens_scaled < 0] <- 0 queens_scaled <- data.frame(scale(queens_scaled, center = FALSE)) queens_scaled$Date <- queens_small$Date ## ----new dists---------------------------------------------------------------- queens_scaled %>% pivot_longer( colnames(queens_small)[-1], names_to = "chem", values_to = "concentration" ) %>% filter(!is.na(concentration)) %>% ggplot(aes(x = concentration)) + geom_histogram(bins = 50) + theme_bw() + facet_wrap(~chem, scales = "free") ## ----sing--------------------------------------------------------------------- D <- as.matrix(queens_scaled %>% select(-Date)) D_imputed <- impute_matrix(D, apply(D, 2, mean, na.rm = TRUE)) singular_values <- sing(D_imputed) plot(singular_values, type = "b") ## ----gs, eval=FALSE, echo=TRUE------------------------------------------------ # eta_0 <- get_pcp_defaults(D)$eta # # to get progress bar, could wrap this # # in a call to progressr::with_progress({ gs <- grid_search_cv(...) }) # gs <- grid_search_cv( # D, # pcp_fn = rrmc, # grid = data.frame(eta = 10^seq(-1, 2, 1) * round(eta_0, 3)), # r = 8, # parallel_strategy = "multisession", # num_workers = 16, # verbose = FALSE # ) # r_star <- gs$summary_stats$r[1] # eta_star <- round(gs$summary_stats$eta[1], 3) # gs$summary_stats ## ----gs results, eval=TRUE, echo=FALSE---------------------------------------- gs <- readRDS("rds_files/applied-gs.rds") r_star <- gs$summary_stats$r[1] eta_star <- round(gs$summary_stats$eta[1], 3) gs$summary_stats ## ----pcp---------------------------------------------------------------------- pcp_model <- rrmc(D, r = r_star, eta = eta_star) L <- pcp_model$L S <- pcp_model$S ## ----obj---------------------------------------------------------------------- plot(pcp_model$objective, type = "l") ## ----output L----------------------------------------------------------------- plot_matrix(L) L_rank <- matrix_rank(L) L_rank ## ----orig corr---------------------------------------------------------------- ggcorr( queens_small[, 2:ncol(queens_small)], method = "pairwise.complete.obs", limits = FALSE, label = FALSE, size = 5 ) ## ----new corr----------------------------------------------------------------- L_df <- data.frame(L) colnames(L_df) <- colnames(queens[, 2:ncol(queens)]) ggcorr( L_df, method = "pairwise.complete.obs", limits = FALSE, label = FALSE, size = 5 ) ## ----sparse------------------------------------------------------------------- sparsity(S) hist(S) plot_matrix(S) ## ----sparse by col------------------------------------------------------------ S_df <- data.frame(S) colnames(S_df) <- colnames(queens_small[, 2:ncol(queens_small)]) chems_w_most_extreme_events <- sort(apply(S_df, 2, function(x) sparsity(as.matrix(x)))) chems_w_most_extreme_events ## ----K sparse events---------------------------------------------------------- S_df %>% select(K) %>% mutate(Date = queens_small$Date) %>% filter(K > 0) ## ----NMF, eval=FALSE, echo=TRUE----------------------------------------------- # library(NMF) # # nmf_mat <- L # nmf_mat[nmf_mat < 0] <- 0 # # res <- nmf( # nmf_mat, # rank = L_rank, # method = "offset", nrun = 30, # seed = 0, .opt = "vp16" # ) # # raw_scores <- basis(res) # raw_loadings <- coef(res) ## ----load NMF, eval=TRUE, echo=FALSE------------------------------------------ raw_scores <- readRDS("rds_files/applied-nmf-raw-scores-W.rds") raw_loadings <- readRDS("rds_files/applied-nmf-raw-loadings-H.rds") ## ----loadings----------------------------------------------------------------- loadings <- data.frame(raw_loadings) colnames(loadings) <- colnames(L_df) loadings[["Pattern"]] <- paste("Pattern", 1:L_rank) loadings %>% pivot_longer(-Pattern, names_to = "Chemical", values_to = "Loading") %>% ggplot(aes(x = Chemical, y = Loading)) + geom_point(size = 2) + geom_segment(aes(yend = 0, xend = Chemical), linewidth = 1) + facet_wrap(~Pattern, scales = "free_x") + theme_bw() + theme( strip.text.x = element_text(size = 12), title = element_text(size = 16), legend.position = "none", axis.text.x = element_text(angle = 45, hjust = 1, size = 10), strip.background = element_rect(fill = "white"), axis.title.x = element_blank(), axis.title.y = element_blank() ) + geom_hline(yintercept = 0, linewidth = 0.2) + ggtitle("PCP-NMF Loadings") ## ----var exp------------------------------------------------------------------ nmf_scores <- data.frame(raw_scores) colnames(nmf_scores) <- c( "Traffic", "Crustal Dust", "Salt", "Regional/Secondary & Tailpipe Emissions" ) nmf_sources <- nmf_scores %>% mutate(Date = queens_scaled$Date) %>% pivot_longer(-Date, names_to = "Pattern", values_to = "Score") var_explained <- nmf_sources %>% group_by(Pattern) %>% summarize(patsum = sum(Score)) %>% mutate(VarianceExplained = round(100 * patsum / sum(patsum), 1)) %>% select(-patsum) %>% arrange(desc(VarianceExplained)) %>% mutate(PatName = paste(Pattern, paste0("(", VarianceExplained, "% Var Exp)"))) nmf_sources <- nmf_sources %>% mutate( Pattern = factor( Pattern, levels = as.character(var_explained$Pattern), labels = as.character(var_explained$PatName) ) ) var_explained %>% select(-PatName) ## ----rename patterns---------------------------------------------------------- loadings %>% mutate(Pattern = factor( colnames(nmf_scores), levels = as.character(var_explained$Pattern), labels = as.character(var_explained$PatName) )) %>% pivot_longer(-Pattern, names_to = "Chemical", values_to = "Loading") %>% ggplot(aes(x = Chemical, y = Loading, color = Pattern)) + scale_color_brewer(palette = "Set1") + geom_point(size = 2) + geom_segment(aes(yend = 0, xend = Chemical), linewidth = 1) + facet_wrap(~Pattern, scales = "free_x") + theme_bw() + theme( strip.text.x = element_text(size = 12), title = element_text(size = 16), legend.position = "none", axis.text.x = element_text(angle = 45, hjust = 1, size = 10), strip.background = element_rect(fill = "white"), axis.title.x = element_blank(), axis.title.y = element_blank() ) + geom_hline(yintercept = 0, linewidth = 0.2) + ggtitle("PCP-NMF Loadings") ## ----NMF scores--------------------------------------------------------------- yr_num <- length(unique(year(nmf_sources$Date))) ggplot(nmf_sources, aes(Date, Score, color = Pattern)) + scale_color_brewer(palette = "Set1") + geom_smooth(method = "loess", span = 0.05) + facet_wrap(~Pattern, scales = "free", ncol = 1) + xlab("") + ylab("Pattern Score") + theme_classic() + theme( legend.position = "none", title = element_text(size = 18), axis.title.y = element_text(size = 18), strip.text = element_text(size = 14), axis.text.x = element_text(size = 13), axis.text.y = element_text(size = 11) ) + scale_x_date(date_labels = "%Y", date_breaks = "1 year") + ggtitle("PCP-NMF Scores over Time")