--- title: "Examples for the Matrix Minimum Covariance Determinant estimators" author: "Mayrhofer, M. and Radojičić, U. and Filzmoser, P." output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{MMCD_examples} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} bibliography: '`r system.file("REFERENCES.bib", package="robustmatrix")`' --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` ```{r, results='hide', message = FALSE, warning=FALSE} library(robustmatrix) library(ggplot2) library(dplyr) library(ggnewscale) library(ggrepel) library(tidyr) library(tibble) library(forcats) ``` This file documents how to use the methods introduced by @mayrhofer2024 and provides an overview of the methods of the `robustmatrix` package. All examples discussed in @mayrhofer2024 can be reproduced using this file. This document aims to illustrate how to use the functions of the `robustmatrix` package. The `mmcd` function implements the FastMMCD algorithm and computes the robust matrix-variate Matrix Minimum Covariance Determinant (MMCD) estimators. For outlier explanation, the `matrixShapley` function decomposes the squared Mahalanobis distance into outlyingness contributions form a matrix's rows, columns, or cells. The real-world data examples discussed here are the *mountain weather*, *darwin*, and *video* data sets. The former two data sets are included in the package and can be loaded using `data(weather)` and `data(darwin)`, respectively. Related publications and data sources are listed in the help files. The *video* data set is from the Perception Test Images Sequences from @li2004statistical and the preprocessed video is available at [https://wis.kuleuven.be/statdatascience/robust/software](https://wis.kuleuven.be/statdatascience/robust/software). ## Glacier weather data – Sonnblick observatory Weather data from Austria's highest weather station, situated in the Austrian Central Alps on the glaciated mountain "Hoher Sonnblick", standing at an elevation of 3106 meters above sea level. An array of dimension $(p,q,n)$, comprising $n = 136$ observations, each represented by a $p = 5$ times $q = 12$ dimensional matrix. Observed parameters are monthly averages of - air pressure (AP) - precipitation (P) - sunshine hours (SH) - temperature (T) - proportion of solid precipitation (SP) from 1891 to 2022. ```{r} data(weather) X <- weather[,,!apply(weather,3, function(x) any(is.na(x)))] n<- dim(X)[3] p<- dim(X)[1] q<- dim(X)[2] ``` Parameter estimation of mean, rowwise and columnwise covariance matrix using maximum likelihood estimation and robust parameter estimation based on MMCD. ```{r} par_MMLE <- mmle(X, lambda = 0) set.seed(1) par_MMCD <- mmcd(X, alpha = 0.5 ,lambda = 0, nsamp = 5000, nthreads = 1) ``` Computation of (robust) Mahalanbosi distances. ```{r} MD <- as.numeric(mmd(X = X, mu = par_MMLE$mu, cov_row = par_MMLE$cov_row_inv, cov_col = par_MMLE$cov_col_inv, inverted = TRUE)) MD_rob <- as.numeric(mmd(X = X, mu = par_MMCD$mu, cov_row = par_MMCD$cov_row_inv, cov_col = par_MMCD$cov_col_inv, inverted = TRUE)) out_quant <- qchisq(0.95, p*q) outliers <- which(MD_rob > out_quant) ``` Distance-distance of yearly outlyingness scores, using the robust procedure, we can unmask several outliers. ```{r, fig.dim = c(7,4)} names(MD_rob) <- dimnames(X)[[3]] subs <- MD_rob < out_quant plt_dd <- ggplot(data = NULL, aes(x = sqrt(MD), y = sqrt(MD_rob))) + geom_point() + geom_hline(yintercept = sqrt(out_quant)) + geom_vline(xintercept = sqrt(out_quant)) + geom_label_repel(aes(x = sqrt(MD[!subs]), y = sqrt(MD_rob[!subs]), label = names(MD_rob[!subs])), size = 2.5, max.overlaps = 200, nudge_y = 0.5) + theme_classic() + xlim(c(min(sqrt(MD)),max(sqrt(MD))+1)) + labs(x = "MMD", y = "Robust MMD") plt_dd ``` Computation of cellwise Shapley values. ```{r} shv_cell <- array(dim = dim(X)) shv_cell[] <- matrixShapley(X, mu = par_MMCD$mu, cov_row = par_MMCD$cov_row_inv, cov_col = par_MMCD$cov_col_inv, inverted = TRUE, type = "cell") dimnames(shv_cell) <- dimnames(X) ``` Preparing data for plots. ```{r} long_weather <- function(x, val_to, var_levels = NULL) { y <- x %>% rownames_to_column(var = "var") %>% pivot_longer(cols = -var, names_to = "parameter", values_to = val_to) if(!is.null(var_levels)){ y <- y %>% mutate(var = factor(var, levels = var_levels)) } y } var_names <- c("T [°C]" = "T", "SH [h]" = "SH", "P [mm/m²]" = "P", "AP [hPa]" = "AP", "SP [%]" = "SP") set <- cbind("var" = 1891:2022, "Temperature [°C]" = 0, "Preciptiation [mm/m²]" = 0, "Solid Preciptiation [%]" = 0, "Sunshine hours [h]" = 0, "Air pressure [hPa]" = 0) set[!(set[,1] %in% dimnames(X)[[3]]),-1] <- 1 set <- data.frame(set) colnames(set) <- c("var", "T [°C]", "P [mm/m²]", "SP [%]", "SH [h]", "AP [hPa]") set_long <- set %>% pivot_longer(-c(var)) ``` ```{r, message = FALSE, warning=FALSE} shv_year <- apply(shv_cell, c(1,3),sum) X_center <- array(dim = dim(X), dimnames = dimnames(X)) X_center[] <- (apply(X,3,function(x) x - par_MMCD$mu)) X_center_sign <- array(dim = dim(X), dimnames = dimnames(X)) X_center_sign[] <- sign(X_center) sign_mat <- apply(X_center,c(1,3),function(x) sign(mean(x))) sign_mat[,-outliers] <- 0 shv_rel <- apply(shv_year, 2, function(x) x/sum(x)) shv_rel_outliers <- shv_rel shv_rel_outliers[,-outliers] <- 0 shv_rel_outliers[shv_rel_outliers < 0] <- 0 # ONLY PLOT POSITIVE SHAPLEY VALUES shv_years_long <- long_weather(data.frame(t(shv_rel_outliers)), val_to = "shv_rel") sign_years_long <- long_weather(data.frame(t(sign_mat)), val_to = "sign") plt_data_years <- inner_join(shv_years_long,sign_years_long) %>% mutate(val = sign*abs(shv_rel), sign = as.character(sign), var = as.numeric(var), parameter = fct_recode(factor(parameter),!!! var_names)) ``` ```{r, message = FALSE, warning=FALSE} shv_long_1895 <- long_weather(data.frame(t(shv_cell[,,outliers[1]])), val_to = "shv", var_levels = dimnames(X)[[2]]) X_long_1895 <- long_weather(data.frame(t(X[,,outliers[1]])) %>% mutate(SP = SP*100), val_to = "x", var_levels = dimnames(X)[[2]]) sign_long_1895 <- long_weather(data.frame(t(sign(X[,,outliers[1]] - par_MMCD$mu))), val_to = "sign", var_levels = dimnames(X)[[2]]) plt_data_1895 <- inner_join(shv_long_1895,sign_long_1895) %>% mutate(val = sign*abs(shv), sign = as.character(sign)) %>% inner_join(X_long_1895) %>% mutate(parameter = fct_recode(factor(parameter),!!! var_names)) shv_long_2022 <- long_weather(data.frame(t(shv_cell[,,n])), val_to = "shv", var_levels = dimnames(X)[[2]]) X_long_2022 <- long_weather(data.frame(t(X[,,n])) %>% mutate(SP = SP*100), val_to = "x", var_levels = dimnames(X)[[2]]) sign_long_2022 <- long_weather(data.frame(t(sign(X[,,n] - par_MMCD$mu))), val_to = "sign", var_levels = dimnames(X)[[2]]) plt_data_2022 <- inner_join(shv_long_2022,sign_long_2022) %>% mutate(val = sign*abs(shv), sign = as.character(sign)) %>% inner_join(X_long_2022) %>% mutate(parameter = fct_recode(factor(parameter),!!! var_names)) ``` Yearly outlyingess contributions based on rowwise Shapley values. In the plot, non-outlying years are colored white, while years with missing data are gray. For the outliers, blue indicates "above average", red indicates "below average", and color intensity is proportional to the rowwise Shapley value. ```{r, fig.dim = c(7,4), message = FALSE, warning=FALSE} ggplot(plt_data_years, aes(x = var, y = parameter, fill = val)) + geom_tile(color = "lightgray") + theme_minimal() + scale_fill_gradient2(low = "#1F78B4", mid = "white", high = "#E31A1C", midpoint = 0, guide = "none") + scale_x_continuous(breaks = seq(from = 1880, to = 2020, by = 5), expand = c(0,0)) + theme(axis.text.x = element_text(angle = 45, vjust = 1.2, hjust=1), axis.title = element_blank(), title = element_blank()) + geom_tile(data = set_long, aes(x = var, y = name, alpha = factor(value)), fill = "gray") + scale_alpha_manual(values = c(0,1)) + guides(alpha = FALSE, fill = FALSE) ``` Outlyingess contributions based on cellwise Shapley values are computed for 1895 and 2022 using the same color scheme as in the previous plot. ```{r, fig.dim = c(7,4), message = FALSE, warning=FALSE} gridExtra::grid.arrange(ggplot(plt_data_1895, aes(x = var, y = parameter, fill = val, label = round(x))) + geom_tile(color = "lightgray") + theme_minimal() + geom_text(size = 3) + theme(axis.text.x = element_text(angle = 45, vjust = 1.4, hjust=1.4), axis.title = element_blank())+ labs(title = "1895") + scale_fill_gradient2(low = "#1F78B4", mid = "white", high = "#E31A1C", midpoint = 0, guide = "none"), ggplot(plt_data_2022, aes(x = var, y = parameter, fill = val, label = round(x))) + geom_tile(color = "lightgray") + theme_minimal() + geom_text(size = 3) + theme(axis.text.x = element_text(angle = 45, vjust = 1.4, hjust=1.4), axis.title = element_blank())+ labs(title = "2022") + scale_fill_gradient2(low = "#1F78B4", mid = "white", high = "#E31A1C", midpoint = 0, guide = "none"), nrow = 1) ``` ## Darwin The DARWIN (Diagnosis AlzheimeR WIth haNdwriting) data set comprises handwriting samples from 174 individuals. Among them, 89 have been diagnosed with Alzheimer's disease (AD), while the remaining 85 are considered healthy subjects (H). Each participant completed 25 handwriting tasks on paper, and their pen movements were recorded using a graphic tablet. A set of 18 features was extracted from the raw handwriting data. An array of dimension $(p,q,n)$, comprising $n = 174$ observations, each represented by a $p = 18$ times $q = 25$ dimensional matrix. The observed parameters are: - Total Time - Air Time - Paper Time - Mean Speed on paper - Mean Acceleration on paper - Mean Acceleration in air - Mean Jerk on paper - Pressure Mean - Pressure Variance - Generalization of the Mean Relative Tremor (GMRT) on paper - GMTR in air - Mean GMRT - Pendowns Number - Max X Extension - Max Y Extension - Dispersion Index ```{r} data(darwin) darwin_class <- dimnames(darwin)[[3]] sub_H <- which(darwin_class == "H") sub_P <- which(darwin_class == "P") ``` The classes are stored in `dimnames(darwin)[[3]]`; the subjects with class *H* are the healthy, and the subjects with class *P* have AD. Since the parameters Total Time (= Air Time + Paper Time) and Mean GMRT (= (GMRT on paper + GMRT in air)/2) are linearly dependent on other variables, they are excluded from the following analysis. Moreover, the variable Air Time is skewed due to several large univariate outliers in both the H and AD subjects. Across various tasks, some of the observed values of this variable are more than 25 median absolute deviations away from the median. Since the Air Time was measured using the graphic tablet, it could be that some individuals placed the pen too close to the device before the task was performed. While Shapley values based on the MMCD estimators allow us to unveil those large outlyingness contributions, they dominate and mask outlyingess contributions from other variables. Therefore, we also excluded the Air Time variable, and used the $p = 15$ remaining variables for our analysis. ```{r} darwin_array_clean <- darwin[-c(1,9,18),,] X_H <- darwin_array_clean[,,sub_H] X_P <- darwin_array_clean[,,sub_P] dimnames(X_H)[[3]] <- sub_H dimnames(X_P)[[3]] <- sub_P p <- dim(X_H)[1] q <- dim(X_H)[2] n <- dim(X_H)[3] ``` Parameter estimation of mean, rowwise and columnwise covariance matrix using maximum likelihood estimation and robust parameter estimation based on MMCD. ```{r} set.seed(1) par_mmle <- robustmatrix::mmle(X_H) par_mmcd <- robustmatrix::mmcd(X_H, nsamp = 500, alpha = 0.5, nthreads = 1, scale_consistency = "quant") ``` Plotting the MMDs based on the parameters estimated from the *H* group. ```{r, fig.dim = c(7,4)} plt_data <- data.frame(patient_id = 1:dim(darwin_array_clean)[[3]], class = c(rep("AD", length(sub_P)), rep("H", length(sub_H))), mmd = sqrt(mmd(darwin_array_clean, par_mmcd$mu, par_mmcd$cov_row, par_mmcd$cov_col))) p_mmd <- ggplot(data = plt_data, mapping = aes(x = patient_id, y = mmd, color = class)) + geom_point(show.legend = FALSE) + geom_hline(yintercept = sqrt(qchisq(0.975, p*q))) + labs(x = "Subject ID", y = "MMD", color = element_blank()) + theme_classic() + facet_grid(~ class, scales = "free") p_mmd ``` Computing Shapley values and *average proportional contributions* of the variables to the MMDs for the *H* and *P* groups. ```{r, message = FALSE, warning=FALSE} shv_cell_P <- array(dim = dim(X_P), dimnames = dimnames(X_P)) shv_cell_P[] <- matrixShapley(X_P, mu = par_mmcd$mu, cov_row = par_mmcd$cov_row_inv, cov_col = par_mmcd$cov_col_inv, inverted = TRUE, type = "cell") shv_cell_H <- array(dim = dim(X_H), dimnames = dimnames(X_H)) shv_cell_H[] <- matrixShapley(X_H, mu = par_mmcd$mu, cov_row = par_mmcd$cov_row_inv, cov_col = par_mmcd$cov_col_inv, inverted = TRUE, type = "cell") shv_row_mean_P <- data.frame("val" = apply(shv_cell_P,c(1), function(x)mean(x))) %>% rownames_to_column() shv_row_mean_H <- data.frame("val" = apply(shv_cell_H,c(1), function(x)mean(x))) %>% rownames_to_column() shv_row_mean <- rbind(cbind("class" = "AD", shv_row_mean_P), cbind("class" = "H", shv_row_mean_H)) shv_row_mean_P_prop <- data.frame("val" = apply(shv_cell_P,c(1), function(x)sum(x)/sum(shv_cell_P))) %>% rownames_to_column() shv_row_mean_H_prop <- data.frame("val" = apply(shv_cell_H,c(1), function(x)sum(x)/sum(shv_cell_H))) %>% rownames_to_column() shv_row_mean_prop<- rbind(cbind("class" = "AD", shv_row_mean_P_prop), cbind("class" = "H", shv_row_mean_H_prop)) fct_order <- order(shv_row_mean_P_prop$val) shv_row_mean_prop <- shv_row_mean_prop %>% mutate(rowname = factor(rowname, levels = shv_row_mean_P_prop$rowname[fct_order])) ``` Average proportional contributions for the *H* and *P* groups. ```{r, fig.dim = c(7,4)} p_shv_features <- ggplot(data = shv_row_mean_prop, aes(x = val, y = rowname, fill = class)) + geom_bar(stat = "identity", position=position_dodge(), color = "black") + labs(x = "Proportional contribution to MMD²", y = element_blank(), fill = element_blank()) + scale_x_continuous(expand = c(0, 0)) + theme_classic() + theme(legend.position = c(0.95,0.5)) p_shv_features ``` ## Video data The following code can load the video data and reproduce the results outlined in @mayrhofer2024. Due to the dimensionality of the data set, the following code is not executed. ```{r, eval=FALSE} # Loading the data load(url("https://wis.kuleuven.be/stat/robust/Programs/DO/do-video-data-rdata")) # Creating the overview plot ind_overview <- c(1,487,491,495,500) plts_overview <- list() for(i in seq_along(ind_overview)){ dim_Video <- dim(Video) video_grid <- expand.grid("x" = 1:dim_Video[2], "y" = 1:dim_Video[3]) video_long <- data.frame(cbind(video_grid, "r" = as.vector(Video[ind_overview[i],,,1]/255), "g" = as.vector(Video[ind_overview[i],,,2]/255), "b" = as.vector(Video[ind_overview[i],,,3]/255))) %>% mutate(rgb.val=rgb(r,g,b)) plts_overview[[i]] <- ggplot(video_long, aes(y,x)) + geom_raster(aes(fill=rgb.val)) + scale_fill_identity() + theme_void()+ coord_fixed() + theme(plot.title = element_text(hjust = 0.5), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_rect(fill = "transparent" ,colour = "black"), panel.ontop = TRUE, plot.margin = margin(1, 1, 1, 1, "pt")) + scale_x_continuous(expand = c(0,0)) + scale_y_continuous(expand = c(0,0), trans = "reverse") + labs(title = paste("Frame", ind_overview[i])) } video_overview <- do.call(gridExtra::grid.arrange, args = list("grobs" = plts_overview, "nrow" = 1)) # Transform to grayscale image video_grayscale <- apply(Video, 1:3, mean) X <- aperm(video_grayscale,c(2,3,1)) n <- dim(X)[3] p <- dim(X)[1] q <- dim(X)[2] # MMLE and MMCD parameter estimation par_MMLE <- mmle(X, lambda = 0) set.seed(1) par_MMCD <- mmcd(X, alpha = 0.5,lambda = 0, nsamp = 500, nthreads = 1) # Compute squared Mahlanobis distances MD <- mmd(X, mu = par_MMLE$mu, cov_row = par_MMLE$cov_row_inv, cov_col = par_MMLE$cov_col_inv, inverted = TRUE) MD_rob <- mmd(X, mu = par_MMCD$mu, cov_row = par_MMCD$cov_row_inv, cov_col = par_MMCD$cov_col_inv, inverted = TRUE) out_quant <- qchisq(0.99, p*q) # Plot MMD of all observations data_plt_md <- data.frame(cbind("Frame" = 1:length(MD_rob), "MD" = sqrt(MD_rob))) ggplot(data_plt_md, aes(x = Frame, y = MD)) + geom_point() + labs(x = "Frame", y = "MMD") + scale_x_continuous(limits = range(1,n), breaks = seq(from = 50, to = n, by = 50), expand = c(0,3)) + theme_classic() # Plot MMD of last observations sub_ind <- (1:length(MD_rob))[-(1:475)] frame_labels <- rep(NA,length(sub_ind)) frame_labels[sub_ind %in% c(487,491,495,500)] <- c("Frame 487: man left of tree", "Frame 491: man behind tree", "Frame 495: man right of tree", "Frame 500: man fully visible") data_plt_md1 <- data.frame(cbind("Frame" = sub_ind, "MD" = sqrt(MD_rob[sub_ind]), "labels" = frame_labels)) ggplot(data_plt_md1, aes(x = as.numeric(Frame), y = as.numeric(MD), label = labels)) + geom_point() + geom_path() + geom_label_repel(nudge_x = 75, nudge_y = -80, color = "darkblue") + labs(x = "Frame", y = "MMD") + scale_x_continuous(limits = range(sub_ind), breaks = seq(from = 480, to = 630, by = 10), expand = c(0,1)) + theme_classic() # Compute Shapley values shv <- array(dim = dim(X)) shv[,,] <- matrixShapley(X[,,], mu = par_MMCD$mu, cov_row = par_MMCD$cov_row_inv, cov_col = par_MMCD$cov_col_inv, inverted = TRUE, type = "cell") # Function to plot Shapley values for image data plot_shapley_image <- function(image,shapley_values, lower = -100, upper = 100, title = NULL, positive_only = FALSE, border = TRUE){ shapley_values[shapley_values < lower] <- lower shapley_values[shapley_values > upper] <- upper if(positive_only){ shapley_values[shapley_values < 0] <- 0 } image_grid <- expand.grid("x" = 1:nrow(shapley_values), "y" = 1:ncol(shapley_values)) image_data <- cbind(image_grid, "z" = as.vector(image), "shv" = as.vector(shapley_values)) plt <- ggplot() if(any(image != 0)){ plt <- plt + geom_raster(data = image_data, aes(x, y, fill = z)) + scale_fill_gradient(low = "black", high = "white",guide=FALSE) } plt <- plt + new_scale("fill") + geom_raster(data = image_data, aes(x, y, fill = shv), na.rm = TRUE) + scale_fill_gradientn(colours= c("blue", "transparent", "red"),limits=c(lower, upper), guide=FALSE, na.value = "transparent") + labs(title = title) + theme_void()+ coord_fixed() + theme(plot.title = element_text(hjust = 0.5)) if(border){ plt <- plt + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_rect(fill = "transparent" ,colour = "black"), panel.ontop = TRUE, plot.margin = margin(1, 1, 1, 1, "pt")) + scale_y_continuous(expand = c(0,0)) + scale_x_continuous(expand = c(0,0)) } plt } # Plot Shapley values for selected frames lims <- 100 gridExtra::grid.arrange( plot_shapley_image(image = t(apply(X[,,487],2,rev)), shapley_values = (t(apply(shv[,,487],2,rev))), lower = -lims, upper = lims, title = "Frame 487", positive_only = TRUE), plot_shapley_image(image = t(apply(X[,,491],2,rev)), shapley_values = (t(apply(shv[,,491],2,rev))), lower = -lims, upper = lims, title = "Frame 491", positive_only = TRUE), plot_shapley_image(image = t(apply(X[,,495],2,rev)), shapley_values = (t(apply(shv[,,495],2,rev))), lower = -lims, upper = lims, title = "Frame 495", positive_only = TRUE), plot_shapley_image(image = t(apply(X[,,500],2,rev)), shapley_values = (t(apply(shv[,,500],2,rev))), lower = -lims, upper = lims, title = "Frame 500", positive_only = TRUE), nrow = 1 ) ```