Correspondence analysis examples

Raymaekers, J. and Rousseeuw, P.J.

2023-10-25

library(cellWise)

Introduction

In this vignette we illustrate the use of MacroPCA with fixed center in the context of correspondence analysis. It reproduces the examples in the report Raymaekers and Rousseeuw (2022), Challenges of cellwise outliers.

Clothes data

In this section we analyze the clothes data through correspondence analysis. We start by loading the data, and constructing the matrix S:

data("data_clothes")
X <- data_clothes
dim(X)
## [1] 28  5
# create matrix S as in paper:
sum(X) # total count is 4373
## [1] 4373
P    <- X / sum(X) # relative frequencies that add up to 1
rvec <- rowSums(P) # vector of row totals
cvec <- colSums(P) # vector of column totals
R    <- X / rowSums(X) # row profiles
rowSums(R) # all 1,  OK
## GB SK BG IE BE ES PL FI GR HU SI NL IT RO AT FR HR SE CZ DK DE LT PT EE LU MT 
##  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 
## LV CY 
##  1  1
S <- diag(sqrt(rvec)) %*% (R - rep(1, nrow(X)) %*% 
                             t(cvec)) %*% diag(1/sqrt(cvec))
dimnames(S) <- dimnames(X)
round(S, 3)
##        X1     X2     X3     X4     X5
## GB  0.100  0.006 -0.042 -0.030 -0.039
## SK  0.208  0.002 -0.071 -0.066 -0.085
## BG  0.027  0.051  0.003 -0.025 -0.058
## IE -0.057 -0.028  0.008  0.021  0.059
## BE -0.054 -0.035  0.031  0.037  0.025
## ES -0.045 -0.020 -0.014  0.045  0.036
## PL -0.041  0.001 -0.001  0.025  0.019
## FI -0.036 -0.013  0.018  0.017  0.016
## GR  0.047 -0.015 -0.004 -0.003 -0.026
## HU -0.019  0.009 -0.004  0.000  0.015
## SI -0.028 -0.022 -0.001  0.023  0.030
## NL  0.003 -0.012 -0.013  0.006  0.015
## IT -0.033  0.010  0.011  0.002  0.012
## RO  0.054  0.038  0.011 -0.041 -0.066
## AT -0.042 -0.019  0.004  0.003  0.056
## FR -0.025 -0.009  0.018 -0.006  0.024
## HR -0.042 -0.014  0.014 -0.001  0.045
## SE -0.048  0.000  0.031  0.006  0.013
## CZ -0.041 -0.003 -0.001  0.008  0.039
## DK -0.048 -0.003  0.014  0.024  0.015
## DE -0.034 -0.005 -0.002  0.009  0.034
## LT -0.056 -0.012  0.020  0.030  0.022
## PT -0.003  0.028  0.003 -0.021 -0.008
## EE -0.021 -0.010  0.004  0.008  0.021
## LU -0.004 -0.013 -0.001  0.010  0.008
## MT  0.052 -0.019  0.011 -0.023 -0.022
## LV  0.033  0.048 -0.001 -0.030 -0.053
## CY -0.038  0.008  0.019  0.041 -0.027
d <- ncol(S)

# We verify that the points of S are on a hyperplane by construction:
(eigvals <- eigen(t(S) %*% S)$values)
## [1] 1.446489e-01 1.855747e-02 6.326238e-03 4.134890e-03 1.009893e-17

Now we analyze the data using DDC to identify suspicious cells:

DDC.out <- DDC(S)
##  
##  The input data has 28 rows and 5 columns.
ggp <- cellMap(R = DDC.out$stdResid,
               mTitle = "clothes data",
               rowtitle = "countries",
               columntitle = "brackets")

# pdf("clothes_cellmap.pdf", width = 3, height = 6)
plot(ggp)
plot of chunk unnamed-chunk-4

plot of chunk unnamed-chunk-4

# dev.off()

United Kingdom (GB), Slovakia, and Romania stand out. The outlier removal in Riani et al (2022) takes out GB, SK, BG, GR, RO, MT, LV which are precisely the countries with flagged cells here! Their analysis doesn’t say much about why these countries are outlying, but here we see which brackets they import unusually much or little from.

For instance, the cellmap suggests that GB imports a lot of cheap clothing. The webpage

says that much is imported from Bangladesh, China, and India.

countryInds <- which(rownames(S) %in% 
                       c("GB", "SK", "MT", "GR", "BG", "LV", "RO"))
matplot(t(S[countryInds, ]), pch = 16, col = 1:7)
lines(apply(S, 2, median), lwd = 3) # median profile
plot of chunk unnamed-chunk-5

plot of chunk unnamed-chunk-5

# The profile of these countries deviate in a similar way: 
# they trade a lot of cheap clothes, and fewer expensive ones. 

We now continue with classical correspondence analysis.

svd.out <- svd(S)
svd.out$d# S is indeed singular:
## [1] 3.803274e-01 1.362258e-01 7.953765e-02 6.430311e-02 2.723455e-17
(svd.out$d)^2 - eigen(t(S) %*% S)$values # ~0
## [1]  1.665335e-16 -1.040834e-17  7.806256e-18  6.071532e-18 -1.009893e-17
diff(c(0,cumsum(svd.out$d^2)/sum(svd.out$d^2)))
## [1] 0.83290720 0.10685628 0.03642729 0.02380923 0.00000000
# This can be shown in a scree plot.

# For plotting the rows:
rowproj <- diag(1/sqrt(rvec)) %*% svd.out$u %*% diag(svd.out$d)
# For plotting the columns = variables:
colproj <- diag(1/sqrt(cvec)) %*% svd.out$v %*% diag(svd.out$d)

# pdf(file="clothes_ClassCorrespA.pdf", width=7, height=6)
plot(rowproj[, 1:2], col = "white", xlab="", ylab="",
     xlim = c(-1, 0.6), ylim = c(-0.6, 0.6))
title(main="Classical correspondence analysis of clothes data", 
      line=1)
text(rowproj[, 1:2], labels = rownames(S))
abline(v=0); abline(h=0)
text(colproj[, 1:2], labels = colnames(S), col="red")
facs = c(0.93,0.85,0.78,0.84,0.87)
shape::Arrows(0, 0, facs*colproj[, 1], facs*colproj[, 2], 
              col = "red", arr.type="simple", arr.width=0.1,
              arr.length = 0.1) # , arr.adj = 0)
## NULL
title(xlab="Dimension 1", line=2.3)
title(ylab="Dimension 2", line=2.3)
plot of chunk unnamed-chunk-6

plot of chunk unnamed-chunk-6

# dev.off()

Now we perform a cellwise robust correspondence analysis.

# We apply MacroPCA with center fixed at zero.
# As in classical CA, we do not prescale S.
MacroPCApar0 <- list(scale = FALSE, center = rep(0,d))
MacroPCA.out <- MacroPCA(S, k=0, MacroPCApars = MacroPCApar0)
##  
##  The input data has 28 rows and 5 columns.
## 
## Initial eigenvalues:
##  0.001858658 0.0002242119 0.000170005 7.881939e-05 5.808785e-05
plot of chunk unnamed-chunk-7

plot of chunk unnamed-chunk-7

## 
## The cumulative percentage of explained variability 
## by the first 5 components is:
##   PC1   PC2   PC3   PC4   PC5 
##  77.8  87.2  94.3  97.6 100.0 
## 
## Based on explained variability >= 80% one would select k = 2.
## 
## Please use this information and the scree plot to select a value of k
## and rerun MacroPCA with it.
MacroPCA.out <- MacroPCA(S, k=2, MacroPCApars = MacroPCApar0)
##  
##  The input data has 28 rows and 5 columns.
## 
## Initial eigenvalues:
##  0.001608485 0.0002046096 0.0001747361 7.98259e-05 5.355761e-05 
## 
## XciSVD$rank, Xcih.SVD$rank, k and kmax:  5 5 2 5 
## 
## Performed an extra reweighting step because k = 2 < rank = 5 .
## 
## The final step used eigenvectors of reweighted covariance.
(eigvals <- MacroPCA.out$eigenvalues)
## [1] 0.0027284503 0.0002045247
diff(c(0,cumsum(eigvals)/sum(eigvals)))
## [1] 0.93026715 0.06973285
# Compute the principal coordinates for the biplot.
# To make the plot match the orientation in Riani et al:
V <- -MacroPCA.out$loadings
Tscores <- -MacroPCA.out$scores
sVals <- sqrt(nrow(S)*MacroPCA.out$eigenvalues)
U <- Tscores %*% diag(1/sVals)
rowproj <- diag(1/sqrt(rvec)) %*% U %*% diag(sVals)
colproj <- diag(1/sqrt(cvec)) %*% V %*% diag(sVals)

# pdf(file="clothes_MacroCA_biplot.pdf", width=7, height=6)
plot(rowproj[, 1:2], col = "white", xlim=c(-0.95,0.65),
     ylim=c(-0.6,0.6), xlab="", ylab="")
title(main="Cellwise robust correspondence analysis of clothes data", 
      line=1)
text(rowproj[,1:2], labels = rownames(S))
abline(h=0); abline(v=0)
text(colproj[, 1:2], labels = colnames(S), col="red")
facs = c(0.9,0.8,0.5,0.75,0.85)
shape::Arrows(0, 0, facs*colproj[, 1], facs*colproj[, 2], 
              col = "red", arr.type="simple", arr.width=0.1,
              arr.length = 0.1)
## NULL
title(xlab="Dimension 1", line=2.3)
title(ylab="Dimension 2", line=2.3)
plot of chunk unnamed-chunk-7

plot of chunk unnamed-chunk-7

# dev.off()
# Matches Fig 4 of Riani quite well.

Brand perception example

We start by loading the data and constructing the matrix S.

data("data_brands")
X <- data_brands
dim(X) 
## [1] 39  7
sum(X) # total count is 11713
## [1] 11713
P    <- X/sum(X) # relative frequencies that add up to 1
rvec <- rowSums(P) # vector of row totals
hist(rvec) # Right tail: Chevrolet, Ford, Honda, Toyota
plot of chunk unnamed-chunk-8

plot of chunk unnamed-chunk-8

# These brands are well known and sold a lot in the US.
cvec <- colSums(P) # vector of column totals
R    <- X / rowSums(X) # row profiles
S    <- diag(sqrt(rvec)) %*% (R - rep(1, nrow(X)) %*% 
                                t(cvec)) %*% diag(1/sqrt(cvec))
dimnames(S) <- dimnames(X)
d <- ncol(S)

Now we continue by executing DDC to identify suspicious cells.

DDC.out <- DDC(S)
##  
##  The input data has 39 rows and 7 columns.
ggp     <- cellMap(R = DDC.out$stdResid,
                   mTitle = "brands data",
                   rowtitle = "brands",
                   columntitle = "perceptions")
# pdf("brands_cellmap.pdf", width = 3.5, height = 8)
plot(ggp)
plot of chunk unnamed-chunk-9

plot of chunk unnamed-chunk-9

# dev.off()

# Volvo is most deviating (3 cells), followed by Hyundai
# (2 cells) and Maserati (2 cells).

Now we analyze the brands data by means of classical correspondence analysis.

svd.out <- svd(S)
svd.out$d # S is singular:
## [1] 3.354798e-01 2.805778e-01 1.707355e-01 1.565453e-01 1.285439e-01
## [6] 1.044026e-01 5.623106e-17
diff(c(0,cumsum(svd.out$d^2)/sum(svd.out$d^2)))
## [1] 0.41324121 0.28905304 0.10703318 0.08998101 0.06067003 0.04002153 0.00000000
# Can be plotted in a scree plot.

# To match the plot in Riani at al:
svd.out$v[, 2] = -svd.out$v[, 2]
svd.out$u[, 2] = -svd.out$u[, 2]
rowproj <- diag(1/sqrt(rvec)) %*% svd.out$u %*% diag(svd.out$d)
colproj <- diag(1/sqrt(cvec)) %*% svd.out$v %*% diag(svd.out$d)


# pdf(file="brands_ClassCorrespA.pdf", width=7, height=6)
plot(rowproj[, 1:2], col = "white", xlim=c(-1.1,1.1),
     ylim=c(-0.8,1.5), xlab="", ylab="")
title(main="Classical correspondence analysis of brands data", 
      line=1)
text(rowproj[,1:2], labels = rownames(S))
abline(v=0); abline(h=0)
text(colproj[, 1:2], labels = colnames(S), col="red")
facs = c(0.8,0.9,0.8,0.65,0.92,0.8,0.65)
shape::Arrows(0, 0, facs*colproj[, 1], facs*colproj[, 2], 
              col = "red", arr.type="simple", arr.width=0.1,
              arr.length = 0.1)
## NULL
title(xlab="Dimension 1", line=2.3)
title(ylab="Dimension 2", line=2.3)
plot of chunk unnamed-chunk-10

plot of chunk unnamed-chunk-10

# dev.off()

Cellwise robust correspondence analysis of the brands data:

MacroPCApar0 <- list(scale = FALSE, center = rep(0, d))
MacroPCA.out <- MacroPCA(S, k = 0, MacroPCApars = MacroPCApar0)
##  
##  The input data has 39 rows and 7 columns.
## 
## Initial eigenvalues:
##  0.001637705 0.0005991985 0.000517501 0.000261448 0.0001165996 9.420168e-05 1.323828e-05
plot of chunk unnamed-chunk-11

plot of chunk unnamed-chunk-11

## 
## The cumulative percentage of explained variability 
## by the first 7 components is:
##   PC1   PC2   PC3   PC4   PC5   PC6   PC7 
##  50.5  69.0  85.0  93.1  96.7  99.6 100.0 
## 
## Based on explained variability >= 80% one would select k = 3.
## 
## Please use this information and the scree plot to select a value of k
## and rerun MacroPCA with it.
MacroPCA.out <- MacroPCA(S, k = 3, MacroPCApars = MacroPCApar0)
##  
##  The input data has 39 rows and 7 columns.
## 
## Initial eigenvalues:
##  0.001310427 0.0006177467 0.0004581388 0.0002548126 0.0001141835 0.0001013869 1.44098e-05 
## 
## XciSVD$rank, Xcih.SVD$rank, k and kmax:  7 7 3 7 
## 
## Performed an extra reweighting step because k = 3 < rank = 7 .
## 
## The final step used eigenvectors of reweighted covariance.
# Computations for the biplot.
V <- MacroPCA.out$loadings
scores <- MacroPCA.out$scores
# To make the plot match the orientation in Riani et al:
V[,2]      <- -V[,2]
scores[,2] <- -scores[,2]
sVals      <- sqrt(nrow(S)*MacroPCA.out$eigenvalues)
U          <- scores %*% diag(1/sVals)
rowproj    <- diag(1/sqrt(rvec)) %*% U %*% diag(sVals)
colproj    <- diag(1/sqrt(cvec)) %*% V %*% diag(sVals)

# pdf(file="brands_MacroCA_biplot.pdf", width=7, height=6)
plot(rowproj[, 1:2], col = "white", xlim=c(-1.1,1.1),
     ylim=c(-0.6,0.6), xlab="", ylab="")
title(main="Cellwise robust correspondence analysis of brands data",
      line=1)
text(rowproj[,1:2], labels = rownames(S))
abline(h=0); abline(v=0)
text(colproj[, 1:2], labels = colnames(S), col="red")
facs = c(0.75,0.76,0.9,0.88,0.65,0.74,0.52)
shape::Arrows(0, 0, facs*colproj[, 1], facs*colproj[, 2], 
              col = "red", arr.type="simple", arr.width=0.1,
              arr.length = 0.1)
## NULL
title(xlab="Dimension 1", line=2.3)
title(ylab="Dimension 2", line=2.3)
plot of chunk unnamed-chunk-11

plot of chunk unnamed-chunk-11

# dev.off()
# Roughly matches Figure 7 of Riani et al (2022).