Introduction
In this vignette we will explain how some functions of the package
are used in order to estimate a contingency table. We will work on the
eusilc
dataset of the laeken
package. All the
functions presented in the following are explained in the proposed
manuscript by Raphaël Jauslin and Yves Tillé (2021) doi:10.1016/j.jspi.2022.12.003.
Contingency table
We will estimate the contingency table when the factor variable which
represents the economic status pl030
is crossed with a
discretized version of the equivalized household income
eqIncome
. In order to discretize the equivalized income, we
calculate percentiles (0.15,0.30,0.45,0.60,0.75,0.90) of the variable
and define the category as intervals between the values.
library(laeken)
library(sampling)
library(StratifiedSampling)
#> Le chargement a nécessité le package : Matrix
data("eusilc")
eusilc <- na.omit(eusilc)
N <- nrow(eusilc)
# Xm are the matching variables and id are identity of the units
Xm <- eusilc[,c("hsize","db040","age","rb090","pb220a")]
Xmcat <- do.call(cbind,apply(Xm[,c(2,4,5)],MARGIN = 2,FUN = disjunctive))
Xm <- cbind(Xmcat,Xm[,-c(2,4,5)])
id <- eusilc$rb030
# categorial income splitted by the percentile
c_income <- eusilc$eqIncome
q <- quantile(eusilc$eqIncome, probs = seq(0, 1, 0.15))
c_income[which(eusilc$eqIncome <= q[2])] <- "(0,15]"
c_income[which(q[2] < eusilc$eqIncome & eusilc$eqIncome <= q[3])] <- "(15,30]"
c_income[which(q[3] < eusilc$eqIncome & eusilc$eqIncome <= q[4])] <- "(30,45]"
c_income[which(q[4] < eusilc$eqIncome & eusilc$eqIncome <= q[5])] <- "(45,60]"
c_income[which(q[5] < eusilc$eqIncome & eusilc$eqIncome <= q[6])] <- "(60,75]"
c_income[which(q[6] < eusilc$eqIncome & eusilc$eqIncome <= q[7])] <- "(75,90]"
c_income[which( eusilc$eqIncome > q[7] )] <- "(90,100]"
# variable of interests
Y <- data.frame(ecostat = eusilc$pl030)
Z <- data.frame(c_income = c_income)
# put same rownames
rownames(Xm) <- rownames(Y) <- rownames(Z)<- id
YZ <- table(cbind(Y,Z))
addmargins(YZ)
#> c_income
#> ecostat (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100] Sum
#> 1 409 616 722 807 935 1025 648 5162
#> 2 189 181 205 184 165 154 82 1160
#> 3 137 90 72 75 59 52 33 518
#> 4 210 159 103 95 74 49 46 736
#> 5 470 462 492 477 459 435 351 3146
#> 6 57 25 28 30 17 11 10 178
#> 7 344 283 194 149 106 91 40 1207
#> Sum 1816 1816 1816 1817 1815 1817 1210 12107
Sampling schemes
Here we set up the sampling designs and define all the quantities we will need for the rest of the vignette. The sample are selected with simple random sampling without replacement and the weights are equal to the inverse of the inclusion probabilities.
# size of sample
n1 <- 1000
n2 <- 500
# samples
s1 <- srswor(n1,N)
s2 <- srswor(n2,N)
# extract matching units
X1 <- Xm[s1 == 1,]
X2 <- Xm[s2 == 1,]
# extract variable of interest
Y1 <- data.frame(Y[s1 == 1,])
colnames(Y1) <- colnames(Y)
Z2 <- as.data.frame(Z[s2 == 1,])
colnames(Z2) <- colnames(Z)
# extract correct identities
id1 <- id[s1 == 1]
id2 <- id[s2 == 1]
# put correct rownames
rownames(Y1) <- id1
rownames(Z2) <- id2
# here weights are inverse of inclusion probabilities
d1 <- rep(N/n1,n1)
d2 <- rep(N/n2,n2)
# disjunctive form
Y_dis <- sampling::disjunctive(as.matrix(Y))
Z_dis <- sampling::disjunctive(as.matrix(Z))
Y1_dis <- Y_dis[s1 ==1,]
Z2_dis <- Z_dis[s2 ==1,]
Harmonization
Then the harmonization step must be performed. The
harmonize
function returns the harmonized weights. If by
chance the true population totals are known, it is possible to use these
instead of the estimate made within the function.
re <- harmonize(X1,d1,id1,X2,d2,id2)
# if we want to use the population totals to harmonize we can use
re <- harmonize(X1,d1,id1,X2,d2,id2,totals = c(N,colSums(Xm)))
w1 <- re$w1
w2 <- re$w2
colSums(Xm)
#> 1 2 3 4 5 6 7 8 9 10 11
#> 476 887 2340 763 1880 1021 2244 1938 558 6263 5844
#> 12 13 14 hsize age
#> 11073 283 751 36380 559915
colSums(w1*X1)
#> 1 2 3 4 5 6 7 8 9 10 11
#> 476 887 2340 763 1880 1021 2244 1938 558 6263 5844
#> 12 13 14 hsize age
#> 11073 283 751 36380 559915
colSums(w2*X2)
#> 1 2 3 4 5 6 7 8 9 10 11
#> 476 887 2340 763 1880 1021 2244 1938 558 6263 5844
#> 12 13 14 hsize age
#> 11073 283 751 36380 559915
Optimal transport matching
The statistical matching is done by using the otmatch
function. The estimation of the contingency table is calculated by
extracting the id1
units (respectively id2
units) and by using the function tapply
with the correct
weights.
# Optimal transport matching
object <- otmatch(X1,id1,X2,id2,w1,w2)
head(object[,1:3])
#> id1 id2 weight
#> 601 601 340602 12.445399
#> 1 1101 1101 11.395848
#> 1601 1601 274501 10.337273
#> 1601.1 1601 280101 3.187415
#> 2001 2001 546501 11.263819
#> 2002 2002 195904 8.688167
Y1_ot <- cbind(X1[as.character(object$id1),],y = Y1[as.character(object$id1),])
Z2_ot <- cbind(X2[as.character(object$id2),],z = Z2[as.character(object$id2),])
YZ_ot <- tapply(object$weight,list(Y1_ot$y,Z2_ot$z),sum)
# transform NA into 0
YZ_ot[is.na(YZ_ot)] <- 0
# result
round(addmargins(YZ_ot),3)
#> (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100] Sum
#> 1 619.379 739.569 949.037 635.626 707.269 894.906 436.951 4982.736
#> 2 196.187 157.957 163.650 172.366 126.967 223.854 107.082 1148.063
#> 3 106.351 77.478 74.653 120.990 54.688 66.171 52.943 553.274
#> 4 190.992 92.113 122.974 172.131 89.565 91.222 59.600 818.597
#> 5 602.082 472.162 327.146 671.722 428.580 356.611 377.550 3235.853
#> 6 12.495 6.426 26.356 17.917 20.908 10.427 23.236 117.765
#> 7 205.995 204.979 177.161 145.040 136.325 300.241 80.971 1250.712
#> Sum 1933.480 1750.684 1840.978 1935.792 1564.301 1943.432 1138.333 12107.000
Balanced sampling
As you can see from the previous section, the optimal transport
results generally do not have a one-to-one match, meaning that for every
unit in sample 1, we have more than one unit with weights not equal to 0
in sample 2. The bsmatch
function creates a one-to-one
match by selecting a balanced stratified sampling to obtain a data.frame
where each unit in sample 1 has only one imputed unit from sample 2.
# Balanced Sampling
BS <- bsmatch(object)
head(BS$object[,1:3])
#> id1 id2 weight
#> 601 601 340602 12.445399
#> 1 1101 1101 11.395848
#> 1601.1 1601 280101 3.187415
#> 2001 2001 546501 11.263819
#> 2002 2002 195904 8.688167
#> 2403 2403 200203 13.578569
Y1_bs <- cbind(X1[as.character(BS$object$id1),],y = Y1[as.character(BS$object$id1),])
Z2_bs <- cbind(X2[as.character(BS$object$id2),],z = Z2[as.character(BS$object$id2),])
YZ_bs <- tapply(BS$object$weight/BS$q,list(Y1_bs$y,Z2_bs$z),sum)
YZ_bs[is.na(YZ_bs)] <- 0
round(addmargins(YZ_bs),3)
#> (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100] Sum
#> 1 640.132 746.390 904.406 600.545 777.484 892.987 420.791 4982.736
#> 2 176.032 162.668 128.345 192.548 139.383 225.546 123.541 1148.063
#> 3 110.347 77.376 84.863 125.206 61.044 46.541 47.897 553.274
#> 4 189.497 67.634 107.068 193.825 98.072 98.939 63.563 818.597
#> 5 645.478 517.060 315.529 591.057 437.081 359.918 369.731 3235.853
#> 6 11.771 14.084 22.764 9.315 26.582 10.427 22.823 117.765
#> 7 199.878 179.058 186.835 157.172 128.335 311.981 87.453 1250.712
#> Sum 1973.135 1764.270 1749.809 1869.667 1667.981 1946.339 1135.799 12107.000
# With Z2 as auxiliary information for stratified balanced sampling.
BS <- bsmatch(object,Z2)
Y1_bs <- cbind(X1[as.character(BS$object$id1),],y = Y1[as.character(BS$object$id1),])
Z2_bs <- cbind(X2[as.character(BS$object$id2),],z = Z2[as.character(BS$object$id2),])
YZ_bs <- tapply(BS$object$weight/BS$q,list(Y1_bs$y,Z2_bs$z),sum)
YZ_bs[is.na(YZ_bs)] <- 0
round(addmargins(YZ_bs),3)
#> (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100] Sum
#> 1 567.928 701.969 991.064 637.342 761.569 897.114 425.750 4982.736
#> 2 237.425 186.601 140.231 141.988 129.349 223.916 88.553 1148.063
#> 3 84.554 74.175 82.577 126.849 59.211 75.535 50.373 553.274
#> 4 198.412 95.906 121.062 168.413 62.642 108.600 63.563 818.597
#> 5 610.651 482.307 334.310 704.901 383.456 350.780 369.448 3235.853
#> 6 11.771 14.084 22.764 9.315 26.582 10.427 22.823 117.765
#> 7 224.178 193.115 154.306 141.845 138.479 287.294 111.496 1250.712
#> Sum 1934.918 1748.157 1846.315 1930.653 1561.287 1953.665 1132.005 12107.000
Prediction
# split the weight by id1
q_l <- split(object$weight,f = object$id1)
# normalize in each id1
q_l <- lapply(q_l, function(x){x/sum(x)})
q <- as.numeric(do.call(c,q_l))
Z_pred <- t(q*disjunctive(object$id1))%*%disjunctive(Z2[as.character(object$id2),])
colnames(Z_pred) <- levels(factor(Z2$c_income))
head(Z_pred)
#> (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100]
#> [1,] 0 0 0.0000000 0 0 0.0000000 1
#> [2,] 0 0 0.0000000 0 1 0.0000000 0
#> [3,] 0 0 0.2356738 0 0 0.7643262 0
#> [4,] 0 0 0.0000000 0 0 0.0000000 1
#> [5,] 0 1 0.0000000 0 0 0.0000000 0
#> [6,] 0 0 0.0000000 1 0 0.0000000 0