## ----include = FALSE---------------------------------------------------------- library(knitr) library(prettydoc) knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----contingency table-------------------------------------------------------- library(laeken) library(sampling) library(StratifiedSampling) data("eusilc") eusilc <- na.omit(eusilc) N <- nrow(eusilc) # Xm are the matching variables and id are identity of the units Xm <- eusilc[,c("hsize","db040","age","rb090","pb220a")] Xmcat <- do.call(cbind,apply(Xm[,c(2,4,5)],MARGIN = 2,FUN = disjunctive)) Xm <- cbind(Xmcat,Xm[,-c(2,4,5)]) id <- eusilc$rb030 # categorial income splitted by the percentile c_income <- eusilc$eqIncome q <- quantile(eusilc$eqIncome, probs = seq(0, 1, 0.15)) c_income[which(eusilc$eqIncome <= q[2])] <- "(0,15]" c_income[which(q[2] < eusilc$eqIncome & eusilc$eqIncome <= q[3])] <- "(15,30]" c_income[which(q[3] < eusilc$eqIncome & eusilc$eqIncome <= q[4])] <- "(30,45]" c_income[which(q[4] < eusilc$eqIncome & eusilc$eqIncome <= q[5])] <- "(45,60]" c_income[which(q[5] < eusilc$eqIncome & eusilc$eqIncome <= q[6])] <- "(60,75]" c_income[which(q[6] < eusilc$eqIncome & eusilc$eqIncome <= q[7])] <- "(75,90]" c_income[which( eusilc$eqIncome > q[7] )] <- "(90,100]" # variable of interests Y <- data.frame(ecostat = eusilc$pl030) Z <- data.frame(c_income = c_income) # put same rownames rownames(Xm) <- rownames(Y) <- rownames(Z)<- id YZ <- table(cbind(Y,Z)) addmargins(YZ) ## ----sample_scheme------------------------------------------------------------ # size of sample n1 <- 1000 n2 <- 500 # samples s1 <- srswor(n1,N) s2 <- srswor(n2,N) # extract matching units X1 <- Xm[s1 == 1,] X2 <- Xm[s2 == 1,] # extract variable of interest Y1 <- data.frame(Y[s1 == 1,]) colnames(Y1) <- colnames(Y) Z2 <- as.data.frame(Z[s2 == 1,]) colnames(Z2) <- colnames(Z) # extract correct identities id1 <- id[s1 == 1] id2 <- id[s2 == 1] # put correct rownames rownames(Y1) <- id1 rownames(Z2) <- id2 # here weights are inverse of inclusion probabilities d1 <- rep(N/n1,n1) d2 <- rep(N/n2,n2) # disjunctive form Y_dis <- sampling::disjunctive(as.matrix(Y)) Z_dis <- sampling::disjunctive(as.matrix(Z)) Y1_dis <- Y_dis[s1 ==1,] Z2_dis <- Z_dis[s2 ==1,] ## ----harm--------------------------------------------------------------------- re <- harmonize(X1,d1,id1,X2,d2,id2) # if we want to use the population totals to harmonize we can use re <- harmonize(X1,d1,id1,X2,d2,id2,totals = c(N,colSums(Xm))) w1 <- re$w1 w2 <- re$w2 colSums(Xm) colSums(w1*X1) colSums(w2*X2) ## ----OT----------------------------------------------------------------------- # Optimal transport matching object <- otmatch(X1,id1,X2,id2,w1,w2) head(object[,1:3]) Y1_ot <- cbind(X1[as.character(object$id1),],y = Y1[as.character(object$id1),]) Z2_ot <- cbind(X2[as.character(object$id2),],z = Z2[as.character(object$id2),]) YZ_ot <- tapply(object$weight,list(Y1_ot$y,Z2_ot$z),sum) # transform NA into 0 YZ_ot[is.na(YZ_ot)] <- 0 # result round(addmargins(YZ_ot),3) ## ----BS----------------------------------------------------------------------- # Balanced Sampling BS <- bsmatch(object) head(BS$object[,1:3]) Y1_bs <- cbind(X1[as.character(BS$object$id1),],y = Y1[as.character(BS$object$id1),]) Z2_bs <- cbind(X2[as.character(BS$object$id2),],z = Z2[as.character(BS$object$id2),]) YZ_bs <- tapply(BS$object$weight/BS$q,list(Y1_bs$y,Z2_bs$z),sum) YZ_bs[is.na(YZ_bs)] <- 0 round(addmargins(YZ_bs),3) # With Z2 as auxiliary information for stratified balanced sampling. BS <- bsmatch(object,Z2) Y1_bs <- cbind(X1[as.character(BS$object$id1),],y = Y1[as.character(BS$object$id1),]) Z2_bs <- cbind(X2[as.character(BS$object$id2),],z = Z2[as.character(BS$object$id2),]) YZ_bs <- tapply(BS$object$weight/BS$q,list(Y1_bs$y,Z2_bs$z),sum) YZ_bs[is.na(YZ_bs)] <- 0 round(addmargins(YZ_bs),3) ## ----------------------------------------------------------------------------- # split the weight by id1 q_l <- split(object$weight,f = object$id1) # normalize in each id1 q_l <- lapply(q_l, function(x){x/sum(x)}) q <- as.numeric(do.call(c,q_l)) Z_pred <- t(q*disjunctive(object$id1))%*%disjunctive(Z2[as.character(object$id2),]) colnames(Z_pred) <- levels(factor(Z2$c_income)) head(Z_pred)