# An Introduction to HMLasso

#### 2019-08-02

We introduce a simple regression problem, and compare the performance of mean imputation, CoCoLasso, and HMLasso. It takes several minutes to run this vignette because of our simple implementation. To see the details of HMLasso, please refer to the following paper.

Takada, M., Fujisawa, H., & Nishikawa, T. (2019). “HMLasso: Lasso with High Missing Rate.” IJCAI. <arXiv:1811.00255>.

NOTE: Because of the simple implementation, less than 200 variables are desirable from the perspective of computation efficiency.

knitr::opts_chunk$set(echo = TRUE) library(hmlasso) library(MASS) ## Data Preparation Next, we define a simple regression problem. # setting n <- 2000 # number of samples p <- 20 # number of dimensions r <- 0.5 # correlation levels eps <- 1 # noise level # construct covariance matrix mu <- rep(0, length=p) Sigma <- matrix(r, nrow=p, ncol=p) # correlation among variables is 0 diag(Sigma) <- 1 # variance of each variable is 1 # generate training data set.seed(0) X <- mvrnorm(n, mu, Sigma) b <- matrix(0, nrow = p, ncol = 1) b[1:10,] <- (-1)^(1:10) * (1:10) y <- X %*% b + eps * rnorm(n) colnames(X) <- paste0("V", 1:p) rownames(b) <- paste0("V", 1:p) # introduce missing data X_incompl <- X mrc <- runif(p, min=0.1, max=0.9) # missing rate of each column for (j in 1:p) { mr <- sample(1:nrow(X), round(nrow(X) * mrc[j])) X_incompl[mr, j] <- NA } image(t(apply(apply(X_incompl, c(1,2), function(v){ifelse(is.na(v), 1, 0)}), 2, rev)), col=grey(11:0/12), main="missing pattern") # visualize (black:na, white:fill) # generate test data n_ts <- 10000 X_ts <- mvrnorm(n_ts, mu, Sigma) y_ts <- X_ts %*% b + eps * rnorm(n_ts) colnames(X_ts) <- paste0("V", 1:p) # setup cv fold nfolds <- 5 foldid1 <- sample(rep(1:nfolds, (n %/% nfolds)), replace=FALSE) foldid2 <- sample(1:nfolds, (n %% nfolds), replace=FALSE) foldid <- c(foldid1, foldid2) ## Mean Imputation We apply mean imputation method to the problem. # MeanImp cv_fit <- cv.hmlasso(X_incompl, y, nlambda=50, lambda.min.ratio=1e-1, foldid=foldid, impute_method="mean", direct_prediction=FALSE, positify="mean") plot(cv_fit) # plot lambda-MSE plot(cv_fit$fit) # plot solution path

cv_fit$cve[cv_fit$lambda.min.index] # minimum cross-valiation error
## [1] 160.8224
cv_fit$fit$beta[, cv_fit$lambda.min.index] # estimated coefficient ## V1 V2 V3 V4 V5 V6 V7 ## 0.000000 0.000000 -1.014909 3.029420 -1.541143 4.098668 -4.438788 ## V8 V9 V10 V11 V12 V13 V14 ## 3.277524 -7.166381 8.275885 0.000000 0.000000 0.000000 0.000000 ## V15 V16 V17 V18 V19 V20 ## 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 pred <- predict(cv_fit$fit, X_ts) # predict
sqrt(sum((cv_fit$fit$beta[, cv_fit$lambda.min.index] - as.vector(b))^2)) # estimation error ## [1] 7.788304 sqrt(sum((pred[, cv_fit$lambda.min.index] - y_ts)^2) / n_ts) # prediction error
## [1] 5.681397

## CoCoLasso

We apply CoCoLasso to the problem.

# CoCoLasso
cv_fit <- cv.hmlasso(X_incompl, y, nlambda=50, lambda.min.ratio=1e-1,
foldid=foldid, direct_prediction=TRUE,
plot(cv_fit) # plot lambda-MSE

plot(cv_fit$fit) # plot solution path cv_fit$cve[cv_fit$lambda.min.index] # minimum cross-valiation error ## [1] 56.62824 cv_fit$fit$beta[, cv_fit$lambda.min.index] # estimated coefficient
##        V1        V2        V3        V4        V5        V6        V7
##  0.000000  0.000000 -1.093565  3.116700 -1.612793  4.151754 -4.508953
##        V8        V9       V10       V11       V12       V13       V14
##  3.328830 -7.223450  8.355743  0.000000  0.000000  0.000000  0.000000
##       V15       V16       V17       V18       V19       V20
##  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000
pred <- predict(cv_fit$fit, X_ts) # predict sqrt(sum((cv_fit$fit$beta[, cv_fit$lambda.min.index] - as.vector(b))^2)) # estimation error
## [1] 7.628155
sqrt(sum((pred[, cv_fit$lambda.min.index] - y_ts)^2) / n_ts) # prediction error ## [1] 5.571061 ## HMLasso We apply HMLasso to the problem. # HMLasso cv_fit <- cv.hmlasso(X_incompl, y, nlambda=50, lambda.min.ratio=1e-1, foldid=foldid, direct_prediction=TRUE, positify="admm_frob", weight_power = 1) plot(cv_fit) # plot lambda-MSE plot(cv_fit$fit) # plot solution path

cv_fit$cve[cv_fit$lambda.min.index] # minimum cross-valiation error
## [1] 52.7777
cv_fit$fit$beta[, cv_fit$lambda.min.index] # estimated coefficient ## V1 V2 V3 V4 V5 V6 ## 0.00000000 0.01459514 -1.24321064 3.27888761 -1.74878873 4.24887374 ## V7 V8 V9 V10 V11 V12 ## -4.64075028 3.42379387 -7.33257437 8.50260041 0.00000000 0.00000000 ## V13 V14 V15 V16 V17 V18 ## 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 ## V19 V20 ## 0.00000000 0.00000000 pred <- predict(cv_fit$fit, X_ts) # predict
sqrt(sum((cv_fit$fit$beta[, cv_fit$lambda.min.index] - as.vector(b))^2)) # estimation error ## [1] 7.329065 sqrt(sum((pred[, cv_fit$lambda.min.index] - y_ts)^2) / n_ts) # prediction error
## [1] 5.36526