We introduce a simple regression problem, and compare the performance of mean imputation, CoCoLasso, and HMLasso. It takes several minutes to run this vignette because of our simple implementation. To see the details of HMLasso, please refer to the following paper.
Takada, M., Fujisawa, H., & Nishikawa, T. (2019). “HMLasso: Lasso with High Missing Rate.” IJCAI. <arXiv:1811.00255>.
NOTE: Because of the simple implementation, less than 200 variables are desirable from the perspective of computation efficiency.
Next, we define a simple regression problem.
# setting
n <- 2000 # number of samples
p <- 20 # number of dimensions
r <- 0.5 # correlation levels
eps <- 1 # noise level
# construct covariance matrix
mu <- rep(0, length=p)
Sigma <- matrix(r, nrow=p, ncol=p) # correlation among variables is 0
diag(Sigma) <- 1 # variance of each variable is 1
# generate training data
set.seed(0)
X <- mvrnorm(n, mu, Sigma)
b <- matrix(0, nrow = p, ncol = 1)
b[1:10,] <- (-1)^(1:10) * (1:10)
y <- X %*% b + eps * rnorm(n)
colnames(X) <- paste0("V", 1:p)
rownames(b) <- paste0("V", 1:p)
# introduce missing data
X_incompl <- X
mrc <- runif(p, min=0.1, max=0.9) # missing rate of each column
for (j in 1:p) {
mr <- sample(1:nrow(X), round(nrow(X) * mrc[j]))
X_incompl[mr, j] <- NA
}
image(t(apply(apply(X_incompl, c(1,2), function(v){ifelse(is.na(v), 1, 0)}), 2, rev)),
col=grey(11:0/12), main="missing pattern") # visualize (black:na, white:fill)
# generate test data
n_ts <- 10000
X_ts <- mvrnorm(n_ts, mu, Sigma)
y_ts <- X_ts %*% b + eps * rnorm(n_ts)
colnames(X_ts) <- paste0("V", 1:p)
# setup cv fold
nfolds <- 5
foldid1 <- sample(rep(1:nfolds, (n %/% nfolds)), replace=FALSE)
foldid2 <- sample(1:nfolds, (n %% nfolds), replace=FALSE)
foldid <- c(foldid1, foldid2)
We apply mean imputation method to the problem.
# MeanImp
cv_fit <- cv.hmlasso(X_incompl, y, nlambda=50, lambda.min.ratio=1e-1,
foldid=foldid, impute_method="mean",
direct_prediction=FALSE, positify="mean")
plot(cv_fit) # plot lambda-MSE
## [1] 160.8224
## V1 V2 V3 V4 V5 V6 V7
## 0.000000 0.000000 -1.014909 3.029420 -1.541143 4.098668 -4.438788
## V8 V9 V10 V11 V12 V13 V14
## 3.277524 -7.166381 8.275885 0.000000 0.000000 0.000000 0.000000
## V15 V16 V17 V18 V19 V20
## 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
pred <- predict(cv_fit$fit, X_ts) # predict
sqrt(sum((cv_fit$fit$beta[, cv_fit$lambda.min.index] - as.vector(b))^2)) # estimation error
## [1] 7.788304
## [1] 5.681397
We apply CoCoLasso to the problem.
# CoCoLasso
cv_fit <- cv.hmlasso(X_incompl, y, nlambda=50, lambda.min.ratio=1e-1,
foldid=foldid, direct_prediction=TRUE,
positify="admm_max", weight_power = 0)
plot(cv_fit) # plot lambda-MSE
## [1] 56.62824
## V1 V2 V3 V4 V5 V6 V7
## 0.000000 0.000000 -1.093565 3.116700 -1.612793 4.151754 -4.508953
## V8 V9 V10 V11 V12 V13 V14
## 3.328830 -7.223450 8.355743 0.000000 0.000000 0.000000 0.000000
## V15 V16 V17 V18 V19 V20
## 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
pred <- predict(cv_fit$fit, X_ts) # predict
sqrt(sum((cv_fit$fit$beta[, cv_fit$lambda.min.index] - as.vector(b))^2)) # estimation error
## [1] 7.628155
## [1] 5.571061
We apply HMLasso to the problem.
# HMLasso
cv_fit <- cv.hmlasso(X_incompl, y, nlambda=50, lambda.min.ratio=1e-1,
foldid=foldid, direct_prediction=TRUE,
positify="admm_frob", weight_power = 1)
plot(cv_fit) # plot lambda-MSE
## [1] 52.7777
## V1 V2 V3 V4 V5 V6
## 0.00000000 0.01459514 -1.24321064 3.27888761 -1.74878873 4.24887374
## V7 V8 V9 V10 V11 V12
## -4.64075028 3.42379387 -7.33257437 8.50260041 0.00000000 0.00000000
## V13 V14 V15 V16 V17 V18
## 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## V19 V20
## 0.00000000 0.00000000
pred <- predict(cv_fit$fit, X_ts) # predict
sqrt(sum((cv_fit$fit$beta[, cv_fit$lambda.min.index] - as.vector(b))^2)) # estimation error
## [1] 7.329065
## [1] 5.36526