library(SplitKnockoff)
Author : Haoxue Wang, Yang Cao, Xinwei Sun, Yuan Yao
Split Knockoff is a data adaptive variable selection framework for controlling the (directional) false discovery rate (FDR) in structural sparsity, where variable selection on linear transformation of parameters is of concern. This proposed scheme relaxes the linear subspace constraint to its neighborhood, often known as variable splitting in optimization, which leads to better incoherence and possible performance improvement in both power and FDR. This vignette illustrates the usage of Splitknockoff with simulation experiments and will help you apply the split knockoff method in a light way. On top of that, an application to Alzheimer’s Disease study with MRI data is given as an example to illustrate that the split knockoff method can disclose important lesion regions in brains associated with the disease and connections between neighboring regions of high contrast variations during disease progression.
install.packages("SplitKnockoff") # just one line code to install our package
This is a R implement on the Matlab version of Split Knockoffs. This R package is more convenient as glmnet can be used directly by
install.packages("glmnet'") # just one line code to install the glmnet tool
Please update your glmnet package to the latest version for smooth usage of this package, the examples illustrated here are tested with glmnet 4.2.
For more information, please see the manual inside this package.
sk.filter(X, D, y, option) : the main function, Split Knockoff filter, for variable selection in structural sparsity problem.
sk.create(X, y, D, nu, option): generate the split knockoff copy for structural sparsity problem.
W_sign(X, D, y, nu, option): generate the W statistics for split knockoff on a split LASSO path.
sk.select(W, q): this function is for variable selection based on W statistics.
install.packages("SplitKnockoff") # install our package
library(latex2exp)
library(ggplot2)
library(Matrix)
library(glmnet)
library(MASS)
library(SplitKnockoff)
<- 20 # sparsity level
k <- 1 # magnitude
A <- 350 # sample size
n <- 100 # dimension of variables
p <- 0.5 # feature correlation
c <-1 # noise level
sigma
<- array(data = NA, dim = length(data), dimnames = NULL)
option
# the target (directional) FDR control
$q <- 0.2
option
# choice on threshold, the other choice is 'knockoff+'
$method <- 'knockoff'
option
# degree of separation between original design and its split knockoff copy
# in the range of [0, 2], the less the more separated
$eta <- 0.1
option
# whether to normalize the dataset
$normalize <- 'true'
option
# choice on the set of regularization parameters for split LASSO path
$lambda <- 10.^seq(0, -6, by=-0.01)
option
# choice of nu for split knockoffs
$nu <- 10
option
# choice on whether to estimate the directional effect, 'disabled'/'enabled'
$sign <- 'enabled' option
<- diag(p)
D <- nrow(D)
m
# generate X
= matrix(0, p, p)
Sigma for( i in 1: p){
for(j in 1: p){
<- c^(abs(i - j))
Sigma[i, j]
} }
library(mvtnorm) # package mvtnorm needed for this generation
set.seed(100)
<- rmvnorm(n,matrix(0, p, 1), Sigma) # generate X X
<- matrix(0, p, 1)
beta_true for( i in 1: k){
1] = A
beta_true[i, if ( i%%3 == 1){
1] = -A
beta_true[i,
}
}<- D %*% beta_true
gamma_true
<- which(gamma_true!=0) S0
# set random seed
set.seed(1)
# generate noise and y
<- rnorm(n) * sqrt(sigma)
varepsilon <- X %*% beta_true + varepsilon y
use the key function filter to get the feature and knockoff significance
## Split Knockoff
<- sk.filter(X, D, y, option)
filter_result <- filter_result$Z
Z_path <- filter_result$t_Z t_Z_path
plot the figure 1
<- sub("TRUE","0",is.element(1:m, S0))
a <- sub("FALSE","1",a)
b <- data.frame(x=Z_path[[1]],y= t_Z_path[[1]], feature=b)
df save.image("C:/Users/v-haoxuewang/Desktop/SplitKnockoff/simu_experiments/figure_1/result/figure_1.RData")
png(file='C:/Users/v-haoxuewang/Desktop/SplitKnockoff/simu_experiments/figure_1/plot/figure_1b.png', height=1000, width=1000)
<- ggplot(data=df,aes(x=x, y=y, color=feature))+geom_point(size=2)
p <-p + labs(title = "Split Knockoff with Lasso Statistic")
p <-p + labs(x = TeX("Value of $Z_i$"))
p <-p + labs(y = TeX("Value of $\\tilde{Z_i}$"))
p <-p + scale_fill_discrete(breaks=c("Null feature","Non-null feature"))
p <-p + geom_abline(intercept=0,slope=1 )
p <-p + scale_colour_discrete(labels=c("Null feature", "Non-null feature"))
p <-p + scale_y_continuous(limits = c(0, Z_path[[1]]))
p print(p)
dev.off()
# 1 June, 2021
# revised 7 July, 2021
# author:Haoxue Wang (haoxwang@student.ethz.ch)
# This script reproduces the figures on FDR and power comparison between
# Split Knockoffs and Knockoffs.
#
# The intermediate result will be automatically saved to
# '../result/temp.RData’ with the proceeding of the calculation. The final
# result will be saved to '../result/examples.mat'. The whole calculation
# may takes hours to finish.
install.packages("SplitKnockoff") # install our package
library(latex2exp)
library(ggplot2)
library(Matrix)
library(glmnet)
library(MASS)
library(SplitKnockoff)
<- 20 # sparsity level
k <- 1 # magnitude
A <- 350 # sample size
n <- 100 # dimension of variables
p <- 0.5 # feature correlation
c
<- array(data=NA, dim = length(data), dimnames = NULL)
option $q <- 0.2
option$eta <- 0.1
option$normalize <- 'true'
option$lambda <- 10.^seq(0, -6, by=-0.01)
option$copy <- 'true'
option$sign <- 'enabled'
option<- option[-1]
option
# settings for nu
<- seq(-1, 3, by=0.2)
expo $nu <- 10.^expo
option<- length(option$nu)
num_nu ## calculation
# generate D1, D2, and D3
= matrix(0,p-1, p)
D_G
for (i in 1:p-1){
<- 1
D_G[i, i] +1] <- -1
D_G[i, i
}
<-diag(p)
D_1 <- D_G
D_2 <- rbind(diag(p), D_G)
D_3 <- array(data = NA, dim = length(data), dimnames = NULL)
D_s $D_1 <- D_1
D_s$D_2 <- D_2
D_s$D_3 <- D_3
D_s<- D_s[-1]
D_s
<- array(0,c(3, num_nu, 2))
fdr_split <- array(0,c(3, num_nu,2))
power_split
# fdr_knock <- matrix(0, 2, 2)
# power_knock <- matrix(0, 2, 2)
<- 2
num_method <- c('knockoff', 'knockoff+')
method_s
# attention! for quick speed, please set the meth manually 1 and 2
# when meth =2, set the ratio calculation offset=1 in function select
= 1
meth for (i in 1: 3){
# choose the respective D for each example
sprintf('Running example for D_%d', i)
<- D_s[[i]]
D <- simu_unit(n, p, D, A, c, k, option)
simu_data <- simu_data$fdr_split
fdr_split[i, , meth ] <- simu_data$power_split
power_split[i, , meth] # if (i < 3){
# fdr_knock[meth, i] <- simu_data$fdr_knock
# power_knock[meth, i] <- simu_data$power_knock
# }
}save.image(file = "C:/Users/v-haoxuewang/Desktop/SplitKnockoff/simu_experiments/examples/result/temp.RData")
# save results
save.image("C:/Users/v-haoxuewang/Desktop/SplitKnockoff/simu_experiments/examples/result/examples.RData")
<- fdr_split
fdr_split_result <- power_split
power_split_result
## for D_1
## plot for FDR
<- expo
x <- fdr_split_result[1, ,1]
fdr_split <- array(fdr_split, dim=c(num_nu, 1))
fdr_split <- fdr_split_result[1,,2]
fdr_split_plus <- array(fdr_split_plus, dim=c(num_nu, 1))
fdr_split_plus
<- data.frame(x,fdr_split,fdr_split_plus)
fdr is.na(fdr)] <- 0
fdr[
png(file='C:/Users/v-haoxuewang/Desktop/SplitKnockoff/simu_experiments/examples/plot/figure_11.png', height=1000, width=1000)
<- ggplot()+geom_line(data=fdr,aes(x=x, y=fdr_split, color='fdr_split'))
p <- p +geom_line(data=fdr,aes(x=x, y=fdr_split_plus, color='fdr_split_plus'))
p <-p + labs(x = TeX("$\\log_{10} (\\nu)$"))
p <-p + labs(y = TeX("FDR"))
p <-p + geom_abline(intercept=0.2,slope=0,linetype="dashed" )
p <-p + scale_fill_discrete(breaks=c("split Knockoff", "split Knockoff+"))
p <-p + scale_colour_discrete(labels=c("split Knockoff", "split Knockoff+"))
p <-p + scale_y_continuous(limits = c(0,1))
p print(p)
dev.off()
## plot for Power
<- expo
x <- power_split_result[1,,1]
power_split <- array(power_split, dim=c(num_nu, 1))
power_split <- power_split_result[1,,2]
power_split_plus <- array(power_split_plus, dim=c(num_nu, 1))
power_split_plus
<- data.frame(x, power_split,power_split_plus)
power png(file='C:/Users/v-haoxuewang/Desktop/SplitKnockoff/simu_experiments/examples/plot/figure_12.png', height=1000, width=1000)
<- ggplot()+geom_line(data=power,aes(x=x, y=power_split, color='split Knockoff'))
q <- q + geom_line(data=power,aes(x=x, y=power_split_plus, color='split Knockoff+'))
q <-q + labs(x = TeX("$\\log_{10} (\\nu)$"))
q <-q + labs(y = TeX("Power"))
q <-q + scale_fill_discrete(breaks=c("split Knockoff", "split Knockoff+"))
q <-q + scale_colour_discrete(labels=c("split Knockoff", "split Knockoff+"))
q <-q+ scale_y_continuous(limits = c(0,1))
q print(q)
dev.off()
## for D_2
## plot for FDR
<- expo
x <- fdr_split_result[2,,1]
fdr_split <- array(fdr_split, dim=c(num_nu, 1))
fdr_split <- fdr_split_result[2, ,2]
fdr_split_plus <- array(fdr_split_plus, dim=c(num_nu, 1))
fdr_split_plus
<- data.frame(x,fdr_split,fdr_split_plus)
fdr is.na(fdr)] <- 0
fdr[# save results
png(file='C:/Users/v-haoxuewang/Desktop/SplitKnockoff/simu_experiments/examples/plot/figure_21.png', height=1000, width=1000)
<- ggplot()+geom_line(data=fdr,aes(x=x, y=fdr_split, color='fdr_split'))
p <- p +geom_line(data=fdr,aes(x=x, y=fdr_split_plus, color='fdr_split_plus'))
p <-p + labs(x = TeX("$\\log_{10} (\\nu)$"))
p <-p + labs(y = TeX("FDR"))
p <-p + geom_abline(intercept=0.2,slope=0,linetype="dashed" )
p <-p + scale_fill_discrete(breaks=c("split Knockoff", "split Knockoff+"))
p <-p + scale_colour_discrete(labels=c("split Knockoff", "split Knockoff+"))
p <-p + scale_y_continuous(limits = c(0,1))
p print(p)
dev.off()
## plot for Power
<- expo
x <- power_split_result[2,,1 ]
power_split <- array(power_split, dim=c(num_nu, 1))
power_split <- power_split_result[2,,2]
power_split_plus <- array(power_split_plus, dim=c(num_nu, 1))
power_split_plus # power_knock_ <- power_knock[1]
# power_knock_plus_ <- power_knock[2]
# power_knock_ <- rep(power_knock_, num_nu)
# power_knock_plus_ <- rep(power_knock_plus_, num_nu)
<- data.frame(x,power_split,power_split_plus)
power png(file='C:/Users/v-haoxuewang/Desktop/SplitKnockoff/simu_experiments/examples/plot/figure_22.png', height=1000, width=1000)
<- ggplot()+geom_line(data=power,aes(x=x, y=power_split, color='power_split'))
q <- q + geom_line(data=power,aes(x=x, y=power_split_plus, color='power_split_plus'))
q <-q + labs(x = TeX("$\\log_{10} (\\nu)$"))
q <-q + labs(y = TeX("Power"))
q <-q + scale_fill_discrete(breaks=c("split Knockoff", "split Knockoff+"))
q <-q+ scale_colour_discrete(labels=c("split Knockoff", "split Knockoff+"))
q <-q+ scale_y_continuous(limits = c(0,1))
q print(q)
dev.off()
## for D_3
## plot for FDR
<- expo
x <- fdr_split_result[3,,1 ]
fdr_split <- array(fdr_split, dim=c(num_nu, 1))
fdr_split <- fdr_split_result[3,,2]
fdr_split_plus <- array(fdr_split_plus, dim=c(num_nu, 1))
fdr_split_plus
<- data.frame(x,fdr_split,fdr_split_plus)
fdr is.na(fdr)] <- 0
fdr[# save results
png(file='C:/Users/v-haoxuewang/Desktop/SplitKnockoff/simu_experiments/examples/plot/figure_31.png', height=1000, width=1000)
<- ggplot()+geom_line(data=fdr,aes(x=x, y=fdr_split, color='fdr_split'))
p <- p +geom_line(data=fdr,aes(x=x, y=fdr_split_plus, color='fdr_split_plus'))
p <-p + labs(x = TeX("$\\log_{10} (\\nu)$"))
p <-p + labs(y = TeX("FDR"))
p <-p + geom_abline(intercept=0.2,slope=0,linetype="dashed" )
p <-p + scale_fill_discrete(breaks=c("split Knockoff", "split Knockoff+"))
p <-p + scale_colour_discrete(labels=c("split Knockoff", "split Knockoff+"))
p <-p + scale_y_continuous(limits = c(0,1))
p print(p)
dev.off()
## plot for Power
<- expo
x <- power_split_result[3,,1 ]
power_split <- array(power_split, dim=c(num_nu, 1))
power_split <- power_split_result[3,,2]
power_split_plus <- array(power_split_plus, dim=c(num_nu, 1))
power_split_plus
<- data.frame(x,power_split,power_split_plus)
power png(file='C:/Users/v-haoxuewang/Desktop/SplitKnockoff/simu_experiments/examples/plot/figure_32.png', height=1000, width=1000)
<- ggplot()+geom_line(data=power,aes(x=x, y=power_split, color='power_split'))
q <- q + geom_line(data=power,aes(x=x, y=power_split_plus, color='power_split_plus'))
q <-q + labs(x = TeX("$\\log_{10} (\\nu)$"))
q <-q + labs(y = TeX("Power"))
q <-q + scale_fill_discrete(breaks=c("split Knockoff", "split Knockoff+"))
q <-q+ scale_colour_discrete(labels=c("split Knockoff ", "split Knockoff+"))
q <-q+ scale_y_continuous(limits = c(0,1))
q print(q)
dev.off()
save.image(file = "C:/Users/v-haoxuewang/Desktop/SplitKnockoff/simu_experiments/examples/result/final.RData")
library(latex2exp)
library(ggplot2)
library(Matrix)
library(glmnet)
library(MASS)
library(SplitKnockoff)
<- array(data=NA, dim = length(data), dimnames = NULL)
option $q <- 0.2
option$eta <- 0.1
option$normalize <- 'true'
option$lambda <- 10.^seq(0, -6, by=-0.01)
option$copy <- 'true'
option$sign <- 'enabled'
option<- option[-1]
option
# settings for nu
<- seq(-1, 1, by=0.1)
expo $nu <- 10.^expo
option<- length(option$nu)
num_nu
= getwd()
root setwd(sprintf('%s/data/', root))
= read.csv('X.csv')
X =as.matrix(X)
X = read.csv('y.csv')
y = as.matrix(y)
y = read.csv('D.csv')
D = as.matrix(D)
D = diag(90)
D
<- splitknockoff.filter(X, D, y, option)
filter_result <- filter_result$results
results <- filter_result$r
r save(results, file="results/region.RData")
save(r, file="results/sign.RData")
library(latex2exp)
library(ggplot2)
library(Matrix)
library(glmnet)
library(MASS)
library(SplitKnockoff)
<- array(data=NA, dim = length(data), dimnames = NULL)
option $q <- 0.2
option$eta <- 0.1
option$normalize <- 'true'
option$lambda <- 10.^seq(0, -6, by=-0.01)
option$copy <- 'true'
option$sign <- 'enabled'
option<- option[-1]
option
# settings for nu
<- seq(-1, 1, by=0.1)
expo $nu <- 10.^expo
option<- length(option$nu)
num_nu
# original .mat data
= getwd()
root setwd(sprintf('%s/data/', root))
= readMat('X.mat')
X = X$X
X = readMat('y.mat')
y = y$y
y = readMat('D.mat')
D = D$D
D #
# root = getwd()
# setwd(sprintf('%s/data/', root))
# X = read.csv('X.csv')
# X =as.matrix(X)
# y = read.csv('y.csv')
# y = as.matrix(y)
# D = read.csv('D.csv')
# D = as.matrix(D)
<- splitknockoff.filter(X, D, y, option)
filter_result <- filter_result$results
results <- filter_result$r
r save(results, file="results/connection.RData")
save(r, file="results/sign.RData")
It usually takes 50min-80min to simulate each experiment. If you want to see the result of the experiment directly, you can reload our generated result by
load("...../SplitKnockoff/simu_experiments/figure_1/result/figure_1.RData")
load("...../SplitKnockoff/simu_experiments/examples/result/final.RData")
If you have any question or idea, please contact the author of this R package Haoxue Wang (haoxwang@student.ethz.ch).