library(sureLDA)
## Loading required package: Matrix
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
Let N denote the number of patients, W the number of EHR features, and K the number of target phenotypes to be predicted. Our input data consists of 1) X, an NxW matrix of EHR feature counts, 2) ICD, an NxK matrix of key ICD code counts for each target phenotype, 3) NLP, an NxK matrix of key NLP feature counts for each target phenotype, 4) HU, an N-dimensional vector of healthcare utilization measurements (i.e. total patient encounters in a patient’s chart), and 5) an NxK matrix of filter indicators for each target phenotype (we assume P(phenotype | filter=0) = 0).
First, we evaluate sureLDA with a PheNorm-generated prior (default) for prediction of 10 target phenotypes using a simulated dataset. We employ 10 ‘empty’ topics (this should generally be set in the range of 10-100).
with(
surelda_run_phenorm <-sureLDA(X, ICD, NLP, HU, filter, nEmpty = 10)) simdata,
## Starting PheNorm
## Starting Guided LDA
## Starting final clustering
Evaluating AUCs of sureLDA scores across 10 phenotypes
sapply(1:ncol(simdata$filter),function(k){
surelda_scores_phenorm_aucs <-::auc(simdata$Y[,k],surelda_run_phenorm$scores[,k])
pROC })
Evaluating AUCs of predicted probabilities across 10 phenotypes
sapply(1:ncol(simdata$filter),function(k){
surelda_ensemble_phenorm_aucs <-auc(simdata$Y[,k],surelda_run_phenorm$ensemble[,k])
})
AUCs:
rbind(surelda_scores_phenorm_aucs,surelda_ensemble_phenorm_aucs)
surelda_result_combined <-rownames(surelda_result_combined) <- c('sureLDA Scores','sureLDA Probs')
print(surelda_result_combined)
## [,1] [,2] [,3] [,4] [,5] [,6]
## sureLDA Scores 0.9106220 0.8673095 0.8748029 0.9114881 0.8786685 0.8759985
## sureLDA Probs 0.9110456 0.8637459 0.8726527 0.9097024 0.8771739 0.8645873
## [,7] [,8] [,9] [,10]
## sureLDA Scores 0.9959016 0.7775945 0.8735303 0.8731959
## sureLDA Probs 0.9873634 0.7690034 0.8666961 0.8962199
Next, we evaluate sureLDA’s predictions of the same 10 target phenotypes using the same data but given the prior and phi estimators from the previous run.
with(simdata,
surelda_prediction <-sureLDA(X, ICD, NLP, HU, filter, prior = surelda_run_phenorm$prior, nEmpty = 10,
weight = surelda_run_phenorm$weight, phi = surelda_run_phenorm$phi))
## Inferring theta given provided phi
## Starting final clustering
Evaluating AUCs of sureLDA scores across 10 phenotypes
sapply(1:ncol(simdata$filter),function(k){
surelda_scores_prediction_aucs <-auc(simdata$Y[,k],surelda_prediction$scores[,k])
})
Evaluating AUCs of predicted probabilities across 10 phenotypes
sapply(1:ncol(simdata$filter),function(k){
surelda_ensemble_prediction_aucs <-auc(simdata$Y[,k],surelda_prediction$ensemble[,k])
})
AUCs:
rbind(surelda_scores_prediction_aucs,surelda_ensemble_prediction_aucs)
surelda_prediction_result_combined <-rownames(surelda_prediction_result_combined) <- c('sureLDA Scores','sureLDA Probs')
print(surelda_prediction_result_combined)
## [,1] [,2] [,3] [,4] [,5] [,6]
## sureLDA Scores 0.9112489 0.8667855 0.8717209 0.9122321 0.8727717 0.8697223
## sureLDA Probs 0.9128416 0.8655277 0.8731902 0.9132143 0.8772283 0.8689616
## [,7] [,8] [,9] [,10]
## sureLDA Scores 0.9953893 0.7754639 0.8683128 0.9025430
## sureLDA Probs 0.9889003 0.7758763 0.8668430 0.8988316
Total time spent:
proc.time()
## user system elapsed
## 77.285 4.192 82.224