Simulated Example

library(sureLDA)

## Loading required package: Matrix

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

Let N denote the number of patients, W the number of EHR features, and K the number of target phenotypes to be predicted. Our input data consists of 1) X, an NxW matrix of EHR feature counts, 2) ICD, an NxK matrix of key ICD code counts for each target phenotype, 3) NLP, an NxK matrix of key NLP feature counts for each target phenotype, 4) HU, an N-dimensional vector of healthcare utilization measurements (i.e. total patient encounters in a patient’s chart), and 5) an NxK matrix of filter indicators for each target phenotype (we assume P(phenotype | filter=0) = 0).

First, we evaluate sureLDA with a PheNorm-generated prior (default) for prediction of 10 target phenotypes using a simulated dataset. We employ 10 ‘empty’ topics (this should generally be set in the range of 10-100).

surelda_run_phenorm <- with(
  simdata, sureLDA(X, ICD, NLP, HU, filter, nEmpty = 10))

## Starting PheNorm

## Starting Guided LDA

## Starting final clustering

Evaluating AUCs of sureLDA scores across 10 phenotypes

surelda_scores_phenorm_aucs <- sapply(1:ncol(simdata$filter),function(k){
  pROC::auc(simdata$Y[,k],surelda_run_phenorm$scores[,k])
})

Evaluating AUCs of predicted probabilities across 10 phenotypes

surelda_ensemble_phenorm_aucs <- sapply(1:ncol(simdata$filter),function(k){
  auc(simdata$Y[,k],surelda_run_phenorm$ensemble[,k])
})

AUCs:

surelda_result_combined <- rbind(surelda_scores_phenorm_aucs,surelda_ensemble_phenorm_aucs)
rownames(surelda_result_combined) <- c('sureLDA Scores','sureLDA Probs')
print(surelda_result_combined)

##                     [,1]      [,2]      [,3]      [,4]      [,5]      [,6]
## sureLDA Scores 0.9106220 0.8673095 0.8748029 0.9114881 0.8786685 0.8759985
## sureLDA Probs  0.9110456 0.8637459 0.8726527 0.9097024 0.8771739 0.8645873
##                     [,7]      [,8]      [,9]     [,10]
## sureLDA Scores 0.9959016 0.7775945 0.8735303 0.8731959
## sureLDA Probs  0.9873634 0.7690034 0.8666961 0.8962199

Next, we evaluate sureLDA’s predictions of the same 10 target phenotypes using the same data but given the prior and phi estimators from the previous run.

surelda_prediction <- with(simdata,
                        sureLDA(X, ICD, NLP, HU, filter, prior = surelda_run_phenorm$prior, nEmpty = 10,
                              weight = surelda_run_phenorm$weight, phi = surelda_run_phenorm$phi))

## Inferring theta given provided phi

## Starting final clustering

Evaluating AUCs of sureLDA scores across 10 phenotypes

surelda_scores_prediction_aucs <- sapply(1:ncol(simdata$filter),function(k){
  auc(simdata$Y[,k],surelda_prediction$scores[,k])
})

Evaluating AUCs of predicted probabilities across 10 phenotypes

surelda_ensemble_prediction_aucs <- sapply(1:ncol(simdata$filter),function(k){
  auc(simdata$Y[,k],surelda_prediction$ensemble[,k])
})

AUCs:

surelda_prediction_result_combined <- rbind(surelda_scores_prediction_aucs,surelda_ensemble_prediction_aucs)
rownames(surelda_prediction_result_combined) <- c('sureLDA Scores','sureLDA Probs')
print(surelda_prediction_result_combined)

##                     [,1]      [,2]      [,3]      [,4]      [,5]      [,6]
## sureLDA Scores 0.9112489 0.8667855 0.8717209 0.9122321 0.8727717 0.8697223
## sureLDA Probs  0.9128416 0.8655277 0.8731902 0.9132143 0.8772283 0.8689616
##                     [,7]      [,8]      [,9]     [,10]
## sureLDA Scores 0.9953893 0.7754639 0.8683128 0.9025430
## sureLDA Probs  0.9889003 0.7758763 0.8668430 0.8988316

Total time spent:

proc.time()

##    user  system elapsed 
##  77.285   4.192  82.224