## Loading required package: Matrix
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

Let N denote the number of patients, W the number of EHR features, and K the number of target phenotypes to be predicted. Our input data consists of 1) X, an NxW matrix of EHR feature counts, 2) ICD, an NxK matrix of key ICD code counts for each target phenotype, 3) NLP, an NxK matrix of key NLP feature counts for each target phenotype, 4) HU, an N-dimensional vector of healthcare utilization measurements (i.e. total patient encounters in a patient’s chart), and 5) an NxK matrix of filter indicators for each target phenotype (we assume P(phenotype | filter=0) = 0).

First, we evaluate sureLDA with a PheNorm-generated prior (default) for prediction of 10 target phenotypes using a simulated dataset. We employ 10 ‘empty’ topics (this should generally be set in the range of 10-100).

surelda_run_phenorm <- with(
  simdata, sureLDA(X, ICD, NLP, HU, filter, nEmpty = 10)
)
## [1] "Starting PheNorm"
## Warning in if (typeof(weight) == "character" & weight == "uniform") {: the
## condition has length > 1 and only the first element will be used
## [1] "Starting Guided LDA"
## [1] "Starting final clustering"

Evaluating AUCs of sureLDA scores across 10 phenotypes

surelda_scores_phenorm_aucs <- sapply(1:ncol(simdata$filter), function(k) {
  pROC::auc(simdata$Y[, k], surelda_run_phenorm$scores[, k])
})

Evaluating AUCs of predicted probabilities across 10 phenotypes

surelda_ensemble_phenorm_aucs <- sapply(1:ncol(simdata$filter), function(k) {
  auc(simdata$Y[, k], surelda_run_phenorm$ensemble[, k])
})

AUCs:

surelda_result_combined <- rbind(surelda_scores_phenorm_aucs, surelda_ensemble_phenorm_aucs)
rownames(surelda_result_combined) <- c("sureLDA Scores", "sureLDA Probs")
print(surelda_result_combined)
##                     [,1]      [,2]      [,3]      [,4]      [,5]      [,6]
## sureLDA Scores 0.9111303 0.8685672 0.8728856 0.9111905 0.8777174 0.8555534
## sureLDA Probs  0.9122825 0.8636673 0.8718642 0.9083631 0.8750543 0.8614492
##                     [,7]      [,8]      [,9]     [,10]
## sureLDA Scores 0.9959016 0.7792440 0.8736038 0.9015808
## sureLDA Probs  0.9868511 0.7727148 0.8665491 0.8958076

Next, we evaluate sureLDA’s predictions of the same 10 target phenotypes using the same data but given the prior and phi estimators from the previous run.

surelda_prediction <- with(
  simdata,
  sureLDA(X, ICD, NLP, HU, filter,
    prior = surelda_run_phenorm$prior, nEmpty = 10,
    weight = surelda_run_phenorm$weight, phi = surelda_run_phenorm$phi
  )
)
## Warning in if (typeof(weight) == "character" & weight == "uniform") {: the
## condition has length > 1 and only the first element will be used
## [1] "Inferring theta given provided phi"
## [1] "Starting final clustering"

Evaluating AUCs of sureLDA scores across 10 phenotypes

surelda_scores_prediction_aucs <- sapply(1:ncol(simdata$filter), function(k) {
  auc(simdata$Y[, k], surelda_prediction$scores[, k])
})

Evaluating AUCs of predicted probabilities across 10 phenotypes

surelda_ensemble_prediction_aucs <- sapply(1:ncol(simdata$filter), function(k) {
  auc(simdata$Y[, k], surelda_prediction$ensemble[, k])
})

AUCs:

surelda_prediction_result_combined <- rbind(surelda_scores_prediction_aucs, surelda_ensemble_prediction_aucs)
rownames(surelda_prediction_result_combined) <- c("sureLDA Scores", "sureLDA Probs")
print(surelda_prediction_result_combined)
##                     [,1]      [,2]      [,3]      [,4]      [,5]      [,6]
## sureLDA Scores 0.9117572 0.8666806 0.8697857 0.9115774 0.8727717 0.8669646
## sureLDA Probs  0.9132991 0.8660256 0.8711833 0.9123810 0.8750543 0.8674401
##                     [,7]      [,8]      [,9]     [,10]
## sureLDA Scores 0.9959016 0.7758763 0.8684597 0.9014433
## sureLDA Probs  0.9880464 0.7758763 0.8666961 0.8956701

Total time spent:

##    user  system elapsed 
##  82.091   0.487  82.708