Load Data.

data(ehr_data)
data <- PhecapData(ehr_data, "healthcare_utilization", "label", 0.4, patient_id = "patient_id")
data
## PheCAP Data
## Feature: 10000 observations of 587 variables
## Label: 119 yes, 62 no, 9819 missing
## Size of training samples: 109
## Size of validation samples: 72

Specify the surrogate used for surrogate-assisted feature extraction (SAFE). The typical way is to specify a main ICD code, a main NLP CUI, as well as their combination. In some cases one may want to define surrogate through lab test. The default lower_cutoff is 1, and the default upper_cutoff is 10. Feel free to change the cutoffs based on domain knowledge.

surrogates <- list(
  PhecapSurrogate(
    variable_names = "main_ICD",
    lower_cutoff = 1, upper_cutoff = 10),
  PhecapSurrogate(
    variable_names = "main_NLP",
    lower_cutoff = 1, upper_cutoff = 10),
  PhecapSurrogate(
    variable_names = c("main_ICD", "main_NLP"),
    lower_cutoff = 1, upper_cutoff = 10))

Run surrogate-assisted feature extraction (SAFE) and show result.

system.time(feature_selected <- phecap_run_feature_extraction(data, surrogates))
##    user  system elapsed 
## 243.326   0.233 244.236
feature_selected
## Feature(s) selected by surrogate-assisted feature extraction (SAFE)
## [1] "main_ICD" "main_NLP" "NLP56"    "NLP93"    "NLP160"   "NLP161"   "NLP306"  
## [8] "NLP403"   "NLP536"

Train phenotyping model and show the fitted model, with the AUC on the training set as well as random splits

suppressWarnings(model <- phecap_train_phenotyping_model(data, surrogates, feature_selected))
model
## Phenotyping model:
## $lasso_bic
##            (Intercept)               main_ICD               main_NLP 
##              0.8165615              2.3789729              2.7761557 
##      main_ICD&main_NLP healthcare_utilization                  NLP56 
##             -1.6390179             -1.3316767              0.0000000 
##                  NLP93                 NLP160                 NLP161 
##             -1.6091941              0.0000000              0.0000000 
##                 NLP306                 NLP403                 NLP536 
##              0.0000000              0.0000000              0.9910775 
## 
## AUC on training data: 0.988 
## Average AUC on random splits: 0.971

Validate phenotyping model using validation label, and show the AUC and ROC

validation <- phecap_validate_phenotyping_model(data, model)
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values

## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values
validation
## AUC on validation data: 0.867 
## AUC on training data: 0.988 
## Average AUC on random splits: 0.971
round(validation$valid_roc[validation$valid_roc[, "FPR"] <= 0.2, ], 3)
##       cutoff pos.rate   FPR   TPR   PPV   NPV    F1
##  [1,]  1.000    0.007 0.000 0.245 1.000 0.353 0.394
##  [2,]  0.988    0.319 0.000 0.356 1.000 0.390 0.526
##  [3,]  0.975    0.361 0.000 0.468 1.000 0.436 0.637
##  [4,]  0.966    0.375 0.048 0.545 0.965 0.463 0.697
##  [5,]  0.959    0.375 0.048 0.603 0.969 0.497 0.743
##  [6,]  0.936    0.458 0.048 0.661 0.971 0.536 0.786
##  [7,]  0.861    0.542 0.048 0.718 0.973 0.582 0.827
##  [8,]  0.785    0.569 0.048 0.776 0.975 0.637 0.864
##  [9,]  0.772    0.583 0.095 0.784 0.952 0.633 0.860
## [10,]  0.770    0.583 0.095 0.784 0.952 0.633 0.860
## [11,]  0.767    0.583 0.095 0.784 0.952 0.633 0.860
## [12,]  0.764    0.583 0.095 0.784 0.952 0.633 0.860
## [13,]  0.762    0.597 0.143 0.785 0.930 0.621 0.851
## [14,]  0.756    0.597 0.143 0.789 0.931 0.626 0.854
## [15,]  0.750    0.597 0.143 0.793 0.931 0.630 0.856
## [16,]  0.742    0.611 0.143 0.797 0.931 0.635 0.859
## [17,]  0.732    0.611 0.143 0.801 0.932 0.640 0.861
## [18,]  0.718    0.625 0.190 0.804 0.911 0.630 0.854
## [19,]  0.696    0.625 0.190 0.804 0.911 0.630 0.854
## [20,]  0.674    0.625 0.190 0.804 0.911 0.630 0.854
## [21,]  0.652    0.625 0.190 0.804 0.911 0.630 0.854
## [22,]  0.631    0.625 0.190 0.804 0.911 0.630 0.854
## Warning: Use of `df$value_x` is discouraged. Use `value_x` instead.
## Warning: Use of `df$value_y` is discouraged. Use `value_y` instead.

Apply the model to all the patients to obtain predicted phenotype.

phenotype <- phecap_predict_phenotype(data, model)
idx <- which.min(abs(validation$valid_roc[, "FPR"] - 0.05))
cut.fpr95 <- validation$valid_roc[idx, "cutoff"]
case_status <- ifelse(phenotype$prediction >= cut.fpr95, 1, 0)
predict.table <- cbind(phenotype, case_status)
predict.table[1:10, ]
##    patient_id   prediction case_status
## 1           1 1.467910e-01           0
## 2           2 9.999938e-01           1
## 3           3 1.828879e-01           0
## 4           4 9.384499e-05           0
## 5           5 1.649510e-02           0
## 6           6 9.803657e-01           1
## 7           7 8.645029e-03           0
## 8           8 2.602369e-04           0
## 9           9 3.652225e-01           0
## 10         10 7.273298e-04           0