2. Diagnostic Workflow

library(E2E)

Diagnostic Models (Classification)

This track is dedicated to binary classification tasks.

1. Initialization

First, initialize the diagnostic modeling system. This registers all built-in classification models.

initialize_modeling_system_dia()
#> Diagnostic modeling system initialized and default models registered.

2. Training Single Models with `models_dia`

The models_dia function is the gateway to training one or more standard classification models.

Basic Usage

By default, models_dia runs all registered models. For this demonstration, we’ll run a subset to save time.

# To run all, use model = "all_dia".
results_all_dia <- models_dia(train_dia, model = c("rf", "lasso", "xb"))
#> Running model: rf
#> Warning in ci.auc.roc(roc_obj, conf.level = 0.95): ci.auc() of a ROC curve with
#> AUC == 1 is always 1-1 and can be misleading.
#> Running model: lasso
#> Running model: xb
#> Warning in ci.auc.roc(roc_obj, conf.level = 0.95): ci.auc() of a ROC curve with
#> AUC == 1 is always 1-1 and can be misleading.

# Print a summary for a specific model (e.g., Random Forest)
print_model_summary_dia("rf", results_all_dia$rf)
#> 
#> --- rf Model (on Training Data) Metrics ---
#> Threshold Strategy: default (0.5000)
#> AUROC: 1.0000 (95% CI: 1.0000 - 1.0000)
#> AUPRC: 1.0000
#> Accuracy: 1.0000
#> F1: 1.0000
#> Precision: 1.0000
#> Recall: 1.0000
#> Specificity: 1.0000
#> --------------------------------------------------

Advanced Usage & Customization

You can precisely control the modeling process by specifying parameters.

# Run a specific subset of models with tuning enabled and custom thresholds
results_dia_custom <- models_dia(
  data = train_dia,
  model = c("rf", "lasso", "xb"),
  tune = TRUE,
  seed = 123,
  threshold_choices = list(rf = "f1", lasso = 0.6, xb = "youden")
)
#> Running model: rf
#> Warning in ci.auc.roc(roc_obj, conf.level = 0.95): ci.auc() of a ROC curve with
#> AUC == 1 is always 1-1 and can be misleading.
#> Running model: lasso
#> Running model: xb

# View the custom results
print_model_summary_dia("rf", results_dia_custom$rf)
#> 
#> --- rf Model (on Training Data) Metrics ---
#> Threshold Strategy: f1 (0.6220)
#> AUROC: 1.0000 (95% CI: 1.0000 - 1.0000)
#> AUPRC: 1.0000
#> Accuracy: 1.0000
#> F1: 1.0000
#> Precision: 1.0000
#> Recall: 1.0000
#> Specificity: 1.0000
#> --------------------------------------------------

3. Ensemble Modeling

Bagging (`bagging_dia`)

Builds a Bagging ensemble by training a base model on multiple bootstrap samples.

# Create a Bagging ensemble with RandomForest as the base model
# n_estimators is reduced for faster execution in this example.
bagging_rf_results <- bagging_dia(train_dia, base_model_name = "rf", tune_base_model = FALSE, n_estimators = 5)
#> Running Bagging model: Bagging_dia (base: rf)
print_model_summary_dia("Bagging (RandomForest)", bagging_rf_results)
#> 
#> --- Bagging (RandomForest) Model (on Training Data) Metrics ---
#> Ensemble Type: Bagging (Base: rf, Estimators: 5)
#> Threshold Strategy: default (0.5000)
#> AUROC: 0.9999 (95% CI: 0.9997 - 1.0000)
#> AUPRC: 1.0000
#> Accuracy: 0.9919
#> F1: 0.9955
#> Precision: 0.9911
#> Recall: 1.0000
#> Specificity: 0.9125
#> --------------------------------------------------

Voting (`voting_dia`)

Combines predictions from multiple pre-trained models.

# Create a soft voting ensemble from the top models
voting_soft_results <- voting_dia(
  results_all_models = results_all_dia,
  data = train_dia,
  type = "soft"
)
#> Running Voting model: Voting_dia (type: soft)
#> Warning in ci.auc.roc(roc_obj, conf.level = 0.95): ci.auc() of a ROC curve with
#> AUC == 1 is always 1-1 and can be misleading.
print_model_summary_dia("Voting (Soft)", voting_soft_results)
#> 
#> --- Voting (Soft) Model (on Training Data) Metrics ---
#> Ensemble Type: Voting (Type: soft, Weight Metric: AUROC, Base models used: rf, xb, lasso)
#> Threshold Strategy: f1 (0.6027)
#> AUROC: 1.0000 (95% CI: 1.0000 - 1.0000)
#> AUPRC: 1.0000
#> Accuracy: 1.0000
#> F1: 1.0000
#> Precision: 1.0000
#> Recall: 1.0000
#> Specificity: 1.0000
#> --------------------------------------------------

Stacking (`stacking_dia`)

Uses predictions from base models as features to train a final meta-model.

# Create a Stacking ensemble with Lasso as the meta-model
stacking_lasso_results <- stacking_dia(
  results_all_models = results_all_dia,
  data = train_dia,
  meta_model_name = "lasso"
)
#> Running Stacking model: Stacking_dia (meta: lasso)
#> Warning in ci.auc.roc(roc_obj, conf.level = 0.95): ci.auc() of a ROC curve with
#> AUC == 1 is always 1-1 and can be misleading.
print_model_summary_dia("Stacking (Lasso)", stacking_lasso_results)
#> 
#> --- Stacking (Lasso) Model (on Training Data) Metrics ---
#> Ensemble Type: Stacking (Meta: lasso, Base models used: rf, xb, lasso)
#> Threshold Strategy: f1 (0.9794)
#> AUROC: 1.0000 (95% CI: 1.0000 - 1.0000)
#> AUPRC: 1.0000
#> Accuracy: 1.0000
#> F1: 1.0000
#> Precision: 1.0000
#> Recall: 1.0000
#> Specificity: 1.0000
#> --------------------------------------------------

Handling Imbalanced Data (`imbalance_dia`)

Implements the EasyEnsemble algorithm.

# Create an EasyEnsemble with RandomForest as the base model
# n_estimators is reduced for faster execution.
results_imbalance_dia <- imbalance_dia(train_dia, base_model_name = "rf", n_estimators = 5)
#> Running Imbalance model: EasyEnsemble_dia (base: rf)
print_model_summary_dia("Imbalance (Random Forest)", results_imbalance_dia)
#> 
#> --- Imbalance (Random Forest) Model (on Training Data) Metrics ---
#> Ensemble Type: EasyEnsemble (Base: rf, Estimators: 5)
#> Threshold Strategy: default (0.5000)
#> AUROC: 0.9995 (95% CI: 0.9986 - 1.0000)
#> AUPRC: 1.0000
#> Accuracy: 0.9873
#> F1: 0.9929
#> Precision: 1.0000
#> Recall: 0.9860
#> Specificity: 1.0000
#> --------------------------------------------------

4. Applying Models to New Data (`apply_dia`)

Use a trained model object to make predictions on a new, unseen dataset.

# Apply the trained Bagging model to the test set
bagging_pred_new <- apply_dia(
  trained_model_object = bagging_rf_results$model_object,
  new_data = test_dia,
  label_col_name = "outcome"
)

# Evaluate these new predictions
eval_results_new <- evaluate_predictions_dia(
  prediction_df = bagging_pred_new,
  threshold_choices = "f1")
print(eval_results_new)
#> $Threshold_Strategy
#> [1] "f1"
#> 
#> $Threshold
#> [1] 0.5836
#> 
#> $Accuracy
#>  Accuracy 
#> 0.9945652 
#> 
#> $Precision
#> Precision 
#> 0.9970149 
#> 
#> $Recall
#>    Recall 
#> 0.9970149 
#> 
#> $F1
#>        F1 
#> 0.9970149 
#> 
#> $Specificity
#> Specificity 
#>    0.969697 
#> 
#> $AUROC
#> [1] 0.9991859
#> 
#> $AUROC_95CI_Lower
#> [1] 0.9978357
#> 
#> $AUROC_95CI_Upper
#> [1] 1
#> 
#> $AUPRC
#> [1] 0.9999205

5. Visualization (`figure_dia`)

Generate high-quality plots to evaluate model performance.

# ROC Curve
p1 <- figure_dia(type = "roc", data = results_imbalance_dia)
#plot(p1)

# Precision-Recall Curve
p2 <- figure_dia(type = "prc", data = results_imbalance_dia)
#plot(p2)

# Confusion Matrix
p3 <- figure_dia(type = "matrix", data = results_imbalance_dia)
#plot(p3)

Diagnostic Models (Classification)

1. Initialization

2. Training Single Models with models_dia

Basic Usage

Advanced Usage & Customization

3. Ensemble Modeling

Bagging (bagging_dia)

Voting (voting_dia)

Stacking (stacking_dia)

Handling Imbalanced Data (imbalance_dia)

4. Applying Models to New Data (apply_dia)