Diagnostic Models (Classification)
This track is dedicated to binary classification tasks.
1. Initialization
First, initialize the diagnostic modeling system. This registers all built-in classification models.
initialize_modeling_system_dia()
#> Diagnostic modeling system initialized and default models registered.
2. Training Single Models with models_dia
The models_dia
function is the gateway to training one
or more standard classification models.
Basic Usage
By default, models_dia
runs all registered models. For
this demonstration, we’ll run a subset to save time.
# To run all, use model = "all_dia" or omit the parameter.
results_all_dia <- models_dia(train_dia, model = c("rf", "lasso", "xb"))
#> Running model: rf
#> Warning in ci.auc.roc(roc_obj, conf.level = 0.95): ci.auc() of a ROC curve with
#> AUC == 1 is always 1-1 and can be misleading.
#> Running model: lasso
#> Running model: xb
#> Warning in ci.auc.roc(roc_obj, conf.level = 0.95): ci.auc() of a ROC curve with
#> AUC == 1 is always 1-1 and can be misleading.
# Print a summary for a specific model (e.g., Random Forest)
print_model_summary_dia("rf", results_all_dia$rf)
#>
#> --- rf Model (on Training Data) Metrics ---
#>
#> AUROC: 1.0000 (95% CI: 1.0000 - 1.0000)
#> AUPRC: 1.0000
#> Accuracy: 1.0000
#> F1: 1.0000
#> Precision: 1.0000
#> Recall: 1.0000
#> Specificity: 1.0000
#> --------------------------------------------------
Advanced Usage & Customization
You can precisely control the modeling process by specifying parameters.
# Run a specific subset of models with tuning enabled and custom thresholds
results_dia_custom <- models_dia(
data = train_dia,
model = c("rf", "lasso", "xb"),
tune = TRUE,
seed = 123,
threshold_choices = list(rf = "f1", lasso = 0.6, xb = "youden"),
positive_label_value = 1,
negative_label_value = 0,
new_positive_label = "Case",
new_negative_label = "Control"
)
#> Running model: rf
#> Warning in ci.auc.roc(roc_obj, conf.level = 0.95): ci.auc() of a ROC curve with
#> AUC == 1 is always 1-1 and can be misleading.
#> Running model: lasso
#> Running model: xb
#> Warning in ci.auc.roc(roc_obj, conf.level = 0.95): ci.auc() of a ROC curve with
#> AUC == 1 is always 1-1 and can be misleading.
# View the custom results
print_model_summary_dia("rf", results_dia_custom$rf)
#>
#> --- rf Model (on Training Data) Metrics ---
#>
#> AUROC: 1.0000 (95% CI: 1.0000 - 1.0000)
#> AUPRC: 1.0000
#> Accuracy: 1.0000
#> F1: 1.0000
#> Precision: 1.0000
#> Recall: 1.0000
#> Specificity: 1.0000
#> --------------------------------------------------
3. Ensemble Modeling
Bagging (bagging_dia
)
Builds a Bagging ensemble by training a base model on multiple bootstrap samples.
# Create a Bagging ensemble with XGBoost as the base model
# n_estimators is reduced for faster execution in this example.
bagging_xb_results <- bagging_dia(train_dia, base_model_name = "xb", n_estimators = 5)
#> Running Bagging model: Bagging_dia (base: xb)
print_model_summary_dia("Bagging (XGBoost)", bagging_xb_results)
#>
#> --- Bagging (XGBoost) Model (on Training Data) Metrics ---
#> Ensemble Type: Bagging (Base: xb, Estimators: 5)
#>
#> AUROC: 0.9995 (95% CI: 0.9989 - 1.0000)
#> AUPRC: 0.9999
#> Accuracy: 0.9919
#> F1: 0.9955
#> Precision: 0.9911
#> Recall: 1.0000
#> Specificity: 0.9125
#> --------------------------------------------------
Voting (voting_dia
)
Combines predictions from multiple pre-trained models.
# Create a soft voting ensemble from the top models
voting_soft_results <- voting_dia(
results_all_models = results_all_dia,
data = train_dia,
type = "soft"
)
#> Running Voting model: Voting_dia (type: soft)
#> Warning in ci.auc.roc(roc_obj, conf.level = 0.95): ci.auc() of a ROC curve with
#> AUC == 1 is always 1-1 and can be misleading.
print_model_summary_dia("Voting (Soft)", voting_soft_results)
#>
#> --- Voting (Soft) Model (on Training Data) Metrics ---
#> Ensemble Type: Voting (Type: soft, Weight Metric: AUROC, Base models used: rf, xb, lasso)
#>
#> AUROC: 1.0000 (95% CI: 1.0000 - 1.0000)
#> AUPRC: 1.0000
#> Accuracy: 1.0000
#> F1: 1.0000
#> Precision: 1.0000
#> Recall: 1.0000
#> Specificity: 1.0000
#> --------------------------------------------------
Stacking (stacking_dia
)
Uses predictions from base models as features to train a final meta-model.
# Create a Stacking ensemble with Lasso as the meta-model
stacking_lasso_results <- stacking_dia(
results_all_models = results_all_dia,
data = train_dia,
meta_model_name = "lasso"
)
#> Running Stacking model: Stacking_dia (meta: lasso)
#> Warning in ci.auc.roc(roc_obj, conf.level = 0.95): ci.auc() of a ROC curve with
#> AUC == 1 is always 1-1 and can be misleading.
print_model_summary_dia("Stacking (Lasso)", stacking_lasso_results)
#>
#> --- Stacking (Lasso) Model (on Training Data) Metrics ---
#> Ensemble Type: Stacking (Meta: lasso, Base models used: rf, xb, lasso)
#>
#> AUROC: 1.0000 (95% CI: 1.0000 - 1.0000)
#> AUPRC: 1.0000
#> Accuracy: 1.0000
#> F1: 1.0000
#> Precision: 1.0000
#> Recall: 1.0000
#> Specificity: 1.0000
#> --------------------------------------------------
Handling Imbalanced Data (imbalance_dia
)
Implements the EasyEnsemble algorithm.
# Create an EasyEnsemble with XGBoost as the base model
# n_estimators is reduced for faster execution.
results_imbalance_dia <- imbalance_dia(train_dia, base_model_name = "xb", n_estimators = 5, seed = 123)
#> Running Imbalance model: EasyEnsemble_dia (base: xb)
print_model_summary_dia("Imbalance (XGBoost)", results_imbalance_dia)
#>
#> --- Imbalance (XGBoost) Model (on Training Data) Metrics ---
#> Ensemble Type: EasyEnsemble (Base: xb, Estimators: 5)
#>
#> AUROC: 0.9999 (95% CI: 0.9998 - 1.0000)
#> AUPRC: 1.0000
#> Accuracy: 0.9838
#> F1: 0.9910
#> Precision: 1.0000
#> Recall: 0.9821
#> Specificity: 1.0000
#> --------------------------------------------------
4. Applying Models to New Data (apply_dia
)
Use a trained model object to make predictions on a new, unseen dataset.
# Apply the trained Bagging model to the test set
bagging_pred_new <- apply_dia(
trained_model_object = bagging_xb_results$model_object,
new_data = test_dia,
label_col_name = "outcome",
pos_class = "Positive",
neg_class = "Negative"
)
#> Applying model to new data...
# Evaluate these new predictions
eval_results_new <- evaluate_model_dia(
precomputed_prob = bagging_pred_new$score,
y_data = factor(test_dia$outcome, levels = c(0, 1), labels = c("Positive", "Negative")),
sample_ids = test_dia$sample,
threshold_strategy = "default",
pos_class = "Positive",
neg_class = "Negative"
)
print(eval_results_new$evaluation_metrics)
#> $Threshold_Strategy
#> [1] "default"
#>
#> $Final_Threshold
#> [1] 0.5
#>
#> $Accuracy
#> Accuracy
#> 0.01902174
#>
#> $Precision
#> Precision
#> 0.01190476
#>
#> $Recall
#> Recall
#> 0.1212121
#>
#> $F1
#> F1
#> 0.02168022
#>
#> $Specificity
#> Specificity
#> 0.008955224
#>
#> $AUROC
#> Area under the curve: 0.9977
#>
#> $AUROC_95CI_Lower
#> [1] 0.9951056
#>
#> $AUROC_95CI_Upper
#> [1] 1
#>
#> $AUPRC
#> [1] 0.04625718
5. Visualization (figure_dia
)
Generate high-quality plots to evaluate model performance.
# ROC Curve
figure_dia(type = "roc", data = results_imbalance_dia)
# Precision-Recall Curve
figure_dia(type = "prc", data = results_imbalance_dia)
# Confusion Matrix
figure_dia(type = "matrix", data = results_imbalance_dia)