Generating simulated data

The true model in this case is a logistic regression model.

Check: Can you understand the true data generating process by reading the code?

# Setting parameters
set.seed(1)
# Try changing some parameters
n <- 500
p <- 4
sparsity <- 2
nonzero_beta <- 
  rep(1, sparsity) # or e.g. runif(sparsity, min = -1, max = 1)
true_beta <- c(nonzero_beta, rep(0, p - sparsity))
class_shift <- 2.5 # change this
# Generating simulated data
X <- matrix(rnorm(n*p), nrow = n)
mu <- class_shift + X %*% true_beta
px <- exp(mu)/(1+exp(mu))
Y <- rbinom(n, 1, prob = px)
train_ld <- data.frame(y = as.factor(Y), x = X)
train_ld |>
  select(y, num_range("x.", 1:4)) |>
  ggpairs(progress = F)

Check: Do these plots fit with your understanding of the true data generating process?

# Class (im)balance
table(train_ld$y)
## 
##   0   1 
##  74 426
train_ld |> count(y)
##   y   n
## 1 0  74
## 2 1 426

Question: What would the accuracy be for a classifier that always predicts the most common class?

const_yhat_success <- mean(train_ld$y == 1)
max(const_yhat_success, 1 - const_yhat_success)
## [1] 0.852

Logistic regression model

Coefficients

c(class_shift, true_beta)
## [1] 2.5 1.0 1.0 0.0 0.0
fit_glm <- glm(y ~ ., family = binomial(), data = train_ld)
fit_glm |> ggcoef()

Coefficients on the odds scale:

fit_glm |> 
  tidy(exponentiate = TRUE) |> knitr::kable()
term estimate std.error statistic p.value
(Intercept) 11.9518723 0.2067843 11.9974661 0.0000000
x.1 3.2949620 0.1704925 6.9938266 0.0000000
x.2 2.8355109 0.1622098 6.4251500 0.0000000
x.3 1.1254655 0.1456378 0.8115803 0.4170325
x.4 0.9570916 0.1344844 -0.3261060 0.7443442

Predictions

fit_glm_predictions <- augment(fit_glm, type.predict = "response")
fit_glm_predictions |> pull(.fitted) |> mean()
## [1] 0.852

Question: Does this number look familiar? Why?

Answer: It’s the proportion of the majority class that we saw before. Any other value here would indicate an overall bias in the predictions of the model.

# Note: the event_level option is currently required because
# of a mismatch between glm() and yardstick
fit_glm_predictions |>
  roc_curve(truth = y, .fitted,
            event_level = "second") |>
  autoplot()

Note: When a classification model outputs class probabilities or numeric scores these can be converted into classifications by setting thresholds or cutoffs. If we keep the model fixed but change the cutoff, we can achieve different trade-offs between false positives and false negatives (or specificity and sensitivity, precision and recall, etc–these are just other names). Plots like this roc_curve (see also ?pr_curve) show all the possible trade-off values for one model.

Question: Using curves like this how can we compare different models? Give at least two suggestions.

Answer: We could compare at a specific point on the curve, for example if that point corresponds to a certain false positive rate that has some practical or domain-specific importance (e.g. regulations). Or we could compute the area under the curves, e.g. using the roc_auc function.

Classify at several different thresholds

# Note: true and predicted classes must use same
# label names, hence the factor(as.numeric()) 
higher_cutoff <- const_yhat_success + .05
confusion_matrix_higher <-
  fit_glm_predictions |>
  mutate(yhat = factor(as.numeric(.fitted > higher_cutoff))) |>
  conf_mat(truth = y, estimate = yhat)
lower_cutoff <- .2
confusion_matrix_lower <-
  fit_glm_predictions |>
  mutate(yhat = factor(as.numeric(.fitted > lower_cutoff))) |>
  conf_mat(truth = y, estimate = yhat)
confusion_matrix_higher
##           Truth
## Prediction   0   1
##          0  65 164
##          1   9 262
confusion_matrix_lower
##           Truth
## Prediction   0   1
##          0   0   2
##          1  74 424
confusion_matrix_higher |> autoplot(type = "heatmap")

confusion_matrix_lower |> autoplot(type = "heatmap")

Comparing across various metrics:

higher_summary <- summary(confusion_matrix_higher) |>
  mutate(higher = .estimate) |>
  select(.metric, higher)
lower_summary <- summary(confusion_matrix_lower) |>
  mutate(lower = .estimate) |>
  select(lower)
cbind(higher_summary, lower_summary) |>
  knitr::kable()
.metric higher lower
accuracy 0.6540000 0.8480000
kap 0.2645058 -0.0078506
sens 0.8783784 0.0000000
spec 0.6150235 0.9953052
ppv 0.2838428 0.0000000
npv 0.9667897 0.8514056
mcc 0.3516568 -0.0264126
j_index 0.4934019 -0.0046948
bal_accuracy 0.7467009 0.4976526
detection_prevalence 0.4580000 0.0040000
precision 0.2838428 0.0000000
recall 0.8783784 0.0000000
f_meas 0.4290429 0.0000000

Note the accuracy, for example, which is the proportion of data where the predicted class and true class are equal.

Sub-sampling for class balance

rare_class_size <- min(table(train_ld$y))
train_subsampled <- train_ld |>
  group_by(y) |>
  sample_n(rare_class_size) |>
  ungroup()
  
fit_glm_subs <- glm(y ~ ., family = binomial(), data = train_subsampled)

Does this change coefficients or inferences about them?

cbind(
  tidy(fit_glm) |> select(term, estimate, std.error),
  tidy(fit_glm_subs) |> select(estimate, std.error)) |>
  knitr::kable()
term estimate std.error estimate std.error
(Intercept) 2.4808879 0.2067843 0.8869130 0.2734060
x.1 1.1923946 0.1704925 1.6250092 0.2957344
x.2 1.0422221 0.1622098 1.1683666 0.2551454
x.3 0.1181967 0.1456378 0.2936595 0.2475284
x.4 -0.0438562 0.1344844 0.0915651 0.1980801

Note: With a smaller sample size, the balanced training data gives us larger std.errors for our estimates. (Wider confident intervals, larger p-values).

fit_glm_subs_predictions <- augment(fit_glm_subs, type.predict = "response")
fit_glm_subs_predictions |> pull(.fitted) |> mean()
## [1] 0.5

Training data classes are balanced so the predictions are also balanced.

# Note: the event_level option is currently required because
# of a mismatch between glm() and yardstick
fit_glm_subs_predictions |>
  roc_curve(truth = y, .fitted,
            event_level = "second") |>
  autoplot()

Classify at several different thresholds

# Note: true and predicted classes must use same
# label names, hence the factor(as.numeric()) 
higher_cutoff <- const_yhat_success + .05
confusion_matrix_subs_higher <-
  fit_glm_subs_predictions |>
  mutate(yhat = factor(as.numeric(.fitted > higher_cutoff))) |>
  conf_mat(truth = y, estimate = yhat)
lower_cutoff <- .2
confusion_matrix_subs_lower <-
  fit_glm_subs_predictions |>
  mutate(yhat = factor(as.numeric(.fitted > lower_cutoff))) |>
  conf_mat(truth = y, estimate = yhat)
confusion_matrix_subs_higher
##           Truth
## Prediction  0  1
##          0 74 48
##          1  0 26
confusion_matrix_subs_lower
##           Truth
## Prediction  0  1
##          0 35  4
##          1 39 70
confusion_matrix_subs_higher |> autoplot(type = "heatmap")

confusion_matrix_subs_lower |> autoplot(type = "heatmap")

Comparing across various metrics:

higher_subs_summary <- summary(confusion_matrix_subs_higher) |>
  mutate(higher = .estimate) |>
  select(.metric, higher)
lower_subs_summary <- summary(confusion_matrix_subs_lower) |>
  mutate(lower = .estimate) |>
  select(lower)
metrics <- c("accuracy", "bal_accuracy") #, "precision", "recall")
rbind(
cbind(higher_summary, lower_summary) |>
  filter(.metric %in% metrics) |>
  mutate(subsampled = FALSE),
cbind(higher_subs_summary, lower_subs_summary) |>
  filter(.metric %in% metrics) |>
  mutate(subsampled = TRUE)) |>
  knitr::kable()
.metric higher lower subsampled
accuracy 0.6540000 0.8480000 FALSE
bal_accuracy 0.7467009 0.4976526 FALSE
accuracy 0.6756757 0.7094595 TRUE
bal_accuracy 0.6756757 0.7094595 TRUE

Question: Suppose we have trained a model on data that was subsampled to create class balance. Then we choose classification cutoffs to achieve a desired trade-off between precision/recall (or false positives and false negatives), and we do this still while using the subsampled training data. What will happen if we start using that model to classify data from a new or full dataset that hasn’t been subsampled?

Answer: Predictions on imbalanced data will achieve a different trade-off between false positives and false negatives. To calibrate this we would need to choose a different classification threshold that works on imbalanced data.