set.seed(1) # optional: delete this line
# sample size
n <- 500
# function with zero set that
# defines the perfect (Bayes) decision boundary
true_boundary_function <- function(x1, x2) {
# experiment with changing this
#(x1^2 + x2^2 - 1)^3 - x1^2*x2^3
abs(x1*x2) - x1
}
train <- data.frame(
# change the distribution
# or scale as needed
x1 = 1.5*(1 - 2*runif(n)),
x2 = 1.5*(1 - 2*runif(n))
) %>%
mutate(
# labels if classes were perfectly separated
separable = true_boundary_function(x1,x2) > 0,
# labels if classes are "noisy"
y = factor(rbinom(n, 1, 9/10 - (8/10) * as.numeric(separable)))
)
train_plot <-
ggplot(train, aes(x1, x2)) +
geom_point(aes(shape = y, color = y),
alpha = .5, show.legend = FALSE) +
xlab("") + ylab("")
train_plot
decision_surface <-
data_grid(train,
x1 = seq_range(x1, 300, expand = .05),
x2 = seq_range(x2, 300, expand = .05)) %>%
mutate(z = true_boundary_function(x1, x2))
boundary_plot <-
train_plot +
geom_contour(
data = decision_surface,
aes(x1, x2, z=z),
bins = 2,
size = 1,
color = "black",
alpha = .5)
boundary_plot
Also try changing the formula to fit a logistic regression model with non-linear transformations of the predictors
# Fit model
logit_model <-
glm(y ~ x1 + x2 + poly(x1,2) * poly(x2,2), family = "binomial", data = train)
# try formula + poly(x1,2) * poly(x2,2)
# Generate decision boundary
logit_surface <- logit_model %>%
augment(type.predict = "response",
newdata = decision_surface)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
Plot of decision boundary of logistic model
boundary_plot +
geom_contour(
data = logit_surface,
aes(x1, x2, z = .fitted),
size = 1,
color = "green",
bins = 2)
ksvm
function from the kernlab
package to fit a non-linear classification models and plot them (install the package if you have not already)k_fit <- ksvm(y ~ x2 + x1,
kernel = "rbfdot",
kpar = list(sigma = 2),
C = 20,
#nu = 0.05,
data = train)
k_fit
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 20
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 2
##
## Number of Support Vectors : 172
##
## Objective Function Value : -2668.884
## Training error : 0.112
The plot below only shows the support vectors, not the full dataset
plot(k_fit)
nSV()
and alphaindex()
functions to find the number of support vectors and their indexes in the training data. What proportion of the training data is used by the classifier?nSV(k_fit)
## [1] 172
alphaindex(k_fit)
## [[1]]
## [1] 3 4 11 13 14 16 27 30 31 32 33 34 45 48 50 52 54 56
## [19] 60 61 62 63 66 67 69 71 73 75 89 100 103 105 108 109 110 111
## [37] 119 122 123 126 127 128 137 140 144 146 153 155 157 160 161 165 167 168
## [55] 180 182 184 185 192 193 195 196 198 202 203 206 207 210 213 216 217 223
## [73] 224 228 229 242 243 245 246 247 248 249 252 254 255 256 258 259 262 263
## [91] 266 268 271 272 279 280 281 282 284 285 286 291 295 297 304 305 314 316
## [109] 322 323 330 333 335 338 339 343 346 355 359 360 362 365 369 372 374 377
## [127] 378 380 383 384 386 390 394 400 403 412 413 416 417 419 421 423 425 426
## [145] 430 433 434 435 438 439 440 441 445 449 450 452 454 459 463 464 468 479
## [163] 483 485 486 489 494 495 496 497 498 499
Fraction of “active” data:
nSV(k_fit) / nrow(train)
## [1] 0.344
f
, n
, s2
, and sd
The beta
distribution is closer to uniform if s2
is closer to 1, so increasing s2
creates a region of the predictor space that has relatively less training data
# change this function
f <- function(x) sin(4*pi*x)
n <- 400
s1 <- 1
s2 <- 3
noise_sd <- .4
train1d <-
data.frame(
x = rbeta(n, s1, s2)
) %>%
# change the noise level sd
mutate(y = f(x) + rnorm(n, sd = noise_sd))
# make sure this is the same as above
test1d_ID <-
data.frame(
x = rbeta(n, s1, s2)
) %>%
# change the noise level sd
mutate(y = f(x) + rnorm(n, sd = noise_sd))
# make this slightly different from above to experiment with OOD
test1d_OOD <-
data.frame(
x = rbeta(n, s1, s2/2) # more data with larger x values
) %>%
# change the noise level sd
mutate(y = f(x) + .1*cos(4*pi*x) + rnorm(n, sd = noise_sd))
# grid for plotting predictions
train1d_grid <-
data_grid(train1d,
x = seq_range(c(.05, .95), 2000, expand = .05))
?geom_smooth
and change the span
manually for a better fit# plot data and loess curve
ggplot(train1d, aes(x, y)) +
geom_point() +
geom_smooth(span = .4)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
model_lm <- lm(y ~ poly(x, 5), data = train1d)
augment(model_lm,
newdata = train1d_grid) %>%
ggplot(aes(x, y)) +
geom_point(data = train1d) +
geom_line(aes(y = .fitted)) #+ ylim(c(-3,3))
Use the ksvm
function and kernelMatrix
functions to fit non-linear kernel regression models and plot the predictions on train1d_grid
k_1fit <- ksvm(y ~ x,
kernel = rbfdot(sigma = 10),
#kernel = polydot(degree = 5),
data = train1d)
k_1fit
## Support Vector Machine object of class "ksvm"
##
## SV type: eps-svr (regression)
## parameter : epsilon = 0.1 cost C = 1
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 10
##
## Number of Support Vectors : 341
##
## Objective Function Value : -119.8799
## Training error : 0.235688
svm_predictions <- train1d_grid %>%
mutate(.fitted = predict(k_1fit, newdata = train1d_grid))
train1d %>%
ggplot(aes(x, y)) +
geom_point(data = train1d) +
geom_line(
data = svm_predictions,
aes(y = .fitted)) #+ylim(c(-2,2))
fits <- list(
augment(model_lm,
newdata = test1d_ID) %>%
mutate(generalization = "ID", model = "lm"),
test1d_ID %>%
mutate(.fitted = predict(k_1fit, newdata = test1d_ID),
generalization = "ID", model = "ksvm_rbf"),
augment(model_lm,
newdata = test1d_OOD) %>%
mutate(generalization = "OOD", model = "lm"),
test1d_OOD %>%
mutate(.fitted = predict(k_1fit, newdata = test1d_OOD),
generalization = "OOD", model = "ksvm_rbf"))
map_dfr(fits, rbind) %>%
mutate(resid = y - .fitted) %>%
group_by(generalization, model) %>%
summarize(MSE = mean(resid^2), .groups = "keep")
## # A tibble: 4 x 3
## # Groups: generalization, model [4]
## generalization model MSE
## <chr> <chr> <dbl>
## 1 ID ksvm_rbf 0.158
## 2 ID lm 0.232
## 3 OOD ksvm_rbf 0.190
## 4 OOD lm 0.940
s2
is increased? Why?There is relatively less training data at larger values of x, so the local accuracy decreases. This is due to higher variability of the model in that region
(Note that this is less of a problem for models which are less flexible)
sd
is increased? Or if n
is decreased? How is this different from the previous question?The variability increases everywhere, so the accuracy is lower everywhere
(Note that our error model has constant variance, a common simplifying assumption that can be importantly wrong in practice. If it is wrong then there may be regions of predictor space where the error level is relatively higher even if the local sample size is high)
sigma
in rbfdot
?The model becomes more flexible, so its fit has higher variance. If there are regions with low local sample size then a highly flexible fit may have poor local accuracy there
ID generalization depends on variability, so everything we previously said about “accuracy” applies directly. Lower (local) noise levels or higher (local) sample sizes allow more flexible fits without suffering from too much variability
OOD generalization will depend (locally) on how the model is different (in that region). If it’s the same function f
extending outside the range of the original data (i.e. extrapolation) then OOD generalization might be (relatively) easy on the left side of the plot and difficult on the right side.
Fit a kernel regression model on gapminder
data, plot the predictions, calculate the MSE
Calculate the MSE using a different year as test data to check OOD performance