# sample size
n <- 800
# function with zero set that
# defines the perfect (Bayes) decision boundary
true_boundary_function <- function(x1, x2) {
# experiment with changing this
thing <- x1^2*x2^3
(x1^2 + x2^2 - 1)^3 - thing^2
}
train <- data.frame(
# change the distribution
# or scale as needed
x1 = 1.5*(1 - 2*runif(n)),
x2 = 1.5*(1 - 2*runif(n))
) %>%
mutate(
# labels if classes were perfectly separated
separable = true_boundary_function(x1,x2) > 0,
# labels if classes are "noisy"
y = factor(rbinom(n, 1, 9/10 - (8/10) * as.numeric(separable)))
)
train_plot <-
ggplot(train, aes(x1, x2)) +
geom_point(aes(shape = y, color = y),
alpha = .5, show.legend = FALSE) +
xlab("") + ylab("")
train_plot
decision_surface <-
data_grid(train,
x1 = seq_range(x1, 300, expand = .05),
x2 = seq_range(x2, 300, expand = .05)) %>%
mutate(z = true_boundary_function(x1, x2))
boundary_plot <-
train_plot +
geom_contour(
data = decision_surface,
aes(x1, x2, z=z),
bins = 2,
size = 1,
color = "black",
alpha = .5)
boundary_plot
Also try changing the formula to fit a logistic regression model with non-linear transformations of the predictors
# Fit model
logit_model <-
glm(y ~ x1 + x2 + poly(x1,2) * poly(x2,2), family = "binomial", data = train)
# try formula + poly(x1,2) * poly(x2,2)
# Generate decision boundary
logit_surface <- logit_model %>%
augment(type.predict = "response",
newdata = decision_surface)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
Plot decision boundary of logistic model
boundary_plot +
geom_contour(
data = logit_surface,
aes(x1, x2, z = .fitted),
size = 1,
color = "green",
bins = 2)
Write a function to implement k-NN classification
You may use matrix operations and simple functions like sort
but not any more sophisticated built-in R functions (or libraries)
You can choose which notion of distance you want to use
# Euclidean or L2 distance
Dist <- function(x1, x2) {
sum((x1-x2)^2) # sqrt doesn't change which points are nearest
}
# Requires a distance function D
# to already be defined
nearest_y_values <- function(x0, k, x, y) {
n <- nrow(x)
distances <- rep(0, n)
for (i in 1:n) {
distances[i] <- Dist(x0, x[i,])
}
k_neighborhood <- order(distances, decreasing = FALSE)[1:k]
y[k_neighborhood]
}
knn <- function(x0, k, x, y) {
votes <- nearest_y_values(x0, k, x, y)
levels(y)[which.max(table(votes))]
}
Try points where the nearest neighbors are clearly majority 0 or 1, and a point where the portion is relatively even. Run the code below a few times and see if the answers change, then pick a different K
and try again
K <- 1
bootstrap_sample <- sample(1:nrow(train), replace = TRUE)
x <- train[bootstrap_sample, 1:2]
y <- train$y[bootstrap_sample]
In a majority 1 region
knn(x0 = c(0,0), k = K, x, y)
## [1] "1"
In a majority 0 region
knn(x0 = c(0,1.2), k = K, x, y)
## [1] "0"
In a relatively even region
knn(x0 = c(1,1), k = K, x, y)
## [1] "1"