knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(broom)
# install.packages("glmnet") # if necessary
library(glmnet)
Generate some data from a high-dimensional model (increase p below)
#set.seed(1)
n <- 100
p <- 20
X <- matrix(rnorm(n*p), nrow = n)
beta <- rpois(p, lambda = 1)
#beta <- c(-1, 1, rep(0, p - 2))
y <- X %*% beta + rnorm(n)
We will study ridge regression in more detail soon. For now you only need to know it’s a method we can use for fitting high-dimensional regression models. It also involves a tuning parameter called “lambda,” and larger values of lambda result in simpler fitted models. (We’ll learn about how they are simpler soon when we study penalized regression)
Use the glmnet
package to fit a ridge regression model on the same data as in the previous part. Hint: read about the alpha
input to the glmnet
function in the documentation.
model_ridge <- glmnet(X, y,
intercept = FALSE,
alpha = 0,
lambda = 10^(-3:3))
What does plotting the model object show?
# or autoplot(model_ridge, xvar = "lambda")
plot(model_ridge, xvar = "lambda")
Compute the mean-squared error of the coefficient estimates at different values of lambda
lambda <- 10^0
beta_hat <- coef(model_ridge, s = lambda)[-1] # leave out intercept
Compute predictions using the estimated coefficients and the mean-squared prediction error at different values of lambda
# e.g. predict(model_ridge, newx = X, s = lambda or 10*lambda)
Generate a new sample from the same distribution (which things stay fixed?)
# X_ID <- ... etc
Calculate the prediction error on this new sample at different values of lambda
There are many ways to change the distribution for a new sample
Pick one of these and test OOD generalisation (prediction error on new, different sample)
# ...
# X_OOD <-
# y_OOD <- X_OOD %*% ...
# ... predict(model_ridge, newx = X_OOD, etc)
Copy/paste your gradient descent (or stochastic version) code for ordinary linear regression here. Does it converge? If so, to what? Check distance (MSE) from the true beta and from the ridge estimate of beta (at different values of lambda, or a very small lambda close to zero)
(Also consider generating data with p larger than n, so the least squares estimator is undefined)
# Gradient of the loss function
least_squares_gradient <- function(x, y, beta) {
-2 * t(x) %*% (y - x %*% beta) #+ 2 * beta
}
# Loss function
least_squares_loss <- function(x, y, beta) {
sum((y - x %*% beta)^2)
}
beta_prev2 <- rep(0, p) # or random start point
grad_prev2 <- least_squares_gradient(X, y, beta_prev2)
beta_prev1 <- beta_prev2 + 0.1 * grad_prev2 / sqrt(sum(grad_prev2^2))
grad_prev1 <- least_squares_gradient(X, y, beta_prev1)
previous_loss <- least_squares_loss(X, y, beta_prev2)
next_loss <- least_squares_loss(X, y, beta_prev1)
steps <- 1
while (abs(previous_loss - next_loss) > 0.001) {
grad_diff <- grad_prev1 - grad_prev2
step_BB <- sum((beta_prev1 - beta_prev2) * grad_diff) / sum(grad_diff^2)
# Barzilai–Borwein step size
beta_prev2 <- beta_prev1
beta_prev1 <- beta_prev1 - step_BB * grad_prev1
grad_prev2 <- grad_prev1
grad_prev1 <- least_squares_gradient(X, y, beta_prev1)
previous_loss <- next_loss
next_loss <- least_squares_loss(X, y, beta_prev1)
#if (round(steps/10) == steps/10)
print(previous_loss)
steps <- steps + 1
}
## [1] 2598.179
## [1] 262.9755
## [1] 134.6286
## [1] 106.1237
## [1] 96.64212
## [1] 91.05705
## [1] 88.7851
## [1] 88.80032
## [1] 88.39227
## [1] 88.37448
## [1] 88.35734
## [1] 88.34211
beta_final <- beta_prev1
Check distance (MSE) from the true beta and from the ridge estimate of beta (at different values of lambda, or a very small lambda close to zero)