n <- 100
p <- 200
X <- matrix(rnorm(n*p), nrow = n)
Y <- rnorm(n)Note: this is an alternative method to cross-validation
# Compute lasso path
# while storing information required for
# conditional (truncated) inference
lasso_path <- lar(X, Y)
# Use AIC/BIC to choose model size
# and compute adjusted p-values
# in higher dimensions change mult
# to log(n) for BIC, log(p) for RIC
lasso_inference <- larInf(lasso_path, mult = 2, type = "aic", ntimes = 1)## Warning in larInf(lasso_path, mult = 2, type = "aic", ntimes = 1): p > n/
## 2, and sd(y) = 0.969 used as an estimate of sigma; you may want to use the
## estimateSigma functionSelected predictors
active_set <- lasso_inference$vars
active_set## [1]  87  53   7 120 139 158   6 196(Use summary() or tidy() from the broom package)
OLS_fit <- lm(Y ~ X[, active_set] - 1)
tidy(OLS_fit)## # A tibble: 8 x 5
##   term             estimate std.error statistic p.value
##   <chr>               <dbl>     <dbl>     <dbl>   <dbl>
## 1 X[, active_set]1    0.196    0.0857      2.28 0.0247 
## 2 X[, active_set]2   -0.239    0.0819     -2.92 0.00439
## 3 X[, active_set]3   -0.235    0.0788     -2.99 0.00363
## 4 X[, active_set]4    0.196    0.0814      2.40 0.0182 
## 5 X[, active_set]5   -0.191    0.0836     -2.29 0.0244 
## 6 X[, active_set]6   -0.158    0.0827     -1.91 0.0595 
## 7 X[, active_set]7   -0.155    0.0794     -1.96 0.0531 
## 8 X[, active_set]8   -0.151    0.0730     -2.07 0.0417These p-values (from the larInf function in the selectiveInference package above) are calculated from a conditional distribution, conditional on the model selection
lasso_inference## 
## Call:
## larInf(obj = lasso_path, type = "aic", mult = 2, ntimes = 1)
## 
## Standard deviation of noise (specified or estimated) sigma = 0.969
## 
## Testing results at step = 8, with alpha = 0.100
## 
##  Var   Coef Z-score P-value LowConfPt UpConfPt LowTailArea UpTailArea
##   87  0.193   1.815   0.372    -5.087    8.839        0.05       0.05
##   53 -0.230  -2.251   0.232    -9.297    2.413        0.05       0.05
##    7 -0.233  -2.382   0.243      -Inf    4.446        0.00       0.05
##  120  0.207   2.025   0.282    -5.442      Inf        0.05       0.00
##  139 -0.206  -1.949   0.718   -10.145      Inf        0.05       0.00
##  158 -0.161  -1.570   0.889    -3.539      Inf        0.05       0.00
##    6 -0.142  -1.414   0.659      -Inf      Inf        0.00       0.00
##  196 -0.146  -1.603   0.446      -Inf      Inf        0.00       0.00
## 
## Estimated stopping point from AIC rule = 8Classical inference is biased toward significance. Even though the true coefficients are all zero we get many significant p-values. This is due to model selection bias. (Classical inference methods are based on an assumption of the model being chosen in advance, they become invalid if the model is chosen using the same data)
X_test <- matrix(rnorm(n*p), nrow = n)
Y_test <- rnorm(n)
lm(Y_test ~ X_test[, active_set] - 1) %>% tidy()## # A tibble: 8 x 5
##   term                  estimate std.error statistic p.value
##   <chr>                    <dbl>     <dbl>     <dbl>   <dbl>
## 1 X_test[, active_set]1 -0.115      0.102    -1.12     0.265
## 2 X_test[, active_set]2  0.0896     0.0939    0.954    0.342
## 3 X_test[, active_set]3 -0.138      0.0930   -1.48     0.142
## 4 X_test[, active_set]4  0.0214     0.104     0.206    0.837
## 5 X_test[, active_set]5 -0.0865     0.0946   -0.915    0.363
## 6 X_test[, active_set]6 -0.0944     0.0892   -1.06     0.293
## 7 X_test[, active_set]7  0.0113     0.0986    0.114    0.909
## 8 X_test[, active_set]8  0.00529    0.104     0.0508   0.960Inference calculated using test data is does not suffer from the model selection bias because this data wasn’t used to select the model
The model changes each time, and so do the p-values, but the qualitative conclusion remains the same:
Y to be generated from a coefficient vector where only the first 5 entries are nonzero and then run the code again a few more timesX <- matrix(rnorm(n*p), nrow = n)
beta <- c(rep(1, 5), rep(0, p - 5))
Y <- X %*% beta + rnorm(n)X_test <- matrix(rnorm(n*p), nrow = n)
Y_test <- X_test %*% beta + rnorm(n)
lm(Y_test ~ X_test[, active_set] - 1) %>% tidy()## # A tibble: 8 x 5
##   term                  estimate std.error statistic p.value
##   <chr>                    <dbl>     <dbl>     <dbl>   <dbl>
## 1 X_test[, active_set]1   0.0132     0.252    0.0524  0.958 
## 2 X_test[, active_set]2   0.262      0.263    0.995   0.322 
## 3 X_test[, active_set]3   0.279      0.249    1.12    0.265 
## 4 X_test[, active_set]4  -0.448      0.259   -1.73    0.0876
## 5 X_test[, active_set]5   0.261      0.250    1.04    0.300 
## 6 X_test[, active_set]6   0.107      0.254    0.422   0.674 
## 7 X_test[, active_set]7   0.241      0.260    0.925   0.357 
## 8 X_test[, active_set]8  -0.359      0.273   -1.32    0.192