set.seed(2^17 - 1) Credit = read.csv("http://www-bcf.usc.edu/~gareth/ISL/Credit.csv", row.names = 1) response.index = which(colnames(Credit) == "Balance") X = model.matrix(~ 0 + ., data = Credit[,-response.index]) y = Credit[,response.index] n = nrow(X) m = round(0.8 * n) shuffle = sample(1:n) trn_X = X[shuffle[1:m], -which(colnames(X) == "Gender Male")] trn_y = y[shuffle[1:m]] tst_X = X[shuffle[(m+1):n], -which(colnames(X) == "Gender Male")] tst_y = y[shuffle[(m+1):n]] p = ncol(trn_X) start.time = Sys.time() library(leaps) subsets = regsubsets(trn_X, trn_y, nvmax = p) Sys.time() - start.time results = array(0, c(p, 3)) colnames(results) = list("Variable Count", "RMSE", "RMSE SD") best.loss = Inf best.subset = NA best.model = NA start.time = Sys.time() library(caret) trControl = trainControl("repeatedcv", number = 5, repeats = 30) tuneGrid = expand.grid(alpha = c(0, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 1), lambda = c(0, 0.1, 1, 10, 100)) for (i in 1:p) { selected = which(summary(subsets)$outmat[i,] == "*") if (i == 1) { model = train(as.matrix(trn_X[,selected]), trn_y, method = "glm", trControl = trControl) } else { model = train(trn_X[,selected], trn_y, method = "glmnet", trControl = trControl, tuneGrid = tuneGrid) } results[i,1] = i index = which.min(model$results[,"RMSE"]) results[i,2] = min(model$results[index,"RMSE"]) results[i,3] = min(model$results[index,"RMSESD"]) if (results[i,2] < best.loss) { best.loss = results[i,2] best.subset = selected best.model = model } } Sys.time() - start.time results[order(results[,2]),] plot(best.model) plot(best.model, ylim = c(99.2, 99.3)) lambda = best.model$results[which.min(best.model$results[,"RMSE"]),"lambda"] coef(best.model$finalModel, s = lambda)