library(purrr) ### map()
library(ISLR2) ### datasets
library(splines) ### bs() for cubic spline
library(glmnet) ### glmnet() for ridge / LASSO
library(class) ### knn()
str(Wage)
'data.frame': 3000 obs. of 11 variables:
$ year : int 2006 2004 2003 2003 2005 2008 2009 2008 2006 2004 ...
$ age : int 18 24 45 43 50 54 44 30 41 52 ...
$ maritl : Factor w/ 5 levels "1. Never Married",..: 1 1 2 2 4 2 2 1 1 2 ...
$ race : Factor w/ 4 levels "1. White","2. Black",..: 1 1 1 3 1 1 4 3 2 1 ...
$ education : Factor w/ 5 levels "1. < HS Grad",..: 1 4 3 4 2 4 3 3 3 2 ...
$ region : Factor w/ 9 levels "1. New England",..: 2 2 2 2 2 2 2 2 2 2 ...
$ jobclass : Factor w/ 2 levels "1. Industrial",..: 1 2 1 2 2 2 1 2 2 2 ...
$ health : Factor w/ 2 levels "1. <=Good","2. >=Very Good": 1 2 1 2 1 2 2 1 2 2 ...
$ health_ins: Factor w/ 2 levels "1. Yes","2. No": 2 2 1 1 1 1 1 1 1 1 ...
$ logwage : num 4.32 4.26 4.88 5.04 4.32 ...
$ wage : num 75 70.5 131 154.7 75 ...
### Method 1
### A function that partition the data sets into K folds of approximately equal size
KFold <- function(dat, K = 5) {
### Number of observations in first (K - 1) folds, take the floor to get a integer value
nFoldK_1 <- floor(nrow(dat) / K)
### Number of observations in Kth fold
nFoldK <- nrow(dat) - (K - 1) * nFoldK_1
### Initialize output
datKFold <- vector("list", K)
### Initialize the data to be selected
datSelect <- dat
### Loop for first (K - 1) folds
for (k in seq_len(K - 1)) {
### Indices to be selected
I <- sample(nrow(datSelect), nFoldK_1, replace = FALSE)
### Save in the output
datKFold[[k]] <- datSelect[I, ]
### Update the data to be selected
datSelect <- datSelect[-I, ]
}
### The Kth fold
datKFold[[K]] <- datSelect
datKFold
}
### Method 2
### A function that partition the data sets into K folds of approximately equal size
KFold <- function(dat, K = 5) {
### Indices for folds
ind <- rep(seq_len(K), length.out = nrow(dat))
### Shuffled indices
indShuf <- sample(ind)
### Partition into K folds
datKFold <- map(seq_len(K), function(k){
dat[indShuf == k, ]
})
datKFold
}
### Number of fold for CV
KCV <- 5
dat <- Wage
### Get random partition of dat
datKFold <- KFold(dat = dat, K = KCV)
### Number of nodes
K <- seq_len(15)
MSECV <- map_dbl(K, function(k) {
### Node position
knots <- seq(min(dat$age), max(dat$age), length.out = k + 2)[-c(1, k + 2)]
### For every fold
MSEi <- map_dbl(seq_len(KCV), function(kcv) {
### Training dataset and test dataset
datTrain <- do.call(rbind, datKFold[-kcv])
datTest <- datKFold[[kcv]]
### Fit cubic spline
CSpline <- lm(wage ~ bs(x = age, knots = knots), data = datTrain)
### Error
err <- datTest$wage - predict(CSpline, datTest)
### MSE
mean(err^2)
})
### CV error
### Test MSE estimate
mean(MSEi)
})
### Visualize the CV error as a function of K
plot(K, MSECV, type = "o", xlab = "K", ylab = "CV Error", main = "Scatter Plot of CV Error Against K")
### Select the best model
(KBest <- K[which.min(MSECV)])
[1] 1
### Fit cubic spline with full dataset
knotsBest <- seq(min(dat$age), max(dat$age), length.out = KBest + 2)[-c(1, KBest + 2)]
CSplineBest <- lm(wage ~ bs(x = age, knots = knotsBest), data = dat)
### Set grid
grid <- seq(min(dat$age), max(dat$age), length.out=1000)
### Fitted values
yhatCSplineBest <- predict(CSplineBest, newdata = data.frame(age = grid), se.fit = TRUE)
### Visualize the nonlinear regression effect
### Data
with(dat, plot(age, wage, xlab = "Age", ylab = "Wage", main = "Wage Against Age with Fitted Curve and Confidence Band"))
### Fitted curve
with(yhatCSplineBest, lines(grid, fit, col="red", lwd=2))
### Confidence band
with(yhatCSplineBest, lines(grid, fit + 2 * se.fit, col="blue", lwd=2, lty=2))
with(yhatCSplineBest, lines(grid, fit - 2 * se.fit, col="blue", lwd=2, lty=2))
### Number of X, no intercept
p <- 20
n <- 100
m <- 500
B <- 200
### lambda values
lam <- 10 ^ ((-3):5)
### Get test MSE for different lambda based on alpha, args in glmnet
getMSE <- function(alpha) {
### Repeat for B times
MSE <- map_dfr(seq_len(B), function(b) {
### Training set
### X matrix, including intercept
XTrain <- cbind(1, matrix(rnorm(n * p), n))
### epsilon matrix, error
epsTrain <- matrix(rnorm(n), n)
### Y matrix, response
YTrain <- XTrain %*% matrix(c(1, 1, -1, rep(0, p - 2)), nrow = p + 1) + epsTrain
### Test set
### X matrix, including intercept
XTest <- cbind(1, matrix(rnorm(m * p), m))
### epsilon matrix, error
epsTest <- matrix(rnorm(m), m)
### Y matrix, response
YTest <- XTest %*% matrix(c(1, 1, -1, rep(0, p - 2)), nrow = p + 1) + epsTest
### Regression fit
fit <- glmnet(XTrain[, -1], YTrain, alpha = alpha)
### Test MSE
colMeans((matrix(YTest, m, length(lam)) - predict(fit, newx = XTest[, -1], s = lam)) ^ 2)
})
### Average test MSE
colMeans(MSE)
}
ridgeMSE <- getMSE(alpha = 0)
LASSOMSE <- getMSE(alpha = 1)
pch <- c("R", "L")
lty <- 1: 2
col <- 1: 2
matplot(log(lam), cbind(ridgeMSE, LASSOMSE), type = "o", pch = pch, lty = lty, col = col, xlab = "log (lambda)", ylab = "Test MSE", main = "Test MSE Against Logarithm of Lambdas")
legend("topleft", legend = c("Ridge", "LASSO"), pch = pch, lty = lty, col = col)
As shown in the plot, we obtained the smallest MSE with \(\lambda\) = 0.1 for LASSO. That agrees with the underlying data generating process because LASSO would set the coefficients to exactly zero, while ridge would not.
str(College)
'data.frame': 777 obs. of 18 variables:
$ Private : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
$ Apps : num 1660 2186 1428 417 193 ...
$ Accept : num 1232 1924 1097 349 146 ...
$ Enroll : num 721 512 336 137 55 158 103 489 227 172 ...
$ Top10perc : num 23 16 22 60 16 38 17 37 30 21 ...
$ Top25perc : num 52 29 50 89 44 62 45 68 63 44 ...
$ F.Undergrad: num 2885 2683 1036 510 249 ...
$ P.Undergrad: num 537 1227 99 63 869 ...
$ Outstate : num 7440 12280 11250 12960 7560 ...
$ Room.Board : num 3300 6450 3750 5450 4120 ...
$ Books : num 450 750 400 450 800 500 500 450 300 660 ...
$ Personal : num 2200 1500 1165 875 1500 ...
$ PhD : num 70 29 53 92 76 67 90 89 79 40 ...
$ Terminal : num 78 30 66 97 72 73 93 100 84 41 ...
$ S.F.Ratio : num 18.1 12.2 12.9 7.7 11.9 9.4 11.5 13.7 11.3 11.5 ...
$ perc.alumni: num 12 16 30 37 2 11 26 37 23 15 ...
$ Expend : num 7041 10527 8735 19016 10922 ...
$ Grad.Rate : num 60 56 54 59 15 55 63 73 80 52 ...
(Not graded)
### Model matrix
X <- model.matrix(Apps ~ ., data = College)[, -1]
### Response
y <- College$Apps
### Ridge
ridgeCV <- cv.glmnet(X, y, alpha = 0, nfolds = 10)
### The minimization rule
ridgeCV$lambda.min
[1] 364.8993
### Predicted values
yridgePredTrainMin <- predict(ridgeCV, newx = X, s = ridgeCV$lambda.min, type="response")
### CV error
colMeans((yridgePredTrainMin - y)^2)
s1
1358455
### 1 standard error rule
ridgeCV$lambda.1se
[1] 1342.238
### Predicted values
yridgePredTrain1se <- predict(ridgeCV, newx = X, s = ridgeCV$lambda.1se, type="response")
### CV error
colMeans((yridgePredTrain1se - y)^2)
s1
1923926
### LASSO
LASSOCV <- cv.glmnet(X, y, alpha = 1, nfolds = 10)
### The minimization rule
LASSOCV$lambda.min
[1] 2.137223
### Predicted values
yLASSOPredTrainMin <- predict(LASSOCV, newx = X, s = LASSOCV$lambda.min, type="response")
### CV error
colMeans((yLASSOPredTrainMin - y)^2)
s1
1060064
### Non-zero coefficient estimates
(betaMin <- predict(LASSOCV, s="lambda.min", type="coefficients"))
18 x 1 sparse Matrix of class "dgCMatrix"
lambda.min
(Intercept) -471.39372052
PrivateYes -491.04485137
Accept 1.57033288
Enroll -0.75961467
Top10perc 48.14698892
Top25perc -12.84690695
F.Undergrad 0.04149116
P.Undergrad 0.04438973
Outstate -0.08328388
Room.Board 0.14943472
Books 0.01532293
Personal 0.02909954
PhD -8.39597537
Terminal -3.26800340
S.F.Ratio 14.59298267
perc.alumni -0.04404771
Expend 0.07712632
Grad.Rate 8.28950241
as.matrix(betaMin)[as.vector(betaMin != 0), ]
(Intercept) PrivateYes Accept Enroll Top10perc
-471.39372052 -491.04485137 1.57033288 -0.75961467 48.14698892
Top25perc F.Undergrad P.Undergrad Outstate Room.Board
-12.84690695 0.04149116 0.04438973 -0.08328388 0.14943472
Books Personal PhD Terminal S.F.Ratio
0.01532293 0.02909954 -8.39597537 -3.26800340 14.59298267
perc.alumni Expend Grad.Rate
-0.04404771 0.07712632 8.28950241
### 1 standard error rule
LASSOCV$lambda.1se
[1] 245.7287
### Predicted values
yLASSOPredTrain1se <- predict(LASSOCV, newx = X, s = LASSOCV$lambda.1se, type="response")
### CV error
colMeans((yLASSOPredTrain1se - y)^2)
s1
1335367
### Non-zero coefficient estimates
(beta1se <- predict(LASSOCV, s="lambda.1se", type="coefficients"))
18 x 1 sparse Matrix of class "dgCMatrix"
lambda.1se
(Intercept) -492.41173306
PrivateYes .
Accept 1.35599086
Enroll .
Top10perc 19.98045033
Top25perc .
F.Undergrad .
P.Undergrad .
Outstate .
Room.Board .
Books .
Personal .
PhD .
Terminal .
S.F.Ratio .
perc.alumni .
Expend 0.02131819
Grad.Rate .
as.matrix(beta1se)[as.vector(beta1se != 0), ]
(Intercept) Accept Top10perc Expend
-492.41173306 1.35599086 19.98045033 0.02131819
(Not graded)
### New data
newx <- c(PrivateYes = 0, colMeans(College[, -(1:2)]))
### Ridge with minimization rule
predict(ridgeCV, newx = newx, s = ridgeCV$lambda.min, type = "response")
s1
[1,] 3385.488
### Ridge with 1 standard error rule
predict(ridgeCV, newx = newx, s = ridgeCV$lambda.1se, type = "response")
s1
[1,] 3406.073
### LASSO with minimization rule
predict(LASSOCV, newx = newx, s = LASSOCV$lambda.min, type = "response")
s1
[1,] 3358.704
### LASSO with 1 standard error rule
predict(LASSOCV, newx = newx, s = LASSOCV$lambda.1se, type = "response")
s1
[1,] 3001.638
(Not graded)
str(Weekly)
'data.frame': 1089 obs. of 9 variables:
$ Year : num 1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
$ Lag1 : num 0.816 -0.27 -2.576 3.514 0.712 ...
$ Lag2 : num 1.572 0.816 -0.27 -2.576 3.514 ...
$ Lag3 : num -3.936 1.572 0.816 -0.27 -2.576 ...
$ Lag4 : num -0.229 -3.936 1.572 0.816 -0.27 ...
$ Lag5 : num -3.484 -0.229 -3.936 1.572 0.816 ...
$ Volume : num 0.155 0.149 0.16 0.162 0.154 ...
$ Today : num -0.27 -2.576 3.514 0.712 1.178 ...
$ Direction: Factor w/ 2 levels "Down","Up": 1 1 2 2 2 1 2 2 2 1 ...
### Fit logistic regression
logitfit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Weekly, family=binomial)
summary(logitfit)
Call:
glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
Volume, family = binomial, data = Weekly)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.6949 -1.2565 0.9913 1.0849 1.4579
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.26686 0.08593 3.106 0.0019 **
Lag1 -0.04127 0.02641 -1.563 0.1181
Lag2 0.05844 0.02686 2.175 0.0296 *
Lag3 -0.01606 0.02666 -0.602 0.5469
Lag4 -0.02779 0.02646 -1.050 0.2937
Lag5 -0.01447 0.02638 -0.549 0.5833
Volume -0.02274 0.03690 -0.616 0.5377
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1496.2 on 1088 degrees of freedom
Residual deviance: 1486.4 on 1082 degrees of freedom
AIC: 1500.4
Number of Fisher Scoring iterations: 4
Lag 2 appears to be statistically significant.
### Fitted values
p <- predict(logitfit, Weekly, type="response")
### Second level as success
Yhat <- ifelse(p > 0.5, levels(Weekly$Direction)[2], levels(Weekly$Direction)[1])
### Confusion table
(ConfTable <- table(Yhat, Truth = Weekly$Direction))
Truth
Yhat Down Up
Down 54 48
Up 430 557
### Sensitivity
ConfTable[2, 2] / (ConfTable[1, 2] + ConfTable[2, 2])
[1] 0.9206612
### Specificity
ConfTable[1, 1] / (ConfTable[1, 1] + ConfTable[2, 1])
[1] 0.1115702
### Training set
datTrain <- Weekly[Weekly$Year %in% 1990:2008, ]
### Testing set
datTest <- Weekly[!(Weekly$Year %in% 1990:2008), ]
### Fit logistic with training set
logitfitTrain <- glm(Direction ~ Lag2, data = datTrain, family=binomial)
### Fitted values for testing set
pTest <- predict(logitfitTrain, datTest, type="response")
### Second level as success
### Confusion table
YhatTest <- ifelse(pTest > 0.5, levels(datTest$Direction)[2], levels(datTest$Direction)[1])
(ConfTableLogit <- table(YhatTest, Truth = datTest$Direction))
Truth
YhatTest Down Up
Down 9 5
Up 34 56
### Sensitivity
ConfTableLogit[2, 2] / (ConfTableLogit[1, 2] + ConfTableLogit[2, 2])
[1] 0.9180328
### Specificity
ConfTableLogit[1, 1] / (ConfTableLogit[1, 1] + ConfTableLogit[2, 1])
[1] 0.2093023
### Overall correct predictions fraction
(ConfTableLogit[1, 1] + ConfTableLogit[2, 2]) / sum(ConfTableLogit)
[1] 0.625
K <- seq_len(50)
### Map over number of knots
MisRate <- map_dbl(K, function(k) {
### KNN fit with leave-one-out CV
YHatKNNCV <- knn.cv(datTrain[, "Lag2", drop = FALSE], datTrain$Direction, k = k)
### Misclassification rate
mean(YHatKNNCV != datTrain$Direction)
})
### Best number of knots
(Kbest <- K[which.min(MisRate)])
[1] 37
### KNN fit with best K
YHatKNN <- knn(datTrain[, "Lag2", drop = FALSE], datTest[, "Lag2", drop = FALSE], datTrain$Direction, k = Kbest)
### Confusion table
table(YHatKNN, Truth = datTest$Direction)
Truth
YHatKNN Down Up
Down 22 24
Up 21 37
### Misclassification rate
mean(YHatKNN != datTest$Direction)
[1] 0.4326923