library(purrr) ### map()
library(ISLR2) ### datasets
library(splines) ### bs() for cubic spline
library(glmnet) ### glmnet() for ridge / LASSO
library(class) ### knn()

1.

str(Wage)
'data.frame':   3000 obs. of  11 variables:
 $ year      : int  2006 2004 2003 2003 2005 2008 2009 2008 2006 2004 ...
 $ age       : int  18 24 45 43 50 54 44 30 41 52 ...
 $ maritl    : Factor w/ 5 levels "1. Never Married",..: 1 1 2 2 4 2 2 1 1 2 ...
 $ race      : Factor w/ 4 levels "1. White","2. Black",..: 1 1 1 3 1 1 4 3 2 1 ...
 $ education : Factor w/ 5 levels "1. < HS Grad",..: 1 4 3 4 2 4 3 3 3 2 ...
 $ region    : Factor w/ 9 levels "1. New England",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ jobclass  : Factor w/ 2 levels "1. Industrial",..: 1 2 1 2 2 2 1 2 2 2 ...
 $ health    : Factor w/ 2 levels "1. <=Good","2. >=Very Good": 1 2 1 2 1 2 2 1 2 2 ...
 $ health_ins: Factor w/ 2 levels "1. Yes","2. No": 2 2 1 1 1 1 1 1 1 1 ...
 $ logwage   : num  4.32 4.26 4.88 5.04 4.32 ...
 $ wage      : num  75 70.5 131 154.7 75 ...

a.

### Method 1
### A function that partition the data sets into K folds of approximately equal size
KFold <- function(dat, K = 5) {
  ### Number of observations in first (K - 1) folds, take the floor to get a integer value
  nFoldK_1 <- floor(nrow(dat) / K)

  ### Number of observations in Kth fold
  nFoldK <- nrow(dat) - (K - 1) * nFoldK_1

  ### Initialize output
  datKFold <- vector("list", K)

  ### Initialize the data to be selected
  datSelect <- dat

  ### Loop for first (K - 1) folds
  for (k in seq_len(K - 1)) {
    ### Indices to be selected
    I <- sample(nrow(datSelect), nFoldK_1, replace = FALSE)

    ### Save in the output
    datKFold[[k]] <- datSelect[I, ]

    ### Update the data to be selected
    datSelect <- datSelect[-I, ]
  }

  ### The Kth fold
  datKFold[[K]] <- datSelect

  datKFold
}

### Method 2
### A function that partition the data sets into K folds of approximately equal size
KFold <- function(dat, K = 5) {
  ### Indices for folds
  ind <- rep(seq_len(K), length.out = nrow(dat))
  
  ### Shuffled indices
  indShuf <- sample(ind)
  
  ### Partition into K folds
  datKFold <- map(seq_len(K), function(k){
    dat[indShuf == k, ]
  })

  datKFold
}

### Number of fold for CV
KCV <- 5

dat <- Wage

### Get random partition of dat
datKFold <- KFold(dat = dat, K = KCV)

### Number of nodes
K <- seq_len(15)

MSECV <- map_dbl(K, function(k) {
  ### Node position
  knots <- seq(min(dat$age), max(dat$age), length.out = k + 2)[-c(1, k + 2)]

  ### For every fold
  MSEi <- map_dbl(seq_len(KCV), function(kcv) {
    ### Training dataset and test dataset
    datTrain <- do.call(rbind, datKFold[-kcv])
    datTest <- datKFold[[kcv]]

    ### Fit cubic spline
    CSpline <- lm(wage ~ bs(x = age, knots = knots), data = datTrain)

    ### Error
    err <- datTest$wage - predict(CSpline, datTest)

    ### MSE
    mean(err^2)
  })

  ### CV error
  ### Test MSE estimate
  mean(MSEi)
})

### Visualize the CV error as a function of K
plot(K, MSECV, type = "o", xlab = "K", ylab = "CV Error", main = "Scatter Plot of CV Error Against K")

b.

### Select the best model
(KBest <- K[which.min(MSECV)])
[1] 1
### Fit cubic spline with full dataset
knotsBest <- seq(min(dat$age), max(dat$age), length.out = KBest + 2)[-c(1, KBest + 2)]
CSplineBest <- lm(wage ~ bs(x = age, knots = knotsBest), data = dat)

### Set grid
grid <- seq(min(dat$age), max(dat$age), length.out=1000)

### Fitted values
yhatCSplineBest <- predict(CSplineBest, newdata = data.frame(age = grid), se.fit = TRUE)

### Visualize the nonlinear regression effect
### Data
with(dat, plot(age, wage, xlab = "Age", ylab = "Wage", main = "Wage Against Age with Fitted Curve and Confidence Band"))

### Fitted curve
with(yhatCSplineBest, lines(grid, fit, col="red", lwd=2))

### Confidence band
with(yhatCSplineBest, lines(grid, fit + 2 * se.fit, col="blue", lwd=2, lty=2))
with(yhatCSplineBest, lines(grid, fit - 2 * se.fit, col="blue", lwd=2, lty=2))

2.

### Number of X, no intercept
p <- 20

n <- 100
m <- 500
B <- 200

### lambda values
lam <- 10 ^ ((-3):5)

### Get test MSE for different lambda based on alpha, args in glmnet
getMSE <- function(alpha) {
  ### Repeat for B times
  MSE <- map_dfr(seq_len(B), function(b) {
    ### Training set
    ### X matrix, including intercept
    XTrain <- cbind(1, matrix(rnorm(n * p), n))

    ### epsilon matrix, error
    epsTrain <- matrix(rnorm(n), n)

    ### Y matrix, response
    YTrain <- XTrain %*% matrix(c(1, 1, -1, rep(0, p - 2)), nrow = p + 1) + epsTrain

    ### Test set
    ### X matrix, including intercept
    XTest <- cbind(1, matrix(rnorm(m * p), m))

    ### epsilon matrix, error
    epsTest <- matrix(rnorm(m), m)

    ### Y matrix, response
    YTest <- XTest %*% matrix(c(1, 1, -1, rep(0, p - 2)), nrow = p + 1) + epsTest

    ### Regression fit
    fit <- glmnet(XTrain[, -1], YTrain, alpha = alpha)

    ### Test MSE
    colMeans((matrix(YTest, m, length(lam)) - predict(fit, newx = XTest[, -1], s = lam)) ^ 2)
  })

  ### Average test MSE
  colMeans(MSE)
}

ridgeMSE <- getMSE(alpha = 0)
LASSOMSE <- getMSE(alpha = 1)

pch <- c("R", "L")
lty <- 1: 2
col <- 1: 2

matplot(log(lam), cbind(ridgeMSE, LASSOMSE), type = "o", pch = pch, lty = lty, col = col, xlab = "log (lambda)", ylab = "Test MSE", main = "Test MSE Against Logarithm of Lambdas")
legend("topleft", legend = c("Ridge", "LASSO"), pch = pch, lty = lty, col = col)

As shown in the plot, we obtained the smallest MSE with \(\lambda\) = 0.1 for LASSO. That agrees with the underlying data generating process because LASSO would set the coefficients to exactly zero, while ridge would not.

3.

str(College)
'data.frame':   777 obs. of  18 variables:
 $ Private    : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
 $ Apps       : num  1660 2186 1428 417 193 ...
 $ Accept     : num  1232 1924 1097 349 146 ...
 $ Enroll     : num  721 512 336 137 55 158 103 489 227 172 ...
 $ Top10perc  : num  23 16 22 60 16 38 17 37 30 21 ...
 $ Top25perc  : num  52 29 50 89 44 62 45 68 63 44 ...
 $ F.Undergrad: num  2885 2683 1036 510 249 ...
 $ P.Undergrad: num  537 1227 99 63 869 ...
 $ Outstate   : num  7440 12280 11250 12960 7560 ...
 $ Room.Board : num  3300 6450 3750 5450 4120 ...
 $ Books      : num  450 750 400 450 800 500 500 450 300 660 ...
 $ Personal   : num  2200 1500 1165 875 1500 ...
 $ PhD        : num  70 29 53 92 76 67 90 89 79 40 ...
 $ Terminal   : num  78 30 66 97 72 73 93 100 84 41 ...
 $ S.F.Ratio  : num  18.1 12.2 12.9 7.7 11.9 9.4 11.5 13.7 11.3 11.5 ...
 $ perc.alumni: num  12 16 30 37 2 11 26 37 23 15 ...
 $ Expend     : num  7041 10527 8735 19016 10922 ...
 $ Grad.Rate  : num  60 56 54 59 15 55 63 73 80 52 ...

a.

(Not graded)

### Model matrix
X <- model.matrix(Apps ~ ., data = College)[, -1]

### Response
y <- College$Apps

### Ridge
ridgeCV <- cv.glmnet(X, y, alpha = 0, nfolds = 10)

i.

### The minimization rule
ridgeCV$lambda.min
[1] 364.8993
### Predicted values
yridgePredTrainMin <- predict(ridgeCV, newx = X, s = ridgeCV$lambda.min, type="response")

### CV error
colMeans((yridgePredTrainMin - y)^2)
     s1 
1358455 

ii.

### 1 standard error rule
ridgeCV$lambda.1se
[1] 1342.238
### Predicted values
yridgePredTrain1se <- predict(ridgeCV, newx = X, s = ridgeCV$lambda.1se, type="response")

### CV error
colMeans((yridgePredTrain1se - y)^2)
     s1 
1923926 

b.

### LASSO
LASSOCV <- cv.glmnet(X, y, alpha = 1, nfolds = 10)

i.

### The minimization rule
LASSOCV$lambda.min
[1] 2.137223
### Predicted values
yLASSOPredTrainMin <- predict(LASSOCV, newx = X, s = LASSOCV$lambda.min, type="response")

### CV error
colMeans((yLASSOPredTrainMin - y)^2)
     s1 
1060064 
### Non-zero coefficient estimates
(betaMin <- predict(LASSOCV, s="lambda.min", type="coefficients"))
18 x 1 sparse Matrix of class "dgCMatrix"
               lambda.min
(Intercept) -471.39372052
PrivateYes  -491.04485137
Accept         1.57033288
Enroll        -0.75961467
Top10perc     48.14698892
Top25perc    -12.84690695
F.Undergrad    0.04149116
P.Undergrad    0.04438973
Outstate      -0.08328388
Room.Board     0.14943472
Books          0.01532293
Personal       0.02909954
PhD           -8.39597537
Terminal      -3.26800340
S.F.Ratio     14.59298267
perc.alumni   -0.04404771
Expend         0.07712632
Grad.Rate      8.28950241
as.matrix(betaMin)[as.vector(betaMin != 0), ]
  (Intercept)    PrivateYes        Accept        Enroll     Top10perc 
-471.39372052 -491.04485137    1.57033288   -0.75961467   48.14698892 
    Top25perc   F.Undergrad   P.Undergrad      Outstate    Room.Board 
 -12.84690695    0.04149116    0.04438973   -0.08328388    0.14943472 
        Books      Personal           PhD      Terminal     S.F.Ratio 
   0.01532293    0.02909954   -8.39597537   -3.26800340   14.59298267 
  perc.alumni        Expend     Grad.Rate 
  -0.04404771    0.07712632    8.28950241 

ii.

### 1 standard error rule
LASSOCV$lambda.1se
[1] 245.7287
### Predicted values
yLASSOPredTrain1se <- predict(LASSOCV, newx = X, s = LASSOCV$lambda.1se, type="response")

### CV error
colMeans((yLASSOPredTrain1se - y)^2)
     s1 
1335367 
### Non-zero coefficient estimates
(beta1se <- predict(LASSOCV, s="lambda.1se", type="coefficients"))
18 x 1 sparse Matrix of class "dgCMatrix"
               lambda.1se
(Intercept) -492.41173306
PrivateYes     .         
Accept         1.35599086
Enroll         .         
Top10perc     19.98045033
Top25perc      .         
F.Undergrad    .         
P.Undergrad    .         
Outstate       .         
Room.Board     .         
Books          .         
Personal       .         
PhD            .         
Terminal       .         
S.F.Ratio      .         
perc.alumni    .         
Expend         0.02131819
Grad.Rate      .         
as.matrix(beta1se)[as.vector(beta1se != 0), ]
  (Intercept)        Accept     Top10perc        Expend 
-492.41173306    1.35599086   19.98045033    0.02131819 

c.

(Not graded)

### New data
newx <- c(PrivateYes = 0, colMeans(College[, -(1:2)]))

### Ridge with minimization rule
predict(ridgeCV, newx = newx, s = ridgeCV$lambda.min, type = "response")
           s1
[1,] 3385.488
### Ridge with 1 standard error rule
predict(ridgeCV, newx = newx, s = ridgeCV$lambda.1se, type = "response")
           s1
[1,] 3406.073
### LASSO with minimization rule
predict(LASSOCV, newx = newx, s = LASSOCV$lambda.min, type = "response")
           s1
[1,] 3358.704
### LASSO with 1 standard error rule
predict(LASSOCV, newx = newx, s = LASSOCV$lambda.1se, type = "response")
           s1
[1,] 3001.638

4. (Optional)

(Not graded)

str(Weekly)
'data.frame':   1089 obs. of  9 variables:
 $ Year     : num  1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
 $ Lag1     : num  0.816 -0.27 -2.576 3.514 0.712 ...
 $ Lag2     : num  1.572 0.816 -0.27 -2.576 3.514 ...
 $ Lag3     : num  -3.936 1.572 0.816 -0.27 -2.576 ...
 $ Lag4     : num  -0.229 -3.936 1.572 0.816 -0.27 ...
 $ Lag5     : num  -3.484 -0.229 -3.936 1.572 0.816 ...
 $ Volume   : num  0.155 0.149 0.16 0.162 0.154 ...
 $ Today    : num  -0.27 -2.576 3.514 0.712 1.178 ...
 $ Direction: Factor w/ 2 levels "Down","Up": 1 1 2 2 2 1 2 2 2 1 ...

a.

### Fit logistic regression
logitfit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Weekly, family=binomial)
summary(logitfit)

Call:
glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
    Volume, family = binomial, data = Weekly)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.6949  -1.2565   0.9913   1.0849   1.4579  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)   
(Intercept)  0.26686    0.08593   3.106   0.0019 **
Lag1        -0.04127    0.02641  -1.563   0.1181   
Lag2         0.05844    0.02686   2.175   0.0296 * 
Lag3        -0.01606    0.02666  -0.602   0.5469   
Lag4        -0.02779    0.02646  -1.050   0.2937   
Lag5        -0.01447    0.02638  -0.549   0.5833   
Volume      -0.02274    0.03690  -0.616   0.5377   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1496.2  on 1088  degrees of freedom
Residual deviance: 1486.4  on 1082  degrees of freedom
AIC: 1500.4

Number of Fisher Scoring iterations: 4

Lag 2 appears to be statistically significant.

b.

### Fitted values
p <- predict(logitfit, Weekly, type="response")

### Second level as success
Yhat <- ifelse(p > 0.5, levels(Weekly$Direction)[2], levels(Weekly$Direction)[1])

### Confusion table
(ConfTable <- table(Yhat, Truth = Weekly$Direction))
      Truth
Yhat   Down  Up
  Down   54  48
  Up    430 557
### Sensitivity
ConfTable[2, 2] / (ConfTable[1, 2] + ConfTable[2, 2])
[1] 0.9206612
### Specificity
ConfTable[1, 1] / (ConfTable[1, 1] + ConfTable[2, 1])
[1] 0.1115702

c.

### Training set
datTrain <- Weekly[Weekly$Year %in% 1990:2008, ]

### Testing set
datTest <- Weekly[!(Weekly$Year %in% 1990:2008), ]

### Fit logistic with training set
logitfitTrain <- glm(Direction ~ Lag2, data = datTrain, family=binomial)

### Fitted values for testing set
pTest <- predict(logitfitTrain, datTest, type="response")

### Second level as success
### Confusion table
YhatTest <- ifelse(pTest > 0.5, levels(datTest$Direction)[2], levels(datTest$Direction)[1])
(ConfTableLogit <- table(YhatTest, Truth = datTest$Direction))
        Truth
YhatTest Down Up
    Down    9  5
    Up     34 56
### Sensitivity
ConfTableLogit[2, 2] / (ConfTableLogit[1, 2] + ConfTableLogit[2, 2])
[1] 0.9180328
### Specificity
ConfTableLogit[1, 1] / (ConfTableLogit[1, 1] + ConfTableLogit[2, 1])
[1] 0.2093023
### Overall correct predictions fraction
(ConfTableLogit[1, 1] + ConfTableLogit[2, 2]) / sum(ConfTableLogit)
[1] 0.625

d.

K <- seq_len(50)

### Map over number of knots
MisRate <- map_dbl(K, function(k) {
  ### KNN fit with leave-one-out CV
  YHatKNNCV <- knn.cv(datTrain[, "Lag2", drop = FALSE], datTrain$Direction, k = k)
  
  ### Misclassification rate
  mean(YHatKNNCV != datTrain$Direction)
})

### Best number of knots
(Kbest <- K[which.min(MisRate)])
[1] 37
### KNN fit with best K
YHatKNN <- knn(datTrain[, "Lag2", drop = FALSE], datTest[, "Lag2", drop = FALSE], datTrain$Direction, k = Kbest)

### Confusion table
table(YHatKNN, Truth = datTest$Direction)
       Truth
YHatKNN Down Up
   Down   22 24
   Up     21 37
### Misclassification rate
mean(YHatKNN != datTest$Direction)
[1] 0.4326923