model-selection.R

mohammad

2021-04-28 library(ggplot2)

# model selection

# overfitting and underfitting set.seed(100) data_num= 50 noise= rnorm(data_num,0, 500) x <- sample(c(1:1000), data_num, replace = FALSE) y <- (x-520)ˆ3/10000+4*(x-300)ˆ2/10000+ noise deg= 20 ggplot(data = data.frame(x,y),aes(x,y))+ geom_point()+ geom_smooth(method="lm", formula=y~poly(x,deg))

15000

10000

5000 y 0

−5000

−10000

0 250 500 750 1000 x

1 rand= sample(c(1:data_num), data_num/5, replace = FALSE) x_valid <- x[rand] x_train <- x[-rand] y_valid <- y[rand] y_train <- y[-rand] train_error=c(1:20) valid_error=c(1:20) for (i in 1:length(train_error)) { model <- lm(data = data.frame(y_train, x_train), y_train~ poly(x_train,i)) pred <- predict(model,newdata = data.frame(x_train = x_valid)) train_error[i]= sum((model$residuals)ˆ2)/data_num valid_error[i] <- sum((y_valid-pred)ˆ2)/data_num } plot(c(1:length(train_error)), train_error) 1500000 train_error 500000 0

5 10 15 20

c(1:length(train_error)) plot(c(1:length(train_error)), valid_error)

2 2.0e+07 valid_error 1.0e+07 0.0e+00 5 10 15 20

c(1:length(train_error))

# feature selection and extraction # selection # best set library(ISLR)

## Warning: package 'ISLR' was built under R version 4.0.5 library(leaps)

## Warning: package 'leaps' was built under R version 4.0.5 fix(Hitters) Hitters= na.omit(Hitters) fit= regsubsets(Salary~ ., Hitters) summary(fit)

## Subset selection object ## Call: regsubsets.formula(Salary ~ ., Hitters) ## 19 Variables (and intercept) ## Forced in Forced ## AtBat FALSE FALSE ## Hits FALSE FALSE ## HmRun FALSE FALSE ## Runs FALSE FALSE ## RBI FALSE FALSE ## Walks FALSE FALSE ## Years FALSE FALSE ## CAtBat FALSE FALSE

3 ## CHits FALSE FALSE ## CHmRun FALSE FALSE ## CRuns FALSE FALSE ## CRBI FALSE FALSE ## CWalks FALSE FALSE ## LeagueN FALSE FALSE ## DivisionW FALSE FALSE ## PutOuts FALSE FALSE ## Assists FALSE FALSE ## Errors FALSE FALSE ## NewLeagueN FALSE FALSE ## 1 subsets of each size up to 8 ## Selection Algorithm: exhaustive ## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI ## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " "*" ## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*" ## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*" ## 4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*" ## 5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " " "*" ## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*" ## 7 ( 1 ) " " "*" " " " " " " "*" " " "*" "*" "*" " " " " ## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " "*" "*" " " ## CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN ## 1 ( 1 ) " " " " " " " " " " " " " " ## 2 ( 1 ) " " " " " " " " " " " " " " ## 3 ( 1 ) " " " " " " "*" " " " " " " ## 4 ( 1 ) " " " " "*" "*" " " " " " " ## 5 ( 1 ) " " " " "*" "*" " " " " " " ## 6 ( 1 ) " " " " "*" "*" " " " " " " ## 7 ( 1 ) " " " " "*" "*" " " " " " " ## 8 ( 1 ) "*" " " "*" "*" " " " " " " fit2= regsubsets(Salary~ ., Hitters, nvmax = 15) summ= summary(fit2) summ

## Subset selection object ## Call: regsubsets.formula(Salary ~ ., Hitters, nvmax = 15) ## 19 Variables (and intercept) ## Forced in Forced out ## AtBat FALSE FALSE ## Hits FALSE FALSE ## HmRun FALSE FALSE ## Runs FALSE FALSE ## RBI FALSE FALSE ## Walks FALSE FALSE ## Years FALSE FALSE ## CAtBat FALSE FALSE ## CHits FALSE FALSE ## CHmRun FALSE FALSE ## CRuns FALSE FALSE ## CRBI FALSE FALSE ## CWalks FALSE FALSE ## LeagueN FALSE FALSE ## DivisionW FALSE FALSE

4 ## PutOuts FALSE FALSE ## Assists FALSE FALSE ## Errors FALSE FALSE ## NewLeagueN FALSE FALSE ## 1 subsets of each size up to 15 ## Selection Algorithm: exhaustive ## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI ## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " "*" ## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*" ## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*" ## 4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*" ## 5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " " "*" ## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*" ## 7 ( 1 ) " " "*" " " " " " " "*" " " "*" "*" "*" " " " " ## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " "*" "*" " " ## 9 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*" ## 10 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*" ## 11 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*" ## 12 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*" ## 13 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*" ## 14 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" " " " " "*" "*" ## 15 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" "*" " " "*" "*" ## CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN ## 1 ( 1 ) " " " " " " " " " " " " " " ## 2 ( 1 ) " " " " " " " " " " " " " " ## 3 ( 1 ) " " " " " " "*" " " " " " " ## 4 ( 1 ) " " " " "*" "*" " " " " " " ## 5 ( 1 ) " " " " "*" "*" " " " " " " ## 6 ( 1 ) " " " " "*" "*" " " " " " " ## 7 ( 1 ) " " " " "*" "*" " " " " " " ## 8 ( 1 ) "*" " " "*" "*" " " " " " " ## 9 ( 1 ) "*" " " "*" "*" " " " " " " ## 10 ( 1 ) "*" " " "*" "*" "*" " " " " ## 11 ( 1 ) "*" "*" "*" "*" "*" " " " " ## 12 ( 1 ) "*" "*" "*" "*" "*" " " " " ## 13 ( 1 ) "*" "*" "*" "*" "*" "*" " " ## 14 ( 1 ) "*" "*" "*" "*" "*" "*" " " ## 15 ( 1 ) "*" "*" "*" "*" "*" "*" " " names(summ)

## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj" summ$which

## (Intercept) AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun ## 1 TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE ## 2 TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE ## 3 TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE ## 4 TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE ## 5 TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE ## 6 TRUE TRUE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE ## 7 TRUE FALSE TRUE FALSE FALSE FALSE TRUE FALSE TRUE TRUE TRUE ## 8 TRUE TRUE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE ## 9 TRUE TRUE TRUE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE ## 10 TRUE TRUE TRUE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE

5 ## 11 TRUE TRUE TRUE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE ## 12 TRUE TRUE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE FALSE ## 13 TRUE TRUE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE FALSE ## 14 TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE TRUE FALSE FALSE ## 15 TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE TRUE TRUE FALSE ## CRuns CRBI CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN ## 1 FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE ## 2 FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE ## 3 FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE ## 4 FALSE TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE ## 5 FALSE TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE ## 6 FALSE TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE ## 7 FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE ## 8 TRUE FALSE TRUE FALSE TRUE TRUE FALSE FALSE FALSE ## 9 TRUE TRUE TRUE FALSE TRUE TRUE FALSE FALSE FALSE ## 10 TRUE TRUE TRUE FALSE TRUE TRUE TRUE FALSE FALSE ## 11 TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE ## 12 TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE ## 13 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE ## 14 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE ## 15 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE plot(summ$rsq, type = "l") 0.55 0.50 0.45 summ$rsq 0.40 0.35

2 4 6 8 10 12 14

Index plot(summ$adjr2, type = "l")

6 0.50 0.45 0.40 summ$adjr2 0.35

2 4 6 8 10 12 14

Index plot(summ$rss, type = "l")

7 3.6e+07 3.2e+07 summ$rss 2.8e+07

2.4e+07 2 4 6 8 10 12 14

Index plot(summ$bic, type = "l")

8 −90 −110 summ$bic −130 −150 2 4 6 8 10 12 14

Index plot(summ$cp, type = "l")

9 100 80 60 summ$cp 40 20

2 4 6 8 10 12 14

Index coef(fit2,8)

## (Intercept) AtBat Hits Walks CHmRun CRuns ## 130.9691577 -2.1731903 7.3582935 6.0037597 1.2339718 0.9651349 ## CWalks DivisionW PutOuts ## -0.8323788 -117.9657795 0.2908431 # forward and backward selection forward= regsubsets(Salary~., Hitters, nvmax = 15, method = "forward") summary(forward)

## Subset selection object ## Call: regsubsets.formula(Salary ~ ., Hitters, nvmax = 15, method = "forward") ## 19 Variables (and intercept) ## Forced in Forced out ## AtBat FALSE FALSE ## Hits FALSE FALSE ## HmRun FALSE FALSE ## Runs FALSE FALSE ## RBI FALSE FALSE ## Walks FALSE FALSE ## Years FALSE FALSE ## CAtBat FALSE FALSE ## CHits FALSE FALSE ## CHmRun FALSE FALSE ## CRuns FALSE FALSE

10 ## CRBI FALSE FALSE ## CWalks FALSE FALSE ## LeagueN FALSE FALSE ## DivisionW FALSE FALSE ## PutOuts FALSE FALSE ## Assists FALSE FALSE ## Errors FALSE FALSE ## NewLeagueN FALSE FALSE ## 1 subsets of each size up to 15 ## Selection Algorithm: forward ## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI ## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " "*" ## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*" ## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*" ## 4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*" ## 5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " " "*" ## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*" ## 7 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*" ## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" "*" ## 9 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*" ## 10 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*" ## 11 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*" ## 12 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*" ## 13 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*" ## 14 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" " " " " "*" "*" ## 15 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" "*" " " "*" "*" ## CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN ## 1 ( 1 ) " " " " " " " " " " " " " " ## 2 ( 1 ) " " " " " " " " " " " " " " ## 3 ( 1 ) " " " " " " "*" " " " " " " ## 4 ( 1 ) " " " " "*" "*" " " " " " " ## 5 ( 1 ) " " " " "*" "*" " " " " " " ## 6 ( 1 ) " " " " "*" "*" " " " " " " ## 7 ( 1 ) "*" " " "*" "*" " " " " " " ## 8 ( 1 ) "*" " " "*" "*" " " " " " " ## 9 ( 1 ) "*" " " "*" "*" " " " " " " ## 10 ( 1 ) "*" " " "*" "*" "*" " " " " ## 11 ( 1 ) "*" "*" "*" "*" "*" " " " " ## 12 ( 1 ) "*" "*" "*" "*" "*" " " " " ## 13 ( 1 ) "*" "*" "*" "*" "*" "*" " " ## 14 ( 1 ) "*" "*" "*" "*" "*" "*" " " ## 15 ( 1 ) "*" "*" "*" "*" "*" "*" " " backward= regsubsets(Salary~., Hitters, nvmax = 15, method = "backward") summary(backward)

## Subset selection object ## Call: regsubsets.formula(Salary ~ ., Hitters, nvmax = 15, method = "backward") ## 19 Variables (and intercept) ## Forced in Forced out ## AtBat FALSE FALSE ## Hits FALSE FALSE ## HmRun FALSE FALSE

11 ## Runs FALSE FALSE ## RBI FALSE FALSE ## Walks FALSE FALSE ## Years FALSE FALSE ## CAtBat FALSE FALSE ## CHits FALSE FALSE ## CHmRun FALSE FALSE ## CRuns FALSE FALSE ## CRBI FALSE FALSE ## CWalks FALSE FALSE ## LeagueN FALSE FALSE ## DivisionW FALSE FALSE ## PutOuts FALSE FALSE ## Assists FALSE FALSE ## Errors FALSE FALSE ## NewLeagueN FALSE FALSE ## 1 subsets of each size up to 15 ## Selection Algorithm: backward ## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI ## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " "*" " " ## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " "*" " " ## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " "*" " " ## 4 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " "*" " " ## 5 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" " " ## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" " " ## 7 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" " " ## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" "*" ## 9 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*" ## 10 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*" ## 11 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*" ## 12 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*" ## 13 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*" ## 14 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" " " " " "*" "*" ## 15 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" "*" " " "*" "*" ## CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN ## 1 ( 1 ) " " " " " " " " " " " " " " ## 2 ( 1 ) " " " " " " " " " " " " " " ## 3 ( 1 ) " " " " " " "*" " " " " " " ## 4 ( 1 ) " " " " " " "*" " " " " " " ## 5 ( 1 ) " " " " " " "*" " " " " " " ## 6 ( 1 ) " " " " "*" "*" " " " " " " ## 7 ( 1 ) "*" " " "*" "*" " " " " " " ## 8 ( 1 ) "*" " " "*" "*" " " " " " " ## 9 ( 1 ) "*" " " "*" "*" " " " " " " ## 10 ( 1 ) "*" " " "*" "*" "*" " " " " ## 11 ( 1 ) "*" "*" "*" "*" "*" " " " " ## 12 ( 1 ) "*" "*" "*" "*" "*" " " " " ## 13 ( 1 ) "*" "*" "*" "*" "*" "*" " " ## 14 ( 1 ) "*" "*" "*" "*" "*" "*" " " ## 15 ( 1 ) "*" "*" "*" "*" "*" "*" " " # regularization # ridge library(glmnet)

12 ## Warning: package 'glmnet' was built under R version 4.0.5 ## Loading required package: Matrix ## Loaded glmnet 4.1-1 x= model.matrix(Salary~., Hitters)[,-1] # dummy variables y= Hitters$Salary lambdas= 10ˆseq(10,-2, length = 100) ridge_model= glmnet(x, y, alpha =0, lambda = lambdas) head(coef(ridge_model))

## 6 x 100 sparse Matrix of class "dgCMatrix" ## [[ suppressing 100 column names 's0', 's1', 's2' ... ]] ## ## (Intercept) 5.359257e+02 5.359256e+02 5.359256e+02 5.359254e+02 5.359253e+02 ## AtBat 5.443467e-08 7.195940e-08 9.512609e-08 1.257511e-07 1.662355e-07 ## Hits 1.974589e-07 2.610289e-07 3.450649e-07 4.561554e-07 6.030105e-07 ## HmRun 7.956523e-07 1.051805e-06 1.390424e-06 1.838059e-06 2.429805e-06 ## Runs 3.339178e-07 4.414196e-07 5.835307e-07 7.713931e-07 1.019736e-06 ## RBI 3.527222e-07 4.662778e-07 6.163918e-07 8.148335e-07 1.077162e-06 ## ## (Intercept) 5.359251e+02 5.359249e+02 5.359246e+02 5.359241e+02 5.359236e+02 ## AtBat 2.197535e-07 2.905011e-07 3.840251e-07 5.076583e-07 6.710939e-07 ## Hits 7.971441e-07 1.053777e-06 1.393031e-06 1.841504e-06 2.434358e-06 ## HmRun 3.212059e-06 4.246151e-06 5.613159e-06 7.420260e-06 9.809139e-06 ## Runs 1.348031e-06 1.782017e-06 2.355720e-06 3.114121e-06 4.116682e-06 ## RBI 1.423944e-06 1.882370e-06 2.488380e-06 3.289490e-06 4.348509e-06 ## ## (Intercept) 5.359228e+02 5.359218e+02 5.359205e+02 5.359188e+02 5.359165e+02 ## AtBat 8.871458e-07 1.172753e-06 1.550308e-06 2.049411e-06 2.709192e-06 ## Hits 3.218075e-06 4.254101e-06 5.623662e-06 7.434134e-06 9.827459e-06 ## HmRun 1.296709e-05 1.714170e-05 2.266028e-05 2.995547e-05 3.959923e-05 ## Runs 5.442006e-06 7.194001e-06 9.510029e-06 1.257167e-05 1.661895e-05 ## RBI 5.748467e-06 7.599123e-06 1.004557e-05 1.327962e-05 1.755482e-05 ## ## (Intercept) 5.359135e+02 5.359095e+02 5.359042e+02 5.358972e+02 5.358880e+02 ## AtBat 3.581378e-06 4.734346e-06 6.258482e-06 8.273267e-06 1.093664e-05 ## Hits 1.299127e-05 1.717361e-05 2.270236e-05 3.001092e-05 3.967221e-05 ## HmRun 5.234760e-05 6.920001e-05 9.147759e-05 1.209267e-04 1.598556e-04 ## Runs 2.196918e-05 2.904181e-05 3.839128e-05 5.075051e-05 6.708833e-05 ## RBI 2.320633e-05 3.067722e-05 4.055316e-05 5.360832e-05 7.086606e-05 ## ## (Intercept) 5.358758e+02 5.358597e+02 5.358383e+02 5.358102e+02 5.357729e+02 ## AtBat 1.445735e-05 1.911136e-05 2.526337e-05 3.339542e-05 4.414457e-05 ## Hits 5.244352e-05 6.932585e-05 9.164225e-05 1.211414e-04 1.601343e-04 ## HmRun 2.113157e-04 2.793399e-04 3.692586e-04 4.881167e-04 6.452240e-04 ## Runs 8.868531e-05 1.172341e-04 1.549720e-04 2.048558e-04 2.707932e-04 ## RBI 9.367905e-05 1.238352e-04 1.636976e-04 2.163893e-04 2.860379e-04 ## ## (Intercept) 5.357237e+02 5.356586e+02 5.355726e+02 5.354590e+02 5.353088e+02 ## AtBat 5.835267e-05 7.713205e-05 1.019523e-04 1.347543e-04 1.781013e-04 ## Hits 2.116751e-04 2.797992e-04 3.698382e-04 4.888348e-04 6.460892e-04 ## HmRun 8.528825e-04 1.127346e-03 1.490084e-03 1.969454e-03 2.602891e-03 ## Runs 3.579481e-04 4.731433e-04 6.253924e-04 8.266006e-04 1.092488e-03

13 ## RBI 3.780973e-04 4.997733e-04 6.605853e-04 8.731052e-04 1.153932e-03 ## ## (Intercept) 5.351104e+02 5.348483e+02 5.345021e+02 5.340450e+02 5.334417e+02 ## AtBat 2.353767e-04 3.110444e-04 4.109909e-04 5.429714e-04 7.171926e-04 ## Hits 8.538795e-04 1.128408e-03 1.491040e-03 1.969937e-03 2.602166e-03 ## HmRun 3.439801e-03 4.545349e-03 6.005426e-03 7.933134e-03 1.047721e-02 ## Runs 1.443804e-03 1.907924e-03 2.520942e-03 3.330403e-03 4.398874e-03 ## RBI 1.524974e-03 2.015129e-03 2.662489e-03 3.517224e-03 4.645322e-03 ## ## (Intercept) 5.326458e+02 5.315966e+02 5.302145e+02 5.283962e+02 5.260283e+02 ## AtBat 9.470681e-04 1.250193e-03 1.649587e-03 2.175269e-03 2.856183e-03 ## Hits 3.436467e-03 4.536802e-03 5.986922e-03 7.896145e-03 1.038009e-02 ## HmRun 1.383295e-02 1.825616e-02 2.408100e-02 3.174215e-02 4.166863e-02 ## Runs 5.808559e-03 7.667251e-03 1.011593e-02 1.333834e-02 1.753936e-02 ## RBI 6.133445e-03 8.095158e-03 1.067886e-02 1.407774e-02 1.850770e-02 ## ## (Intercept) 522.91172696 5.188425e+02 5.135499e+02 5.067007e+02 497.89432739 ## AtBat 0.00375537 4.929271e-03 6.455825e-03 8.430885e-03 0.01096941 ## Hits 0.01365619 1.793929e-02 2.351966e-02 3.075783e-02 0.04009239 ## HmRun 0.05475351 7.181148e-02 9.395171e-02 1.225246e-01 0.15912611 ## Runs 0.02306905 3.029402e-02 3.969965e-02 5.188629e-02 0.06757994 ## RBI 0.02433453 3.194175e-02 4.183469e-02 5.463507e-02 0.07108903 ## ## (Intercept) 486.66538760 472.49837844 454.86126015 433.26825885 407.35605020 ## AtBat 0.01420455 0.01828293 0.02335416 0.02954756 0.03695718 ## Hits 0.05204227 0.06719837 0.08619852 0.10967001 0.13818034 ## HmRun 0.20556391 0.26375808 0.33554396 0.42231272 0.52462998 ## Runs 0.08763193 0.11299845 0.14468833 0.18366163 0.23070152 ## RBI 0.09206175 0.11850817 0.15140705 0.19164195 0.23984146 ## ## (Intercept) 377.00656988 342.45146856 304.36285975 263.84665456 222.37830885 ## AtBat 0.04558973 0.05533408 0.06592341 0.07692026 0.08772909 ## Hits 0.17210886 0.21156280 0.25628237 0.30566272 0.35881027 ## HmRun 0.64153683 0.77000823 0.90440096 1.03656335 1.15605327 ## Runs 0.28620519 0.34999568 0.42114043 0.49795797 0.57807538 ## RBI 0.29614905 0.36000544 0.42995694 0.50367426 0.57804567 ## ## (Intercept) 181.62066405 143.2209084 108.6007107 78.7760218 54.3251995 ## AtBat 0.09763889 0.1058832 0.1115181 0.1139551 0.1121111 ## Hits 0.41475429 0.4726795 0.5318660 0.5929157 0.6562241 ## HmRun 1.25121946 1.3103300 1.3223521 1.2805113 1.1798091 ## Runs 0.65873313 0.7371023 0.8107935 0.8778978 0.9376971 ## RBI 0.64954348 0.7146205 0.7705096 0.8147962 0.8471855 ## ## (Intercept) 35.4632735 22.07163454 13.69965470 10.08979010 10.368086610 ## AtBat 0.1052545 0.09238344 0.07214526 0.04333856 0.003859899 ## Hits 0.7236362 0.79744297 0.88059238 0.97661396 1.090128700 ## HmRun 1.0201569 0.80505438 0.54367980 0.24336135 -0.074578917 ## Runs 0.9897020 1.03411394 1.07217873 1.10318741 1.129710071 ## RBI 0.8675975 0.87728221 0.87912033 0.87391232 0.866429174 ## ## (Intercept) 14.26002563 21.2474898 30.6934829 42.0855152 54.9738422 68.6872524 ## AtBat -0.04801371 -0.1143526 -0.1969965 -0.2970743 -0.4148060 -0.5489118 ## Hits 1.22574333 1.3888568 1.5858862 1.8230585 2.1053049 2.4364278

14 ## HmRun -0.39844659 -0.7097177 -0.9847328 -1.2028780 -1.3482833 -1.4028069 ## Runs 1.15012799 1.1635394 1.1682319 1.1599670 1.1328125 1.0800738 ## RBI 0.85667724 0.8455760 0.8329787 0.8163507 0.7921941 0.7564450 ## ## (Intercept) 82.6164012 96.3405543 109.2829120 121.0394128 131.3383871 ## AtBat -0.6964360 -0.8532130 -1.0135354 -1.1712169 -1.3203039 ## Hits 2.8163931 3.2390939 3.6940734 4.1655533 4.6348424 ## HmRun -1.3562347 -1.2134681 -0.9796829 -0.6702762 -0.3059515 ## Runs 0.9943610 0.8703921 0.7058863 0.5030149 0.2686396 ## RBI 0.7049849 0.6365634 0.5510051 0.4507415 0.3400248 ## ## (Intercept) 140.02704637 147.0439149 152.52893912 156.6073700 159.6160967 ## AtBat -1.45573912 -1.5737685 -1.67284639 -1.7526436 -1.8153438 ## Hits 5.08339645 5.4958805 5.86106113 6.1739859 6.4337591 ## HmRun 0.09220561 0.5084434 0.92327754 1.3285278 1.7038461 ## Runs 0.01299834 -0.2529044 -0.51634440 -0.7689372 -0.9993212 ## RBI 0.22342867 0.1039164 -0.01416825 -0.1297830 -0.2372834 ## ## (Intercept) 161.6138012 162.9350244 163.6938252 164.1218504 164.3266996 ## AtBat -1.8619922 -1.8962304 -1.9201291 -1.9368385 -1.9482895 ## Hits 6.6446724 6.8126375 6.9445433 7.0473614 7.1270344 ## HmRun 2.0577112 2.3769197 2.6655164 2.9163590 3.1305427 ## Runs -1.2091093 -1.3918382 -1.5508817 -1.6845635 -1.7957415 ## RBI -0.3401913 -0.4343568 -0.5209687 -0.5973026 -0.6632374 ## ## (Intercept) 164.4197999 164.4398691 164.4099707 164.3701706 164.3247947 ## AtBat -1.9562902 -1.9617909 -1.9654629 -1.9680069 -1.9697620 ## Hits 7.1885531 7.2358004 7.2719325 7.2992751 7.3199359 ## HmRun 3.3073722 3.4514572 3.5681841 3.6592305 3.7298870 ## Runs -1.8857654 -1.9581687 -2.0162146 -2.0612924 -2.0963217 ## RBI -0.7181371 -0.7631803 -0.7998993 -0.8286657 -0.8509939 ## ## (Intercept) 164.2813513 164.2457978 164.2135548 164.1869190 164.1646430 ## AtBat -1.9709892 -1.9718803 -1.9725262 -1.9729623 -1.9732951 ## Hits 7.3354142 7.3469033 7.3559222 7.3623124 7.3675688 ## HmRun 3.7837313 3.8238422 3.8561750 3.8791510 3.8985402 ## Runs -2.1230968 -2.1431262 -2.1590930 -2.1705339 -2.1800496 ## RBI -0.8679825 -0.8805956 -0.8907918 -0.8980045 -0.9041437 ## ## (Intercept) 164.1460911 164.131956 164.120458 164.1132161 ## AtBat -1.9735097 -1.973666 -1.973790 -1.9738615 ## Hits 7.3712955 7.374072 7.376352 7.3777227 ## HmRun 3.9124462 3.922825 3.931473 3.9366022 ## Runs -2.1868797 -2.191986 -2.196207 -2.1987362 ## RBI -0.9085485 -0.911840 -0.914588 -0.9162301 summary(ridge_model)

## Length Class Mode ## a0 100 -none- numeric ## beta 1900 dgCMatrix S4 ## df 100 -none- numeric ## dim 2 -none- numeric ## lambda 100 -none- numeric ## dev.ratio 100 -none- numeric

15 ## nulldev 1 -none- numeric ## npasses 1 -none- numeric ## jerr 1 -none- numeric ## offset 1 -none- logical ## call 5 -none- call ## nobs 1 -none- numeric set.seed(100) train= sample(1:nrow(x), nrow(x)/5) valid=(-train) cross_valid= cv.glmnet(x[train,], y[train],alpha =0) plot(cross_valid)

19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 200000 150000 Mean−Squared 100000

4 6 8 10 12

Log(λ) best_lambda= cross_valid$lambda.min best_lambda

## [1] 365.67 best_ridge= glmnet(x, y, alpha =0) predict(best_ridge, type = "coefficients",s= best_lambda)

## 20 x 1 sparse Matrix of class "dgCMatrix" ## 1 ## (Intercept) 18.71499068 ## AtBat 0.08573542 ## Hits 0.82504996 ## HmRun 0.71087630 ## Runs 1.04816875

16 ## RBI 0.87872700 ## Walks 1.57872645 ## Years 1.61293013 ## CAtBat 0.01133665 ## CHits 0.05569044 ## CHmRun 0.39563875 ## CRuns 0.11115987 ## CRBI 0.11737947 ## CWalks 0.05761240 ## LeagueN 20.78946923 ## DivisionW -75.47615500 ## PutOuts 0.15915642 ## Assists 0.02668917 ## Errors -1.24955616 ## NewLeagueN 9.34373735 # lasso cross_valid2=cv.glmnet (x[train ,],y[train],alpha =1) plot(cross_valid2)

19 19 19 18 18 17 17 15 13 9 11 8 7 6 4 2 0 200000 150000 Mean−Squared Error 100000

−2 0 2 4

Log(λ) best_lambda2= cross_valid2$lambda.min best_lambda2

## [1] 0.02702565 # feature extraction # PCA library(pls)

17 ## Warning: package 'pls' was built under R version 4.0.5 ## ## Attaching package: 'pls' ## The following object is masked from 'package:stats': ## ## loadings pca_fit= pcr(Salary~., data = Hitters, scale = TRUE, validation = "CV") validationplot(pca_fit, val.type = "MSEP")

Salary 200000 MSEP 160000 120000 0 5 10 15

number of components

comps=7 pca_fit2= pcr(Salary~., data = Hitters, scale = TRUE, ncomp = comps) predict(pca_fit2, x[valid,], ncomp = comps)

## , , 7 comps ## ## Salary ## -Alan Ashby 568.794000 ## - 921.607658 ## -Alfredo Griffin 560.319821 ## -Al Newman 135.537767 ## -Andres Thomas 82.564364 ## -Andre Thornton 888.504284 ## -Alan Trammell 891.881067 ## -Alex Trevino 270.342442

18 ## -Andy VanSlyke 534.935783 ## -Alan Wiggins 297.043430 ## -Bill Almon 445.349098 ## -Buddy Bell 1044.888994 ## -Bruce Bochy 108.401983 ## -Barry Bonds 501.121402 ## -Bobby Bonilla 347.351204 ## -Bob Brenly 614.895095 ## -Bill Buckner 1318.813149 ## - 786.922803 ## -Bob Dernier 405.368033 ## -Bo Diaz 603.600817 ## -Brian Downing 955.025211 ## - 248.564608 ## -Brook Jacoby 587.187142 ## -Bob Kearney 173.707270 ## -Bill Madlock 674.280370 ## -Bill Schroeder 236.924236 ## -Chris Bando 328.667422 ## -Chris Brown 274.041687 ## -Carmen Castillo 201.672766 ## - 671.485461 ## -Carlton Fisk 827.018056 ## -Curt Ford 221.509562 ## -Carney Lansford 642.309394 ## -Candy Maldonado 300.375444 ## -Craig Reynolds 339.608887 ## -Cory Snyder 339.513379 ## -Chris Speier 650.492268 ## -Dave Anderson 111.207730 ## - 1165.976361 ## -Darnell Coles 460.729629 ## -Dave Concepcion 730.387819 ## -Doug DeCinces 779.401664 ## -Darrell Evans 1499.914528 ## - 1265.721789 ## -Damaso Garcia 544.883432 ## - 294.762999 ## -Dave Henderson 380.650713 ## -Donnie Hill 151.243102 ## -Davey Lopes 825.352205 ## - 1141.996576 ## -Dale Murphy 997.172103 ## -Dwayne Murphy 593.626328 ## -Dave Parker 1070.563039 ## - 332.499806 ## -Darrell Porter 554.745906 ## -Don Slaught 296.552986 ## - 729.986798 ## -Dale Sveum 206.487292 ## -Danny Tartabull 402.345719 ## -Denny Walling 413.461393 ## - 1271.752034 ## - 484.442074

19 ## -Eddie Milner 468.205314 ## -Eddie Murray 1275.053484 ## -Ed Romero 261.142908 ## -Frank White 782.236358 ## -George Bell 737.025389 ## - 117.200307 ## -George Brett 924.551591 ## -Greg Brock 446.823388 ## - 1272.420318 ## - 589.662819 ## -George Hendrick 704.195795 ## -Glenn Hubbard 545.890957 ## -Garth Iorg 338.101881 ## -Graig Nettles 966.323516 ## -Gary Redus 479.424512 ## -Greg Walker 394.823415 ## -Harold Baines 659.007717 ## - 506.964453 ## -Howard Johnson 304.467806 ## -Hal McRae 633.373918 ## -Harold Reynolds 196.145601 ## - 112.486654 ## - 194.138143 ## -Jesse Barfield 863.455212 ## -Juan Beniquez 646.960799 ## -John Cangelosi 346.123527 ## -Jose Canseco 500.827480 ## -Joe Carter 677.284507 ## -Jack Clark 843.718116 ## -Jody Davis 838.485209 ## -Jim Dwyer 345.094244 ## -Julio Franco 617.529359 ## -Johnny Grubb 560.626400 ## -Jack Howell -4.356316 ## -John Kruk 181.340612 ## -Jeffrey Leonard 410.519620 ## -John Moses 237.128907 ## -Jerry Mumphrey 651.852879 ## -Jim Presley 417.842209 ## -Johnny Ray 750.397269 ## - 63.216075 ## - 1278.472093 ## -Jerry Royster 374.127320 ## - 420.268658 ## -Juan Samuel 618.336690 ## -John Shelby 390.336213 ## -Joel Skinner 113.494435 ## -Jim Sundberg 797.334924 ## -Jose Uribe 333.829426 ## -Joel Youngblood 303.641961 ## -Kevin Bass 536.797341 ## -Kal Daniels 75.216187 ## -Kirk Gibson 748.236695 ## -Ken Griffey 913.580178

20 ## - 925.596247 ## -Ken Landreaux 409.993575 ## -Kevin McReynolds 626.324620 ## -Kevin Mitchell 341.037523 ## -Keith Moreland 778.639045 ## -Ken Phelps 394.890089 ## - 624.186492 ## -Kurt Stillwell 99.720363 ## -Leon Durham 1045.377519 ## -Len Dykstra 534.390025 ## -Lee Lacy 765.882999 ## -Len Matuszek 177.676646 ## -Lloyd Moseby 860.662789 ## -Lance Parrish 795.564692 ## -Lou Whitaker 925.473517 ## -Mike Aldrete 156.887424 ## -Marty Barrett 664.630034 ## -Mike Davis 457.994884 ## -Mike Diaz 239.663998 ## - 217.728199 ## -Mike Easler 722.900952 ## -Mel Hall 523.641696 ## -Mike Kingery -3.554142 ## -Mike Marshall 341.077496 ## - 144.867645 ## -Mike Schmidt 16.275738 ## -Mike Scioscia 541.954671 ## -Mickey Tettleton 196.339897 ## -Milt Thompson 315.670262 ## -Mitch Webster 626.359003 ## - 582.484580 ## -Ozzie Guillen 231.595839 ## -Ozzie Virgil 523.511878 ## -Phil Bradley 533.160556 ## -Phil Garner 579.224171 ## - 374.074055 ## -Paul Molitor 651.897799 ## -Pat Sheridan 286.925221 ## -Rafael Belliard 267.092022 ## - 421.035742 ## -Ron Cey 927.586802 ## -Rob Deer 570.551575 ## -Ron Hassey 552.251102 ## -Reggie Jackson 1279.791932 ## -Rick Leach 239.543591 ## - 536.246461 ## - 568.713220 ## -Rey Quinones 182.090795 ## -Rafael Ramirez 388.367584 ## -Ron Roenicke 450.279130 ## -Ryne Sandberg 776.509182 ## - 365.195250 ## -Rick Schu 219.053531 ## -Ruben Sierra 183.811983

21 ## -Roy Smalley 719.412752 ## -Robby Thompson 346.294420 ## -Rob Wilfong 235.783588 ## -Robin Yount 1066.807801 ## -Sid Bream 860.685642 ## -Steve Buechele 227.279970 ## -Shawon Dunston 472.828407 ## -Scott Fletcher 414.337731 ## -Steve Garvey 1246.619187 ## -Spike Owen 366.880844 ## -Steve Sax 679.205382 ## -Tony Bernazard 775.310161 ## -Tom Brookens 385.047590 ## - 615.639899 ## -Tony Fernandez 592.229205 ## -Tim Flannery 388.853507 ## -Tom Foley 282.269797 ## -Tony Gwynn 667.666916 ## -Terry Harper 191.995298 ## -Tommy Herr 773.264357 ## -Tim Hulett 214.792136 ## -Terry Kennedy 28.708414 ## -Tito Landrum 262.968330 ## - 168.855649 ## -Tony Pena 813.876987 ## -Terry Pendleton 468.545419 ## -Tony Phillips 422.698977 ## -Ted Simmons 739.703606 ## - 360.332390 ## -Tim Wallach 614.765539 ## -Vince Coleman 604.359995 ## -Von Hayes 1122.397998 ## -Vance Law 491.585292 ## - 494.309154 ## -Wade Boggs 860.019659 ## - 464.960407 ## -Willie McGee 634.980330 ## -Willie Randolph 990.884596 ## -Wayne Tolleson 328.863851 ## -Willie Upshaw 1082.018400 ## -Willie Wilson 687.288702

22