Model-Selection.R

Model-Selection.R

model-selection.R mohammad 2021-04-28 library(ggplot2) # model selection # overfitting and underfitting set.seed(100) data_num= 50 noise= rnorm(data_num,0, 500) x <- sample(c(1:1000), data_num, replace = FALSE) y <- (x-520)ˆ3/10000+4*(x-300)ˆ2/10000+ noise deg= 20 ggplot(data = data.frame(x,y),aes(x,y))+ geom_point()+ geom_smooth(method="lm", formula=y~poly(x,deg)) 15000 10000 5000 y 0 −5000 −10000 0 250 500 750 1000 x 1 rand= sample(c(1:data_num), data_num/5, replace = FALSE) x_valid <- x[rand] x_train <- x[-rand] y_valid <- y[rand] y_train <- y[-rand] train_error=c(1:20) valid_error=c(1:20) for (i in 1:length(train_error)) { model <- lm(data = data.frame(y_train, x_train), y_train~ poly(x_train,i)) pred <- predict(model,newdata = data.frame(x_train = x_valid)) train_error[i]= sum((model$residuals)ˆ2)/data_num valid_error[i] <- sum((y_valid-pred)ˆ2)/data_num } plot(c(1:length(train_error)), train_error) 1500000 train_error 500000 0 5 10 15 20 c(1:length(train_error)) plot(c(1:length(train_error)), valid_error) 2 2.0e+07 valid_error 1.0e+07 0.0e+00 5 10 15 20 c(1:length(train_error)) # feature selection and extraction # selection # best set library(ISLR) ## Warning: package 'ISLR' was built under R version 4.0.5 library(leaps) ## Warning: package 'leaps' was built under R version 4.0.5 fix(Hitters) Hitters= na.omit(Hitters) fit= regsubsets(Salary~ ., Hitters) summary(fit) ## Subset selection object ## Call: regsubsets.formula(Salary ~ ., Hitters) ## 19 Variables (and intercept) ## Forced in Forced out ## AtBat FALSE FALSE ## Hits FALSE FALSE ## HmRun FALSE FALSE ## Runs FALSE FALSE ## RBI FALSE FALSE ## Walks FALSE FALSE ## Years FALSE FALSE ## CAtBat FALSE FALSE 3 ## CHits FALSE FALSE ## CHmRun FALSE FALSE ## CRuns FALSE FALSE ## CRBI FALSE FALSE ## CWalks FALSE FALSE ## LeagueN FALSE FALSE ## DivisionW FALSE FALSE ## PutOuts FALSE FALSE ## Assists FALSE FALSE ## Errors FALSE FALSE ## NewLeagueN FALSE FALSE ## 1 subsets of each size up to 8 ## Selection Algorithm: exhaustive ## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI ## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " "*" ## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*" ## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*" ## 4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*" ## 5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " " "*" ## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*" ## 7 ( 1 ) " " "*" " " " " " " "*" " " "*" "*" "*" " " " " ## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " "*" "*" " " ## CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN ## 1 ( 1 ) " " " " " " " " " " " " " " ## 2 ( 1 ) " " " " " " " " " " " " " " ## 3 ( 1 ) " " " " " " "*" " " " " " " ## 4 ( 1 ) " " " " "*" "*" " " " " " " ## 5 ( 1 ) " " " " "*" "*" " " " " " " ## 6 ( 1 ) " " " " "*" "*" " " " " " " ## 7 ( 1 ) " " " " "*" "*" " " " " " " ## 8 ( 1 ) "*" " " "*" "*" " " " " " " fit2= regsubsets(Salary~ ., Hitters, nvmax = 15) summ= summary(fit2) summ ## Subset selection object ## Call: regsubsets.formula(Salary ~ ., Hitters, nvmax = 15) ## 19 Variables (and intercept) ## Forced in Forced out ## AtBat FALSE FALSE ## Hits FALSE FALSE ## HmRun FALSE FALSE ## Runs FALSE FALSE ## RBI FALSE FALSE ## Walks FALSE FALSE ## Years FALSE FALSE ## CAtBat FALSE FALSE ## CHits FALSE FALSE ## CHmRun FALSE FALSE ## CRuns FALSE FALSE ## CRBI FALSE FALSE ## CWalks FALSE FALSE ## LeagueN FALSE FALSE ## DivisionW FALSE FALSE 4 ## PutOuts FALSE FALSE ## Assists FALSE FALSE ## Errors FALSE FALSE ## NewLeagueN FALSE FALSE ## 1 subsets of each size up to 15 ## Selection Algorithm: exhaustive ## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI ## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " "*" ## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*" ## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*" ## 4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*" ## 5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " " "*" ## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*" ## 7 ( 1 ) " " "*" " " " " " " "*" " " "*" "*" "*" " " " " ## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " "*" "*" " " ## 9 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*" ## 10 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*" ## 11 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*" ## 12 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*" ## 13 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*" ## 14 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" " " " " "*" "*" ## 15 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" "*" " " "*" "*" ## CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN ## 1 ( 1 ) " " " " " " " " " " " " " " ## 2 ( 1 ) " " " " " " " " " " " " " " ## 3 ( 1 ) " " " " " " "*" " " " " " " ## 4 ( 1 ) " " " " "*" "*" " " " " " " ## 5 ( 1 ) " " " " "*" "*" " " " " " " ## 6 ( 1 ) " " " " "*" "*" " " " " " " ## 7 ( 1 ) " " " " "*" "*" " " " " " " ## 8 ( 1 ) "*" " " "*" "*" " " " " " " ## 9 ( 1 ) "*" " " "*" "*" " " " " " " ## 10 ( 1 ) "*" " " "*" "*" "*" " " " " ## 11 ( 1 ) "*" "*" "*" "*" "*" " " " " ## 12 ( 1 ) "*" "*" "*" "*" "*" " " " " ## 13 ( 1 ) "*" "*" "*" "*" "*" "*" " " ## 14 ( 1 ) "*" "*" "*" "*" "*" "*" " " ## 15 ( 1 ) "*" "*" "*" "*" "*" "*" " " names(summ) ## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj" summ$which ## (Intercept) AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRununs CRBI CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueplot(summ$rsq, type = "l") 0.55 0.50 0.45 summ$rsq 0.40 0.35 2 4 6 8 10 12 14 Index plot(summ$adjr2, type = "l") 6 0.50 0.45 0.40 summ$adjr2 0.35 2 4 6 8 10 12 14 Index plot(summ$rss, type = "l") 7 3.6e+07 3.2e+07 summ$rss 2.8e+07 2.4e+07 2 4 6 8 10 12 14 Index plot(summ$bic, type = "l") 8 −90 −110 summ$bic −130 −150 2 4 6 8 10 12 14 Index plot(summ$cp, type = "l") 9 100 80 60 summ$cp 40 20 2 4 6 8 10 12 14 Index coef(fit2,8) ## (Intercept) AtBat Hits Walks CHmRun CRuns ## 130.9691577 -2.1731903 7.3582935 6.0037597 1.2339718 0.9651349 ## CWalks DivisionW PutOuts ## -0.8323788 -117.9657795 0.2908431 # forward and backward selection forward= regsubsets(Salary~., Hitters, nvmax = 15, method = "forward") summary(forward) ## Subset selection object ## Call: regsubsets.formula(Salary ~ ., Hitters, nvmax = 15, method = "forward") ## 19 Variables (and intercept) ## Forced in Forced out ## AtBat FALSE FALSE ## Hits FALSE FALSE ## HmRun FALSE FALSE ## Runs FALSE FALSE ## RBI FALSE FALSE ## Walks FALSE FALSE ## Years FALSE FALSE ## CAtBat FALSE FALSE ## CHits FALSE FALSE ## CHmRun FALSE FALSE ## CRuns FALSE FALSE 10 ## CRBI FALSE FALSE ## CWalks FALSE FALSE ## LeagueN FALSE FALSE ## DivisionW FALSE FALSE ## PutOuts FALSE FALSE ## Assists FALSE FALSE ## Errors FALSE FALSE ## NewLeagueN FALSE FALSE ## 1 subsets of each size up to 15 ## Selection Algorithm: forward ## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI ## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " "*" ## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*" ## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*" ## 4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*" ## 5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " " "*" ## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*" ## 7 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*" ## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" "*" ## 9 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*" ## 10 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*" ## 11 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*" ## 12 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*" ## 13 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*" ## 14 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" " " " " "*" "*" ## 15 ( 1 ) "*"

View Full Text

Details

  • File Type
    pdf
  • Upload Time
    -
  • Content Languages
    English
  • Upload User
    Anonymous/Not logged-in
  • File Pages
    22 Page
  • File Size
    -

Download

Channel Download Status
Express Download Enable

Copyright

We respect the copyrights and intellectual property rights of all users. All uploaded documents are either original works of the uploader or authorized works of the rightful owners.

  • Not to be reproduced or distributed without explicit permission.
  • Not used for commercial purposes outside of approved use cases.
  • Not used to infringe on the rights of the original creators.
  • If you believe any content infringes your copyright, please contact us immediately.

Support

For help with questions, suggestions, or problems, please contact us