Model-Selection.R
Total Page:16
File Type:pdf, Size:1020Kb
model-selection.R mohammad 2021-04-28 library(ggplot2) # model selection # overfitting and underfitting set.seed(100) data_num= 50 noise= rnorm(data_num,0, 500) x <- sample(c(1:1000), data_num, replace = FALSE) y <- (x-520)ˆ3/10000+4*(x-300)ˆ2/10000+ noise deg= 20 ggplot(data = data.frame(x,y),aes(x,y))+ geom_point()+ geom_smooth(method="lm", formula=y~poly(x,deg)) 15000 10000 5000 y 0 −5000 −10000 0 250 500 750 1000 x 1 rand= sample(c(1:data_num), data_num/5, replace = FALSE) x_valid <- x[rand] x_train <- x[-rand] y_valid <- y[rand] y_train <- y[-rand] train_error=c(1:20) valid_error=c(1:20) for (i in 1:length(train_error)) { model <- lm(data = data.frame(y_train, x_train), y_train~ poly(x_train,i)) pred <- predict(model,newdata = data.frame(x_train = x_valid)) train_error[i]= sum((model$residuals)ˆ2)/data_num valid_error[i] <- sum((y_valid-pred)ˆ2)/data_num } plot(c(1:length(train_error)), train_error) 1500000 train_error 500000 0 5 10 15 20 c(1:length(train_error)) plot(c(1:length(train_error)), valid_error) 2 2.0e+07 valid_error 1.0e+07 0.0e+00 5 10 15 20 c(1:length(train_error)) # feature selection and extraction # selection # best set library(ISLR) ## Warning: package 'ISLR' was built under R version 4.0.5 library(leaps) ## Warning: package 'leaps' was built under R version 4.0.5 fix(Hitters) Hitters= na.omit(Hitters) fit= regsubsets(Salary~ ., Hitters) summary(fit) ## Subset selection object ## Call: regsubsets.formula(Salary ~ ., Hitters) ## 19 Variables (and intercept) ## Forced in Forced out ## AtBat FALSE FALSE ## Hits FALSE FALSE ## HmRun FALSE FALSE ## Runs FALSE FALSE ## RBI FALSE FALSE ## Walks FALSE FALSE ## Years FALSE FALSE ## CAtBat FALSE FALSE 3 ## CHits FALSE FALSE ## CHmRun FALSE FALSE ## CRuns FALSE FALSE ## CRBI FALSE FALSE ## CWalks FALSE FALSE ## LeagueN FALSE FALSE ## DivisionW FALSE FALSE ## PutOuts FALSE FALSE ## Assists FALSE FALSE ## Errors FALSE FALSE ## NewLeagueN FALSE FALSE ## 1 subsets of each size up to 8 ## Selection Algorithm: exhaustive ## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI ## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " "*" ## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*" ## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*" ## 4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*" ## 5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " " "*" ## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*" ## 7 ( 1 ) " " "*" " " " " " " "*" " " "*" "*" "*" " " " " ## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " "*" "*" " " ## CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN ## 1 ( 1 ) " " " " " " " " " " " " " " ## 2 ( 1 ) " " " " " " " " " " " " " " ## 3 ( 1 ) " " " " " " "*" " " " " " " ## 4 ( 1 ) " " " " "*" "*" " " " " " " ## 5 ( 1 ) " " " " "*" "*" " " " " " " ## 6 ( 1 ) " " " " "*" "*" " " " " " " ## 7 ( 1 ) " " " " "*" "*" " " " " " " ## 8 ( 1 ) "*" " " "*" "*" " " " " " " fit2= regsubsets(Salary~ ., Hitters, nvmax = 15) summ= summary(fit2) summ ## Subset selection object ## Call: regsubsets.formula(Salary ~ ., Hitters, nvmax = 15) ## 19 Variables (and intercept) ## Forced in Forced out ## AtBat FALSE FALSE ## Hits FALSE FALSE ## HmRun FALSE FALSE ## Runs FALSE FALSE ## RBI FALSE FALSE ## Walks FALSE FALSE ## Years FALSE FALSE ## CAtBat FALSE FALSE ## CHits FALSE FALSE ## CHmRun FALSE FALSE ## CRuns FALSE FALSE ## CRBI FALSE FALSE ## CWalks FALSE FALSE ## LeagueN FALSE FALSE ## DivisionW FALSE FALSE 4 ## PutOuts FALSE FALSE ## Assists FALSE FALSE ## Errors FALSE FALSE ## NewLeagueN FALSE FALSE ## 1 subsets of each size up to 15 ## Selection Algorithm: exhaustive ## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI ## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " "*" ## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*" ## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*" ## 4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*" ## 5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " " "*" ## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*" ## 7 ( 1 ) " " "*" " " " " " " "*" " " "*" "*" "*" " " " " ## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " "*" "*" " " ## 9 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*" ## 10 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*" ## 11 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*" ## 12 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*" ## 13 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*" ## 14 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" " " " " "*" "*" ## 15 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" "*" " " "*" "*" ## CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN ## 1 ( 1 ) " " " " " " " " " " " " " " ## 2 ( 1 ) " " " " " " " " " " " " " " ## 3 ( 1 ) " " " " " " "*" " " " " " " ## 4 ( 1 ) " " " " "*" "*" " " " " " " ## 5 ( 1 ) " " " " "*" "*" " " " " " " ## 6 ( 1 ) " " " " "*" "*" " " " " " " ## 7 ( 1 ) " " " " "*" "*" " " " " " " ## 8 ( 1 ) "*" " " "*" "*" " " " " " " ## 9 ( 1 ) "*" " " "*" "*" " " " " " " ## 10 ( 1 ) "*" " " "*" "*" "*" " " " " ## 11 ( 1 ) "*" "*" "*" "*" "*" " " " " ## 12 ( 1 ) "*" "*" "*" "*" "*" " " " " ## 13 ( 1 ) "*" "*" "*" "*" "*" "*" " " ## 14 ( 1 ) "*" "*" "*" "*" "*" "*" " " ## 15 ( 1 ) "*" "*" "*" "*" "*" "*" " " names(summ) ## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj" summ$which ## (Intercept) AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun ## 1 TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE ## 2 TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE ## 3 TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE ## 4 TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE ## 5 TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE ## 6 TRUE TRUE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE ## 7 TRUE FALSE TRUE FALSE FALSE FALSE TRUE FALSE TRUE TRUE TRUE ## 8 TRUE TRUE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE ## 9 TRUE TRUE TRUE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE ## 10 TRUE TRUE TRUE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE 5 ## 11 TRUE TRUE TRUE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE ## 12 TRUE TRUE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE FALSE ## 13 TRUE TRUE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE FALSE ## 14 TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE TRUE FALSE FALSE ## 15 TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE TRUE TRUE FALSE ## CRuns CRBI CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN ## 1 FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE ## 2 FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE ## 3 FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE ## 4 FALSE TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE ## 5 FALSE TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE ## 6 FALSE TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE ## 7 FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE ## 8 TRUE FALSE TRUE FALSE TRUE TRUE FALSE FALSE FALSE ## 9 TRUE TRUE TRUE FALSE TRUE TRUE FALSE FALSE FALSE ## 10 TRUE TRUE TRUE FALSE TRUE TRUE TRUE FALSE FALSE ## 11 TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE ## 12 TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE ## 13 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE ## 14 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE ## 15 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE plot(summ$rsq, type = "l") 0.55 0.50 0.45 summ$rsq 0.40 0.35 2 4 6 8 10 12 14 Index plot(summ$adjr2, type = "l") 6 0.50 0.45 0.40 summ$adjr2 0.35 2 4 6 8 10 12 14 Index plot(summ$rss, type = "l") 7 3.6e+07 3.2e+07 summ$rss 2.8e+07 2.4e+07 2 4 6 8 10 12 14 Index plot(summ$bic, type = "l") 8 −90 −110 summ$bic −130 −150 2 4 6 8 10 12 14 Index plot(summ$cp, type = "l") 9 100 80 60 summ$cp 40 20 2 4 6 8 10 12 14 Index coef(fit2,8) ## (Intercept) AtBat Hits Walks CHmRun CRuns ## 130.9691577 -2.1731903 7.3582935 6.0037597 1.2339718 0.9651349 ## CWalks DivisionW PutOuts ## -0.8323788 -117.9657795 0.2908431 # forward and backward selection forward= regsubsets(Salary~., Hitters, nvmax = 15, method = "forward") summary(forward) ## Subset selection object ## Call: regsubsets.formula(Salary ~ ., Hitters, nvmax = 15, method = "forward") ## 19 Variables (and intercept) ## Forced in Forced out ## AtBat FALSE FALSE ## Hits FALSE FALSE ## HmRun FALSE FALSE ## Runs FALSE FALSE ## RBI FALSE FALSE ## Walks FALSE FALSE ## Years FALSE FALSE ## CAtBat FALSE FALSE ## CHits FALSE FALSE ## CHmRun FALSE FALSE ## CRuns FALSE FALSE 10 ## CRBI FALSE FALSE ## CWalks FALSE FALSE ## LeagueN FALSE FALSE ## DivisionW FALSE FALSE ## PutOuts FALSE FALSE ## Assists FALSE FALSE ## Errors FALSE FALSE ## NewLeagueN FALSE FALSE ## 1 subsets of each size up to 15 ## Selection Algorithm: forward ## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI ## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " "*" ## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*" ## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*" ## 4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*" ## 5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " " "*" ## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*" ## 7 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*" ## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" "*" ## 9 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*" ## 10 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*" ## 11 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*" ## 12 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*" ## 13 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*" ## 14 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" " " " " "*" "*" ## 15 ( 1 ) "*"