Model-Selection.R
model-selection.R mohammad 2021-04-28 library(ggplot2) # model selection # overfitting and underfitting set.seed(100) data_num= 50 noise= rnorm(data_num,0, 500) x <- sample(c(1:1000), data_num, replace = FALSE) y <- (x-520)ˆ3/10000+4*(x-300)ˆ2/10000+ noise deg= 20 ggplot(data = data.frame(x,y),aes(x,y))+ geom_point()+ geom_smooth(method="lm", formula=y~poly(x,deg)) 15000 10000 5000 y 0 −5000 −10000 0 250 500 750 1000 x 1 rand= sample(c(1:data_num), data_num/5, replace = FALSE) x_valid <- x[rand] x_train <- x[-rand] y_valid <- y[rand] y_train <- y[-rand] train_error=c(1:20) valid_error=c(1:20) for (i in 1:length(train_error)) { model <- lm(data = data.frame(y_train, x_train), y_train~ poly(x_train,i)) pred <- predict(model,newdata = data.frame(x_train = x_valid)) train_error[i]= sum((model$residuals)ˆ2)/data_num valid_error[i] <- sum((y_valid-pred)ˆ2)/data_num } plot(c(1:length(train_error)), train_error) 1500000 train_error 500000 0 5 10 15 20 c(1:length(train_error)) plot(c(1:length(train_error)), valid_error) 2 2.0e+07 valid_error 1.0e+07 0.0e+00 5 10 15 20 c(1:length(train_error)) # feature selection and extraction # selection # best set library(ISLR) ## Warning: package 'ISLR' was built under R version 4.0.5 library(leaps) ## Warning: package 'leaps' was built under R version 4.0.5 fix(Hitters) Hitters= na.omit(Hitters) fit= regsubsets(Salary~ ., Hitters) summary(fit) ## Subset selection object ## Call: regsubsets.formula(Salary ~ ., Hitters) ## 19 Variables (and intercept) ## Forced in Forced out ## AtBat FALSE FALSE ## Hits FALSE
[Show full text]