###model selection example set.seed(1) n = 100 x1 = rnorm(n) x2 = rnorm(n) x3 = rnorm(n) x4 = rnorm(n) x5 = rnorm(n) y = x1 + 2*x3 + 3*x5 + rnorm(n) dat = data.frame(cbind(x1,x2,x3,x4,x5,y)) full = lm(y~x1+x2+x3+x4+x5,data=dat) null = lm(y~1, data=dat) step(null, scope=list(upper=full, lower=null), direction='both', trace=TRUE) step(null, scope=list(upper=full, lower=null), direction='forward', trace=TRUE) step(full, scope=list(upper=full, lower=null), direction='backward', trace=TRUE) add1 (null, ~x2+x3+x4+x5) drop1 (full) add1 (null, ~x2+x3+x4+x5,test='F') drop1 (full,test='F') ######### best subset using package "bestglm" ############# set.seed(1) n = 100 x1 = rnorm(n) x2 = rnorm(n) x3 = rnorm(n) x4 = rnorm(n) x5 = rnorm(n) y = x1 + 2*x3 + 3*x5 + rnorm(n) Xy = data.frame(cbind(x1,x2,x3,x4,x5,y)) p = 5 names(Xy)<-c(paste("X",1:p,sep=""),"y") fit = bestglm(Xy,IC='LOOCV') fit$Subsets fit = bestglm(Xy,IC='AIC') fit$Subsets fit = bestglm(Xy,IC='BIC') fit$Subsets #####best subset using package "leaps" ############# install.packages('leaps') library(leaps) leaps<-regsubsets(y~x1+x2+x3+x4+x5,data=dat,nbest=10) #nbest means the max number of optimal model for each size # view results summary(leaps) # plot a table of models showing variables in each model. # models are ordered by the selection statistic. plot(leaps,scale="r2") plot(leaps,scale="Cp") plot(leaps,scale="adjr2") plot(leaps,scale="bic") leaps<-regsubsets(y~x1+x2+x3+x4+x5,data=dat,nbest=1) # view results summary(leaps) # plot a table of models showing variables in each model. # models are ordered by the selection statistic. plot(leaps,scale="r2") ###best subset by hand ### set.seed(1) n = 100 x1 = rnorm(n) x2 = rnorm(n) x3 = rnorm(n) x4 = rnorm(n) x5 = rnorm(n) y = x1 + 2*x3 + 3*x5 + rnorm(n) X = cbind(x1,x2,x3,x4,x5) bestsubset <- function(X, y){ P = ncol(X) subsets = expand.grid( rep( list( 0:1), P)) names(subsets)=paste('X',1:P,sep='') stat = NULL SST = sum((y-mean(y))^2) fitall = lm(y~X) n = nrow(X) MSEP = deviance(fitall)/(n-P-1) for(i in 1:nrow(subsets)){ subs = which(subsets[i,]>0) if(length(subs)==0) fit = lm(y~1) else { subX = X[,which(subsets[i,]>0)] fit = lm(y~subX) } p = length(subs)+1 SSE = deviance(fit) R2 = 1-SSE/SST R2a = 1-SSE/SST*(n-1)/(n-p) Cp = SSE/MSEP - (n-2*p) AIC = n*log(SSE)-n*log(n)+2*p BIC = n*log(SSE)-n*log(n)+log(n)*p X1 = as.matrix(cbind(1,X[,subs])) hatMat = X1%*%solve(t(X1)%*%X1)%*%t(X1) eList = fit$residuals dList = eList/(1-diag(hatMat)) PRESS = sum(dList^2) criList = c(length(subs)+1, subsets[i,], SSE, R2, R2a, Cp, AIC, BIC, PRESS) stat=rbind(stat,criList) } rownames(stat)=NULL colnames(stat)=c('p',names(subsets),'SSE','R2','R2a','Cp','AIC','BIC','PRESS') model = NULL model$R2 = which.max(stat[,P+3]) model$R2a = which.max(stat[,P+4]) model$Cp = which.min(stat[,P+5]) model$AIC = which.min(stat[,P+6]) model$BIC = which.min(stat[,P+7]) model$PRESS = which.min(stat[,P+8]) list(model=model, stat=stat) } bestsubset(X,y) ###best subset for surgical example alldat = read.table('surgical.txt') dat0 = alldat[1:54,c(1:4, 9)] dat = alldat[1:54,c(1:4, 10)] X=alldat[1:54,1:4] names(dat0) = c('X1','X2','X3','X4','Y') names(dat) = c('X1','X2','X3','X4','logY') fit0 = lm(Y~.,data=dat0) fit = lm(logY~.,data=dat) par(mfrow=c(2,2)) plot(fit0$fitted, fit0$residuals) qqnorm(fit0$residuals) plot(fit$fitted, fit$residuals) qqnorm(fit$residuals) cor(dat) pairs(dat) logY = dat$logY SST = sum((logY-mean(logY))^2) stat=NULL n=54 MSEP = deviance(fit)/(n-5) varnameList=NULL for(i1 in 0:1) for(i2 in 0:1) for(i3 in 0:1) for(i4 in 0:1) { subs=(1:4)[c(i1==1,i2==1,i3==1,i4==1)] p = length(subs) + 1 if(i1+i2+i3+i4==0) fit1 = lm(logY~1) else{ datTmp = dat[,c(subs,5)] fit1 = lm(logY~., data=datTmp) } SSE = deviance(fit1) R2 = 1-SSE/SST R2a = 1-SSE/SST*(n-1)/(n-p) Cp = SSE/MSEP - (n-2*p) AIC = n*log(SSE)-n*log(n)+2*p BIC = n*log(SSE)-n*log(n)+log(n)*p X1 = as.matrix(cbind(1,X[,subs])) hatMat = X1%*%solve(t(X1)%*%X1)%*%t(X1) eList = fit1$residuals dList = eList/(1-diag(hatMat)) PRESS = sum(dList^2) varList = c("X1","X2","X3","X4")[c(i1==1,i2==1,i3==1,i4==1)] varname = NULL for(j in 1:length(varList)) varname=paste(varname,varList[j],sep=',') varname = substr(varname,2,100) varnameList = c(varnameList,varname) criList = c(p, SSE, R2, R2a, Cp, AIC, BIC, PRESS) stat=rbind(stat,criList) } varnameList[1]='None' stat=round(stat,3) rownames(stat)=varnameList colnames(stat)=c('p','SSE','R2','R2a','Cp','AIC','BIC','PRESS')