bp = read.table('blood-pressure.txt') X = bp[,1] Y = bp[,2] ###step1: do a regular OLS lmfit = lm(Y~X) plot(X, Y) abline(lmfit) resi = residuals(lmfit) par(mfrow=c(1,2)) plot(X,resi) abline(0,0) plot(X,abs(resi)) ###step2: regress absolute residuals on X vfit = lm(abs(resi)~X) abline(vfit) ###step3: use the fitted values to get the weights wlist = vfit$fitted^(-2) ###step4: use weighted least squares wlfit = lm(Y~X, weights=wlist) ###compare the two fit... summary(lmfit) summary(wlfit) ###########Ridge Regression dat = read.table('fat.txt') dat = scale(dat) X1 = dat[,1] X2 = dat[,2] X3 = dat[,3] Y = dat[,4] library(MASS) X = cbind(dat[,1:3]) solve(t(X)%*%X+20*diag(rep(1,3)))%*%t(X)%*%Y lamlist = seq(from=0, to=1, by=0.01) lamLen = length(lamlist) fit = lm.ridge(Y~X1+X2+X3-1, lambda=lamlist) coefMat = matrix(0, lamLen, 3) VIF = matrix(0, lamLen, 3) for(i in 1:length(lamlist)) { coefMat[i,] = solve(t(X)%*%X+lamlist[i]*diag(rep(1,3)))%*%t(X)%*%Y tmp = solve(t(X)%*%X+lamlist[i]*diag(rep(1,3))) VIF[i,] = diag(tmp%*%t(X)%*%X%*%tmp) } fit$coef fit$kHKB fit$kLW par(mfrow=c(2,1)) matplot(lamlist,t(fit$coef),type='l',ylab='beta',xlab='lambda') matplot(lamlist,coefMat,type='l',ylab='beta',xlab='lambda') matplot(lamlist, VIF,,type='l',ylab='VIF',xlab='lambda') ###Least Absolute Deviation Regression install.packages('quantreg') library(quantreg) n = 100 x = rnorm(n) y = 1+x+rnorm(n) y[1] = 100 lsfit = lm(y~x) ladfit = rq(y~x) summary(lsfit) summary(ladfit) ###IRLS (Huber weights) math = read.table('math.txt',header=F) y = math[,2] x = math[,4] lsfit = lm(y~x) plot(x,y) abline(lsfit) lines(lowess(x,y),col=2) ### quadratic fit seems to be reasonable x1 = x-mean(x) x2 = x1^2 qfit = lm(y~x1+x2) plot(x,y) sx = sort(x) lines(sx,qfit$fitted[order(x)]) ### robust quadratic fit mat = NULL for(rep in 1:8){ elist = qfit$residuals MAD = median(abs(elist-median(elist)))/.6745 ulist = elist/MAD wlist = 1.345/pmax(abs(ulist),1.345) qfit = lm(y~x1+x2,weight=wlist) mat=cbind(mat,elist,wlist) } qfit ### lowess implementation insurance = read.table('insurance.txt',header=F) q = 0.5 xh1 = 30 xh2 = 3 x1 = insurance[,1] x2 = insurance[,2] sd1 = sd(x1) sd2 = sd(x2) y = insurance[,3] dlist = (((x1-xh1)/sd1)^2+((x2-xh2)/sd2)^2)^0.5 dq = sort(dlist)[9] wlist = (1-(pmin(dlist,dq)/dq)^3)^3 wlfit = lm(y~x1+x2,weight=wlist) wlfit -134.076 + 3.571 *30 + 10.532 *3 insurance = read.table('insurance.txt',header=F) names(insurance)=c('x1','x2','y') sdlist = apply(insurance,2,sd) insurance[,1]=insurance[,1]/sdlist[1] insurance[,2]=insurance[,2]/sdlist[2] fit = loess(y~x1+x2,data=insurance,span=0.6,degree=1, normalize=FALSE) predict(fit,data.frame(x1=30/sdlist[1],x2=3/sdlist[2])) ### regression trees install.packages('tree') library(tree) data = read.table('steroid.txt',header=F) names(data)<-c('steroid','age') fit = tree(steroid~age, data=data, minsize=5) plot(fit) text(fit) ?tree.control gpa = read.table('gpa.txt',header=F) names(gpa) = c('index','gpa','HSRank','ACT','year') fit = tree(gpa~ ACT + HSRank,data=gpa, minsize=5) plot(fit) text(fit) ### bootstrap set.seed(1) n = 100 x = rnorm(n) y = 1+2*x + rnorm(n) fit = lm(y~x) summary(fit) resi = fit$resi yhat = fit$fitted bvec = NULL ###fixed X sampling for(k in 1:500){ newy = yhat + sample(resi,replace=T) newfit = lm(newy~x) bvec[k] = newfit$coef[2] } sd(bvec) alpha=0.05 2*fit$coef[2]-quantile(bvec,1-alpha/2) 2*fit$coef[2]-quantile(bvec,alpha/2) ###random X sampling for(k in 1:500){ ind = sample(1:length(y),replace=T) newfit = lm(y[ind]~x[ind]) bvec[k] = newfit$coef[2] } sd(bvec) ###confidence intervals confint(fit) alpha=0.05 2*fit$coef[2]-quantile(bvec,1-alpha/2) 2*fit$coef[2]-quantile(bvec,alpha/2)