########R Example for Toluca dataset ####Chapter 1 toluca = read.table('toluca.txt',header=F) n = 25 ###Change the column names names(toluca)<-c("Size", "Hours") ###Scatter Plot plot(toluca,xlim=c(0,150),ylim=c(0,600)) ###Doing linear regression using R function lm() fit = lm(Hours~Size, data=toluca) ###If calculate by hand x = toluca[,1] y = toluca[,2] xmean = mean(x) ymean = mean(y) b1 = sum((x-xmean)*(y-ymean))/sum((x-xmean)^2) b0 = ymean - b1 * xmean ####Residuals resi = fit$residuals ####Or by definition y-yhat yfit = x*b1+b0 resi2 =y-yfit ####Verify the property of residuals sum(resi) sum(x*resi) sum(yfit*resi) ###SSE, MSE, SSE = sum(resi^2) MSE = SSE/(25-2) ###Estimate of sigma s = sqrt(MSE) ###Chapter 2 ###Inference for beta1 sb1sq = MSE/sum((x-xmean)^2) sb1 = sqrt(sb1sq) ###95% confidence interval qt975 = qt(0.975, df=23) b1 - qt975 * sb1 b1 + qt975 * sb1 ###direct function for CI confint(fit) confint(fit,level=0.99) ###two side test for beta1 tstar = b1/sb1 ##Threshold hold for tstar = qt975 ##here tstar > qt975, Reject H_0, accept H_a ###p value 2*(1-pt(tstar, df=23)) ####Inference for beta0 sb0sq = MSE*(1/n+xmean^2/sum((x-xmean)^2)) sb0 =sqrt(sb0sq) ###90% confidence interval qt95 = qt(0.95, df=23) b0 - qt95 * sb0 b0 + qt95 * sb0 #####Inference for E{Y_h} # Let's find a 90% confidence interval of E{Y_h} when X_h=100 units. Xh = 100 Yh = b0 + b1*Xh sYhsq = MSE*(1/n+(Xh-xmean)^2/sum((x-xmean)^2)) sYh = sqrt(sYhsq) Yh - sYh * qt95 Yh + sYh * qt95 ###direct function newx = data.frame(Size=100) predict.lm(fit, newx, interval="confidence",level=.9) ###Prediction for a new observation ###Suppose new observation consists of X_h=100 units, we want a 90% confidence interval sYpredsq = sYhsq +MSE sYpred = sqrt(sYpredsq) Yh - sYpred * qt95 Yh + sYpred * qt95 ###direct function newx = data.frame(Size=100) predict.lm(fit, newx, interval="prediction",level=.9) ####Chapter 3