####### Section 8.4 ####### Section 8.4-4, pages 482-486 ####### Example: Formaldehyde Concentrations ## CH2O concentration (response) was influences by ## the airtightness of the home (x) and ## the absence (0) or presence (1) of UFFI (z) -- urea formaldehyde foam insullation data = read.table("table842.dat", head=T) # CH2O Airtightness UFFI #1 31.33 0 0 y = data\$CH2O x = data\$Airtightness z = data\$UFFI ## pre-analysis pairs(data) # scatterplot matrix cor(data) # correlation matrix round(cor(data), 3) ## separating data by UFFI=0 or 1 # UFFI = 0 y0 = y[z==0] x0 = x[z==0] fit0 = lm(y0 ~ x0) summary(fit0) # UFFI = 1 y1 = y[z==1] x1 = x[z==1] fit1 = lm(y1 ~ x1) summary(fit1) # reproduce Figure 8.4-1 on page 484 par(mfrow=c(1,1)) plot(x, y, xlab="Airtightness", ylab="CH2O", main="Scatter Plot of CH2O against Airtightness", type="n") points(x0, y0, pch=16) points(x1, y1, pch=15, col="grey") abline(fit0\$coef[1], fit0\$coef[2], lty=1) abline(fit1\$coef[1], fit1\$coef[2], lty=2) legend(0, 70, legend=c("UFFI=1", "UFFI=0"), pch=c(15,16), col=c("grey","black")) ########################################################## ################## R tips ################################ ## Source: R help on "points" ##-------- Showing all the extra & some char graphics symbols --------- pchShow <- function(extras = c("*",".", "o","O","0","+","-","|","%","#"), cex = 3, ## good for both .Device=="postscript" and "x11" col = "red3", bg = "gold", coltext = "brown", cextext = 1.2, main = paste("plot symbols : points (... pch = *, cex =", cex,")")) { nex <- length(extras) np <- 26 + nex ipch <- 0:(np-1) k <- floor(sqrt(np)) dd <- c(-1,1)/2 rx <- dd + range(ix <- ipch %/% k) ry <- dd + range(iy <- 3 + (k-1)- ipch %% k) pch <- as.list(ipch) # list with integers & strings if(nex > 0) pch[26+ 1:nex] <- as.list(extras) plot(rx, ry, type="n", axes = FALSE, xlab = "", ylab = "", main = main) abline(v = ix, h = iy, col = "lightgray", lty = "dotted") for(i in 1:np) { pc <- pch[[i]] ## 'col' symbols with a 'bg'-colored interior (where available) : points(ix[i], iy[i], pch = pc, col = col, bg = bg, cex = cex) if(cextext > 0) text(ix[i] - 0.3, iy[i], pc, col = coltext, cex = cextext) } } par(mfrow=c(1,1)) pchShow() ################## end of R tips ################################ ## fit linear regression model (a), assuming the coefficient of x is the same for z=0 or z=1 fita = lm(y ~ x + z) summary(fita) anova(fita) # diagnostic check par(mfrow=c(2,2)) plot(fita, 1) plot(fita, 2) plot(fita, 3) plot(fita, 4) ## fit linear regression model (b), allowing the coefficients of x are different for z=0 or z=1 xz = x*z # interaction of x and z fitb = lm(y ~ x + z + xz) summary(fitb) anova(fitb) # conclusion: the coefficient of xz is not significant, no need to add xz ###### ACH example in "chapter84.sas" ## ACH6: Reading achievement at the end of sixth grade. ## ACH5: Reading achievement at the end of fifth grade. ## APT: A measure of verbal aptitude taken in the fifth grade. ## ATT: A measure of attitude toward school taken in fifth grade. ## INCOME: A measure of parental income (in thousands of dollars per year). ## The purpose of this study is to understand what underlies the reading achievement of the students in the district. data = read.table("ach.dat", head=T) # ID ACH6 ACH5 APT ATT INCOME #1 1 7.5 6.6 104 60 67 ach6 = y = data\$ACH6 ach5 = data\$ACH5 apt = data\$APT att = data\$ATT income = data\$INCOME ## pre-analysis # pairwise scatter plot pairs(data) # scatterplot matrix cor(data) # correlation matrix round(cor(data), 3) # rounded correlation matrix ## model y = ach5 + apt + att + income fit1 = lm(y~ach5+apt+att+income) summary(fit1) aov(fit1) ## recover SAS output yhat = fit1\$fitted # y^hat or predicted value ybar = mean(y) # y^bar X = model.matrix(fit1) # model matrix [1 X1 X2 X3 X4], n*(p+1) n = dim(fit1\$model)[1] # number of observations p = dim(fit1\$model)[2] - 1 # number of explanatory variables mse = sum((y-yhat)^2)/(n-p-1) # MSE sqrt(mse) # Root MSE sqrt(mse)/mean(y)*100 # Coeff Var bhat = solve(t(X) %*% X) %*% t(X) %*% y # beta^hat fit1\$coef # beta^hat sb2 = mse*solve(t(X) %*% X)# s^2(beta vector), estimates of variance matrix of beta vector sqrt(diag(X %*% sb2 %*% t(X))) # Std Error of Mean Predict ei = fit1\$residual # Residuals H = X %*% solve(t(X) %*% X) %*% t(X) # hat matrix, yhat = H y hii = diag(H) # known as leverage ser = sqrt(mse * (1-hii)) # Std Error Residual ei/ser # Student Residual (Studentized Residual) (ei^2/((p+1)*mse))*(hii/(1-hii)^2) # Cook's D (Cook's Distance) sum(ei) # Sum of Residuals sum(ei^2) # Sum of Squared Residuals sum((ei/(1-hii))^2) # Predicted Residual SS (PRESS), sum of (Yi - Yi(i))^2