####### Section 8.4

####### Section 8.4-4, pages 482-486
####### Example: Formaldehyde Concentrations
## CH2O concentration (response) was influences by
##      the airtightness of the home (x) and
##      the absence (0) or presence (1) of UFFI (z) -- urea formaldehyde foam insullation
data = read.table("table842.dat", head=T)
#    CH2O Airtightness UFFI
#1  31.33            0    0
y = data$CH2O
x = data$Airtightness
z = data$UFFI

## pre-analysis
pairs(data)          # scatterplot matrix
cor(data)            # correlation matrix
round(cor(data), 3)

## separating data by UFFI=0 or 1
# UFFI = 0
y0 = y[z==0]
x0 = x[z==0]
fit0 = lm(y0 ~ x0)
summary(fit0)
# UFFI = 1
y1 = y[z==1]
x1 = x[z==1]
fit1 = lm(y1 ~ x1)
summary(fit1)
# reproduce Figure 8.4-1 on page 484
par(mfrow=c(1,1))
plot(x, y, xlab="Airtightness", ylab="CH2O", main="Scatter Plot of CH2O against Airtightness", type="n")
points(x0, y0, pch=16)
points(x1, y1, pch=15, col="grey")
abline(fit0$coef[1], fit0$coef[2], lty=1)
abline(fit1$coef[1], fit1$coef[2], lty=2)
legend(0, 70, legend=c("UFFI=1", "UFFI=0"), pch=c(15,16), col=c("grey","black"))


##########################################################
################## R tips ################################
## Source: R help on "points"
##-------- Showing all the extra & some char graphics symbols ---------
 pchShow <-
   function(extras = c("*",".", "o","O","0","+","-","|","%","#"),
            cex = 3, ## good for both .Device=="postscript" and "x11"
            col = "red3", bg = "gold", coltext = "brown", cextext = 1.2,
            main = paste("plot symbols :  points (...  pch = *, cex =",
                         cex,")"))
   {
     nex <- length(extras)
     np  <- 26 + nex
     ipch <- 0:(np-1)
     k <- floor(sqrt(np))
     dd <- c(-1,1)/2
     rx <- dd + range(ix <- ipch %/% k)
     ry <- dd + range(iy <- 3 + (k-1)- ipch %% k)
     pch <- as.list(ipch) # list with integers & strings
     if(nex > 0) pch[26+ 1:nex] <- as.list(extras)
     plot(rx, ry, type="n", axes = FALSE, xlab = "", ylab = "",
          main = main)
     abline(v = ix, h = iy, col = "lightgray", lty = "dotted")
     for(i in 1:np) {
       pc <- pch[[i]]
       ## 'col' symbols with a 'bg'-colored interior (where available) :
       points(ix[i], iy[i], pch = pc, col = col, bg = bg, cex = cex)
       if(cextext > 0)
           text(ix[i] - 0.3, iy[i], pc, col = coltext, cex = cextext)
     }
   }
 par(mfrow=c(1,1))
 pchShow()
################## end of R tips ################################


## fit linear regression model (a), assuming the coefficient of x is the same for z=0 or z=1
fita = lm(y ~ x + z)
summary(fita)
anova(fita)
# diagnostic check
par(mfrow=c(2,2))
plot(fita, 1)
plot(fita, 2)
plot(fita, 3)
plot(fita, 4)


## fit linear regression model (b), allowing the coefficients of x are different for z=0 or z=1
xz = x*z               # interaction of x and z
fitb = lm(y ~ x + z + xz)
summary(fitb)
anova(fitb)
# conclusion: the coefficient of xz is not significant, no need to add xz


###### ACH example in "chapter84.sas"
##  ACH6: Reading achievement at the end of sixth grade.
##  ACH5: Reading achievement at the end of fifth grade.
##  APT:  A measure of verbal aptitude taken in the fifth grade.
##  ATT:  A measure of attitude toward school taken in fifth grade.
##  INCOME: A measure of parental income (in thousands of dollars per year).
##  The purpose of this study is to understand what underlies the reading achievement of the students in the district.
data = read.table("ach.dat", head=T)
#   ID ACH6 ACH5 APT ATT INCOME
#1   1  7.5  6.6 104  60     67
ach6 = y = data$ACH6
ach5 = data$ACH5
apt = data$APT
att = data$ATT
income = data$INCOME

## pre-analysis
# pairwise scatter plot 
pairs(data)            # scatterplot matrix
cor(data)              # correlation matrix
round(cor(data), 3)    # rounded correlation matrix

## model y = ach5 + apt + att + income
fit1 = lm(y~ach5+apt+att+income)
summary(fit1)
aov(fit1)

## recover SAS output
yhat = fit1$fitted         # y^hat or predicted value
ybar = mean(y)             # y^bar
X = model.matrix(fit1)     # model matrix [1 X1 X2 X3 X4], n*(p+1)
n = dim(fit1$model)[1]     # number of observations
p = dim(fit1$model)[2] - 1 # number of explanatory variables
mse = sum((y-yhat)^2)/(n-p-1)     # MSE
sqrt(mse)                  # Root MSE
sqrt(mse)/mean(y)*100      # Coeff Var
bhat = solve(t(X) %*% X) %*% t(X) %*% y    # beta^hat
fit1$coef                  # beta^hat
sb2 = mse*solve(t(X) %*% X)# s^2(beta vector), estimates of variance matrix of beta vector
sqrt(diag(X %*% sb2 %*% t(X)))   # Std Error of Mean Predict
ei = fit1$residual               # Residuals
H = X %*% solve(t(X) %*% X) %*% t(X)     # hat matrix, yhat = H y
hii = diag(H)              # known as leverage
ser = sqrt(mse * (1-hii))        # Std Error Residual
ei/ser                           # Student Residual (Studentized Residual)
(ei^2/((p+1)*mse))*(hii/(1-hii)^2)    # Cook's D (Cook's Distance)
sum(ei)                          # Sum of Residuals
sum(ei^2)                        # Sum of Squared Residuals
sum((ei/(1-hii))^2)              # Predicted Residual SS (PRESS), sum of (Yi - Yi(i))^2