## For SAS coding on GLM: see
##    http://128.97.141.26/stat/examples/iglm/default.htm
## Reference: An Introduction to Generalized Linear Models, 2nd Edition, Annette J. Dobson

## More reference (dataset and R code):
##    http://statmath.wu.ac.at/courses/heather_turner/index.html

## Basic R functions for GLM
# glm(formula, family = gaussian, data, weights, subset,
#     na.action, start = NULL, etastart, mustart, offset,
#     control = glm.control(...), model = TRUE,
#     method = "glm.t", x = FALSE, y = TRUE,
#     contrasts = NULL, ...)
# "formula": y ~ x1 + x2
#     In formula, "a:b" for an interaction between a and b
#                 "a*b" means "a + b + a:b"
# "family": binomial(link = "logit")
#           gaussian(link = "identity")
#           Gamma(link = "inverse")
#           inverse.gaussian(link = "1/mu2")
#           poisson(link = "log")
# There are several functions available for accessing/displaying components of 
#     the glm object, including:
#           residuals()
#           fitted()
#           predict()
#           coef()
#           deviance()
#           formula()
#           summary()


## Bronchitis Example, Jones (1975)
bronchitis <- read.table("bronchitis.dat",sep=",",header=T)
# cigs: the number of cigarettes smoked per day
# poll: the air pollution level in the locality of residence
# bron: the presence/absence of bronchitis (indicated by 1/0)
str(bronchitis)
#'data.frame':   212 obs. of  3 variables:
# $ bron: int  0 1 0 0 0 0 1 1 0 0 ...
# $ cigs: num  5.15 0 2.5 1.75 6.75 0 0 9.5 0 0.75 ...
# $ poll: num  67.1 66.9 66.7 65.8 64.4 64.4 65.1 66.2 65.9 67.1 ...
attach(bronchitis)  # then objects in the database can be accessed by simply giving their names


## plot the dataset
pairs(bronchitis)
cor(bronchitis)
plot(bron ~ poll)
plot(jitter(bron, 0.1) ~ poll)
plot(bron ~ cigs)
plot(jitter(bron, 0.1) ~ cigs)

plot(poll ~ cigs, xlab = "No. cigarettes/day", 
  ylab = "Pollution level", type = "n")
text(cigs, poll, labels = bron)
legend(20, 65, legend = c("presence", "absence"),
       title = "Bronchitis",
       pch = c("1", "0"))

boxplot(poll ~ bron,
    xlab = "Bronchitis presence/absence (1/0)",
    ylab = "Pollution level")
boxplot(cigs ~ bron,
    xlab = "Bronchitis presence/absence (1/0)",
    ylab = "No. cigarettes/day")

## Comparison of Links
mu.logit <- function(eta) 1/(1 + exp(-eta));          # logit link
mu.probit <- function(eta) pnorm(eta, 0, pi/sqrt(3)); # probit link
mu.cloglog <- function(eta) 1 - exp(-exp(eta));       # complementary log-log link
plot(mu.logit, (-4): 4, xlim = c(-4, 4), ylim = c(0,1),
   xlab = expression(eta), ylab = expression(mu == g^-1 * (eta)))
curve(mu.probit, (-4):4, add = TRUE, lty = 2)
curve(mu.cloglog, (-4):4, add = TRUE, lty = 3)
legend(-4, 1, c("logit", "probit", "complementary log-log"), lty = 1:3)


## fit logistic model
model1 <- glm(bron ~ cigs + poll, family = binomial)
summary(model1)
#Deviance Residuals: #D = sum of r^2
#    Min       1Q   Median       3Q      Max  
#-2.4023  -0.5606  -0.4260  -0.3155   2.3594  

#Coefficients: # based on Wald test
#             Estimate Std. Error z value Pr(>|z|)    
#(Intercept) -10.08491    2.95100  -3.417 0.000632 ***
#cigs          0.21169    0.03813   5.552 2.83e-08 ***
#poll          0.13176    0.04895   2.692 0.007113 ** 

# null model: only the intercept presents, d.f. = n-1
# fitted model: d.f.= n-p
#    Null deviance: 221.78  on 211  degrees of freedom
#Residual deviance: 174.21  on 209  degrees of freedom

# AIC = -2*l_model + 2*p
#AIC: 180.21
AIC(model1) #180.2145
BIC(model1) #190.2842

## plot fitted model
# (deviance) residual plot
plot(residuals(model1) ~ fitted(model1),
  xlab = expression(hat(y)[i]), ylab = expression(r[i]))
abline(0, 0, lty = 2)

par(mfrow=c(2,2))
plot(model1)

# check outliers
r <- residuals(model1)
r[abs(r) > 2]
bronchitis[abs(r) > 2, ]
#    bron cigs poll
#48     1  0.0 61.8
#59     1  0.0 61.7
#87     1  0.0 59.7
#122    1  0.0 55.9
#142    1  0.0 58.9
#147    0 24.9 58.0
sum(bron[cigs == 0])
#[1] 7
sum(cigs==0)
#[1] 54

## try other links
model1a <- glm(bron ~ cigs + poll, family = binomial(link=probit))
summary(model1a)
AIC(model1a)
#[1] 179.0786
BIC(model1a)
#[1] 189.1483

model1b <- glm(bron ~ cigs + poll, family = binomial(link=cloglog))
summary(model1b)
AIC(model1b)
#[1] 186.2768
BIC(model1b)
#[1] 196.3465

model1c <- glm(bron ~ cigs + poll, family = binomial(link=cauchit))
summary(model1c)
AIC(model1c)
#[1] 185.9184
BIC(model1c)
#[1] 195.9882

# model non-smokers separately
smoker = as.numeric(cigs > 0)
model2 <- glm(bron ~ smoker + smoker:(cigs + poll),
              family = binomial)
summary(model2)
#    Null deviance: 221.78  on 211  degrees of freedom
#Residual deviance: 175.66  on 208  degrees of freedom
#AIC: 183.66
par(mfrow=c(2,2))
plot(model2)

temp <- glm(bron ~ smoker*poll,
              family = binomial)
summary(temp)

# Grouping the Data
cutCigs <- cut(cigs, c(-1, 0, 1, 3, 5, 8, 50))
cbind(cigs,cutCigs)
cutPoll <- cut(poll, c(-1, 55, 57.5, 60, 62.5, 65, 100))
xtabs(~cutCigs)
#cutCigs
#(-1,0]  (0,1]  (1,3]  (3,5]  (5,8] (8,50] 
#    54     43     35     21     20     39 
xtabs(~cutPoll)
#cutPoll
#  (-1,55] (55,57.5] (57.5,60] (60,62.5] (62.5,65]  (65,100] 
#       47        50        44        31        26        14 

total <- xtabs( ~ cutCigs + cutPoll)
total
presence <- xtabs(bron ~ cutCigs + cutPoll)
presence
absence <- c(total) - c(presence)
round((total-presence)/total*10)
binData <- as.data.frame.table(presence, responseName = "presence")
binData <- cbind(binData, absence)
binData

model4 <- glm(cbind(presence, absence) ~ cutCigs + cutPoll, family = binomial,
              data = binData)
summary(model4)
#    Null deviance: 98.027  on 34  degrees of freedom
#Residual deviance: 14.877  on 24  degrees of freedom
#AIC: 72.228
par(mfrow=c(2,2))
plot(model4)

# Analysis of Deviance Table (~ANOVA)
anova(model4, test = "Chisq")
#        Df Deviance Resid. Df Resid. Dev P(>|Chi|)    
#NULL                       34     98.027              
#cutCigs  5   73.063        29     24.964 2.359e-14 ***
#cutPoll  5   10.087        24     14.877   0.07281 .  
par(mfrow=c(2,2))
plot(binData$cutPoll, fitted(model4), main="Fitted vs. cutPoll")
plot(binData$cutPoll, residuals(model4), main="Residual vs. cutPoll")
plot(binData$cutCigs, fitted(model4), main="Fitted vs. cutCigs")
plot(binData$cutCigs, residuals(model4), main="Residual vs. cutCigs")


## other models
# treat "cutPoll" as a continuous variable
model5 <- glm(cbind(presence, absence) ~ cutCigs + c(cutPoll),
              family = binomial, data = binData)
summary(model5)
par(mfrow=c(2,2))
plot(model5)
#    Null deviance: 98.027  on 34  degrees of freedom
#Residual deviance: 19.472  on 28  degrees of freedom
#AIC: 68.823
anova(model5, test = "Chisq")

# simplify the factor "cutCigs" further (with levels "0", "(0,3]", "(3,8]", "8+")
attributes(binData$cutCigs)
#$levels
#[1] "(-1,0]" "(0,1]"  "(1,3]"  "(3,5]"  "(5,8]"  "(8,50]"
binData6 = binData
levels(binData6$cutCigs) <- c("0", "(0,3]", "(0,3]", "(3,8]", "(3,8]", "8+")
model6 <- glm(cbind(presence, absence) ~ cutCigs + c(cutPoll),
              family = binomial, data = binData6)
summary(model6)
par(mfrow=c(2,2))
plot(model6)
#    Null deviance: 98.027  on 34  degrees of freedom
#Residual deviance: 19.498  on 30  degrees of freedom
#AIC: 64.848

cigs7 <- cut(cigs, c(-1, 0, 3, 8, 50))
model7 <- glm(bron ~ cigs7 + poll, family = binomial)
summary(model7)
par(mfrow=c(2,2))
plot(model7)
#    Null deviance: 221.78  on 211  degrees of freedom
#Residual deviance: 143.65  on 207  degrees of freedom
#AIC: 153.65

r <- residuals(model7)
r[abs(r) > 2]
bronchitis[abs(r) > 2, ]
#    bron cigs poll
#87     1    0 59.7
#122    1    0 55.9
#142    1    0 58.9


## 5-fold cross-validation for evaluating models
length(bron)   # 212
sum(bron)      #  46

library(boot)      # for function "cv.glm"
## use self-defined cost function for binary response
cost <- function(r, pi = 0) mean(abs(r-pi) > 0.5)
set.seed(678)                   # free to change
cv.glm(data=bronchitis, glmfit=model1, cost=cost, K=5)$delta
#[1] 0.2028302 0.2046992
set.seed(678)                   # free to change
cv.glm(data=bronchitis, glmfit=model1a, cost=cost, K=5)$delta
#[1] 0.2028302 0.1971120
set.seed(678)                   # free to change
cv.glm(data=cbind(bronchitis,smoker), glmfit=model2, cost=cost, K=5)$delta
#[1] 0.2028302 0.1980910
set.seed(678)                   # free to change
cv.glm(data=cbind(bronchitis,cigs7), glmfit=model7, cost=cost, K=5)$delta
#0.2169811 0.2301976

## use default cost function
set.seed(678)                   # free to change
cv.glm(data=cbind(bronchitis,smoker), glmfit=model2, K=5)$delta
#[1] 0.1409895 0.1401409
set.seed(678)                   # free to change
cv.glm(data=bronchitis, glmfit=model1, K=5)$delta
#[1] 0.1399383 0.1390862
set.seed(678)                   # free to change
cv.glm(data=cbind(bronchitis,cigs7), glmfit=model7, K=5)$delta
#0.1248911 0.1237637

set.seed(678)                   # free to change
cv.glm(data=binData, glmfit=model4, K=5)$delta
#0.05389201 0.05139671 
set.seed(678)                   # free to change
cv.glm(data=cbind(binData,c(binData$cutPoll)), glmfit=model5, K=5)$delta
#0.03978364 0.03888970 
set.seed(678)                   # free to change
cv.glm(data=cbind(binData6,c(binData6$cutPoll)), glmfit=model6, K=5)$delta
#0.04515505 0.04399620