## More examples for GLM

## reference: http://statmath.wu.ac.at/courses/heather_turner/index.html

## Example 2: Budworm Data
## Reference: Collett (1991, Modelling binary data) 
# It's on the toxicity of the pyrethoid trans - cypermethrin to the tobacco budworm. 
#   Batches of 20 moths of each sex were exposed to varying doses of the
#   pyrethoid for three days and the number knocked out in each batch was recorded.

## Data (out of 20):
#Dose   1 2 4  8 16 32
#Male   1 4 9 13 18 20
#Female 0 2 6 10 12 16

## a picture of Tobacco_budworm
# http://en.wikipedia.org/wiki/File:Tobacco_budworm_2.jpg

dose = c(1,2,4,8,16,32,1,2,4,8,16,32)
dead <- c(1, 4, 9, 13, 18, 20, 0, 2, 6, 10, 12, 16)
sex <- factor(rep(c("M", "F"), c(6, 6)))

# pre-analysis
pairs(cbind(dead,dose,sex))
plot(c(1, 32), c(0, 1), type = "n", xlab = "dose", ylab = "prob")
text(dose, dead/20, labels = sex)

ldose <- rep(0:5, 2)  # log scale
pairs(cbind(dead,ldose,sex))
plot(c(0, 5), c(0, 1), type = "n", xlab = "log(dose)", ylab = "prob")
text(ldose, dead/20, labels = sex)

## models under consideration
# (1) eta ~ ldose
# (2) eta ~ ldose + sex
# (3) eta ~ ldose + sex + ldose:sex

# model 1
y <- cbind(dead, 20 - dead)
fit1 <- glm(y ~ ldose, family = binomial)
summary(fit1)
par(mfrow=c(2,2))
plot(fit1)

# model 2
fit2 <- glm(y ~ ldose + sex, family = binomial)
summary(fit2)
par(mfrow=c(2,2))
plot(fit2)
#Coefficients:
#            Estimate Std. Error z value Pr(>|z|)    
#(Intercept)  -3.4732     0.4685  -7.413 1.23e-13 ***
#ldose         1.0642     0.1311   8.119 4.70e-16 ***
#sexM          1.1007     0.3558   3.093  0.00198 ** 
#    Null deviance: 124.8756  on 11  degrees of freedom
#Residual deviance:   6.7571  on  9  degrees of freedom
#AIC: 42.867


# model 3
fit3 <- glm(y ~ sex*ldose, family = binomial)
summary(fit3)
par(mfrow=c(2,2))
plot(fit3)
anova(fit3, test = "Chisq")


# model 2 (fit2) looks better

## Goodness-of-fit
# D ~ Chisq_(n-p)
# Residual deviance:   6.7571  on  9  degrees of freedom
1 - pchisq(q=6.7571, df=9)   # p-value
#[1] 0.662392

## confidence interval
stats:::confint.lm(fit2)   # Wald confidence interval (symmetric)
#                 2.5 %    97.5 %
#(Intercept) -4.5330217 -2.413289
#ldose        0.7676962  1.360732
#sexM         0.2958066  1.905680

confint(fit2)      # Profile confidence interval (asymmetric)
#                 2.5 %    97.5 %
#(Intercept) -4.4581438 -2.613610
#ldose        0.8228708  1.339058
#sexM         0.4192265  1.820471


## prediction
# empirical odds of death
emp.logits <- log((dead + 0.5)/(20.5 - dead))  
plot(c(1, 32), range(emp.logits), type = "n", xlab = "dose",
     ylab = "emp.logit", log = "x")
text(dose, emp.logits, labels = sex)

# plot fitted model on the logit scale
par(mfrow=c(1,1))
plot(c(1, 32), range(emp.logits), type = "n", xlab = "dose", ylab = "emp.logit", log = "x")
text(dose, emp.logits, labels = sex)
lines(dose[sex == "M"],
predict(fit2, type = "link")[sex == "M"], col = 3)
lines(dose[sex == "M"],
predict(fit2, type = "link")[sex == "F"], col = 2)

# plot fitted model on the probability scale
par(mfrow=c(1,1))
plot(c(1, 32), c(0, 1), type = "n", xlab = "dose",
ylab = "prob", log = "x")
text(dose, dead/20, labels = sex)
ld <- seq(0, 5, 0.1)
newdat <- data.frame(ldose = c(ld, ld),
sex = gl(2, length(ld),labels = c("F", "M")))
lines(2^ld, predict(fit2, type = "response", newdat = subset(newdat, sex == "M")),col = 3)
lines(2^ld, predict(fit2, type = "response", newdat = subset(newdat, sex == "F")),col = 2)