## For SAS coding on GLM: see ## http://128.97.141.26/stat/examples/iglm/default.htm ## Reference: An Introduction to Generalized Linear Models, 2nd Edition, Annette J. Dobson ## More reference (dataset and R code): ## http://statmath.wu.ac.at/courses/heather_turner/index.html ## Basic R functions for GLM # glm(formula, family = gaussian, data, weights, subset, # na.action, start = NULL, etastart, mustart, offset, # control = glm.control(...), model = TRUE, # method = "glm. t", x = FALSE, y = TRUE, # contrasts = NULL, ...) # "formula": y ~ x1 + x2 # In formula, "a:b" for an interaction between a and b # "a*b" means "a + b + a:b" # "family": binomial(link = "logit") # gaussian(link = "identity") # Gamma(link = "inverse") # inverse.gaussian(link = "1/mu2") # poisson(link = "log") # There are several functions available for accessing/displaying components of # the glm object, including: # residuals() # fitted() # predict() # coef() # deviance() # formula() # summary() ## Bronchitis Example, Jones (1975) bronchitis <- read.table("bronchitis.dat",sep=",",header=T) # cigs: the number of cigarettes smoked per day # poll: the air pollution level in the locality of residence # bron: the presence/absence of bronchitis (indicated by 1/0) str(bronchitis) #'data.frame': 212 obs. of 3 variables: # $ bron: int 0 1 0 0 0 0 1 1 0 0 ... # $ cigs: num 5.15 0 2.5 1.75 6.75 0 0 9.5 0 0.75 ... # $ poll: num 67.1 66.9 66.7 65.8 64.4 64.4 65.1 66.2 65.9 67.1 ... attach(bronchitis) # then objects in the database can be accessed by simply giving their names ## plot the dataset pairs(bronchitis) cor(bronchitis) plot(bron ~ poll) plot(jitter(bron, 0.1) ~ poll) plot(bron ~ cigs) plot(jitter(bron, 0.1) ~ cigs) plot(poll ~ cigs, xlab = "No. cigarettes/day", ylab = "Pollution level", type = "n") text(cigs, poll, labels = bron) legend(20, 65, legend = c("presence", "absence"), title = "Bronchitis", pch = c("1", "0")) boxplot(poll ~ bron, xlab = "Bronchitis presence/absence (1/0)", ylab = "Pollution level") boxplot(cigs ~ bron, xlab = "Bronchitis presence/absence (1/0)", ylab = "No. cigarettes/day") ## Comparison of Links mu.logit <- function(eta) 1/(1 + exp(-eta)); # logit link mu.probit <- function(eta) pnorm(eta, 0, pi/sqrt(3)); # probit link mu.cloglog <- function(eta) 1 - exp(-exp(eta)); # complementary log-log link plot(mu.logit, (-4): 4, xlim = c(-4, 4), ylim = c(0,1), xlab = expression(eta), ylab = expression(mu == g^-1 * (eta))) curve(mu.probit, (-4):4, add = TRUE, lty = 2) curve(mu.cloglog, (-4):4, add = TRUE, lty = 3) legend(-4, 1, c("logit", "probit", "complementary log-log"), lty = 1:3) ## fit logistic model model1 <- glm(bron ~ cigs + poll, family = binomial) summary(model1) #Deviance Residuals: #D = sum of r^2 # Min 1Q Median 3Q Max #-2.4023 -0.5606 -0.4260 -0.3155 2.3594 #Coefficients: # based on Wald test # Estimate Std. Error z value Pr(>|z|) #(Intercept) -10.08491 2.95100 -3.417 0.000632 *** #cigs 0.21169 0.03813 5.552 2.83e-08 *** #poll 0.13176 0.04895 2.692 0.007113 ** # null model: only the intercept presents, d.f. = n-1 # fitted model: d.f.= n-p # Null deviance: 221.78 on 211 degrees of freedom #Residual deviance: 174.21 on 209 degrees of freedom # AIC = -2*l_model + 2*p #AIC: 180.21 AIC(model1) #180.2145 BIC(model1) #190.2842 ## plot fitted model # (deviance) residual plot plot(residuals(model1) ~ fitted(model1), xlab = expression(hat(y)[i]), ylab = expression(r[i])) abline(0, 0, lty = 2) par(mfrow=c(2,2)) plot(model1) # check outliers r <- residuals(model1) r[abs(r) > 2] bronchitis[abs(r) > 2, ] # bron cigs poll #48 1 0.0 61.8 #59 1 0.0 61.7 #87 1 0.0 59.7 #122 1 0.0 55.9 #142 1 0.0 58.9 #147 0 24.9 58.0 sum(bron[cigs == 0]) #[1] 7 sum(cigs==0) #[1] 54 ## try other links model1a <- glm(bron ~ cigs + poll, family = binomial(link=probit)) summary(model1a) AIC(model1a) #[1] 179.0786 BIC(model1a) #[1] 189.1483 model1b <- glm(bron ~ cigs + poll, family = binomial(link=cloglog)) summary(model1b) AIC(model1b) #[1] 186.2768 BIC(model1b) #[1] 196.3465 model1c <- glm(bron ~ cigs + poll, family = binomial(link=cauchit)) summary(model1c) AIC(model1c) #[1] 185.9184 BIC(model1c) #[1] 195.9882 # model non-smokers separately smoker = as.numeric(cigs > 0) model2 <- glm(bron ~ smoker + smoker:(cigs + poll), family = binomial) summary(model2) # Null deviance: 221.78 on 211 degrees of freedom #Residual deviance: 175.66 on 208 degrees of freedom #AIC: 183.66 par(mfrow=c(2,2)) plot(model2) temp <- glm(bron ~ smoker*poll, family = binomial) summary(temp) # Grouping the Data cutCigs <- cut(cigs, c(-1, 0, 1, 3, 5, 8, 50)) cbind(cigs,cutCigs) cutPoll <- cut(poll, c(-1, 55, 57.5, 60, 62.5, 65, 100)) xtabs(~cutCigs) #cutCigs #(-1,0] (0,1] (1,3] (3,5] (5,8] (8,50] # 54 43 35 21 20 39 xtabs(~cutPoll) #cutPoll # (-1,55] (55,57.5] (57.5,60] (60,62.5] (62.5,65] (65,100] # 47 50 44 31 26 14 total <- xtabs( ~ cutCigs + cutPoll) total presence <- xtabs(bron ~ cutCigs + cutPoll) presence absence <- c(total) - c(presence) round((total-presence)/total*10) binData <- as.data.frame.table(presence, responseName = "presence") binData <- cbind(binData, absence) binData model4 <- glm(cbind(presence, absence) ~ cutCigs + cutPoll, family = binomial, data = binData) summary(model4) # Null deviance: 98.027 on 34 degrees of freedom #Residual deviance: 14.877 on 24 degrees of freedom #AIC: 72.228 par(mfrow=c(2,2)) plot(model4) # Analysis of Deviance Table (~ANOVA) anova(model4, test = "Chisq") # Df Deviance Resid. Df Resid. Dev P(>|Chi|) #NULL 34 98.027 #cutCigs 5 73.063 29 24.964 2.359e-14 *** #cutPoll 5 10.087 24 14.877 0.07281 . par(mfrow=c(2,2)) plot(binData$cutPoll, fitted(model4), main="Fitted vs. cutPoll") plot(binData$cutPoll, residuals(model4), main="Residual vs. cutPoll") plot(binData$cutCigs, fitted(model4), main="Fitted vs. cutCigs") plot(binData$cutCigs, residuals(model4), main="Residual vs. cutCigs") ## other models # treat "cutPoll" as a continuous variable model5 <- glm(cbind(presence, absence) ~ cutCigs + c(cutPoll), family = binomial, data = binData) summary(model5) par(mfrow=c(2,2)) plot(model5) # Null deviance: 98.027 on 34 degrees of freedom #Residual deviance: 19.472 on 28 degrees of freedom #AIC: 68.823 anova(model5, test = "Chisq") # simplify the factor "cutCigs" further (with levels "0", "(0,3]", "(3,8]", "8+") attributes(binData$cutCigs) #$levels #[1] "(-1,0]" "(0,1]" "(1,3]" "(3,5]" "(5,8]" "(8,50]" binData6 = binData levels(binData6$cutCigs) <- c("0", "(0,3]", "(0,3]", "(3,8]", "(3,8]", "8+") model6 <- glm(cbind(presence, absence) ~ cutCigs + c(cutPoll), family = binomial, data = binData6) summary(model6) par(mfrow=c(2,2)) plot(model6) # Null deviance: 98.027 on 34 degrees of freedom #Residual deviance: 19.498 on 30 degrees of freedom #AIC: 64.848 cigs7 <- cut(cigs, c(-1, 0, 3, 8, 50)) model7 <- glm(bron ~ cigs7 + poll, family = binomial) summary(model7) par(mfrow=c(2,2)) plot(model7) # Null deviance: 221.78 on 211 degrees of freedom #Residual deviance: 143.65 on 207 degrees of freedom #AIC: 153.65 r <- residuals(model7) r[abs(r) > 2] bronchitis[abs(r) > 2, ] # bron cigs poll #87 1 0 59.7 #122 1 0 55.9 #142 1 0 58.9 ## 5-fold cross-validation for evaluating models length(bron) # 212 sum(bron) # 46 library(boot) # for function "cv.glm" ## use self-defined cost function for binary response cost <- function(r, pi = 0) mean(abs(r-pi) > 0.5) set.seed(678) # free to change cv.glm(data=bronchitis, glmfit=model1, cost=cost, K=5)$delta #[1] 0.2028302 0.2046992 set.seed(678) # free to change cv.glm(data=bronchitis, glmfit=model1a, cost=cost, K=5)$delta #[1] 0.2028302 0.1971120 set.seed(678) # free to change cv.glm(data=cbind(bronchitis,smoker), glmfit=model2, cost=cost, K=5)$delta #[1] 0.2028302 0.1980910 set.seed(678) # free to change cv.glm(data=cbind(bronchitis,cigs7), glmfit=model7, cost=cost, K=5)$delta #0.2169811 0.2301976 ## use default cost function set.seed(678) # free to change cv.glm(data=cbind(bronchitis,smoker), glmfit=model2, K=5)$delta #[1] 0.1409895 0.1401409 set.seed(678) # free to change cv.glm(data=bronchitis, glmfit=model1, K=5)$delta #[1] 0.1399383 0.1390862 set.seed(678) # free to change cv.glm(data=cbind(bronchitis,cigs7), glmfit=model7, K=5)$delta #0.1248911 0.1237637 set.seed(678) # free to change cv.glm(data=binData, glmfit=model4, K=5)$delta #0.05389201 0.05139671 set.seed(678) # free to change cv.glm(data=cbind(binData,c(binData$cutPoll)), glmfit=model5, K=5)$delta #0.03978364 0.03888970 set.seed(678) # free to change cv.glm(data=cbind(binData6,c(binData6$cutPoll)), glmfit=model6, K=5)$delta #0.04515505 0.04399620