Stat 214 Logistic Regression Handout

Stat 214 --- Logistic Regression Handout

SOURCE: Hosmer and Lemeshow (2000) Applied Logistic Regression: Second Edition. Data were collected at Baystate Medical Center, Springfield, Massachusetts during 1986. The goal of this study was to identify risk factors associated with giving birth to a low birth weight baby (weighing less than 2500 grams). Data were collected on 189 women, 59 of which had low birth weight babies and 130 of which had normal birth weight babies. Four variables which were thought to be of importance were age, weight of the subject at her last menstrual period, race, and the number of physician visits during the first trimester of pregnancy.

LIST OF VARIABLES: Identification Code: ID Low Birth Weight (0 = Birth Weight >= 2500g,1 = Birth Weight < 2500g): LOW Age of the Mother in Years AGE Weight in Pounds at the Last Menstrual Period LWT Race (1 = White, 2 = Black, 3 = Other) RACE Smoking Status During Pregnancy (1 = Yes, 0 = No) SMOKE History of Premature Labor (0 = None, 1 = One, etc.) PTL History of Hypertension (1 = Yes, 0 = No) HT Presence of Uterine Irritability (1 = Yes, 0 = No) UI Number of Physician Visits during the First Trimester FTV (0 = None, 1 = One, 2 = Two, etc.) Birth Weight in Grams BWT

# read data in HLdata <- read.table("C:/Classes/Stat214/lowbwt.txt", sep="\t",quote="",header=TRUE) attach(HLdata)

# take a look at first 3 rows of dataset HLdata[1:3,] ID LOW AGE LWT RACE SMOKE PTL HT UI FTV BWT 1 85 0 19 182 2 0 0 0 1 0 2523 2 86 0 33 155 3 0 0 0 0 3 2551 3 87 0 20 105 1 1 0 0 0 1 2557

# fit a logit model with low as the dep. var. and age, lwt, and smoke # as the covariates logit.out <- glm(LOW~AGE+LWT+SMOKE, family=binomial(link=logit), data=HLdata)

# fit a probit model with low as the dep. var. and age, lwt, and smoke # as the covariates probit.out <- glm(LOW~AGE+LWT+SMOKE, family=binomial(link=probit),data=HLdata)

# take a look at the logit results summary(logit.out) Call: glm(formula = LOW ~ AGE + LWT + SMOKE, family = binomial(link = logit), data = HLdata) Deviance Residuals: Min 1Q Median 3Q Max -1.2829 -0.8650 -0.6938 1.2624 2.0103 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) 1.368225 1.014262 1.349 0.1773 AGE -0.038995 0.032726 -1.192 0.2334 LWT -0.012139 0.006135 -1.979 0.0479 * SMOKE 0.670764 0.325878 2.058 0.0396 * --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1) Null deviance: 234.67 on 188 degrees of freedom Residual deviance: 222.88 on 185 degrees of freedom AIC: 230.88

Number of Fisher Scoring iterations: 4

# take a look at the probit results summary(probit.out)

Call: glm(formula = low ~ age + lwt + smoke, family = binomial(link = probit), data = HLdata)

Deviance Residuals: Min 1Q Median 3Q Max -1.2785 -0.8714 -0.6891 1.2587 2.0285

Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) 0.818464 0.596821 1.371 0.1703 age -0.024405 0.019424 -1.256 0.2090 lwt -0.007215 0.003538 -2.039 0.0414 * smoke 0.416977 0.197264 2.114 0.0345 * --- Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1

(Dispersion parameter for binomial family taken to be 1)

Null deviance: 234.67 on 188 degrees of freedom Residual deviance: 222.67 on 185 degrees of freedom AIC: 230.67

# extract just the coefficients from the logit output object coefficients(logit.out)

(Intercept) age lwt smoke 1.36822568 -0.03899458 -0.01213854 0.67076320

# put the logit coefficients in a new object called beta.logit beta.logit <- coefficients(logit.out)

# plot low on age adding some jitter to low plot(AGE, jitter(LOW, .1)) # Now we're going to plot the predicted probabilities of a 120 lb. #woman who is a smoker giving birth to a low birthweight child at #different ages. First we need to construct a new matrix of covariate #values that corresponds to our hypothetical women.

X <- cbind(1, seq(from=14, to=45, by=1), 120, 1)

# multiply this matrix by out logit coefficients to get the value of #the linear predictor. Xb <- X %*% beta.logit

# Now use the logistic cdf to transform the linear predictor into probabilities prob<-1/(1+exp(-Xb))

# now plot these probabilities as a function of age on the pre-existing # graph of low on age lines(seq(from=14, to=45, by=1), prob, col="red")

## the odds of a woman having a low weight baby if she is a smoker: exp(0.67076320) [1] 1.955729

## It is almost twice as likely that a woman at any age and of any weight ## will give birth to a low weight baby if she is a smoker

### A 95% CI for the odds exp(beta3 +/- 1.96*SE(beta3)) exp(0.67076320-1.96*0.197264) [1] 1.3286 exp(0.67076320+1.96*0.197264) [1] 2.878878

# # Now we're going to plot the predicted probabilities of a 120 lb. woman # who is NOT a smoker giving birth to a low birthweight child at different # ages. First we need to construct a new matrix of covariate values that # corresponds to our hypothetical women.

X <- cbind(1, seq(from=14, to=45, by=1), 120, 0)

# multiply this matrix by out logit coefficients to get the value of the linear predictor.

Xb <- X %*% beta.logit

# Now use the logistic cdf to transform the linear predictor into # probabilities prob <- exp(Xb)/(1+exp(Xb))

# now plot these probabilities as a function of age on the pre-existing # graph of low on age lines(seq(from=14, to=45, by=1), prob, col="blue")

# create a 3-d plot weight <- seq(from=80, to=250, length=100) age <- seq(from=14, to=45, length=100) logit.prob.fun <- function(weight, age){ exp(1.368226 -0.038995*age -0.012139*weight + 0.670763) / (1 + exp(1.368226 -0.038995*age -0.012139*weight + 0.670763)) } prob <- outer(weight, age, logit.prob.fun) #matrix that is plotted below in the 3D plot persp(age, weight, prob, theta=30, phi=30, expand=0.5, col="lightblue")

# let's fit another logit model logit1.out <- glm(LOW~AGE+LWT+SMOKE+HT+UI, family=binomial, data=HLdata) summary(logit1.out)

Call: glm(formula = LOW ~ AGE + LWT + SMOKE + HT + UI, family = binomial, data = HLdata)

Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) 1.399794 1.080407 1.296 0.1951 AGE -0.034073 0.033674 -1.012 0.3116 LWT -0.015447 0.006587 -2.345 0.0190 * SMOKE 0.647540 0.336650 1.923 0.0544 . HT 1.893274 0.683392 2.770 0.0056 ** UI 0.884607 0.444051 1.992 0.0464 * --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Null deviance: 234.67 on 188 degrees of freedom Residual deviance: 211.78 on 183 degrees of freedom AIC: 223.78

# fit another logit model including race HLdata$AfrAm <- HLdata$RACE==2 HLdata$othrace <- HLdata$RACE==3 logit2.out <- glm(LOW~AGE + LWT +AfrAm+othrace+SMOKE + HT + UI, family=binomial, data=HLdata) summary(logit2.out)

Call: glm(formula = LOW ~ AGE + LWT + AfrAm + othrace + SMOKE + HT + UI, family = binomial, data = HLdata)

Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) 0.437240 1.191931 0.367 0.7137 AGE -0.018256 0.035354 -0.516 0.6056 LWT -0.016285 0.006859 -2.374 0.0176 * AfrAmTRUE 1.280641 0.526695 2.431 0.0150 * othraceTRUE 0.901880 0.434362 2.076 0.0379 * SMOKE 1.027571 0.393931 2.609 0.0091 ** HT 1.857617 0.688848 2.697 0.0070 ** UI 0.895387 0.448494 1.996 0.0459 * --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Null deviance: 234.67 on 188 degrees of freedom Residual deviance: 203.95 on 181 degrees of freedom AIC: 219.95 Number of Fisher Scoring iterations: 4

# Let's conduct a likelihood ratio test of model 1 vs. model 2 # Here the constrained model is model 1 and the unconstrained model is model 2. # Since 2 constraints are applied, the test statistic under the null follows # a chi-square distribution with 2 degrees of freedom lr <- deviance(logit1.out) - deviance(logit2.out) lr [1] 7.829775

1 - pchisq(lr, 2) [1] 0.01994279

ROC Curves library(ROCR) #need the ROCR package

## here’s the first model we fitted logit.out <- glm(LOW~AGE+LWT+SMOKE, family=binomial(link=logit), data=HLdata) pred.vals <- prediction(logit.out$fitted.values,LOW) perf <- performance(pred.vals, measure = "tpr", x.measure = "fpr")

## plot the ROC curve for the predicted response values by the fitted model plot(perf, colorize=TRUE) #to obtain the area under the curve performance(pred.vals, measure = "auc")

An object of class "performance" Slot "x.name": [1] "None"

Slot "y.name": [1] "Area under the ROC curve"

Slot "alpha.name": [1] "none"

Slot "x.values": list()

Slot "y.values": [[1]] [1] 0.6531291

Slot "alpha.values": list()

## the AUC value is 0.6531291. we can also obtain it by performance(pred.vals, measure = "auc")@y.values [[1]] [1] 0.6531291 par(new=T) logit2.out <- glm(LOW~AGE + LWT +AfrAm+othrace+SMOKE + HT + UI, family=binomial, data=HLdata) pred.vals <- prediction(logit2.out$fitted.values,LOW) perf <- performance(pred.vals, measure = "tpr", x.measure = "fpr")

## plot the ROC curve for the predicted response values by the fitted model plot(perf, colorize=TRUE) # with AUC: performance(pred.vals, measure = "auc")@y.values [[1]] [1] 0.7343546