-
Notifications
You must be signed in to change notification settings - Fork 0
/
linear_regression.R
238 lines (178 loc) · 8.73 KB
/
linear_regression.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
# Introduction
## ══════════════
# • Learning objectives:
## • Learn the R formula interface
## • Specify factor contrasts to test specific hypotheses
## • Perform model comparisons
## • Run and interpret variety of regression models in R
## Set working directory
## ─────────────────────────
## It is often helpful to start your R session by setting your working
## directory so you don't have to type the full path names to your data
## and other files
# set the working directory
# setwd("~/Desktop/Rstatistics")
# setwd("C:/Users/dataclass/Desktop/Rstatistics")
## You might also start by listing the files in your working directory
getwd() # where am I?
list.files("dataSets") # files in the dataSets folder
## Load the states data
## ────────────────────────
# read the states data
states.data <- readRDS("dataSets/states.rds")
#get labels
states.info <- data.frame(attributes(states.data)[c("names", "var.labels")])
#look at last few labels
tail(states.info, 8)
## Linear regression
## ═══════════════════
## Examine the data before fitting models
## ──────────────────────────────────────────
## Start by examining the data to check for problems.
# summary of expense and csat columns, all rows
sts.ex.sat <- subset(states.data, select = c("expense", "csat"))
summary(sts.ex.sat)
# correlation between expense and csat
cor(sts.ex.sat)
## Plot the data before fitting models
## ───────────────────────────────────────
## Plot the data to look for multivariate outliers, non-linear
## relationships etc.
# scatter plot of expense vs csat
plot(sts.ex.sat)
## Linear regression example
## ─────────────────────────────
## • Linear regression models can be fit with the `lm()' function
## • For example, we can use `lm' to predict SAT scores based on
## per-pupal expenditures:
# Fit our regression model
sat.mod <- lm(csat ~ expense, # regression formula
data=states.data) # data set
# Summarize and print the results
summary(sat.mod) # show regression coefficients table
## Why is the association between expense and SAT scores /negative/?
## ─────────────────────────────────────────────────────────────────────
## Many people find it surprising that the per-capita expenditure on
## students is negatively related to SAT scores. The beauty of multiple
## regression is that we can try to pull these apart. What would the
## association between expense and SAT scores be if there were no
## difference among the states in the percentage of students taking the
## SAT?
summary(lm(csat ~ expense + percent, data = states.data))
## The lm class and methods
## ────────────────────────────
## OK, we fit our model. Now what?
## • Examine the model object:
class(sat.mod)
names(sat.mod)
methods(class = class(sat.mod))[1:9]
## • Use function methods to get more information about the fit
confint(sat.mod)
# hist(residuals(sat.mod))
## Linear Regression Assumptions
## ─────────────────────────────────
## • Ordinary least squares regression relies on several assumptions,
## including that the residuals are normally distributed and
## homoscedastic, the errors are independent and the relationships are
## linear.
## • Investigate these assumptions visually by plotting your model:
par(mar = c(4, 4, 2, 2), mfrow = c(1, 2)) #optional
plot(sat.mod, which = c(1, 2)) # "which" argument optional
## Comparing models
## ────────────────────
## Do congressional voting patterns predict SAT scores over and above
## expense? Fit two models and compare them:
# fit another model, adding house and senate as predictors
sat.voting.mod <- lm(csat ~ expense + house + senate,
data = na.omit(states.data))
sat.mod <- update(sat.mod, data=na.omit(states.data))
# compare using the anova() function
anova(sat.mod, sat.voting.mod)
coef(summary(sat.voting.mod))
## Exercise: least squares regression
## ────────────────────────────────────────
## Use the /states.rds/ data set. Fit a model predicting energy consumed
## per capita (energy) from the percentage of residents living in
## metropolitan areas (metro). Be sure to
## 1. Examine/plot the data before fitting the model
## 2. Print and interpret the model `summary'
## 3. `plot' the model to look for deviations from modeling assumptions
# Load data
states <- readRDS("dataSets/states.rds")
# Check out variables we want to examine
statesEnergy <- na.omit(states)
summary(statesEnergy)
# Plot it
plot(statesEnergy$metro, statesEnergy$energy)
# Fit a model
energy <- lm(energy ~ metro, data=statesEnergy)
summary(energy)
# So far, this seems like metro does not have a strong impact on energy. We'll need to change our model.
## Select one or more additional predictors to add to your model and
## repeat steps 1-3. Is this model significantly better than the model
## with /metro/ as the only predictor?
energy2 <- lm(energy ~ metro + toxic + green, data=statesEnergy)
summary(energy2)
plot(energy2)
# This model is much better than the one with only using metro.
anova(energy, energy2)
coef(summary(energy2))
## Interactions and factors
## ══════════════════════════
## Modeling interactions
## ─────────────────────────
## Interactions allow us assess the extent to which the association
## between one predictor and the outcome depends on a second predictor.
## For example: Does the association between expense and SAT scores
## depend on the median income in the state?
#Add the interaction to the model
sat.expense.by.percent <- lm(csat ~ expense*income,
data=states.data)
#Show the results
coef(summary(sat.expense.by.percent)) # show regression coefficients table
## Regression with categorical predictors
## ──────────────────────────────────────────
## Let's try to predict SAT scores from region, a categorical variable.
## Note that you must make sure R does not think your categorical
## variable is numeric.
# make sure R knows region is categorical
str(states.data$region)
states.data$region <- factor(states.data$region)
#Add region to the model
sat.region <- lm(csat ~ region,
data=states.data)
#Show the results
coef(summary(sat.region)) # show regression coefficients table
anova(sat.region) # show ANOVA table
## Again, *make sure to tell R which variables are categorical by
## converting them to factors!*
## Setting factor reference groups and contrasts
## ─────────────────────────────────────────────────
## In the previous example we use the default contrasts for region. The
## default in R is treatment contrasts, with the first level as the
## reference. We can change the reference group or use another coding
## scheme using the `C' function.
# print default contrasts
contrasts(states.data$region)
# change the reference group
coef(summary(lm(csat ~ C(region, base=4),
data=states.data)))
# change the coding scheme
coef(summary(lm(csat ~ C(region, contr.helmert),
data=states.data)))
## See also `?contrasts', `?contr.treatment', and `?relevel'.
## Exercise: interactions and factors
## ────────────────────────────────────────
## Use the states data set.
## 1. Add on to the regression equation that you created in exercise 1 by
## generating an interaction term and testing the interaction.
## 2. Try adding region to the model. Are there significant differences
## across the four regions?
energy3 <- lm(energy ~ metro + toxic*green, data=statesEnergy)
summary(energy3)
coef(summary(energy3))
# Adding regions
energy4 <- lm(energy ~ region, data=statesEnergy)
summary(energy4)
anova(energy4)
# There are differences in regions, but they do not appear to significantly affect the model.