-
Notifications
You must be signed in to change notification settings - Fork 0
/
02_envi_cov_data_formatting.R
262 lines (214 loc) · 11.1 KB
/
02_envi_cov_data_formatting.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
##################################################
### The purpose of this script is to format environmental
### covariate data and save the formatted output
### to be used in the IPM
### Last update date: 07/26/2024
###################################################
# Prepare packages
list.of.packages <- c("tidyverse", "GGaly", "rnoaa", "todor")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)){install.packages(new.packages)}
lapply(list.of.packages, require, character.only = TRUE)
#######################################
###### Loading all climate data #######
#######################################
# Load climate data by state
ga.dat <- read.csv("raw_dat/environmental_covariates/GA_ClimDat.csv")
fl.dat <- read.csv("raw_dat/environmental_covariates/FL_ClimDat.csv")
sc.dat <- read.csv("raw_dat/environmental_covariates/SC_ClimDat.csv")
nc.dat <- read.csv("raw_dat/environmental_covariates/NC_ClimDat.csv")
tn.dat <- read.csv("raw_dat/environmental_covariates/TN_ClimDat.csv")
al.dat <- read.csv("raw_dat/environmental_covariates/AL_ClimDat.csv")
ms.dat <- read.csv("raw_dat/environmental_covariates/MS_ClimDat.csv")
ar.dat <- read.csv("raw_dat/environmental_covariates/AR_ClimDat.csv")
la.dat <- read.csv("raw_dat/environmental_covariates/LA_ClimDat.csv")
tx.dat <- read.csv("raw_dat/environmental_covariates/TX_ClimDat.csv")
ky.dat <- read.csv("raw_dat/environmental_covariates/KY_ClimDat.csv")
mo.dat <- read.csv("raw_dat/environmental_covariates/MO_ClimDat.csv")
nm.dat <- read.csv("raw_dat/environmental_covariates/NM_ClimDat.csv")
az.dat <- read.csv("raw_dat/environmental_covariates/AZ_ClimDat.csv")
ok.dat <- read.csv("raw_dat/environmental_covariates/OK_ClimDat.csv")
ks.dat <- read.csv("raw_dat/environmental_covariates/KS_ClimDat.csv")
# Climate data from mid-latitude cities
chicago <- read.csv("raw_dat/environmental_covariates/Chicago.csv")
nplatte <- read.csv("raw_dat/environmental_covariates/NorthPlatte.csv")
desmoines <- read.csv("raw_dat/environmental_covariates/DesMoines.csv")
cleveland <- read.csv("raw_dat/environmental_covariates/Cleveland.csv")
# Palmer drought severity index
pdsi <- read.csv("raw_dat/environmental_covariates/PDSI.csv")
# NAO
nao <- read.csv("raw_dat/environmental_covariates/NAO.csv")
#############################
####### Data processing #####
#############################
## REVIEW: When reviewing this code, I realized that a lot of the code was for data exploration and the data contained in the 'winter' object are not the data we are using in the model, since it contains data from all southern states. We further subset the data down to one weather station from each state (i.e., the winter.new object below). That said, should we still keep all of the code?
# Combine all state climate data
dat <- rbind(ga.dat, fl.dat, sc.dat, nc.dat, tn.dat, al.dat, ms.dat, ar.dat, la.dat, tx.dat, ky.dat, mo.dat, nm.dat, az.dat, ok.dat, ks.dat)
# Weather stations in Mexico
mx.dat <- dat[str_sub(dat$NAME, -2, -1)=="MX",]
# Remove data from Mexico
dat <- dat[!(str_sub(dat$NAME, -2, -1)=="MX"),]
dat <- dat %>%
distinct(.keep_all = TRUE)
dat$YEAR <- str_sub(dat$DATE, 1, 4)
dat$MONTH <- str_sub(dat$DATE, 6, 7)
dat$STATE <- str_sub(dat$NAME, -5, -4)
# Identify southern states
states <- c("GA", "FL", "SC", "NC", "TN", "AL", "MS", "AR", "LA", "TX", "KY", "MO", "NM", "AZ", "OK", "KS")
# Identify winter months
w.months <- c("01", "09", "10", "11", "12")
#subset only southern states
dat <- dat[dat$STATE%in%states, ]
# Subset winter covariates
# DT32: number of days per month where minimum temp was below freezing
# DX32: number of days per month where maximum temp was below freezing
# PRCP: monthly amount of precipitation (in)
winter <- dat %>%
select(STATION, NAME, DT32, DX32, PRCP, YEAR, MONTH, STATE) %>%
filter(MONTH %in% c("01", "09", "10", "11", "12"))
winter$YEAR <- as.numeric(winter$YEAR)
# January of following year are considered part of the previous winter
year.new <- rep(NA, nrow(winter))
for(i in 1:nrow(winter)){
year.new[i] <- ifelse(winter$MONTH[i] %in% c("01"), winter[i,6]-1, winter[i,6])
}
# Incorporate newly assigned year to dataframe
winter <- winter %>%
mutate(YEAR2 = year.new) %>%
drop_na() %>%
group_by(STATE, NAME, YEAR2) %>%
summarise(n_months = n()) %>%
pivot_wider(names_from = YEAR2, values_from = n_months) %>%
select(-c('1990', '2021')) %>%
mutate_if(is.integer, as.numeric)
winter$Sum_mo <- rowSums(winter[,3:ncol(winter)])
# Checking if there are months with missing data
winter <- winter %>%
filter(Sum_mo == 150)
# Winter precipitation table
# winter.prcp <- winter %>%
# select(NAME, PRCP, YEAR2, MONTH, STATE) %>%
# group_by(STATE, NAME, YEAR2) %>%
# summarise(n_months = n()) %>%
# pivot_wider(names_from = YEAR2, values_from = n_months) %>%
# drop_na() %>%
# select(-c('1990', '2021')) %>%
# mutate_if(is.integer, as.numeric) %>%
# mutate(Sum_mo = rowSums(winter[,3:ncol(winter)]))
#
# winter.prcp$Sum_mo <- rowSums(winter.prcp[,3:ncol(winter.prcp)])
# winter.prcp <- winter.prcp %>%
# filter(Sum_mo==180)
#
# write.csv(winter.prcp, "Winter_precipitation.csv")
#
# # Number of days minimum temp below freezing
# winter.dt32 <- winter %>%
# select(NAME, DT32, YEAR2, MONTH, STATE) %>%
# group_by(STATE, NAME, YEAR2) %>%
# summarise(n_months = n()) %>%
# pivot_wider(names_from = YEAR2, values_from = n_months) %>%
# drop_na() %>%
# select(-c('1990', '2021')) %>%
# mutate_if(is.integer, as.numeric)
#
# winter.dt32$Sum_mo <- rowSums(winter.dt32[,3:ncol(winter.dt32)])
# winter.dt32 <- winter.dt32 %>%
# filter(Sum_mo==180)
#
# # Number of days maximum temp below freezing
# winter.dx32 <- winter %>%
# select(NAME, DX32, YEAR2, MONTH, STATE) %>%
# group_by(STATE, NAME, YEAR2) %>%
# summarise(n_months = n()) %>%
# pivot_wider(names_from = YEAR2, values_from = n_months) %>%
# drop_na() %>%
# select(-c('1990', '2021')) %>%
# mutate_if(is.integer, as.numeric)
#
# winter.dx32$Sum_mo <- rowSums(winter.dx32[,3:ncol(winter.dx32)])
# winter.dx32 <- winter.dx32 %>%
# filter(Sum_mo==180)
# winter.prcp$NAME==winter.dt32$NAME
#
# # Join all dataframes and keep stations have complete sets of data
# all <- winter.prcp[,c(1:2,33)] %>%
# full_join(winter.dt32[,c(1:2,33)], by = "NAME") %>%
# full_join(winter.dx32[,c(1:2,33)], by = "NAME") %>%
# filter(Sum_mo==180 & Sum_mo.x==180 & Sum_mo.y==180)
# Weather stations from which we want to keep the climate data
stations <- c("MACON MIDDLE GA REGIONAL AIRPORT, GA US", "ORLANDO INTERNATIONAL AIRPORT, FL US", "COLUMBIA METROPOLITAN AIRPORT, SC US", "SALISBURY 9 WNW, NC US", "MURFREESBORO 5 N, TN US", "BIRMINGHAM AIRPORT, AL US", "JACKSON INTERNATIONAL AIRPORT, MS US", "LITTLE ROCK AIRPORT ADAMS FIELD, AR US", "ALEXANDRIA, LA US","ABILENE REGIONAL AIRPORT, TX US","LEXINGTON BLUEGRASS AIRPORT, KY US", "ROLLA MISSOURI S AND T, MO US", "ALBUQUERQUE INTERNATIONAL AIRPORT, NM US", "PHOENIX AIRPORT, AZ US","OKLAHOMA CITY WILL ROGERS WORLD AIRPORT, OK US", "HUTCHINSON 10 SW, KS US")
## REVIEW: Please review whether the year assignment of environmental covariate data make sense. Are we matching the covariate data from the correct year with the correct survival estimates?
winter.new <- dat %>%
filter(NAME %in% stations & MONTH %in% w.months) %>%
mutate_at("YEAR", as.numeric) %>%
# Data from January is still considered winter of the previous year
mutate(YEAR2 = case_when(MONTH %in% c("01") ~ YEAR-1,
TRUE ~ YEAR)) %>%
filter(!(YEAR2 %in% c(1990, 2021)))
avg <- winter.new %>%
group_by(YEAR2) %>%
summarise(avg_prcp = mean(PRCP), avg_dt32 = mean(DT32), avg_dx32 = mean(DX32))
# Format PDSI data
pdsi$Year <- str_sub(pdsi$ID, -4, -1)
pdsi$Year <- as.numeric(pdsi$Year)
years <- c(1991:2021)
pdsi.new <- pdsi %>%
select(State, Year, January, September:December) %>%
filter(Year %in% years & State %in% states) %>%
group_by(State) %>%
## NOTE: Same as other envi covs above, January of following year is part of the winter of previous year
mutate(January = lead(January, order_by = State))
mean.pdsi <- pdsi.new %>%
group_by(Year) %>%
summarise(across(where(is.numeric), mean))
mean.pdsi <- cbind(mean.pdsi, Mean.pdsi = rowMeans(mean.pdsi[,2:6]))
mean.pdsi <- mean.pdsi[complete.cases(mean.pdsi),]
#mean.pdsi <- mean.pdsi[1:nrow(mean.pdsi)-1,]
# Format NAO data
# subset from 1991-2021
nao <- nao %>%
filter(Year %in% years)
# Winter months are Sept - Jan
nao.jan <- nao %>%
select(January)
nao.sepdec <- nao %>%
select(Year,September:December)
## NOTE: lead(nao.jan) shifts January data to the previous year
nao.new <- cbind(nao.sepdec, lead(nao.jan))
mean.nao <- cbind(nao.new, Mean.nao = rowMeans(nao.new[,2:6]))
## 2021 is missing data from January, so remove that year (which is not needed for the model)
mean.nao <- mean.nao[complete.cases(mean.nao),]
#mean.nao <- mean.nao[1:nrow(mean.nao)-1,]
# Midlatitude
midlat <- rbind(chicago, nplatte, desmoines, cleveland)
midlat$YEAR <- str_sub(midlat$DATE, 1, 4)
midlat$MONTH <- str_sub(midlat$DATE, 6, 7)
midlat$STATE <- str_sub(midlat$NAME, -5, -4)
midlat$YEAR <- as.numeric(midlat$YEAR)
midlat <- midlat %>%
select(STATION, NAME, DT32, DX32, PRCP, SNOW, YEAR, MONTH, STATE) %>%
filter(MONTH %in% c("01", "09", "10", "11", "12"))
midlat.new <- midlat %>%
filter(MONTH %in% w.months) %>%
mutate_at("YEAR", as.numeric) %>%
mutate(YEAR2 = case_when(MONTH %in% c("01") ~ YEAR-1,
TRUE ~ YEAR)) %>%
filter(!(YEAR2 %in% c(1989, 1990, 2021, 2022))) # Only need data from 1991 - 2020
## REVIEW: there is one month from one of the weather stations missing snow data. So we exclude that month when we take the average.
midlat %>% filter(if_any(everything(), is.na)) # Only one row contains NA: Chicago Midway Airport snow record September 2012
midlat.avg <- midlat.new %>%
group_by(YEAR2) %>%
summarise(avg_prcp_ml = mean(PRCP), avg_dt32_ml = mean(DT32), avg_dx32_ml = mean(DX32), avg_snow_ml = mean(SNOW, na.rm = TRUE))
# Putting everything in one table
new.dat <- as.data.frame(cbind(avg, Mean.nao = mean.nao$Mean.nao, Mean.pdsi = mean.pdsi$Mean.pdsi, midlat.avg[2:5]))
saveRDS(new.dat, "data/envi_cov.rda")
# May pond counts (estimates in thousands 1990-2019)
ponds <- c(3508.5, 3200, 3608.9, 3611.7, 5984.8, 6335.4, 7482.2, 7458.2, 4586.9, 6704.3, 3946.9, 4640.4, 2720.0, 5190.1, 3919.6, 5381.2, 6093.9, 7002.7, 4431.4, 6434.0, 6665.0, 8132.2, 5544.0, 6891.7, 7181.2, 6307.7, 5012.5, 6096.0, 5227.4, 4990.3)
# Standardize ponds
ponds.std <- (ponds-mean(ponds))/sd(ponds)
# correlation between envi covs (without ponds)
ggpairs(new.dat[,2:ncol(new.dat)])
# NOTE: correlation with ponds (off set by one year because want to see whether winter conditions from previous year are correlated with pond counts of following year)
new.dat2 <- cbind(new.dat, ponds = c(ponds.std[2:length(ponds.std)], NA))
ggpairs(new.dat2)