-
Notifications
You must be signed in to change notification settings - Fork 4
/
08_final-covar-data.Rmd
394 lines (313 loc) · 12 KB
/
08_final-covar-data.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
---
editor_options:
chunk_output_type: console
---
# Adding Covariates to Checklist Data
In this section, we prepare a final list of covariates, after taking into account spatial sampling bias, temporal bias and observer expertise scores (examined in previous sections).
## Prepare libraries and data
```{r load_libs_data, , message=FALSE, warning=FALSE}
# load libs for data
library(dplyr)
library(readr)
library(stringr)
library(purrr)
library(glue)
library(tidyr)
# check for velox and install
library(devtools)
if (!"velox" %in% installed.packages()) {
install_github("hunzikp/velox")
}
# load spatial
library(raster)
library(rgeos)
library(velox)
library(sf)
# load saved data object
load("results/02_data_prelim_processing.rdata")
```
## Spatial subsampling
Sampling bias can be introduced into citizen science due to the often ad-hoc nature of data collection (Sullivan et al. 2014). For eBird, this translates into checklists reported when convenient, rather than at regular or random points in time and space, leading to non-independence in the data if observations are spatio-temporally clustered (Johnston et al. 2021). Spatio-temporal autocorrelation in the data can be reduced by sub-sampling at an appropriate spatial resolution, and by avoiding temporal clustering. We estimated two simple measures of spatial clustering: the distance from each site to the nearest road (road data from OpenStreetMap) and the nearest-neighbor distance for each site. Sites were strongly tied to roads (mean distance to road ± SD = 390.77 ± 859.15 m; range = 0.28 m – 7.64 km) and were on average only 297 m away from another site (SD = 553 m; range = 0.14 m – 12.85 km) (Figure 3). This analysis was done in the previous section.
Here, to further reduce spatial autocorrelation, we divided the study area into a grid of 1km wide square cells and picked checklists from one site at random within each grid cell.
```{r separate_pres_abs}
# grid based spatial thinning
gridsize <- 500 # grid size in metres
effort_distance_max <- 1000 # removing checklists with this distance
# make grids across the study site
hills <- st_read("data/spatial/hillsShapefile/Nil_Ana_Pal.shp") %>%
st_transform(32643)
grid <- st_make_grid(hills, cellsize = gridsize)
# filtering on !pres_abs keeps absences
# this absence data will be thinned
data_thin_absences <- filter(dataGrouped, !pres_abs)
data_presences <- filter(dataGrouped, pres_abs)
# split data by species
data_thin_absences <- split(
x = data_thin_absences,
f = data_thin_absences$scientific_name
)
```
### Counting presence observation proportion
```{r}
data_presence_prop <- count(data_presences, scientific_name, name = "presences") %>%
mutate(
absences = map_int(data_thin_absences, nrow),
presence_prop = presences / (presences + absences)
)
# mean and sd of presence prop
data_presence_prop %>%
summarise(mean(presence_prop), sd(presence_prop))
```
```{r sp_thin_absences}
# spatial thinning on each species retains
# site with maximum visits per grid cell
data_thin_absences <- map(data_thin_absences, function(df) {
# count visits per locality
df <- group_by(df, locality) %>%
mutate(tot_effort = length(sampling_event_identifier)) %>%
ungroup()
# remove sites with distances above spatial independence
df <- df %>%
dplyr::filter(effort_distance_km <= effort_distance_max) %>%
st_as_sf(coords = c("longitude", "latitude")) %>%
`st_crs<-`(4326)
# transform to regional UTM 43N and add id
df <- df %>%
st_transform(32643) %>%
mutate(coordId = 1:nrow(.)) %>%
bind_cols(as_tibble(st_coordinates(.)))
# whcih cell has which coords
grid_overlap <- st_contains(grid, df) %>%
unclass() %>%
purrr::discard(.p = is_empty)
# count length of grid overlap list
# this is the number of cells with points in them
sampled_cells <- length(grid_overlap)
# make tibble
grid_overlap <- tibble(
uid_cell = seq(length(grid_overlap)), # the uid_cell is specific to this sp.
coordId = grid_overlap
)
# unnest
grid_overlap <- unnest(grid_overlap, cols = "coordId")
# join grid cell overlap with coordinate data
df <- left_join(df,
grid_overlap,
by = "coordId"
) %>%
st_drop_geometry()
# for each uid_cell, select coord where effort is max
points_max <- df %>%
group_by(uid_cell) %>%
dplyr::filter(tot_effort == max(tot_effort)) %>%
# there may be multiple rows with max effort, select first
dplyr::filter(row_number() == 1)
# check for number of samples
assertthat::assert_that(
assertthat::are_equal(sampled_cells, nrow(points_max),
msg = "spatial thinning error: more samples than\\
sampled cells"
)
)
# check that there is one sample per cell
assertthat::assert_that(
assertthat::are_equal(
max(count(points_max, uid_cell)$n), 1
)
)
# return data without UID cell and coordinate Id
dplyr::select(ungroup(points_max), -uid_cell, -coordId, -tot_effort)
})
# remove old data
rm(dataGrouped)
```
### Count absences after spatial thinning
```{r}
data_presence_prop <- data_presence_prop %>%
mutate(
absences_sp_thin = map_int(data_thin_absences, nrow)
)
```
## Temporal subsampling
Additionally, from each selected site, we randomly selected a maximum of 10 **absence** checklists, which reduced temporal clustering. We kept **all** presence checklists.
```{r subsample_data}
# subsample data for random 10 observations
dataSubsample <- map(data_thin_absences, function(df) {
df <- ungroup(df)
df_to_locality <- split(x = df, f = df$locality)
df_samples <- map_if(
.x = df_to_locality,
.p = function(x) {
nrow(x) > 10
},
.f = function(x) sample_n(x, 10, replace = FALSE)
)
bind_rows(df_samples)
})
```
### Count absences after temporal thinning
```{r}
data_presence_prop <- data_presence_prop %>%
mutate(
absences_tmp_thin = map_int(dataSubsample, nrow),
presence_prop_post_thin = presences / (presences + absences_tmp_thin)
)
# save data
write_csv(data_presence_prop, "results/08_data-class-balance.csv")
```
```{r join_pres_abs}
# bind all spatially and temporally thinned absences rows for data frame
dataSubsample <- bind_rows(dataSubsample)
# convert presence data to UTM 43 N and long-lat to X-Y
data_presences <- bind_cols(
data_presences,
as_tibble(
st_as_sf(
data_presences,
coords = c("longitude", "latitude"),
crs = 4326
) %>%
st_transform(32643) %>%
st_coordinates()
)
)
# drop long lat
data_presences <- dplyr::select(data_presences, -longitude, -latitude)
# join ALL PRESENCES and THINNED ABSENCES
dataSubsample <- bind_rows(dataSubsample, data_presences)
# check joined data
assertthat::assert_that(
max(apply(dataSubsample, 2, function(x) sum(is.na(x)))) == 0,
msg = "some columns missing from one of the datasets"
)
# remove previous data
rm(data_thin_absences)
```
## Add checklist calibration index
Load the CCI computed in the previous section. The CCI was the lone observer’s expertise score for single-observer checklists, and the highest expertise score among observers for group checklists.
```{r add_expertise}
# read in obs score and extract numbers
expertiseScore <- read_csv("results/04_data-obsExpertise-score.csv") %>%
mutate(numObserver = str_extract(observer, "\\d+")) %>%
dplyr::select(-observer)
# group seis consist of multiple observers
# in this case, seis need to have the highest expertise observer score
# as the associated covariate
# get unique observers per sei
dataSeiScore <- distinct(
dataSubsample, sampling_event_identifier,
observer_id
) %>%
# make list column of observers
mutate(observers = str_split(observer_id, ",")) %>%
unnest(cols = c(observers)) %>%
# add numeric observer id
mutate(numObserver = str_extract(observers, "\\d+")) %>%
# now get distinct sei and observer id numeric
distinct(sampling_event_identifier, numObserver)
# now add expertise score to sei
dataSeiScore <- left_join(dataSeiScore, expertiseScore,
by = "numObserver"
) %>%
# get max expertise score per sei
group_by(sampling_event_identifier) %>%
summarise(expertise = max(score))
# add to dataCovar
dataSubsample <- left_join(dataSubsample, dataSeiScore,
by = "sampling_event_identifier"
)
# remove data without expertise score
dataSubsample <- filter(dataSubsample, !is.na(expertise))
```
## Add climatic and landscape covariates
Reload climate and land cover predictors prepared previously.
```{r add_landcovars}
# list landscape covariate stacks
landscape_files <- "data/spatial/landscape_resamp01_km.tif"
# read in as stacks
landscape_data <- stack(landscape_files)
# get proper names
elev_names <- c("elev", "slope", "aspect")
chelsa_names <- c("bio_1", "bio_12")
names(landscape_data) <- as.character(glue('{c(elev_names, chelsa_names, "landcover")}'))
```
## Spatial buffers around selected checklists
Every checklist on eBird is associated with a latitude and longitude. However, the coordinates entered by an observer may not accurately depict the location at which a species was detected. This can occur for two reasons: first, traveling checklists are often associated with a single location along the route travelled by observers; and second, checklist locations could be assigned to a ‘hotspot’ – a location that is marked by eBird as being frequented by multiple observers. In many cases, an observation might be assigned to a hotspot even though the observation was not made at the precise location of the hotspot (Praveen J 2017). Johnston et al. (2021) showed that a large proportion of observations occurred within a 3km grid, even for those checklists up to 5km in length. Hence to adjust for spatial precision, we considered a minimum radius of 2.5km around each unique locality when sampling environmental covariate values.
```{r point_buffer}
# assign neighbourhood radius in m
sample_radius <- 2.5 * 1e3
# get distinct points and make buffer
ebird_buff <- dataSubsample %>%
ungroup() %>%
distinct(X, Y) %>%
# remove NAs
drop_na()
# convert to spatial features
ebird_buff <- st_as_sf(ebird_buff, coords = c("X", "Y"), crs = 32643) %>%
# add long lat
bind_cols(as_tibble(st_coordinates(.))) %>%
# make buffer around points
st_buffer(dist = sample_radius)
```
## Spatial buffer-wide covariates
### Mean climatic covariates
All climatic covariates are sampled by considering the mean values within a 2.5km radius as discussed above and prefixed "am_".
```{r mean_landscape}
# get area mean for all preds except landcover, which is the last one
stk <- raster::dropLayer(landscape_data, "landcover") # removing landcover here
velstk <- velox(stk)
# velox raster value extraction
dextr <- velstk$extract(
sp = ebird_buff, df = TRUE,
fun = function(x) mean(x, na.rm = T)
)
# assign names for joining
names(dextr) <- c("id", names(stk))
env_area_mean <- as_tibble(dextr)
# add id to buffer data
ebird_buff <- mutate(ebird_buff,
id = seq(nrow(ebird_buff))
)
# join to buffer data
ebird_buff <- inner_join(ebird_buff, env_area_mean)
```
### Proportions of land cover type
All land cover covariates were sampled by considering the proportion of each land cover type within a 2.5km radius.
```{r pland}
# get the last element of each stack from the list
# this is the landcover at that resolution
lc <- landscape_data[["landcover"]] # accessing landcover here
lc_velox <- velox(lc)
lc_vals <- lc_velox$extract(sp = ebird_buff, df = TRUE)
names(lc_vals) <- c("id", "lc")
# get landcover proportions
lc_prop <- count(lc_vals, id, lc) %>%
group_by(id) %>%
mutate(
lc = glue('lc_{str_pad(lc, 2, pad = "0")}'),
prop = n / sum(n)
) %>%
dplyr::select(-n) %>%
tidyr::pivot_wider(
names_from = lc,
values_from = prop,
values_fill = list(prop = 0)
) %>%
ungroup()
# join to data
ebird_buff <- mutate(ebird_buff, lc_prop)
```
### Link environmental covariates to checklists
```{r land_to_obs}
# drop geometry
ebird_buff <- st_drop_geometry(ebird_buff)
# link to dataSubsample
dataSubsample <- inner_join(dataSubsample, ebird_buff,
by = c("X", "Y")
)
```
Save data to file.
```{r spit_scale}
# write to file
write_csv(dataSubsample, path = glue("results/08_data-covars-2.5km.csv"))
```