diff --git a/R/mol_flagged_by_source.R b/R/mol_flagged_by_source.R new file mode 100644 index 0000000..42ac8e0 --- /dev/null +++ b/R/mol_flagged_by_source.R @@ -0,0 +1,64 @@ +#' Flag records that have prexisting quality control attributes +#' +#' This function flags records with quality control attributes provided by the source (eg GBIF 'issues'). The flag is applied my matching a vector of user-specified strings to one or more columns. +#' +#' @family prefilter +#' @param data data.frame. Containing quality control attributes. +#' @param flagStrings character. A vector of terms denoting sub-quality observations. +#' @param flagCols character string. One or more column names containing quality +#' control attributes (e.g. 'issue' for GBIF data). +#' +#' @details The function uses regex matching of multiple terms to identify +#' observations tagged with one or more of the quality control attributes. If +#' length of flagStrings is greater than one, terms are collapsed using the "|" +#' operator. +#' +#' @return A data.frame containing the column ".flagged_by_source" +#' .Compliant (TRUE) for observations that match one or more flagStrings; +#' otherwise "FALSE". +#' +#' @importFrom dplyr if_any all_of +#' @importFrom stringr str_c str_detect +#' +#' @author Matthew S. Rogan +#' +#' @export +#' +#' @examples +#' x <- data.frame( +#' issue = c("", "", "", "TAXON_MATCH_FUZZY"), +#' occurrenceStatus = c("", "ABSENT", "", "") +#' ) +#' +#' mol_flagged_by_source( +#' data = x, +#' flagStrings = c("ABSENT", "TAXON_MATCH_FUZZY"), +#' flagCols = c("issue", "occurrenceStatus") +#' ) +bdc_flagged_by_source <- function( + data, + flagStrings, + flagCols +){ + + check_col(data, flagCols) + + terms <- stringr::str_c(flagStrings, collapse = "|") + + data <- data %>% + dplyr::mutate(.flagged_by_source = dplyr::if_any(tidyselect::all_of(flagCols), + ~stringr::str_detect(.x, terms))) + + message( + paste( + "\mol_flagged_by_source:\nFlagged", + sum(data$.flagged_by_source == FALSE), + "observations with flagged attributes.", + "\nOne column was added to the database.\n" + ) + ) + + return(data) +} + + diff --git a/R/mol_roi.R b/R/mol_roi.R new file mode 100644 index 0000000..e207a06 --- /dev/null +++ b/R/mol_roi.R @@ -0,0 +1,270 @@ +#' Identify records located outside a given region of interest. +#' +#' This function flags records that are outside a particular spatial region of +#' interest (roi). ROIs can be specified in a variety of raster and vector +#' formats. +#' +#' @family space +#' @param data data.frame. Containing geographical coordinates. +#' @param roi spatial region of interest. Can be provided as an sf, sfc, +#' SpatialPolygons, SpatialPolygonsDataFrame, rasterLayer, spatRaster, +#' or as a file path with extension ".shp", ".gpkg", or ".tif". +#' @param lat character string. The column name with latitude in decimal degrees +#' and WGS84. Default = "decimalLatitude". +#' @param lon character string. The column with longitude in decimal degrees and +#' WGS84. Default = "decimalLongitude". +#' @param byExtOnly logical. If true, occurrences are only flagged if located +#' outside the roi extent. If false, occurrences are checked for whether they are +#' located within the roi polygons or raster mask. +#' @param maskValue numeric or character. Raster value corresponding to "outside" +#' the region of interest. Ignored if roi is provided as a vector layer. +#' +#' @details This test identifies records outside a particular region of interest +#' defined by a raster or vector layer +#' +#' @return A tibble containing the column ".outside_roi". Compliant +#' (TRUE) if coordinate is outside user defined roi; otherwise "FALSE". +#' +#' @author Matthew S. Rogan +#' +#' @importFrom sf st_bbox st_as_sf st_as_sfc st_within +#' @importFrom stringr str_detect +#' +#' @export mol_roi +#' +#' @examples +#' x <- data.frame( +#' decimalLatitude = c(1, 2, 3, 5), +#' decimalLongitude = c(1, 2, 3, 4) +#' ) +#' +#' roi <- sf::st_as_sfc(sf::st_bbox(c(xmin = 0, xmax = 4, ymax = 4, ymin = 0), crs = 4326)) +#' +#' bdc_roi( +#' data = x, +#' roi = roi, +#' lat = "decimalLatitude", +#' lon = "decimalLongitude" +#' ) +#' + +bdc_roi <- + function(data, + roi, + lat = "decimalLatitude", + lon = "decimalLongitude", + byExtOnly = FALSE, + maskValue = NULL){ + + ### Run checks + if (!is.data.frame(data)) { + stop(deparse(substitute(data)), " is not a data.frame", call. = FALSE) + } + + check_col(data, c(lat, lon)) + + if(!any(c("character", + "sf", + "sfc", + "SpatialPolygons", + "RasterLayer", + "SpatRaster") %in% class(roi))){ + stop("The region of interest is not in a valid format.") + } + + ### converts coordinates columns to numeric + data <- + data %>% + tibble::as_tibble() %>% + tibble::rowid_to_column("id_temp") %>% + dplyr::mutate(decimalLatitude = as.numeric(.data[[lat]]), + decimalLongitude = as.numeric(.data[[lon]]), + .outside_roi = TRUE) + + ### Screen NAs + dataCoords <- data %>% + dplyr::select(dplyr::all_of(c("id_temp", lon, lat))) %>% + dplyr::filter(!is.na(.data[[lat]]), + !is.na(.data[[lon]])) + + if(nrow(dataCoords) == 0) stop("No valid coordinates.") + + ### Read file path + if(is.character(roi)){ + + # check file exists + if(!file.exists(roi)) stop("The ROI filepath does not exist.") + + # check proper format + if(!any(stringr::str_detect(roi, + c("\\.shp$", + "\\.gpkg$", + "\\.tif$", + "\\.tiff$")))){ + stop("ROI input files must have '.shp', '.gpkg' or '.tif' file extensions.") + } + + # load roi + if(stringr::str_detect(roi, "\\.tif")){ + suppressWarnings({ + check_require_cran("terra") + }) + roi <- terra::rast(roi) + } else{ + roi <- sf::read_sf(roi) + } + + } + + ### Convert to sf/terra + if("SpatialPolygons" %in% .class2(roi)) roi <- sf::st_as_sfc(roi) + if("RasterLayer" %in% class(roi)){ + suppressWarnings({ + check_require_cran("terra") + }) + roi <- terra::rast(roi) + } + + ### Run appropriate check + if(any(c("sf", "sfc") %in% class(roi))){ + + if(!any(is.na(maskValue), is.null(maskValue))){ + warning("maskValue is ignored when ROI is provided as a vector layer.") + } + + unflagged <- roi_sf(dataCoords, + roi, + lat, + lon, + byExtOnly = byExtOnly) + } else{ + unflagged <- roi_rast(dataCoords, + roi, + lat, + lon, + byExtOnly = byExtOnly, + maskValue = maskValue) + } + + ### Update data frame + data$.outside_roi[data$id_temp %in% unflagged] <- FALSE + out <- data %>% + dplyr::select(-id_temp) + + if(sum(out$.outside_roi) == 0){ + message("No coordinates were located outside the region of interest.") + } else{ + message(paste(sum(out$.outside_roi), + "occurrences were flagged as outside the region of interest.", + "One column was added to the database.")) + } + + return(out) + + } + +roi_sf <- + function(dataCoords, + roi, + lat, + lon, + byExtOnly = FALSE){ + + ### Check CRS + if(sf::st_crs(roi) != sf::st_crs(4326)){ + message("Reprojecting ROI to WGS84.") + roi <- roi %>% sf::st_transform(4326) + } + + ### Filter by extent + ext <- sf::st_bbox(roi) + + crpd <- dataCoords %>% + dplyr::filter(dplyr::between(.data[[lat]], ext["ymin"], ext["ymax"]), + dplyr::between(.data[[lon]], ext["xmin"], ext["xmax"])) + + + if(any(byExtOnly, nrow(crpd) == 0)){ + unflgd <- crpd$id_temp + + } else{ + + ### Consolidate roi + suppressMessages({ + s2_status <- sf::sf_use_s2() + sf::sf_use_s2(FALSE) + + roi <- roi %>% + sf::st_union() %>% + sf::st_combine() + + if(s2_status) sf::sf_use_s2(TRUE) + + unflgd <- crpd %>% + sf::st_as_sf(coords = c(lon, lat), + crs = 4326) %>% + dplyr::mutate(within = sf::st_within(., + roi, + sparse = F)[,1]) %>% + sf::st_drop_geometry() %>% + dplyr::filter(within) %>% + dplyr::pull(id_temp) + }) + } + + return(unflgd) + } + +roi_rast <- + function(dataCoords, + roi, + lat, + lon, + byExtOnly = FALSE, + maskValue = NA){ + + ### Reproject to raster CRS + dataCoords[, c("tempX", "tempY")] <- terra::geom(project(vect(dataCoords, + geom = c(lon, lat), + crs = "EPSG:4326"), + crs(roi)), + wkt = FALSE, + hex = FALSE, + df = TRUE)[,c("x", "y")] + + ### Filter by extent + Ext <- ext(roi) + + crpd <- dataCoords %>% + dplyr::filter(dplyr::between(tempX, Ext[1], Ext[2]), + dplyr::between(tempY, Ext[3], Ext[4])) + + + if(any(byExtOnly, nrow(crpd) == 0)){ + unflgd <- crpd$id_temp + + } else{ + if(is.null(maskValue)){ + warning("No maskValue specied. Assuming maskValue is NA") + maskValue <- NA + } + smpld <- crpd %>% + dplyr::mutate(value = terra::extract(roi, + crpd[,c("tempX", "tempY")], + method = "simple")[,2]) %>% + dplyr::filter(!is.na(value)) + + if(!is.na(maskValue)){ + smpld <- smpld %>% + dplyr::filter(value != maskValue) + } + + unflgd <- smpld$id_temp + } + + return(unflgd) + } + + + + diff --git a/R/mol_spatiotemporal_duplicate.R b/R/mol_spatiotemporal_duplicate.R new file mode 100644 index 0000000..329103a --- /dev/null +++ b/R/mol_spatiotemporal_duplicate.R @@ -0,0 +1,126 @@ +#' Identify spatiotemporally duplicated records +#' +#' This function flags records that are spatialtemporal duplicates +#' +#' @param data data.frame. Containing information about the location, taxonomy, +#' date, and source of the observation. +#' @param lat character string. The column name with latitude in decimal degree +#' and in WGS84. Default = "decimalLatitude". +#' @param lon character string. The column with longitude in decimal degree and +#' in WGS84. Default = "decimalLongitude". +#' @param date character string. The column name with temporal information of the occurrence record. +#' @param recordCols character string. Names of columns that distinguish records collected simultaneously (e.g., specimenID). +#' @param priorityCol character. Name of column that indicates how to prioritize which duplicate record is flagged. +#' @param priorityOrder a vector of values in the priority column in descending order of priority. +#' @param ndec integer. Number of decimal places in the decimalDegree coordinates at which to determine spatial duplicates. +#' +#' @details Records with NA in the lat, lon, or date columns are excluded. If 'date' argument is NULL, function flags spatial duplicates and the flag column name is changed accordingly. +#' +#' priorityCol is used to arrange the data prior to identifying duplicates. The vector in priorityOrder is used to define levels in a factor prior to ordering. If priorityOrder is NULL and priorityCol isn't, priorityCol is simply sorted in ascending order. Values in the priority column that are not specified in the priorityOrder will be arranged as lowest priority, with a warning. +#' +#' @return A data.frame containing the column ".spatiotemporal_duplicate", or ".spatial_duplicate" when date = NULL. +#' .Compliant (TRUE) if if observation is a spatiotemporal duplicate; otherwise "FALSE". +#' +#' @importFrom lubridate as_date +#' +#' @export +#' +#' @examples + +bdc_spatiotemporal_duplicate <- function( + data, + lat = "decimalLatitude", + lon = "decimalLongitude", + date = "eventDate", + recordCols = NULL, + priorityCol = NULL, + priorityOrder = NULL, + digits = 3){ + + ### Run checks + if (!is.data.frame(data)) { + stop(deparse(substitute(data)), " is not a data.frame", call. = FALSE) + } + + dup_cols <- c(lon, lat, date, recordCols) + + check_col(data, c(dup_cols, priorityCols)) + + if(!is.null(priorityOrder) & is.null(priorityCols)){ + stop("No column specified for sorting by priority.\nEither specify a valid column name as priorityCol or set priorityOrder to NULL.") + } + + # check mismatches between priorityCol and priorityOrder + if(!is.null(priorityCol){ + if(is.null(priorityOrder)){ + warning("The order of priority for values in the priority column has not been specified.\nValues will be prioritized in ascending order.") + } + + if(!is.null(priorityOrder)){ + if(!any(priorityOrder %in% unique(data[[priorityCol]]))){ + stop("The values specified as the priority order do not appear in the priority column. Recheck arguments.") + } + + if(!all(priorityOrder %in% unique(data[[priorityCol]]))){ + warning("Not all values in the priority order occur in the priority column.\nThese values will be ignored.") + } + + if(!all(unique(data[[priorityCol]]) %in% priorityOrder)){ + warning("Not all values in the priority column are included in the priority order.\nThese values will be treated as lowest priority.") + } + } + } + + if(!is.numeric(ndec)) stop("the number of decimals (ndec) argument must be provided as an integer.") + + if(as.integer(ndec) != ndec) stop("the number of decimals ('ndec') argument must be provided as an integer.") + + ### update data + flagCol <- dplyr::if_else(is.null(date), + ".spatial_duplicate", + ".spatiotemporal_duplicate") + data <- data %>% + dplyr::mutate(decimalLatitude = as.numeric(.data[[lat]]), + decimalLongitude = as.numeric(.data[[lon]]), + !! flagCol := FALSE) %>% + tibble::rowid_to_column("id_temp") + + + ### subset and rearrange data + data_temp <- data %>% + dplyr::select(dplyr::all_of(c(lat, lon, date, recordCols, priorityCol))) %>% + dplyr::filter(!dplyr::if_any(c(lon, lat, date), ~is.na(.))) %>% + dplyr::mutate(dplyr::across(dplyr::all_of(c(lon, lat)), ~round(., digits = ndec))) + + if(!is.null(priorityCol)){ + if(!is.null(priorityOrder)){ + data_temp[[priorityCol]] <- factor(data_temp[[priorityCol]], + levels = priorityOrder) + } + + data_temp <- data_temp %>% + dplyr::arrange(across(all_of(priorityCol))) + } + + ### get flagged ids + flagged <- data_temp %>% + filter(duplicated(dplyr::select(., all_of(dup_cols)))) %>% + pull(id_temp) + + ### update flag col + data[[flagCol]][data$id_temp %in% flagged] <- TRUE + + message( + paste( + "\nbdc_spatiotemporal_duplicate:\nFlagged", + sum(data[[flagCol]]), + "observations that were", + paste0(str_remove(flagCol, "^\\."), "s."), + "\nOne column was added to the database.\n" + ) + ) + + return(data) + +} +