From e089719073f008528b3f1af00cfb181625ae2794 Mon Sep 17 00:00:00 2001 From: Matt Brousil <37380883+mbrousil@users.noreply.github.com> Date: Fri, 23 Aug 2024 17:08:59 -0700 Subject: [PATCH 01/35] Add sf and tribal query options to TADA_DataRetrieval --- R/DataDiscoveryRetrieval.R | 771 ++++++++++++++++++++++++++++--------- 1 file changed, 584 insertions(+), 187 deletions(-) diff --git a/R/DataDiscoveryRetrieval.R b/R/DataDiscoveryRetrieval.R index e0478e95..a9fc3ea3 100644 --- a/R/DataDiscoveryRetrieval.R +++ b/R/DataDiscoveryRetrieval.R @@ -5,7 +5,9 @@ #' exceptions of endDate and startDate match the web service call format from the #' online WQP GUI. endDate and startDate match the format suggested in USGS's #' dataRetrieval package (endDate = "YYYY-MM-DD"), which is a more familiar date -#' format for R users than the WQP GUI's endDateHi = "MM-DD-YYYY". +#' format for R users than the WQP GUI's endDateHi = "MM-DD-YYYY". aoi_sf, +#' tribal_area_type, and tribe_name_parcel do not have corresponding inputs in +#' the web service. #' #' Multiple fields are queried together using AND logic, but multiple values within #' one field are queried together using OR logic. For example, within @@ -17,7 +19,11 @@ #' of the query fields. #' characteristicName and Characteristic Group also work as an AND, therefore the #' characteristicName must fall within the Characteristic Group when both are entered. -#' +#' +#' aoi_sf cannot be used with tribal_area_type. If countrycode, countycode, huc, +#' siteid, or statecode are used with aoi_sf or tribal_area_type they will be ignored +#' under the assumption that the sf object or tribal location are the intended +#' area of interest. #' #' Users can reference the \href{https://www.epa.gov/waterdata/storage-and-retrieval-and-water-quality-exchange-domain-services-and-downloads}{WQX domain tables} #' to find allowable values for queries, e.g., reference the WQX domain table to find countycode and statecode: https://cdx.epa.gov/wqx/download/DomainValues/County_CSV.zip @@ -33,12 +39,15 @@ #' #' @param startDate Start Date string in the format YYYY-MM-DD, for example, "2020-01-01" #' @param endDate End Date string in the format YYYY-MM-DD, for example, "2020-01-01" +#' @param aoi_sf An sf object to use for a query area of interest #' @param countrycode Code that identifies a country or ocean (e.g. countrycode = "CA" for Canada, countrycode = "OA" for Atlantic Ocean). See https://www.waterqualitydata.us/Codes/countrycode for options. #' @param statecode FIPS state alpha code that identifies a state (e.g. statecode = "DE" for Delaware). See https://www.waterqualitydata.us/Codes/statecode for options. #' @param countycode FIPS county name. Note that a state code must also be supplied (e.g. statecode = "AL", countycode = "Chilton"). See https://www.waterqualitydata.us/Codes/countycode for options. #' @param huc A numeric code denoting a hydrologic unit. Example: "04030202". Different size hucs can be entered. See https://epa.maps.arcgis.com/home/item.html?id=796992f4588c401fabec7446ecc7a5a3 for a map with HUCS. Click on a HUC to find the associated code. #' @param siteid Unique monitoring location identifier. #' @param siteType Type of waterbody. See https://www.waterqualitydata.us/Codes/sitetype for options. +#' @param tribal_area_type One of the six tribal spatial layers: "Alaska Native Allotments", "Alaska Native Villages", "American Indian Reservations", "Off-reservation Trust Lands", "Oklahoma Tribal Statistical Areas", or "Virginia Federally Recognized Tribes". +#' @param tribe_name_parcel The name of a tribe corresponding to an entry in the TRIBE_NAME field of the specified tribal_area_type. OR if the type is Alaska Native Allotments" then the corresponding PARCEL_NO. #' @param characteristicName Name of parameter. See https://www.waterqualitydata.us/Codes/characteristicName for options. #' @param characteristicType Groups of environmental measurements/parameters. See https://www.waterqualitydata.us/Codes/characteristicType for options. #' @param sampleMedia Sampling substrate such as water, air, or sediment. See https://www.waterqualitydata.us/Codes/sampleMedia for options. @@ -49,6 +58,12 @@ #' #' @return TADA-compatible dataframe #' +#' @note +#' Alaska Native Villages and Virginia Federally Recognized Tribes are point +#' geometries in the Map Service, not polygons. At the time of this writing they +#' do not return any data when used for WQP bbox queries and so are set to return +#' errors when used with this function. +#' #' @export #' #' @examples @@ -160,11 +175,14 @@ #' TADA_DataRetrieval <- function(startDate = "null", endDate = "null", + aoi_sf = NULL, countrycode = "null", countycode = "null", huc = "null", siteid = "null", siteType = "null", + tribal_area_type = "null", + tribe_name_parcel = "null", characteristicName = "null", characteristicType = "null", sampleMedia = "null", @@ -173,158 +191,537 @@ TADA_DataRetrieval <- function(startDate = "null", project = "null", providers = "null", applyautoclean = TRUE) { + + + + # Check for incomplete or inconsistent inputs: + + # If both an sf object and tribe information are provided it's unclear what + # the priority should be for the query + if( !is.null(aoi_sf) & + ( (tribal_area_type != "null") | (tribe_name_parcel != "null") ) ){ + stop( + paste0( + "Both sf data and tribal information have been provided. ", + "Please use only one of these query options." + ) + ) + } + + # Check for other arguments that indicate location. Function will ignore + # these inputs but warn the user + if( + # sf object provided + (!is.null(aoi_sf) & inherits(aoi_sf, "sf")) & + # with additional location info + any( (countrycode != "null"), (countycode != "null"), (huc != "null"), + (siteid != "null"), (statecode != "null") ) + ){ + warning( + paste0( + "Location information has been provided in addition to an sf object. ", + "Only the sf object will be used in the query." + ) + ) + } else if( + # Tribe info provided + (tribal_area_type != "null") & + # with additional location info + any( (countrycode != "null"), (countycode != "null"), (huc != "null"), + (siteid != "null"), (statecode != "null") ) + ){ + warning( + paste0( + "Location information has been provided in addition to tribal information. ", + "Only the tribal information will be used in the query." + ) + ) + } + + # Insufficient tribal info provided + if( (tribal_area_type == "null") & (tribe_name_parcel != "null") ){ + stop("A tribal_area_type is required if tribe_name_parcel is provided.") + } + + # Set query parameters WQPquery <- list() - - if (!"null" %in% statecode) { - load(system.file("extdata", "statecodes_df.Rdata", package = "EPATADA")) - statecode <- as.character(statecode) - statecodes_sub <- statecodes_df %>% dplyr::filter(STUSAB %in% statecode) - statecd <- paste0("US:", statecodes_sub$STATE) - if (nrow(statecodes_sub) == 0) { - stop("State code is not valid. Check FIPS state/territory abbreviations.") + + + # If an sf object OR tribal info are provided they will be the basis of the query + # (The tribal data handling uses sf objects as well) + if( (!is.null(aoi_sf) & inherits(aoi_sf, "sf")) | (tribal_area_type != "null") ){ + + sf::sf_use_s2(FALSE) + + # Build the non-sf part of the query: + + # StartDate + if (length(startDate) > 1) { + if (is.na(suppressWarnings(lubridate::parse_date_time(startDate[1], orders = "ymd")))) { + stop("Incorrect date format. Please use the format YYYY-MM-DD.") + } + WQPquery <- c(WQPquery, startDate = list(startDate)) + } else if (startDate != "null") { + if (is.na(suppressWarnings(lubridate::parse_date_time(startDate, orders = "ymd")))) { + stop("Incorrect date format. Please use the format YYYY-MM-DD.") + } + WQPquery <- c(WQPquery, startDate = startDate) } - if (length(statecode) >= 1) { - WQPquery <- c(WQPquery, statecode = list(statecd)) + # SiteType + if (length(siteType) > 1) { + WQPquery <- c(WQPquery, siteType = list(siteType)) + } else if (siteType != "null") { + WQPquery <- c(WQPquery, siteType = siteType) } - } - - if (length(huc) > 1) { - WQPquery <- c(WQPquery, huc = list(huc)) - } else if (huc != "null") { - WQPquery <- c(WQPquery, huc = huc) - } - - if (length(startDate) > 1) { - if (is.na(suppressWarnings(lubridate::parse_date_time(startDate[1], orders = "ymd")))) { - stop("Incorrect date format. Please use the format YYYY-MM-DD.") + # CharacteristicName + if (length(characteristicName) > 1) { + WQPquery <- c(WQPquery, characteristicName = list(characteristicName)) + } else if (characteristicName != "null") { + WQPquery <- c(WQPquery, characteristicName = characteristicName) } - WQPquery <- c(WQPquery, startDate = list(startDate)) - } else if (startDate != "null") { - if (is.na(suppressWarnings(lubridate::parse_date_time(startDate, orders = "ymd")))) { - stop("Incorrect date format. Please use the format YYYY-MM-DD.") + # CharacteristicType + if (length(characteristicType) > 1) { + WQPquery <- c(WQPquery, characteristicType = list(characteristicType)) + } else if (characteristicType != "null") { + WQPquery <- c(WQPquery, characteristicType = characteristicType) } - WQPquery <- c(WQPquery, startDate = startDate) - } - - if (length(countrycode) > 1) { - WQPquery <- c(WQPquery, countrycode = list(countrycode)) - } else if (countrycode != "null") { - WQPquery <- c(WQPquery, countrycode = countrycode) - } - - if (length(countycode) > 1) { - WQPquery <- c(WQPquery, countycode = list(countycode)) - } else if (countycode != "null") { - WQPquery <- c(WQPquery, countycode = countycode) - } - - if (length(siteid) > 1) { - WQPquery <- c(WQPquery, siteid = list(siteid)) - } else if (siteid != "null") { - WQPquery <- c(WQPquery, siteid = siteid) - } - - if (length(siteType) > 1) { - WQPquery <- c(WQPquery, siteType = list(siteType)) - } else if (siteType != "null") { - WQPquery <- c(WQPquery, siteType = siteType) - } - - if (length(characteristicName) > 1) { - WQPquery <- c(WQPquery, characteristicName = list(characteristicName)) - } else if (characteristicName != "null") { - WQPquery <- c(WQPquery, characteristicName = characteristicName) - } - - if (length(characteristicType) > 1) { - WQPquery <- c(WQPquery, characteristicType = list(characteristicType)) - } else if (characteristicType != "null") { - WQPquery <- c(WQPquery, characteristicType = characteristicType) - } - - if (length(sampleMedia) > 1) { - WQPquery <- c(WQPquery, sampleMedia = list(sampleMedia)) - } else if (sampleMedia != "null") { - WQPquery <- c(WQPquery, sampleMedia = sampleMedia) - } - - if (length(project) > 1) { - WQPquery <- c(WQPquery, project = list(project)) - } else if (project != "null") { - WQPquery <- c(WQPquery, project = project) - } - - if (length(providers) > 1) { - WQPquery <- c(WQPquery, providers = list(providers)) - } else if (providers != "null") { - WQPquery <- c(WQPquery, providers = providers) - } - - if (length(organization) > 1) { - WQPquery <- c(WQPquery, organization = list(organization)) - } else if (organization != "null") { - WQPquery <- c(WQPquery, organization = organization) - } - - if (length(endDate) > 1) { - if (is.na(suppressWarnings(lubridate::parse_date_time(endDate[1], orders = "ymd")))) { - stop("Incorrect date format. Please use the format YYYY-MM-DD.") + # SampleMedia + if (length(sampleMedia) > 1) { + WQPquery <- c(WQPquery, sampleMedia = list(sampleMedia)) + } else if (sampleMedia != "null") { + WQPquery <- c(WQPquery, sampleMedia = sampleMedia) } - WQPquery <- c(WQPquery, endDate = list(endDate)) - } else if (endDate != "null") { - if (is.na(suppressWarnings(lubridate::parse_date_time(endDate, orders = "ymd")))) { - stop("Incorrect date format. Please use the format YYYY-MM-DD.") + # Project + if (length(project) > 1) { + WQPquery <- c(WQPquery, project = list(project)) + } else if (project != "null") { + WQPquery <- c(WQPquery, project = project) } - WQPquery <- c(WQPquery, endDate = endDate) - } - - # Retrieve all 3 profiles - print("Downloading WQP query results. This may take some time depending upon the query size.") - print(WQPquery) - results.DR <- dataRetrieval::readWQPdata(WQPquery, - dataProfile = "resultPhysChem", - ignore_attributes = TRUE - ) - # check if any results are available - if ((nrow(results.DR) > 0) == FALSE) { - print("Returning empty results dataframe: Your WQP query returned no results (no data available). Try a different query. Removing some of your query filters OR broadening your search area may help.") - TADAprofile.clean <- results.DR - } else { - sites.DR <- dataRetrieval::whatWQPsites(WQPquery) - - projects.DR <- dataRetrieval::readWQPdata(WQPquery, - ignore_attributes = TRUE, - service = "Project" + # Provider + if (length(providers) > 1) { + WQPquery <- c(WQPquery, providers = list(providers)) + } else if (providers != "null") { + WQPquery <- c(WQPquery, providers = providers) + } + # Organization + if (length(organization) > 1) { + WQPquery <- c(WQPquery, organization = list(organization)) + } else if (organization != "null") { + WQPquery <- c(WQPquery, organization = organization) + } + # EndDate + if (length(endDate) > 1) { + if (is.na(suppressWarnings(lubridate::parse_date_time(endDate[1], orders = "ymd")))) { + stop("Incorrect date format. Please use the format YYYY-MM-DD.") + } + WQPquery <- c(WQPquery, endDate = list(endDate)) + } else if (endDate != "null") { + if (is.na(suppressWarnings(lubridate::parse_date_time(endDate, orders = "ymd")))) { + stop("Incorrect date format. Please use the format YYYY-MM-DD.") + } + WQPquery <- c(WQPquery, endDate = endDate) + } + + # sf AOI prep for query + + # If tribe info is provided then grab the corresponding sf object: + if(tribal_area_type != "null"){ + + # Make a reference table for tribal area type + url matching + # (options that don't return results are commented out for now) + map_service_urls <- tibble::tribble( + ~tribal_area, ~url, + "Alaska Native Allotments", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/0", + # "Alaska Native Villages", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/1", + "American Indian Reservations", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/2", + "Off-reservation Trust Lands", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/3", + "Oklahoma Tribal Statistical Areas", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/4"# , + # "Virginia Federally Recognized Tribes", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/5" + ) + + # Keep to a single type: + if(length(tribal_area_type) > 1){ + stop("tribal_area_type must be of length 1.") + } + + # These two layers will not return any data when used for bboxes + if(tribal_area_type == "Alaska Native Villages"){ + stop("Alaska Native Villages data are centroid points, not spatial boundaries.") + } else if(tribal_area_type == "Virginia Federally Recognized Tribes") { + stop("Federally recognized tribal entities in Virginia do not have any available spatial boundaries.") + } + + # These area types allow filtering by TRIBE_NAME (unique within each type) + if(tribal_area_type %in% c( + # "Alaska Native Villages", + "American Indian Reservations", + "Off-reservation Trust Lands", + "Oklahoma Tribal Statistical Areas"#, + # "Virginia Federally Recognized Tribes" + ) + ){ + + # Get the relevant url + aoi_sf <- filter(map_service_urls, + tribal_area == tribal_area_type)$url %>% + # Pull data + arcgislayers::arc_open() %>% + # Return sf + arcgislayers::arc_select() %>% + # If a value provided, then filter + {if ((tribe_name_parcel != "null") & (tribe_name_parcel != "null")) { + filter(., TRIBE_NAME %in% tribe_name_parcel) + } else { + . + }} + + # Otherwise filter by PARCEL_NO (Note that values in this col are not unique) + } else if(tribal_area_type == "Alaska Native Allotments"){ + + aoi_sf <- filter(map_service_urls, + tribal_area == tribal_area_type)$url %>% + arcgislayers::arc_open() %>% + arcgislayers::arc_select() %>% + {if ((tribe_name_parcel != "null") & (tribe_name_parcel != "null")) { + filter(., PARCEL_NO %in% tribe_name_parcel) + } else { + . + }} + + } else { + stop("Tribal area type not recognized. Refer to TADA_TribalOptions() for query options.") + } + + } + + # Match CRS + if(sf::st_crs(aoi_sf) != 4326){ + aoi_sf <- sf::st_transform(aoi_sf, crs = 4326) + } + + # Get bbox of the sf object + input_bbox <- sf::st_bbox(aoi_sf) + + # Query site info within the bbox + bbox_sites <- dataRetrieval::whatWQPsites( + WQPquery, + bBox = c(input_bbox$xmin, input_bbox$ymin, input_bbox$xmax, input_bbox$ymax) ) - - TADAprofile <- TADA_JoinWQPProfiles( - FullPhysChem = results.DR, - Sites = sites.DR, - Projects = projects.DR + + # Check if any sites are within the aoi + if ( (nrow(bbox_sites) > 0 ) == FALSE) { + stop("No monitoring sites were returned within your area of interest (no data available).") + } + + # Reformat returned info as sf + bbox_sites_sf <- TADA_MakeSpatial(bbox_sites, crs = 4326) + + # Subset sites to only within shapefile and get IDs + clipped_sites_sf <- bbox_sites_sf[aoi_sf, ] + + clipped_site_ids <- clipped_sites_sf$MonitoringLocationIdentifier + + # Check number of sites returned. More than 300 will require a map() approach + if( length(clipped_site_ids) > 300 ) { + warning( + paste0( + "More than 300 sites are matched by the AOI and query terms. ", + "If your AOI is a county, state, country, or HUC boundary it would be more efficient to provide a code instead of an sf object." + ) + ) + + # Split IDs into a list + id_cluster_list <- split(x = clipped_site_ids, + f = ceiling(seq_along(clipped_site_ids) / 300)) + + print("Downloading WQP query results. This may take some time depending upon the query size.") + + # List of query results + results.DR <- purrr::map( + .x = id_cluster_list, + .f = ~suppressMessages( + dataRetrieval::readWQPdata( + siteid = .x, + WQPquery, + dataProfile = "resultPhysChem", + ignore_attributes = TRUE + ) + ) %>% + # To allow row binds + dplyr::mutate(across(everything(), as.character)) + ) %>% + list_rbind() + + # Check if any results were returned + if ( (nrow(results.DR) > 0 ) == FALSE) { + print( + paste0( + "Returning empty results dataframe: ", + "Your WQP query returned no results (no data available). ", + "Try a different query. ", + "Removing some of your query filters OR broadening your search area may help." + ) + ) + TADAprofile.clean <- results.DR + } else { + + # Get site metadata + sites.DR <- clipped_sites_sf %>% + as_tibble() %>% + select(-geometry) + + # Get project metadata + projects.DR <- dataRetrieval::readWQPdata( + siteid = clipped_site_ids, + WQPquery, + ignore_attributes = TRUE, + service = "Project" + ) + + # Join results, sites, projects + TADAprofile <- TADA_JoinWQPProfiles( + FullPhysChem = results.DR, + Sites = sites.DR, + Projects = projects.DR + ) + + # need to specify this or throws error when trying to bind rows. + # Temporary fix for larger issue where data structure for all columns + # should be specified. + TADAprofile <- TADAprofile %>% dplyr::mutate( + across(everything(), as.character) + ) + + # run TADA_AutoClean function + if (applyautoclean == TRUE) { + print("Data successfully downloaded. Running TADA_AutoClean function.") + + TADAprofile.clean <- TADA_AutoClean(TADAprofile) + } else { + TADAprofile.clean <- TADAprofile + } + } + + return(TADAprofile.clean) + + # Less than 300 sites: + } else { + + # Retrieve all 3 profiles + print("Downloading WQP query results. This may take some time depending upon the query size.") + print(WQPquery) + + # Get results + results.DR <- dataRetrieval::readWQPdata( + siteid = clipped_site_ids, + WQPquery, + dataProfile = "resultPhysChem", + ignore_attributes = TRUE + ) + + # check if any results were returned + if ((nrow(results.DR) > 0) == FALSE) { + paste0( + "Returning empty results dataframe: ", + "Your WQP query returned no results (no data available). ", + "Try a different query. ", + "Removing some of your query filters OR broadening your search area may help." + ) + TADAprofile.clean <- results.DR + } else { + + # Get site metadata + sites.DR <- dataRetrieval::whatWQPsites(WQPquery) + + # Get project metadata + projects.DR <- dataRetrieval::readWQPdata(WQPquery, + ignore_attributes = TRUE, + service = "Project") + + # Join results, sites, projects + TADAprofile <- TADA_JoinWQPProfiles( + FullPhysChem = results.DR, + Sites = sites.DR, + Projects = projects.DR + ) + + # need to specify this or throws error when trying to bind rows. + # Temporary fix for larger issue where data structure for all columns + # should be specified. + TADAprofile <- TADAprofile %>% dplyr::mutate( + across(everything(), as.character) + ) + + # run TADA_AutoClean function + if (applyautoclean == TRUE) { + print("Data successfully downloaded. Running TADA_AutoClean function.") + + TADAprofile.clean <- TADA_AutoClean(TADAprofile) + } else { + TADAprofile.clean <- TADAprofile + } + } + + return(TADAprofile.clean) + + } + + # If no sf object provided: + } else { + # Set query parameters + WQPquery <- list() + + if (!"null" %in% statecode) { + load(system.file("extdata", "statecodes_df.Rdata", package = "EPATADA")) + statecode <- as.character(statecode) + statecodes_sub <- statecodes_df %>% dplyr::filter(STUSAB %in% statecode) + statecd <- paste0("US:", statecodes_sub$STATE) + if (nrow(statecodes_sub) == 0) { + stop("State code is not valid. Check FIPS state/territory abbreviations.") + } + if (length(statecode) >= 1) { + WQPquery <- c(WQPquery, statecode = list(statecd)) + } + } + + if (length(huc) > 1) { + WQPquery <- c(WQPquery, huc = list(huc)) + } else if (huc != "null") { + WQPquery <- c(WQPquery, huc = huc) + } + + if (length(startDate) > 1) { + if (is.na(suppressWarnings(lubridate::parse_date_time(startDate[1], orders = "ymd")))) { + stop("Incorrect date format. Please use the format YYYY-MM-DD.") + } + WQPquery <- c(WQPquery, startDate = list(startDate)) + } else if (startDate != "null") { + if (is.na(suppressWarnings(lubridate::parse_date_time(startDate, orders = "ymd")))) { + stop("Incorrect date format. Please use the format YYYY-MM-DD.") + } + WQPquery <- c(WQPquery, startDate = startDate) + } + + if (length(countrycode) > 1) { + WQPquery <- c(WQPquery, countrycode = list(countrycode)) + } else if (countrycode != "null") { + WQPquery <- c(WQPquery, countrycode = countrycode) + } + + if (length(countycode) > 1) { + WQPquery <- c(WQPquery, countycode = list(countycode)) + } else if (countycode != "null") { + WQPquery <- c(WQPquery, countycode = countycode) + } + + if (length(siteid) > 1) { + WQPquery <- c(WQPquery, siteid = list(siteid)) + } else if (siteid != "null") { + WQPquery <- c(WQPquery, siteid = siteid) + } + + if (length(siteType) > 1) { + WQPquery <- c(WQPquery, siteType = list(siteType)) + } else if (siteType != "null") { + WQPquery <- c(WQPquery, siteType = siteType) + } + + if (length(characteristicName) > 1) { + WQPquery <- c(WQPquery, characteristicName = list(characteristicName)) + } else if (characteristicName != "null") { + WQPquery <- c(WQPquery, characteristicName = characteristicName) + } + + if (length(characteristicType) > 1) { + WQPquery <- c(WQPquery, characteristicType = list(characteristicType)) + } else if (characteristicType != "null") { + WQPquery <- c(WQPquery, characteristicType = characteristicType) + } + + if (length(sampleMedia) > 1) { + WQPquery <- c(WQPquery, sampleMedia = list(sampleMedia)) + } else if (sampleMedia != "null") { + WQPquery <- c(WQPquery, sampleMedia = sampleMedia) + } + + if (length(project) > 1) { + WQPquery <- c(WQPquery, project = list(project)) + } else if (project != "null") { + WQPquery <- c(WQPquery, project = project) + } + + if (length(providers) > 1) { + WQPquery <- c(WQPquery, providers = list(providers)) + } else if (providers != "null") { + WQPquery <- c(WQPquery, providers = providers) + } + + if (length(organization) > 1) { + WQPquery <- c(WQPquery, organization = list(organization)) + } else if (organization != "null") { + WQPquery <- c(WQPquery, organization = organization) + } + + if (length(endDate) > 1) { + if (is.na(suppressWarnings(lubridate::parse_date_time(endDate[1], orders = "ymd")))) { + stop("Incorrect date format. Please use the format YYYY-MM-DD.") + } + WQPquery <- c(WQPquery, endDate = list(endDate)) + } else if (endDate != "null") { + if (is.na(suppressWarnings(lubridate::parse_date_time(endDate, orders = "ymd")))) { + stop("Incorrect date format. Please use the format YYYY-MM-DD.") + } + WQPquery <- c(WQPquery, endDate = endDate) + } + + # Retrieve all 3 profiles + print("Downloading WQP query results. This may take some time depending upon the query size.") + print(WQPquery) + results.DR <- dataRetrieval::readWQPdata(WQPquery, + dataProfile = "resultPhysChem", + ignore_attributes = TRUE ) - - # need to specify this or throws error when trying to bind rows. Temporary fix for larger - # issue where data structure for all columns should be specified. - cols <- names(TADAprofile) - - TADAprofile <- TADAprofile %>% dplyr::mutate_at(cols, as.character) - - # run TADA_AutoClean function - if (applyautoclean == TRUE) { - print("Data successfully downloaded. Running TADA_AutoClean function.") - - TADAprofile.clean <- TADA_AutoClean(TADAprofile) + # check if any results are available + if ((nrow(results.DR) > 0) == FALSE) { + print("Returning empty results dataframe: Your WQP query returned no results (no data available). Try a different query. Removing some of your query filters OR broadening your search area may help.") + TADAprofile.clean <- results.DR } else { - TADAprofile.clean <- TADAprofile + sites.DR <- dataRetrieval::whatWQPsites(WQPquery) + + projects.DR <- dataRetrieval::readWQPdata(WQPquery, + ignore_attributes = TRUE, + service = "Project" + ) + + TADAprofile <- TADA_JoinWQPProfiles( + FullPhysChem = results.DR, + Sites = sites.DR, + Projects = projects.DR + ) + + # need to specify this or throws error when trying to bind rows. Temporary fix for larger + # issue where data structure for all columns should be specified. + cols <- names(TADAprofile) + + TADAprofile <- TADAprofile %>% dplyr::mutate_at(cols, as.character) + + # run TADA_AutoClean function + if (applyautoclean == TRUE) { + print("Data successfully downloaded. Running TADA_AutoClean function.") + + TADAprofile.clean <- TADA_AutoClean(TADAprofile) + } else { + TADAprofile.clean <- TADAprofile + } } + + return(TADAprofile.clean) } - - return(TADAprofile.clean) + } - #' Read in WQP data using the Water Quality Portal (WQP) web services #' #' Go to the WQP website (https://www.waterqualitydata.us/) and fill out the @@ -379,25 +776,25 @@ TADA_ReadWQPWebServices <- function(webservice) { # read in csv from WQP web service if (grepl("zip=yes", webservice)) { webservice <- stringr::str_replace(webservice, "zip=yes", "zip=no") - + # download data webservice <- data.table::fread(toString(webservice)) - + # if input df was not downloaded using USGS's dataRetrieval, then the # column names will include / separators instead of . and TADA uses . # (e.g. ResultMeasure/MeasureUnitCode vs. ResultMeasure.MeasureUnitCode) colnames(webservice) <- gsub("/", ".", colnames(webservice)) - + return(webservice) } else { # download data webservice <- data.table::fread(toString(webservice)) - + # if input df was not downloaded using USGS's dataRetrieval, then the # column names will include / separators instead of . and TADA uses . # (e.g. ResultMeasure/MeasureUnitCode vs. ResultMeasure.MeasureUnitCode) colnames(webservice) <- gsub("/", ".", colnames(webservice)) - + return(webservice) } } @@ -487,11 +884,11 @@ TADA_BigDataRetrieval <- function(startDate = "null", maxrecs = 250000, applyautoclean = FALSE) { start_T <- Sys.time() - + if (!"null" %in% statecode & !"null" %in% huc) { stop("Please provide either state code(s) OR huc(s) to proceed.") } - + if (!startDate == "null") { startDat <- lubridate::ymd(startDate) startYearLo <- lubridate::year(startDat) @@ -500,7 +897,7 @@ TADA_BigDataRetrieval <- function(startDate = "null", startDat <- lubridate::ymd(startDate) startYearLo <- lubridate::year(startDat) } - + # Logic: if the input endDate is not null, convert to date and obtain year # for summary if (!endDate == "null") { @@ -511,7 +908,7 @@ TADA_BigDataRetrieval <- function(startDate = "null", endDat <- lubridate::ymd(endDate) endYearHi <- lubridate::year(endDat) } - + # Create readWQPsummary query WQPquery <- list() if (length(characteristicName) > 1) { @@ -529,7 +926,7 @@ TADA_BigDataRetrieval <- function(startDate = "null", } else if (siteType != "null") { WQPquery <- c(WQPquery, siteType = siteType) } - + if (!"null" %in% statecode) { load(system.file("extdata", "statecodes_df.Rdata", package = "EPATADA")) statecode <- as.character(statecode) @@ -547,45 +944,45 @@ TADA_BigDataRetrieval <- function(startDate = "null", WQPquery <- c(WQPquery, statecode = statecd) } } - + if (length(huc) > 1) { WQPquery <- c(WQPquery, huc = list(huc)) } else if (huc != "null") { WQPquery <- c(WQPquery, huc = huc) } - + if (length(countrycode) > 1) { WQPquery <- c(WQPquery, countrycode = list(countrycode)) } else if (countrycode != "null") { WQPquery <- c(WQPquery, countrycode = countrycode) } - + if (length(countycode) > 1) { WQPquery <- c(WQPquery, countycode = list(countycode)) } else if (countycode != "null") { WQPquery <- c(WQPquery, countycode = countycode) } - + if (length(organization) > 1) { WQPquery <- c(WQPquery, organization = list(organization)) } else if (organization != "null") { WQPquery <- c(WQPquery, organization = organization) } - + # cut down on summary query time if possible based on big data query diffdat <- lubridate::time_length(difftime(Sys.Date(), startDat), "years") - + if (diffdat <= 1) { WQPquery <- c(WQPquery, summaryYears = 1) } - + if (diffdat > 1 & diffdat <= 5) { WQPquery <- c(WQPquery, summaryYears = 5) } - + print("Building site summary table for chunking result downloads...") df_summary <- dataRetrieval::readWQPsummary(WQPquery) - + ## NOTE: if query brings back no results, function returns empty # dataRetrieval profile, not empty summary if (nrow(df_summary) > 0) { @@ -595,7 +992,7 @@ TADA_BigDataRetrieval <- function(startDate = "null", YearSummarized >= startYearLo, YearSummarized <= endYearHi ) - + rm(df_summary) # if there are still site records when filtered to years of interest.... if (dim(sites)[1] > 0) { @@ -621,15 +1018,15 @@ TADA_BigDataRetrieval <- function(startDate = "null", i <- i + 1 groupings <- plyr::rbind.fill(groupings, group) } - + x$group <- i - + groupings <- plyr::rbind.fill(groupings, x) } return(groupings) } - - + + # get total number of results per site and separate out sites with >250000 results tot_sites <- sites %>% dplyr::group_by(MonitoringLocationIdentifier) %>% @@ -637,14 +1034,14 @@ TADA_BigDataRetrieval <- function(startDate = "null", dplyr::arrange(tot_n) smallsites <- tot_sites %>% dplyr::filter(tot_n <= maxrecs) bigsites <- tot_sites %>% dplyr::filter(tot_n > maxrecs) - + df <- data.frame() - + if (dim(smallsites)[1] > 0) { smallsitesgrp <- make_groups(smallsites, maxrecs) - + print(paste0("Downloading data from sites with fewer than ", maxrecs, " results by grouping them together.")) - + for (i in 1:max(smallsitesgrp$group)) { site_chunk <- subset(smallsitesgrp$MonitoringLocationIdentifier, smallsitesgrp$group == i) joins <- TADA_DataRetrieval( @@ -660,29 +1057,29 @@ TADA_BigDataRetrieval <- function(startDate = "null", df <- dplyr::bind_rows(df, joins) } } - + rm(smallsites, smallsitesgrp) } - + if (dim(bigsites)[1] > 0) { print(paste0("Downloading data from sites with greater than ", maxrecs, " results, chunking queries by shorter time intervals...")) - + bsitesvec <- unique(bigsites$MonitoringLocationIdentifier) - + for (i in 1:length(bsitesvec)) { mlidsum <- subset(sites, sites$MonitoringLocationIdentifier == bsitesvec[i]) mlidsum <- mlidsum %>% dplyr::group_by(MonitoringLocationIdentifier, YearSummarized) %>% dplyr::summarise(tot_n = sum(ResultCount)) site_chunk <- unique(mlidsum$MonitoringLocationIdentifier) - + bigsitegrps <- make_groups(mlidsum, maxrecs) - + for (i in 1:max(bigsitegrps$group)) { yearchunk <- subset(bigsitegrps$YearSummarized, bigsitegrps$group == i) startD <- paste0(min(yearchunk), "-01-01") endD <- paste0(max(yearchunk), "-12-31") - + joins <- TADA_DataRetrieval( startDate = startD, endDate = endD, @@ -692,7 +1089,7 @@ TADA_BigDataRetrieval <- function(startDate = "null", sampleMedia = sampleMedia, applyautoclean = FALSE ) - + if (dim(joins)[1] > 0) { df <- dplyr::bind_rows(df, joins) } @@ -708,18 +1105,18 @@ TADA_BigDataRetrieval <- function(startDate = "null", warning("Query returned no data. Function returns an empty dataframe.") return(df_summary) } - + df <- subset(df, as.Date(df$ActivityStartDate, "%Y-%m-%d") >= startDat & as.Date(df$ActivityStartDate, "%Y-%m-%d") <= endDat) - + if (applyautoclean == TRUE) { print("Applying TADA_AutoClean function...") df <- TADA_AutoClean(df) } - + # timing function for efficiency tests. difference <- difftime(Sys.time(), start_T, units = "mins") print(difference) - + return(df) } @@ -756,20 +1153,20 @@ TADA_JoinWQPProfiles <- function(FullPhysChem = "null", Sites = "null", Projects = "null") { FullPhysChem.df <- FullPhysChem - + Sites.df <- Sites - + Projects.df <- Projects - + # Join station data to full phys/chem (FullPhysChem.df) if (length(Sites.df > 1)) { if (nrow(Sites.df) > 0) { join1 <- FullPhysChem.df %>% # join stations to results dplyr::left_join(Sites.df, - by = "MonitoringLocationIdentifier", - multiple = "all", - relationship = "many-to-many" + by = "MonitoringLocationIdentifier", + multiple = "all", + relationship = "many-to-many" ) %>% # remove ".x" suffix from column names dplyr::rename_at(dplyr::vars(dplyr::ends_with(".x")), ~ stringr::str_replace(., "\\..$", "")) %>% @@ -781,8 +1178,8 @@ TADA_JoinWQPProfiles <- function(FullPhysChem = "null", } else { join1 <- FullPhysChem.df } - - + + # Add QAPP columns from project if (length(Projects.df) > 1) { if (nrow(Projects.df) > 0) { @@ -808,6 +1205,6 @@ TADA_JoinWQPProfiles <- function(FullPhysChem = "null", } else { join2 <- join1 } - + return(join2) } From b268ce8cb602cec8e561d94caf28ad771f4c2e8e Mon Sep 17 00:00:00 2001 From: Matt Brousil <37380883+mbrousil@users.noreply.github.com> Date: Mon, 26 Aug 2024 10:38:10 -0700 Subject: [PATCH 02/35] Add TADA_TribalOptions to R/GeospatialFunctions.R --- R/GeospatialFunctions.R | 227 +++++++++++++++++++++++++++------------- 1 file changed, 156 insertions(+), 71 deletions(-) diff --git a/R/GeospatialFunctions.R b/R/GeospatialFunctions.R index f4a9e48f..d95875db 100644 --- a/R/GeospatialFunctions.R +++ b/R/GeospatialFunctions.R @@ -29,13 +29,13 @@ #' TADA_MakeSpatial <- function(.data, crs = 4326) { if (!"LongitudeMeasure" %in% colnames(.data) | - !"LatitudeMeasure" %in% colnames(.data) | - !"HorizontalCoordinateReferenceSystemDatumName" %in% colnames(.data)) { + !"LatitudeMeasure" %in% colnames(.data) | + !"HorizontalCoordinateReferenceSystemDatumName" %in% colnames(.data)) { stop("The dataframe does not contain WQP-style latitude and longitude data (column names `HorizontalCoordinateReferenceSystemDatumName`, `LatitudeMeasure`, and `LongitudeMeasure`.") } else if (!is.null(.data) & inherits(.data, "sf")) { stop("Your data is already a spatial object.") } - + suppressMessages(suppressWarnings({ # Make a reference table for CRS and EPSG codes # List should include all codes in WQX domain (see HorizontalCoordinateReferenceSystemDatum CSV at https://www.epa.gov/waterdata/storage-and-retrieval-and-water-quality-exchange-domain-services-and-downloads) @@ -59,7 +59,7 @@ TADA_MakeSpatial <- function(.data, crs = 4326) { "WGS72", 6322, "HARN", 4152 ) - + # join our CRS reference table to our original WQP dataframe: sf <- .data %>% tibble::rowid_to_column(var = "index") %>% @@ -89,7 +89,7 @@ TADA_MakeSpatial <- function(.data, crs = 4326) { dplyr::arrange(index) %>% dplyr::select(-c(index, epsg)) })) - + return(sf) } @@ -120,10 +120,10 @@ fetchATTAINS <- function(.data) { if (is.null(.data) | nrow(.data) == 0) { stop("There is no data in your `data` object to use as a bounding box for selecting ATTAINS features.") } - + # EPSG we want our ATTAINS data to be in (always 4326 for this function) our_epsg <- 4326 - + # If data is already spatial, just make sure it is in the right CRS # and add an index as the WQP observations' unique identifier... if (!is.null(.data) & inherits(.data, "sf")) { @@ -139,7 +139,7 @@ fetchATTAINS <- function(.data) { # convert dataframe to a spatial object TADA_MakeSpatial(.data = ., crs = our_epsg) } - + baseurls <- c( # ATTAINS catchments: "https://gispub.epa.gov/arcgis/rest/services/OW/ATTAINS_Assessment/MapServer/3/query?", # ATTAINS points: @@ -149,13 +149,13 @@ fetchATTAINS <- function(.data) { # ATTAINS polygons: "https://gispub.epa.gov/arcgis/rest/services/OW/ATTAINS_Assessment/MapServer/2/query?" ) - + feature_downloader <- function(baseurls) { # starting at feature 1 (i.e., no offset): offset <- 0 # empty list to store all features in all_features <- list() - + # bounding box (with some minor wiggle) of user's WQP data suppressMessages(suppressWarnings({ bbox <- .data %>% @@ -165,12 +165,12 @@ fetchATTAINS <- function(.data) { # encode for use within the API URL urltools::url_encode(.) })) - + # The ATTAINS API has a limit of 2000 features that can be pulled in at once. # Therefore, we must split the call into manageable "chunks" using a moving # window of what features to pull in, then munging all the separate API calls # together. - + repeat { query <- urltools::param_set(baseurls, key = "geometry", value = bbox) %>% urltools::param_set(key = "inSR", value = our_epsg) %>% @@ -191,7 +191,7 @@ fetchATTAINS <- function(.data) { urltools::param_set(key = "returnDistinctValues", value = "false") %>% urltools::param_set(key = "returnExtentOnly", value = "false") %>% urltools::param_set(key = "featureEncoding", value = "esriDefault") - + # Fetch features within the offset window and append to list: features <- suppressMessages(suppressWarnings({ tryCatch( @@ -203,30 +203,30 @@ fetchATTAINS <- function(.data) { } ) })) - - + + # Exit loop if no more features or error occurred if (is.null(features) || nrow(features) == 0) { break } - + all_features <- c(all_features, list(features)) # once done, change offset by 2000 features: offset <- offset + 2000 - + if (offset == 4000) { print("Your TADA data covers a large spatial range. The ATTAINS pull may take a while.") } } - + all_features <- dplyr::bind_rows(all_features) } - + final_features <- baseurls %>% purrr::map(~ feature_downloader(.)) - + names(final_features) <- c("ATTAINS_catchments", "ATTAINS_points", "ATTAINS_lines", "ATTAINS_polygons") - + return(final_features) } @@ -276,16 +276,16 @@ TADA_GetATTAINS <- function(.data, return_sf = TRUE) { "ATTAINS.visionpriority303d", "ATTAINS.areasqkm", "ATTAINS.catchmentareasqkm", "ATTAINS.catchmentstatecode", "ATTAINS.catchmentresolution", "ATTAINS.Shape_Area" ) - + if (any(attains_names %in% colnames(.data))) { stop("Your data has already been joined with ATTAINS data.") } - + if (nrow(.data) == 0) { print("Your Water Quality Portal dataframe has no observations. Returning an empty dataframe with empty ATTAINS features.") - + # if no WQP observations, return a modified `data` with empty ATTAINS-related columns: - + col_val_list <- stats::setNames( object = rep( x = list(NA), @@ -293,19 +293,19 @@ TADA_GetATTAINS <- function(.data, return_sf = TRUE) { ), nm = attains_names ) - + # Add ATTAINS columns with NA values no_WQP_data <- .data %>% dplyr::mutate(index = NA) %>% dplyr::bind_cols(col_val_list) - + # In this case we'll need to return empty ATTAINS objects if (return_sf == TRUE) { ATTAINS_catchments <- NULL ATTAINS_lines <- NULL ATTAINS_points <- NULL ATTAINS_polygons <- NULL - + return(list( "TADA_with_ATTAINS" = no_WQP_data, "ATTAINS_catchments" = ATTAINS_catchments, @@ -318,14 +318,14 @@ TADA_GetATTAINS <- function(.data, return_sf = TRUE) { return(no_WQP_data) } } - + # If data doesn't already contain ATTAINS data and isn't an empty dataframe: suppressMessages(suppressWarnings({ sf::sf_use_s2(FALSE) - + # If data is already spatial, just make sure it is in the right CRS # and add unique WQP ID for identifying obs with more than one ATTAINS assessment unit - + if (!is.null(.data) & inherits(.data, "sf")) { if (sf::st_crs(.data)$epsg != 4326) { TADA_DataRetrieval_data <- .data %>% @@ -344,9 +344,9 @@ TADA_GetATTAINS <- function(.data, return_sf = TRUE) { tibble::rowid_to_column(var = "index") } })) - + attains_features <- try(fetchATTAINS(.data = TADA_DataRetrieval_data), silent = TRUE) - + suppressMessages(suppressWarnings({ # grab the ATTAINS catchments within our WQP bbox: nearby_catchments <- NULL @@ -364,11 +364,11 @@ TADA_GetATTAINS <- function(.data, return_sf = TRUE) { silent = TRUE ) })) - + # if no ATTAINS data, return original dataframe with empty ATTAINS columns: if (is.null(nearby_catchments)) { print("There are no ATTAINS features associated with these WQP observations. Returning original dataframe with empty ATTAINS columns and empty ATTAINS geospatial features.") - + col_val_list <- stats::setNames( object = rep( x = list(NA), @@ -376,18 +376,18 @@ TADA_GetATTAINS <- function(.data, return_sf = TRUE) { ), nm = attains_names ) - + # return a modified `.data` with empty ATTAINS-related columns: no_ATTAINS_data <- .data %>% dplyr::bind_cols(col_val_list) %>% tibble::rowid_to_column(var = "index") - + if (return_sf == TRUE) { ATTAINS_catchments <- NULL ATTAINS_lines <- NULL ATTAINS_points <- NULL ATTAINS_polygons <- NULL - + return(list( "TADA_with_ATTAINS" = no_ATTAINS_data, "ATTAINS_catchments" = ATTAINS_catchments, @@ -398,7 +398,7 @@ TADA_GetATTAINS <- function(.data, return_sf = TRUE) { } else { return(no_ATTAINS_data) } - + # If there IS ATTAINS data... } else { suppressMessages(suppressWarnings({ @@ -406,18 +406,18 @@ TADA_GetATTAINS <- function(.data, return_sf = TRUE) { TADA_with_ATTAINS <- TADA_DataRetrieval_data %>% # left join = TRUE to preserve all observations (with or without ATTAINS features): sf::st_join(., nearby_catchments, left = TRUE) - + if (return_sf == FALSE) { return(TADA_with_ATTAINS) } - + # CATCHMENT FEATURES # use original catchment pull, but return column names to original ATTAINS_catchments <- nearby_catchments colnames(ATTAINS_catchments) <- gsub("ATTAINS.", "", colnames(ATTAINS_catchments)) # due to the rename, must re-set geometry column: sf::st_geometry(ATTAINS_catchments) <- "geometry" - + # POINT FEATURES - try to pull point AU data if it exists. Otherwise, move on... ATTAINS_points <- NULL try( @@ -428,7 +428,7 @@ TADA_GetATTAINS <- function(.data, return_sf = TRUE) { dplyr::distinct(assessmentunitidentifier, .keep_all = TRUE), silent = TRUE ) - + # LINE FEATURES - try to pull line AU data if it exists. Otherwise, move on... ATTAINS_lines <- NULL try( @@ -439,7 +439,7 @@ TADA_GetATTAINS <- function(.data, return_sf = TRUE) { dplyr::distinct(assessmentunitidentifier, .keep_all = TRUE), silent = TRUE ) - + # POLYGON FEATURES - try to pull polygon AU data if it exists. Otherwise, move on... ATTAINS_polygons <- NULL try( @@ -451,7 +451,7 @@ TADA_GetATTAINS <- function(.data, return_sf = TRUE) { silent = TRUE ) })) - + return(list( "TADA_with_ATTAINS" = TADA_with_ATTAINS, "ATTAINS_catchments" = ATTAINS_catchments, @@ -504,17 +504,17 @@ TADA_ViewATTAINS <- function(.data) { ) %in% names(.data))) { stop("Your input dataframe was not produced from `TADA_GetATTAINS()` or it was modified. Please create your list of ATTAINS features using `TADA_GetATTAINS()` and confirm that return_sf has been set to TRUE.") } - + ATTAINS_table <- .data[["TADA_with_ATTAINS"]] ATTAINS_catchments <- .data[["ATTAINS_catchments"]] ATTAINS_points <- .data[["ATTAINS_points"]] ATTAINS_lines <- .data[["ATTAINS_lines"]] ATTAINS_polygons <- .data[["ATTAINS_polygons"]] - + if (nrow(ATTAINS_table) == 0) { stop("Your WQP dataframe has no observations.") } - + required_columns <- c( "LongitudeMeasure", "LatitudeMeasure", "HorizontalCoordinateReferenceSystemDatumName", @@ -522,27 +522,27 @@ TADA_ViewATTAINS <- function(.data) { "MonitoringLocationName", "ResultIdentifier", "ActivityStartDate", "OrganizationIdentifier" ) - + if (!any(required_columns %in% colnames(ATTAINS_table))) { stop("Your dataframe does not contain the necessary WQP-style column names.") } - + suppressMessages(suppressWarnings({ sf::sf_use_s2(FALSE) - + # if data was spatial, remove for downstream leaflet dev: try(ATTAINS_table <- ATTAINS_table %>% - sf::st_drop_geometry(), silent = TRUE) - + sf::st_drop_geometry(), silent = TRUE) + tada.pal <- TADA_ColorPalette() - + colors <- data.frame( overallstatus = c("Not Supporting", "Fully Supporting", "Not Assessed"), col = c(tada.pal[3], tada.pal[4], tada.pal[7]), dark_col = c(tada.pal[12], tada.pal[6], tada.pal[11]), priority = c(1, 2, 3) ) - + # POINT FEATURES - try to pull point AU data if it exists. Otherwise, move on... try( points_mapper <- ATTAINS_points %>% @@ -554,7 +554,7 @@ TADA_ViewATTAINS <- function(.data) { dplyr::right_join(., tibble::as_tibble(sf::st_coordinates(ATTAINS_points)), by = c("index" = "L1")), silent = TRUE ) - + # LINE FEATURES - try to pull line AU data if it exists. Otherwise, move on... try( lines_mapper <- ATTAINS_lines %>% @@ -562,7 +562,7 @@ TADA_ViewATTAINS <- function(.data) { dplyr::mutate(type = "Line Feature"), silent = TRUE ) - + # POLYGON FEATURES - try to pull polygon AU data if it exists. Otherwise, move on... try( polygons_mapper <- ATTAINS_polygons %>% @@ -570,7 +570,7 @@ TADA_ViewATTAINS <- function(.data) { dplyr::mutate(type = "Polygon Feature"), silent = TRUE ) - + # Develop WQP site stats (e.g. count of observations, parameters, per site) sumdat <- ATTAINS_table %>% dplyr::group_by(MonitoringLocationIdentifier, MonitoringLocationName, LatitudeMeasure, LongitudeMeasure) %>% @@ -586,15 +586,15 @@ TADA_ViewATTAINS <- function(.data) { LatitudeMeasure = as.numeric(LatitudeMeasure), LongitudeMeasure = as.numeric(LongitudeMeasure) ) - + # Basemap for AOI: map <- leaflet::leaflet() %>% leaflet::addProviderTiles("Esri.WorldTopoMap", - group = "World topo", - options = leaflet::providerTileOptions( - updateWhenZooming = FALSE, - updateWhenIdle = TRUE - ) + group = "World topo", + options = leaflet::providerTileOptions( + updateWhenZooming = FALSE, + updateWhenIdle = TRUE + ) ) %>% leaflet::clearShapes() %>% leaflet::fitBounds( @@ -614,7 +614,7 @@ TADA_ViewATTAINS <- function(.data) { opacity = 1, title = "Legend" ) - + # Add ATTAINS catchment outlines (if they exist): try( map <- map %>% @@ -626,7 +626,7 @@ TADA_ViewATTAINS <- function(.data) { ), silent = TRUE ) - + # Add ATTAINS polygon features (if they exist): try( map <- map %>% @@ -645,7 +645,7 @@ TADA_ViewATTAINS <- function(.data) { ), silent = TRUE ) - + # Add ATTAINS lines features (if they exist): try( map <- map %>% @@ -663,7 +663,7 @@ TADA_ViewATTAINS <- function(.data) { ), silent = TRUE ) - + # Add ATTAINS point features (if they exist): try( map <- map %>% @@ -682,7 +682,7 @@ TADA_ViewATTAINS <- function(.data) { ), silent = TRUE ) - + # Add WQP observation features (should always exist): try( map <- map %>% @@ -702,12 +702,97 @@ TADA_ViewATTAINS <- function(.data) { ), silent = TRUE ) - + if (is.null(ATTAINS_lines) & is.null(ATTAINS_points) & is.null(ATTAINS_polygons)) { print("No ATTAINS data associated with this Water Quality Portal data.") } - + # Return leaflet map of TADA WQ and its associated ATTAINS data return(map) })) } + + +#' Access options available for querying tribal spatial data with `TADA_DataRetrieval()`. +#' +#' @description +#' This function provides access to [six layer datasets](https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer) +#' containing spatial data related to tribal lands: "Alaska Native Allotments", +#' "Alaska Native Villages", "American Indian Reservations", "Off-reservation Trust Lands", +#' "Oklahoma Tribal Statistical Areas", and "Virginia Federally Recognized Tribes". +#' These datasets are used by `TADA_DataRetrieval()` when retrieving spatial data +#' for tribal lands specified by the user. +#' +#' The purpose of `TADA_TribalOptions()` is to allow the user to review the available +#' data in those datasets and identify the records they would like to query with +#' `TADA_DataRetrieval()`. +#' +#' An interactive map of the six layer datasets is available on ArcGIS Online Map +#' Viewer here: https://www.arcgis.com/apps/mapviewer/index.html?url=https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer&source=sd +#' +#' @param tribal_area_type A character string. Must be one of the six tribal +#' spatial layers: "Alaska Native Allotments", "Alaska Native Villages", +#' "American Indian Reservations", "Off-reservation Trust Lands", +#' "Oklahoma Tribal Statistical Areas", or "Virginia Federally Recognized Tribes". +#' +#' @param return_sf Logical. Should the function return the dataset as an `sf` +#' object (TRUE) or a data frame (FALSE)? Defaults to FALSE. +#' +#' @returns A data frame or `sf` object containing the specified layer from the EPA +#' Map Service. +#' +#' @note +#' Alaska Native Villages and Virginia Federally Recognized Tribes are point +#' geometries in the Map Service, not polygons. At the time of this writing they +#' do not return any data when used for WQP bbox queries. +#' +#' @seealso [TADA_DataRetrieval()] +#' + +TADA_TribalOptions <- function(tribal_area_type, return_sf = FALSE){ + + # Make a reference table for tribal area type + url matching + map_service_urls <- tibble::tribble( + ~tribal_area, ~url, + "Alaska Native Allotments", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/0", + "Alaska Native Villages", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/1", + "American Indian Reservations", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/2", + "Off-reservation Trust Lands", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/3", + "Oklahoma Tribal Statistical Areas", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/4", + "Virginia Federally Recognized Tribes", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/5" + ) + + # Confirm usable string provided + if( !(tribal_area_type %in% map_service_urls$tribal_area) ){ + stop("tribal_area_type must match one of the six tribal spatial layer names.") + } + + if( tribal_area_type %in% + c("Alaska Native Villages", "Virginia Federally Recognized Tribes") ){ + warning( + paste0( + "Alaska Native Villages and Virginia Federally Recognized Tribes are point geometries in the Map Service, not polygons. ", + "At the time of this writing they do not return any data when used for WQP bbox queries." + ) + ) + } + + # Query Map Service + tribal_area_sf <- dplyr::filter(map_service_urls, + tribal_area == tribal_area_type)$url %>% + arcgislayers::arc_open() %>% + # Return sf + arcgislayers::arc_select() %>% + sf::st_make_valid() + + # Convert to df if needed, export + if(return_sf == FALSE){ + return( + as.data.frame(tribal_area_sf) %>% + sf::st_drop_geometry() + ) + } else { + return(tribal_area_sf) + } + +} From 49ddc839c3c821c0f3841b3b27210a2c85733735 Mon Sep 17 00:00:00 2001 From: Matt Brousil <37380883+mbrousil@users.noreply.github.com> Date: Wed, 20 Nov 2024 12:39:28 -0800 Subject: [PATCH 03/35] tribal options edits --- NAMESPACE | 1 + R/DataDiscoveryRetrieval.R | 68 ++++++++++++++++++++------------------ R/GeospatialFunctions.R | 2 ++ man/TADA_DataRetrieval.Rd | 24 +++++++++++++- man/TADA_TribalOptions.Rd | 44 ++++++++++++++++++++++++ 5 files changed, 105 insertions(+), 34 deletions(-) create mode 100644 man/TADA_TribalOptions.Rd diff --git a/NAMESPACE b/NAMESPACE index 9bd8f6cb..0b286c45 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -74,6 +74,7 @@ export(TADA_SimpleCensoredMethods) export(TADA_Stats) export(TADA_SubstituteDeprecatedChars) export(TADA_SummarizeColumn) +export(TADA_TribalOptions) export(TADA_TwoCharacteristicScatterplot) export(TADA_UniqueCharUnitSpeciation) export(TADA_ViewATTAINS) diff --git a/R/DataDiscoveryRetrieval.R b/R/DataDiscoveryRetrieval.R index 8bae1989..a5f2ff59 100644 --- a/R/DataDiscoveryRetrieval.R +++ b/R/DataDiscoveryRetrieval.R @@ -247,17 +247,13 @@ TADA_DataRetrieval <- function(startDate = "null", stop("A tribal_area_type is required if tribe_name_parcel is provided.") } - # Set query parameters WQPquery <- list() - # If an sf object OR tribal info are provided they will be the basis of the query # (The tribal data handling uses sf objects as well) if( (!is.null(aoi_sf) & inherits(aoi_sf, "sf")) | (tribal_area_type != "null") ){ - sf::sf_use_s2(FALSE) - # Build the non-sf part of the query: # StartDate @@ -399,6 +395,9 @@ TADA_DataRetrieval <- function(startDate = "null", } + # Check and/or fix geometry + aoi_sf <- sf::st_make_valid(aoi_sf) + # Match CRS if(sf::st_crs(aoi_sf) != 4326){ aoi_sf <- sf::st_transform(aoi_sf, crs = 4326) @@ -1000,35 +999,7 @@ TADA_BigDataRetrieval <- function(startDate = "null", rm(df_summary) # if there are still site records when filtered to years of interest.... if (dim(sites)[1] > 0) { - # function for chunking by records - make_groups <- function(x, maxrecs) { - if (sum(x$tot_n) <= maxrecs | dim(x)[1] == 1) { # WARNING: if there's only one row and it's more than maxrecs, it will try to run the query anyway - groupings <- x - groupings$group <- 1 - } else { - groupings <- data.frame() - group <- data.frame() - i <- 1 - while (nrow(x) > nrow(group)) { - x$csum <- cumsum(x$tot_n) - brk <- which(x$csum > maxrecs)[1] - group <- x[1:(brk - 1), ] - group$group <- i - if (brk > 1) { - x <- x[brk:length(x$tot_n), ] - } else { - x <- x[2:length(x$tot_n), ] - } - i <- i + 1 - groupings <- plyr::rbind.fill(groupings, group) - } - - x$group <- i - - groupings <- plyr::rbind.fill(groupings, x) - } - return(groupings) - } + # get total number of results per site and separate out sites with >250000 results @@ -1212,3 +1183,34 @@ TADA_JoinWQPProfiles <- function(FullPhysChem = "null", return(join2) } + + +# function for chunking by records +make_groups <- function(x, maxrecs) { + if (sum(x$tot_n) <= maxrecs | dim(x)[1] == 1) { # WARNING: if there's only one row and it's more than maxrecs, it will try to run the query anyway + groupings <- x + groupings$group <- 1 + } else { + groupings <- data.frame() + group <- data.frame() + i <- 1 + while (nrow(x) > nrow(group)) { + x$csum <- cumsum(x$tot_n) + brk <- which(x$csum > maxrecs)[1] + group <- x[1:(brk - 1), ] + group$group <- i + if (brk > 1) { + x <- x[brk:length(x$tot_n), ] + } else { + x <- x[2:length(x$tot_n), ] + } + i <- i + 1 + groupings <- plyr::rbind.fill(groupings, group) + } + + x$group <- i + + groupings <- plyr::rbind.fill(groupings, x) + } + return(groupings) +} diff --git a/R/GeospatialFunctions.R b/R/GeospatialFunctions.R index d95875db..8b6bb16b 100644 --- a/R/GeospatialFunctions.R +++ b/R/GeospatialFunctions.R @@ -748,6 +748,8 @@ TADA_ViewATTAINS <- function(.data) { #' #' @seealso [TADA_DataRetrieval()] #' +#' @export +#' TADA_TribalOptions <- function(tribal_area_type, return_sf = FALSE){ diff --git a/man/TADA_DataRetrieval.Rd b/man/TADA_DataRetrieval.Rd index dfd58fd8..e49f1e03 100644 --- a/man/TADA_DataRetrieval.Rd +++ b/man/TADA_DataRetrieval.Rd @@ -7,11 +7,14 @@ TADA_DataRetrieval( startDate = "null", endDate = "null", + aoi_sf = NULL, countrycode = "null", countycode = "null", huc = "null", siteid = "null", siteType = "null", + tribal_area_type = "null", + tribe_name_parcel = "null", characteristicName = "null", characteristicType = "null", sampleMedia = "null", @@ -27,6 +30,8 @@ TADA_DataRetrieval( \item{endDate}{End Date string in the format YYYY-MM-DD, for example, "2020-01-01"} +\item{aoi_sf}{An sf object to use for a query area of interest} + \item{countrycode}{Code that identifies a country or ocean (e.g. countrycode = "CA" for Canada, countrycode = "OA" for Atlantic Ocean). See https://www.waterqualitydata.us/Codes/countrycode for options.} \item{countycode}{FIPS county name. Note that a state code must also be supplied (e.g. statecode = "AL", countycode = "Chilton"). See https://www.waterqualitydata.us/Codes/countycode for options.} @@ -37,6 +42,10 @@ TADA_DataRetrieval( \item{siteType}{Type of waterbody. See https://www.waterqualitydata.us/Codes/sitetype for options.} +\item{tribal_area_type}{One of the six tribal spatial layers: "Alaska Native Allotments", "Alaska Native Villages", "American Indian Reservations", "Off-reservation Trust Lands", "Oklahoma Tribal Statistical Areas", or "Virginia Federally Recognized Tribes".} + +\item{tribe_name_parcel}{The name of a tribe corresponding to an entry in the TRIBE_NAME field of the specified tribal_area_type. OR if the type is Alaska Native Allotments" then the corresponding PARCEL_NO.} + \item{characteristicName}{Name of parameter. See https://www.waterqualitydata.us/Codes/characteristicName for options.} \item{characteristicType}{Groups of environmental measurements/parameters. See https://www.waterqualitydata.us/Codes/characteristicType for options.} @@ -62,7 +71,9 @@ dataframe. Note that the inputs (e.g. project, organization, siteType) with the exceptions of endDate and startDate match the web service call format from the online WQP GUI. endDate and startDate match the format suggested in USGS's dataRetrieval package (endDate = "YYYY-MM-DD"), which is a more familiar date -format for R users than the WQP GUI's endDateHi = "MM-DD-YYYY". +format for R users than the WQP GUI's endDateHi = "MM-DD-YYYY". aoi_sf, +tribal_area_type, and tribe_name_parcel do not have corresponding inputs in +the web service. } \details{ Multiple fields are queried together using AND logic, but multiple values within @@ -76,6 +87,11 @@ of the query fields. characteristicName and Characteristic Group also work as an AND, therefore the characteristicName must fall within the Characteristic Group when both are entered. +aoi_sf cannot be used with tribal_area_type. If countrycode, countycode, huc, +siteid, or statecode are used with aoi_sf or tribal_area_type they will be ignored +under the assumption that the sf object or tribal location are the intended +area of interest. + Users can reference the \href{https://www.epa.gov/waterdata/storage-and-retrieval-and-water-quality-exchange-domain-services-and-downloads}{WQX domain tables} to find allowable values for queries, e.g., reference the WQX domain table to find countycode and statecode: https://cdx.epa.gov/wqx/download/DomainValues/County_CSV.zip Alternatively, you can use the WQP services to find areas where data is available in the US: https://www.waterqualitydata.us/Codes/countycode @@ -88,6 +104,12 @@ Note: TADA_DataRetrieval (by leveraging dataRetrieval), automatically converts the date times to UTC. It also automatically converts the data to dates, datetimes, numerics based on a standard algorithm. See: ?dataRetrieval::readWQPdata } +\note{ +Alaska Native Villages and Virginia Federally Recognized Tribes are point +geometries in the Map Service, not polygons. At the time of this writing they +do not return any data when used for WQP bbox queries and so are set to return +errors when used with this function. +} \examples{ \dontrun{ # example for WI diff --git a/man/TADA_TribalOptions.Rd b/man/TADA_TribalOptions.Rd new file mode 100644 index 00000000..05396d87 --- /dev/null +++ b/man/TADA_TribalOptions.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GeospatialFunctions.R +\name{TADA_TribalOptions} +\alias{TADA_TribalOptions} +\title{Access options available for querying tribal spatial data with \code{TADA_DataRetrieval()}.} +\usage{ +TADA_TribalOptions(tribal_area_type, return_sf = FALSE) +} +\arguments{ +\item{tribal_area_type}{A character string. Must be one of the six tribal +spatial layers: "Alaska Native Allotments", "Alaska Native Villages", +"American Indian Reservations", "Off-reservation Trust Lands", +"Oklahoma Tribal Statistical Areas", or "Virginia Federally Recognized Tribes".} + +\item{return_sf}{Logical. Should the function return the dataset as an \code{sf} +object (TRUE) or a data frame (FALSE)? Defaults to FALSE.} +} +\value{ +A data frame or \code{sf} object containing the specified layer from the EPA +Map Service. +} +\description{ +This function provides access to \href{https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer}{six layer datasets} +containing spatial data related to tribal lands: "Alaska Native Allotments", +"Alaska Native Villages", "American Indian Reservations", "Off-reservation Trust Lands", +"Oklahoma Tribal Statistical Areas", and "Virginia Federally Recognized Tribes". +These datasets are used by \code{TADA_DataRetrieval()} when retrieving spatial data +for tribal lands specified by the user. + +The purpose of \code{TADA_TribalOptions()} is to allow the user to review the available +data in those datasets and identify the records they would like to query with +\code{TADA_DataRetrieval()}. + +An interactive map of the six layer datasets is available on ArcGIS Online Map +Viewer here: https://www.arcgis.com/apps/mapviewer/index.html?url=https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer&source=sd +} +\note{ +Alaska Native Villages and Virginia Federally Recognized Tribes are point +geometries in the Map Service, not polygons. At the time of this writing they +do not return any data when used for WQP bbox queries. +} +\seealso{ +\code{\link[=TADA_DataRetrieval]{TADA_DataRetrieval()}} +} From 57e5a036bb207bb1e251a90d425a017783e337a0 Mon Sep 17 00:00:00 2001 From: Matt Brousil <37380883+mbrousil@users.noreply.github.com> Date: Wed, 20 Nov 2024 12:47:15 -0800 Subject: [PATCH 04/35] TADA_DR rewrite --- R/DataDiscoveryRetrieval.R | 218 ++++++++++++++++++++++++------------- 1 file changed, 140 insertions(+), 78 deletions(-) diff --git a/R/DataDiscoveryRetrieval.R b/R/DataDiscoveryRetrieval.R index a5f2ff59..f689ae47 100644 --- a/R/DataDiscoveryRetrieval.R +++ b/R/DataDiscoveryRetrieval.R @@ -194,11 +194,10 @@ TADA_DataRetrieval <- function(startDate = "null", organization = "null", project = "null", providers = "null", + maxrecs = 250000, applyautoclean = TRUE) { - - - - # Check for incomplete or inconsistent inputs: + + # Check for incomplete or inconsistent inputs: # If both an sf object and tribe information are provided it's unclear what # the priority should be for the query @@ -210,7 +209,7 @@ TADA_DataRetrieval <- function(startDate = "null", "Please use only one of these query options." ) ) - } + } # Check for other arguments that indicate location. Function will ignore # these inputs but warn the user @@ -406,55 +405,59 @@ TADA_DataRetrieval <- function(startDate = "null", # Get bbox of the sf object input_bbox <- sf::st_bbox(aoi_sf) - # Query site info within the bbox - bbox_sites <- dataRetrieval::whatWQPsites( + # Query info on available data within the bbox + bbox_avail <- dataRetrieval::whatWQPdata( WQPquery, bBox = c(input_bbox$xmin, input_bbox$ymin, input_bbox$xmax, input_bbox$ymax) ) # Check if any sites are within the aoi - if ( (nrow(bbox_sites) > 0 ) == FALSE) { + if ( (nrow(bbox_avail) > 0 ) == FALSE) { stop("No monitoring sites were returned within your area of interest (no data available).") } # Reformat returned info as sf - bbox_sites_sf <- TADA_MakeSpatial(bbox_sites, crs = 4326) + bbox_sites_sf <- dataRetrieval::whatWQPsites( + siteid = bbox_avail$MonitoringLocationIdentifier + ) %>% + TADA_MakeSpatial(., crs = 4326) # Subset sites to only within shapefile and get IDs clipped_sites_sf <- bbox_sites_sf[aoi_sf, ] clipped_site_ids <- clipped_sites_sf$MonitoringLocationIdentifier - # Check number of sites returned. More than 300 will require a map() approach - if( length(clipped_site_ids) > 300 ) { + record_count <- bbox_avail %>% + dplyr::filter(MonitoringLocationIdentifier %in% clipped_site_ids) %>% + dplyr::pull(resultCount) %>% + sum() + + site_count <- length(clipped_site_ids) + + # Check for either more than 300 sites or more records than max_recs. + # If either is true then we'll approach the pull as a "big data" pull + if( site_count > 300 | record_count > maxrecs) { warning( paste0( - "More than 300 sites are matched by the AOI and query terms. ", + "The number of sites and/or records matched by the AOI and query terms is large, so the download may take some time. ", "If your AOI is a county, state, country, or HUC boundary it would be more efficient to provide a code instead of an sf object." ) ) - # Split IDs into a list - id_cluster_list <- split(x = clipped_site_ids, - f = ceiling(seq_along(clipped_site_ids) / 300)) - - print("Downloading WQP query results. This may take some time depending upon the query size.") + # Use helper function to download large data volume + results.DR <- suppressMessages( + TADA_BigDataHelper( + record_summary = bbox_avail %>% + dplyr::select(MonitoringLocationIdentifier, resultCount) %>% + dplyr::filter(MonitoringLocationIdentifier %in% clipped_site_ids), + WQPquery = WQPquery, + maxrecs = maxrecs, + maxsites = 300 + ) + ) - # List of query results - results.DR <- purrr::map( - .x = id_cluster_list, - .f = ~suppressMessages( - dataRetrieval::readWQPdata( - siteid = .x, - WQPquery, - dataProfile = "resultPhysChem", - ignore_attributes = TRUE - ) - ) %>% - # To allow row binds - dplyr::mutate(across(everything(), as.character)) - ) %>% - list_rbind() + rm(bbox_avail, bbox_sites_sf) + gc() # Check if any results were returned if ( (nrow(results.DR) > 0 ) == FALSE) { @@ -466,6 +469,7 @@ TADA_DataRetrieval <- function(startDate = "null", "Removing some of your query filters OR broadening your search area may help." ) ) + # Empty TADAprofile.clean <- results.DR } else { @@ -487,13 +491,8 @@ TADA_DataRetrieval <- function(startDate = "null", FullPhysChem = results.DR, Sites = sites.DR, Projects = projects.DR - ) - - # need to specify this or throws error when trying to bind rows. - # Temporary fix for larger issue where data structure for all columns - # should be specified. - TADAprofile <- TADAprofile %>% dplyr::mutate( - across(everything(), as.character) + ) %>% dplyr::mutate( + across(tidyselect::everything(), as.character) ) # run TADA_AutoClean function @@ -508,7 +507,7 @@ TADA_DataRetrieval <- function(startDate = "null", return(TADAprofile.clean) - # Less than 300 sites: + # Doesn't meet "big data" threshold: } else { # Retrieve all 3 profiles @@ -523,7 +522,7 @@ TADA_DataRetrieval <- function(startDate = "null", ignore_attributes = TRUE ) - # check if any results were returned + # Check if any results were returned if ((nrow(results.DR) > 0) == FALSE) { paste0( "Returning empty results dataframe: ", @@ -535,28 +534,28 @@ TADA_DataRetrieval <- function(startDate = "null", } else { # Get site metadata - sites.DR <- dataRetrieval::whatWQPsites(WQPquery) + sites.DR <- clipped_sites_sf %>% + as_tibble() %>% + select(-geometry) # Get project metadata - projects.DR <- dataRetrieval::readWQPdata(WQPquery, - ignore_attributes = TRUE, - service = "Project") + projects.DR <- dataRetrieval::readWQPdata( + siteid = clipped_site_ids, + WQPquery, + ignore_attributes = TRUE, + service = "Project" + ) # Join results, sites, projects TADAprofile <- TADA_JoinWQPProfiles( FullPhysChem = results.DR, Sites = sites.DR, Projects = projects.DR + ) %>% dplyr::mutate( + across(tidyselect::everything(), as.character) ) - # need to specify this or throws error when trying to bind rows. - # Temporary fix for larger issue where data structure for all columns - # should be specified. - TADAprofile <- TADAprofile %>% dplyr::mutate( - across(everything(), as.character) - ) - - # run TADA_AutoClean function + # Run TADA_AutoClean function if (applyautoclean == TRUE) { print("Data successfully downloaded. Running TADA_AutoClean function.") @@ -572,6 +571,7 @@ TADA_DataRetrieval <- function(startDate = "null", # If no sf object provided: } else { + # Set query parameters WQPquery <- list() @@ -678,37 +678,58 @@ TADA_DataRetrieval <- function(startDate = "null", WQPquery <- c(WQPquery, endDate = endDate) } - # Retrieve all 3 profiles - print("Downloading WQP query results. This may take some time depending upon the query size.") - print(WQPquery) - results.DR <- dataRetrieval::readWQPdata(WQPquery, - dataProfile = "resultPhysChem", - ignore_attributes = TRUE - ) - # check if any results are available - if ((nrow(results.DR) > 0) == FALSE) { - print("Returning empty results dataframe: Your WQP query returned no results (no data available). Try a different query. Removing some of your query filters OR broadening your search area may help.") - TADAprofile.clean <- results.DR - } else { - sites.DR <- dataRetrieval::whatWQPsites(WQPquery) + # Query info on available data + query_avail <- dataRetrieval::whatWQPdata(WQPquery) + + site_count <- length(query_avail$MonitoringLocationIdentifier) + + record_count <- query_avail %>% + dplyr::pull(resultCount) %>% + sum() + + # Check for either more than 300 sites or more records than max_recs. + # If either is true then we'll approach the pull as a "big data" pull + if(site_count > 300 | record_count > maxrecs) { + warning( + "The number of sites and/or records matched by the query terms is large, so the download may take some time." + ) + + # Use helper function to download large data volume + results.DR <- suppressMessages( + TADA_BigDataHelper( + record_summary = query_avail %>% + dplyr::select(MonitoringLocationIdentifier, resultCount), + WQPquery = WQPquery, + maxrecs = maxrecs, + maxsites = 300 + ) + ) + + rm(query_avail) + gc() - projects.DR <- dataRetrieval::readWQPdata(WQPquery, - ignore_attributes = TRUE, - service = "Project" + # Get site metadata + sites.DR <- dataRetrieval::whatWQPsites( + siteid = unique(results.DR$MonitoringLocationIdentifier) ) + # Get project metadata + projects.DR <- dataRetrieval::readWQPdata( + siteid = unique(results.DR$MonitoringLocationIdentifier), + WQPquery, + ignore_attributes = TRUE, + service = "Project" + ) + + # Join results, sites, projects TADAprofile <- TADA_JoinWQPProfiles( FullPhysChem = results.DR, Sites = sites.DR, Projects = projects.DR + ) %>% dplyr::mutate( + across(tidyselect::everything(), as.character) ) - # need to specify this or throws error when trying to bind rows. Temporary fix for larger - # issue where data structure for all columns should be specified. - cols <- names(TADAprofile) - - TADAprofile <- TADAprofile %>% dplyr::mutate_at(cols, as.character) - # run TADA_AutoClean function if (applyautoclean == TRUE) { print("Data successfully downloaded. Running TADA_AutoClean function.") @@ -717,11 +738,52 @@ TADA_DataRetrieval <- function(startDate = "null", } else { TADAprofile.clean <- TADAprofile } + + return(TADAprofile.clean) + + # If not a "big data" pull: + } else { + # Retrieve all 3 profiles + print("Downloading WQP query results. This may take some time depending upon the query size.") + print(WQPquery) + results.DR <- dataRetrieval::readWQPdata(WQPquery, + dataProfile = "resultPhysChem", + ignore_attributes = TRUE + ) + + # check if any results are available + if ((nrow(results.DR) > 0) == FALSE) { + print("Returning empty results dataframe: Your WQP query returned no results (no data available). Try a different query. Removing some of your query filters OR broadening your search area may help.") + TADAprofile.clean <- results.DR + } else { + sites.DR <- dataRetrieval::whatWQPsites(WQPquery) + + projects.DR <- dataRetrieval::readWQPdata(WQPquery, + ignore_attributes = TRUE, + service = "Project" + ) + + TADAprofile <- TADA_JoinWQPProfiles( + FullPhysChem = results.DR, + Sites = sites.DR, + Projects = projects.DR + ) %>% dplyr::mutate( + across(tidyselect::everything(), as.character) + ) + + # run TADA_AutoClean function + if (applyautoclean == TRUE) { + print("Data successfully downloaded. Running TADA_AutoClean function.") + + TADAprofile.clean <- TADA_AutoClean(TADAprofile) + } else { + TADAprofile.clean <- TADAprofile + } + } + + return(TADAprofile.clean) } - - return(TADAprofile.clean) } - } From 7d9ac2aa74a87a525ecc42837184a1c601b64aa3 Mon Sep 17 00:00:00 2001 From: Matt Brousil <37380883+mbrousil@users.noreply.github.com> Date: Wed, 20 Nov 2024 12:51:03 -0800 Subject: [PATCH 05/35] Helper function for large queries --- R/DataDiscoveryRetrieval.R | 93 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/R/DataDiscoveryRetrieval.R b/R/DataDiscoveryRetrieval.R index f689ae47..1d9923e4 100644 --- a/R/DataDiscoveryRetrieval.R +++ b/R/DataDiscoveryRetrieval.R @@ -865,6 +865,99 @@ TADA_ReadWQPWebServices <- function(webservice) { } +TADA_BigDataHelper <- function(record_summary, WQPquery, maxrecs, maxsites = 300){ + + # Get total number of results per site and separate out sites with >maxrecs results + tot_sites <- record_summary %>% + # dplyr::filter(MonitoringLocationIdentifier %in% clipped_site_ids) %>% + dplyr::group_by(MonitoringLocationIdentifier) %>% + dplyr::summarise(tot_n = sum(resultCount)) %>% + dplyr::filter(tot_n > 0) %>% + dplyr::arrange(tot_n) + + # Sites with less than/equal to maxrecs + smallsites <- tot_sites %>% dplyr::filter(tot_n <= maxrecs) + # Sites with more than maxrecs + bigsites <- tot_sites %>% dplyr::filter(tot_n > maxrecs) + + df_small <- data.frame() + df_big <- data.frame() + + # Work with small sites first: + # Build download groups. Total record count limited to value of maxrecs. + # Number of sites per download group limited to 300. + if (dim(smallsites)[1] > 0) { + smallsitesgrp <- smallsites %>% + mutate(group = MESS::cumsumbinning( + x = tot_n, + threshold = maxrecs, + maxgroupsize = 300 + )) + + # Status update to user + print( + paste0("Downloading data from sites with fewer than ", + maxrecs, + " results by grouping them together.") + ) + + # Download the data for each group + for (i in 1:max(smallsitesgrp$group)) { + small_site_chunk <- subset(smallsitesgrp$MonitoringLocationIdentifier, + smallsitesgrp$group == i) + # Query result data + results_small <- dataRetrieval::readWQPdata( + siteid = small_site_chunk, + WQPquery, + dataProfile = "resultPhysChem", + ignore_attributes = TRUE + ) %>% + dplyr::mutate(across(everything(), as.character)) + + # If data is returned, stack with what's already been retrieved + if (dim(results_small)[1] > 0) { + df_small <- dplyr::bind_rows(df_small, results_small) + } + } + + rm(smallsites, smallsitesgrp) + gc() + + # Large sites (>= maxrecs) next: + if (dim(bigsites)[1] > 0) { + print( + paste0("Downloading data from sites with greater than ", + maxrecs, + " results, chunking queries by site.") + ) + + # Unique site IDs + bsitesvec <- unique(bigsites$MonitoringLocationIdentifier) + + # For each site + for (i in 1:length(bsitesvec)) { + # Download each site's data individually + results_big <- dataRetrieval::readWQPdata( + siteid = bsitesvec[i], + WQPquery, + dataProfile = "resultPhysChem", + ignore_attributes = TRUE + ) %>% + dplyr::mutate(across(everything(), as.character)) + + if (dim(results_big)[1] > 0) { + df_big <- dplyr::bind_rows(df_big, results_big) + } + } + } + rm(bigsites) + gc() + } + + df_out <- bind_rows(df_small, df_big) + + return(df_out) +} #' Large WQP data pulls using dataRetrieval #' From 2b06dbd20209302c26636eb87e6ea6b2950113da Mon Sep 17 00:00:00 2001 From: Matt Brousil <37380883+mbrousil@users.noreply.github.com> Date: Wed, 20 Nov 2024 13:12:23 -0800 Subject: [PATCH 06/35] Document bigdatahelper --- R/DataDiscoveryRetrieval.R | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/R/DataDiscoveryRetrieval.R b/R/DataDiscoveryRetrieval.R index 1d9923e4..56a2e57e 100644 --- a/R/DataDiscoveryRetrieval.R +++ b/R/DataDiscoveryRetrieval.R @@ -864,12 +864,25 @@ TADA_ReadWQPWebServices <- function(webservice) { } } - -TADA_BigDataHelper <- function(record_summary, WQPquery, maxrecs, maxsites = 300){ +#' Assist with large WQP data pulls using dataRetrieval +#' +#' This is a helper function that takes large WQP (waterqualitydata.us) queries +#' and splits them up into multiple, smaller queries. By default it pulls data +#' for up to 300 sites or 250,000 data records at a time and then joins the separate +#' pulls back together to produce a single TADA compatible dataframe as the output. +#' Computer memory may limit the size of data frames that your R console will +#' be able to hold in one session. +#' +#' @param record_summary MonitoringLocationIdentifier and resultCount columns from the output of `dataRetrieval::whatWQPdata` for the WQP query being used. +#' @param WQPquery A named list of query terms to supply dataRetrieval functions. +#' @param maxrecs Maximum number of records to query at once. +#' @param maxsites Maximum number of sites to query at once. +#' +#' @return TADA-compatible dataframe +TADA_BigDataHelper <- function(record_summary, WQPquery, maxrecs = 250000, maxsites = 300){ # Get total number of results per site and separate out sites with >maxrecs results tot_sites <- record_summary %>% - # dplyr::filter(MonitoringLocationIdentifier %in% clipped_site_ids) %>% dplyr::group_by(MonitoringLocationIdentifier) %>% dplyr::summarise(tot_n = sum(resultCount)) %>% dplyr::filter(tot_n > 0) %>% From 929dc3740330facf4d7126d11781543ac8e770e8 Mon Sep 17 00:00:00 2001 From: Matt Brousil <37380883+mbrousil@users.noreply.github.com> Date: Thu, 21 Nov 2024 08:42:16 -0800 Subject: [PATCH 07/35] update geospatial funs --- R/GeospatialFunctions.R | 2123 +++++++++++++++++++++++++++++---------- 1 file changed, 1610 insertions(+), 513 deletions(-) diff --git a/R/GeospatialFunctions.R b/R/GeospatialFunctions.R index 8b6bb16b..85f9a9d2 100644 --- a/R/GeospatialFunctions.R +++ b/R/GeospatialFunctions.R @@ -2,7 +2,7 @@ #' #' Transform a Water Quality Portal dataframe into a geospatial sf object. #' -#' Adds one new column to input dataset, 'geometry', which allows for mapping and additional geospatial capabilities. Check out the TADAModule2.Rmd for an example workflow. +#' Adds one new column to input dataframe, 'geometry', which allows for mapping and additional geospatial capabilities. Check out the TADAModule2.Rmd for an example workflow. #' #' @param .data A dataframe created by `TADA_DataRetrieval()`. #' @param crs The coordinate reference system (CRS) you would like the returned point features to be in. The default is CRS 4326 (WGS84). @@ -29,13 +29,15 @@ #' TADA_MakeSpatial <- function(.data, crs = 4326) { if (!"LongitudeMeasure" %in% colnames(.data) | - !"LatitudeMeasure" %in% colnames(.data) | - !"HorizontalCoordinateReferenceSystemDatumName" %in% colnames(.data)) { + !"LatitudeMeasure" %in% colnames(.data) | + !"HorizontalCoordinateReferenceSystemDatumName" %in% colnames(.data)) { stop("The dataframe does not contain WQP-style latitude and longitude data (column names `HorizontalCoordinateReferenceSystemDatumName`, `LatitudeMeasure`, and `LongitudeMeasure`.") } else if (!is.null(.data) & inherits(.data, "sf")) { stop("Your data is already a spatial object.") } - + + print("Transforming your data into a spatial object.") + suppressMessages(suppressWarnings({ # Make a reference table for CRS and EPSG codes # List should include all codes in WQX domain (see HorizontalCoordinateReferenceSystemDatum CSV at https://www.epa.gov/waterdata/storage-and-retrieval-and-water-quality-exchange-domain-services-and-downloads) @@ -44,8 +46,9 @@ TADA_MakeSpatial <- function(.data, crs = 4326) { "NAD83", 4269, "WGS84", 4326, "NAD27", 4267, - "UNKWN", crs, # Unknowns and NAs should go to user supplied default - "OTHER", 4326, + "UNKWN", crs, # Unknowns and NAs should go to user supplied crs or default + "Unknown", crs, + "OTHER", crs, "OLDHI", 4135, "AMSMA", 4169, "ASTRO", 4727, @@ -59,13 +62,25 @@ TADA_MakeSpatial <- function(.data, crs = 4326) { "WGS72", 6322, "HARN", 4152 ) - + + # Check the CRS column for NA or "UNKWN" and warn user if any are found + if (any(is.na(.data$HorizontalCoordinateReferenceSystemDatumName)) || + any(.data$HorizontalCoordinateReferenceSystemDatumName %in% c("UNKWN", "Unknown", "OTHER"))) { + print(paste0("Your WQP data frame contains observations without a listed coordinate reference system (CRS). For these, we have assigned CRS ", crs, ".")) + } # join our CRS reference table to our original WQP dataframe: sf <- .data %>% tibble::rowid_to_column(var = "index") %>% dplyr::mutate( lat = as.numeric(LatitudeMeasure), - lon = as.numeric(LongitudeMeasure) + lon = as.numeric(LongitudeMeasure), + # If `HorizontalCoordinateReferenceSystemDatumName` is NA... + HorizontalCoordinateReferenceSystemDatumName = ifelse(is.na(HorizontalCoordinateReferenceSystemDatumName), + # ... assign it the same crs as the user-supplied crs: + paste0(epsg_codes %>% dplyr::filter(epsg == as.numeric(crs)) %>% .[1, 1]), + # otherwise, preserve the original crs + HorizontalCoordinateReferenceSystemDatumName + ) ) %>% # Add EPSG codes dplyr::left_join( @@ -89,17 +104,18 @@ TADA_MakeSpatial <- function(.data, crs = 4326) { dplyr::arrange(index) %>% dplyr::select(-c(index, epsg)) })) - + return(sf) } #' fetchATTAINS #' -#' Fetch ATTAINS features (entity submitted points, lines, polygons representing their assessment units; and EPA snapshot of the associated NHDPlus HR catchments that the entity submitted features fall within) within a bounding box produced from a set of TADA spatial features. +#' Fetches ATTAINS features (state- or tribe- or other entity- submitted points, lines, and polygons representing their assessment units; and the EPA snapshot of the associated NHDPlus HR catchments that the state- or tribe- or other entity- submitted features fall within) within a bounding box produced from a set of TADA spatial features. #' #' @param .data A dataframe developed using `TADA_DataRetrieval()` or `TADA_MakeSpatial()`. -#' @return spatial features (ATTAINS_catchments, ATTAINS_points, ATTAINS_lines, and ATTAINS_polygons) that are within the spatial bounding box of water quality observations. +#' @param catchments_only Whether to return just the summarized ATTAINS catchment features, or both the catchments and raw ATTAINS features. TRUE or FALSE. +#' @return Spatial features (ATTAINS_catchments, ATTAINS_points, ATTAINS_lines, and ATTAINS_polygons) that are within the spatial bounding box of water quality observations. #' #' @seealso [TADA_MakeSpatial()] #' @seealso [TADA_DataRetrieval()] @@ -114,32 +130,44 @@ TADA_MakeSpatial <- function(.data, crs = 4326) { #' applyautoclean = TRUE #' ) #' -#' nv_attains_features <- fetchATTAINS(tada_data) +#' nv_attains_features <- fetchATTAINS(tada_data, catchments_only = FALSE) #' } -fetchATTAINS <- function(.data) { - if (is.null(.data) | nrow(.data) == 0) { - stop("There is no data in your `data` object to use as a bounding box for selecting ATTAINS features.") - } - +fetchATTAINS <- function(.data, catchments_only = FALSE) { + sf::sf_use_s2(FALSE) + + print("Depending on your data's observation count and its spatial range, the ATTAINS pull may take a while.") + # EPSG we want our ATTAINS data to be in (always 4326 for this function) our_epsg <- 4326 - + # If data is already spatial, just make sure it is in the right CRS # and add an index as the WQP observations' unique identifier... if (!is.null(.data) & inherits(.data, "sf")) { if (sf::st_crs(.data)$epsg != our_epsg) { .data <- .data %>% - sf::st_transform(our_epsg) + sf::st_transform(our_epsg) %>% + dplyr::distinct(geometry, .keep_all = TRUE) } else { - .data <- .data + .data <- .data %>% + dplyr::distinct(geometry, .keep_all = TRUE) } + } else if (!"LongitudeMeasure" %in% colnames(.data) | + !"LatitudeMeasure" %in% colnames(.data) | + !"HorizontalCoordinateReferenceSystemDatumName" %in% colnames(.data)) { + stop("The dataframe does not contain WQP-style latitude and longitude data (column names `HorizontalCoordinateReferenceSystemDatumName`, `LatitudeMeasure`, and `LongitudeMeasure`.") } else { # ... Otherwise transform into a spatial object then do the same thing: .data <- .data %>% + data.table::data.table(.) %>% + dplyr::distinct(LongitudeMeasure, LatitudeMeasure, .keep_all = TRUE) %>% # convert dataframe to a spatial object TADA_MakeSpatial(.data = ., crs = our_epsg) } - + + if (is.null(.data) | nrow(.data) == 0) { + stop("There is no data in your `data` object to use as a bounding box for selecting ATTAINS features.") + } + baseurls <- c( # ATTAINS catchments: "https://gispub.epa.gov/arcgis/rest/services/OW/ATTAINS_Assessment/MapServer/3/query?", # ATTAINS points: @@ -149,30 +177,32 @@ fetchATTAINS <- function(.data) { # ATTAINS polygons: "https://gispub.epa.gov/arcgis/rest/services/OW/ATTAINS_Assessment/MapServer/2/query?" ) - - feature_downloader <- function(baseurls) { + + # bounding box of user's WQP data + suppressMessages(suppressWarnings({ + bbox_raw <- .data %>% + sf::st_bbox(.) + bbox <- bbox_raw %>% + # convert bounding box to characters + toString(.) %>% + # encode for use within the API URL + urltools::url_encode(.) + })) + + + feature_downloader <- function(baseurls, sf_bbox) { # starting at feature 1 (i.e., no offset): offset <- 0 # empty list to store all features in all_features <- list() - - # bounding box (with some minor wiggle) of user's WQP data - suppressMessages(suppressWarnings({ - bbox <- .data %>% - sf::st_bbox(.) %>% - # convert bounding box to characters - toString(.) %>% - # encode for use within the API URL - urltools::url_encode(.) - })) - + # The ATTAINS API has a limit of 2000 features that can be pulled in at once. # Therefore, we must split the call into manageable "chunks" using a moving # window of what features to pull in, then munging all the separate API calls # together. - + repeat { - query <- urltools::param_set(baseurls, key = "geometry", value = bbox) %>% + query <- urltools::param_set(baseurls, key = "geometry", value = sf_bbox) %>% urltools::param_set(key = "inSR", value = our_epsg) %>% # Total of 2000 features at a time... urltools::param_set(key = "resultRecordCount", value = 2000) %>% @@ -191,7 +221,7 @@ fetchATTAINS <- function(.data) { urltools::param_set(key = "returnDistinctValues", value = "false") %>% urltools::param_set(key = "returnExtentOnly", value = "false") %>% urltools::param_set(key = "featureEncoding", value = "esriDefault") - + # Fetch features within the offset window and append to list: features <- suppressMessages(suppressWarnings({ tryCatch( @@ -203,243 +233,975 @@ fetchATTAINS <- function(.data) { } ) })) - - + # Exit loop if no more features or error occurred if (is.null(features) || nrow(features) == 0) { break } - + all_features <- c(all_features, list(features)) # once done, change offset by 2000 features: offset <- offset + 2000 - + if (offset == 4000) { - print("Your TADA data covers a large spatial range. The ATTAINS pull may take a while.") + + } + } + + all_features <- dplyr::bind_rows(all_features) %>% + # remove duplicate features (precautionary) + dplyr::distinct(.keep_all = TRUE) + } + + # Function used to grab assessment unit "WaterType". + # Sweet spot chunk size wise is 200: + split_vector <- function(vector, chunk_size = 200) { + # Number of chunks needed + num_chunks <- ceiling(length(vector) / chunk_size) + + # Split the vector into chunks + chunks <- split(vector, ceiling(seq_along(vector) / chunk_size)) + + return(chunks) + } + + # If the area of the bbox is massive (about the area of California or larger), AND there + # aren't that many actual monitoring locations (100)... OR the bbox is about the size of New Hampshire, and the observations are under 25... + # ... speed up processing by going site-by-site: + if (nrow(.data) <= 100 & as.numeric(sf::st_area(sf::st_as_sfc(bbox_raw))) >= 4e+11 || nrow(.data) <= 25 & as.numeric(sf::st_area(sf::st_as_sfc(bbox_raw))) >= 1e+11) { + catchment_features <- vector("list", length = nrow(.data)) + + for (i in 1:nrow(.data)) { + # bounding box of user's WQP data + suppressMessages(suppressWarnings({ + bbox <- .data[i, ] %>% + sf::st_buffer(0.0000001) %>% + sf::st_bbox(.) %>% + # convert bounding box to characters + toString(.) %>% + # encode for use within the API URL + urltools::url_encode(.) + })) + + catchment_features[[i]] <- feature_downloader(baseurls = baseurls[1], sf_bbox = bbox) + } + + catchment_features <- catchment_features %>% + purrr::keep(~ nrow(.) > 0) %>% + dplyr::bind_rows() + + if (length(catchment_features) == 0 || is.null(catchment_features)) { + print("There are no ATTAINS features associated with your area of interest.") + } else { + ## GRABBING WATER TYPE: + + # Use ATTAINS API to grab, for each assessment unit, its WaterType. + # Query the API in "chunks" so it doesn't break. Sweet spot is ~200: + all_units <- unique(catchment_features$assessmentunitidentifier) + chunks <- split_vector(all_units, chunk_size = 200) + water_types <- vector("list", length = length(chunks)) + + for (i in 1:length(chunks)) { + dat <- httr::GET(paste0("https://attains.epa.gov/attains-public/api/assessmentUnits?assessmentUnitIdentifier=", paste(chunks[[i]], collapse = ","))) %>% + httr::content(., as = "text", encoding = "UTF-8") %>% + jsonlite::fromJSON(.) + + water_types[[i]] <- dat[["items"]] %>% + tidyr::unnest("assessmentUnits") %>% + tidyr::unnest("waterTypes") %>% + dplyr::select( + assessmentUnitIdentifier, + waterTypeCode + ) + } + + water_types <- dplyr::bind_rows(water_types) + + try(catchment_features <- dplyr::left_join(catchment_features, water_types, by = c("assessmentunitidentifier" = "assessmentUnitIdentifier"))) + } + + # If only interested in grabbing catchment data, return just the catchments + if (catchments_only == TRUE) { + return(list("ATTAINS_catchments" = catchment_features)) + } + + # Otherwise, start grabbing the raw ATTAINS features that intersect those + # catchments + points <- vector("list", length = nrow(catchment_features)) + lines <- vector("list", length = nrow(catchment_features)) + polygons <- vector("list", length = nrow(catchment_features)) + + for (i in 1:nrow(catchment_features)) { + # bounding box of catchments + suppressMessages(suppressWarnings({ + bbox <- catchment_features[i, ] %>% + sf::st_bbox(.) %>% + # convert bounding box to characters + toString(.) %>% + # encode for use within the API URL + urltools::url_encode(.) + })) + + points[[i]] <- feature_downloader(baseurls = baseurls[2], sf_bbox = bbox) + lines[[i]] <- feature_downloader(baseurls = baseurls[3], sf_bbox = bbox) + polygons[[i]] <- feature_downloader(baseurls = baseurls[4], sf_bbox = bbox) + } + + points <- points %>% + purrr::keep(~ nrow(.) > 0) %>% + dplyr::bind_rows() + try( + points <- points %>% dplyr::left_join(., water_types, by = c("assessmentunitidentifier" = "assessmentUnitIdentifier")), + silent = TRUE + ) + + lines <- lines %>% + purrr::keep(~ nrow(.) > 0) %>% + dplyr::bind_rows() + try( + lines <- lines %>% dplyr::left_join(., water_types, by = c("assessmentunitidentifier" = "assessmentUnitIdentifier")), + silent = TRUE + ) + + polygons <- polygons %>% + purrr::keep(~ nrow(.) > 0) %>% + dplyr::bind_rows() + try( + polygons <- polygons %>% dplyr::left_join(., water_types, by = c("assessmentunitidentifier" = "assessmentUnitIdentifier")), + silent = TRUE + ) + + final_features <- list( + "ATTAINS_catchments" = catchment_features, + "ATTAINS_points" = points, + "ATTAINS_lines" = lines, + "ATTAINS_polygons" = polygons + ) + + return(final_features) + + # Otherwise, just use the bbox in one pull: + } else { + catchment_features <- feature_downloader(baseurls = baseurls[1], sf_bbox = bbox) + + if (length(catchment_features) == 0 || is.null(catchment_features)) { + print("There are no ATTAINS features associated with your area of interest.") + } else { + ## GRABBING WATER TYPE: + + # Use ATTAINS API to grab, for each assessment unit, its WaterType. + # Query the API in "chunks" so it doesn't break: + all_units <- unique(catchment_features$assessmentunitidentifier) + chunks <- split_vector(all_units, chunk_size = 200) + water_types <- vector("list", length = length(chunks)) + + for (i in 1:length(chunks)) { + dat <- httr::GET(paste0("https://attains.epa.gov/attains-public/api/assessmentUnits?assessmentUnitIdentifier=", paste(chunks[[i]], collapse = ","))) %>% + httr::content(., as = "text", encoding = "UTF-8") %>% + jsonlite::fromJSON(.) + + water_types[[i]] <- dat[["items"]] %>% + tidyr::unnest("assessmentUnits") %>% + tidyr::unnest("waterTypes") %>% + dplyr::select( + assessmentUnitIdentifier, + waterTypeCode + ) } + + water_types <- dplyr::bind_rows(water_types) + + try(catchment_features <- dplyr::left_join(catchment_features, water_types, by = c("assessmentunitidentifier" = "assessmentUnitIdentifier")), silent = TRUE) + } + + # If only interested in grabbing catchment data, return just the catchments + if (catchments_only == TRUE) { + return(list("ATTAINS_catchments" = catchment_features)) } - - all_features <- dplyr::bind_rows(all_features) + + # Otherwise, start grabbing the raw ATTAINS features that intersect those + # catchments + + # bounding box of catchments + try(suppressMessages(suppressWarnings({ + bbox <- catchment_features %>% + sf::st_bbox(.) %>% + # convert bounding box to characters + toString(.) %>% + # encode for use within the API URL + urltools::url_encode(.) + })), silent = TRUE) + + # Download associated point, line, and polygon features using catchment bbox + other_features <- baseurls[2:4] %>% + purrr::map(function(baseurl) { + features <- feature_downloader(baseurls = baseurl, sf_bbox = bbox) + + if (!is.null(features) && nrow(features) > 0) { + features <- try(dplyr::left_join(features, water_types, by = c("assessmentunitidentifier" = "assessmentUnitIdentifier")), silent = TRUE) + } + + return(features) + }) + + final_features <- list( + "ATTAINS_catchments" = catchment_features, + "ATTAINS_points" = other_features[[1]], + "ATTAINS_lines" = other_features[[2]], + "ATTAINS_polygons" = other_features[[3]] + ) + + return(final_features) } - - final_features <- baseurls %>% - purrr::map(~ feature_downloader(.)) - - names(final_features) <- c("ATTAINS_catchments", "ATTAINS_points", "ATTAINS_lines", "ATTAINS_polygons") - - return(final_features) } -#' TADA_GetATTAINS +#' fetchNHD #' -#' Link catchment-based ATTAINS assessment unit data (EPA snapshot of NHDPlus HR catchments associated with entity submitted assessment unit features - points, lines, and polygons) to Water Quality Portal observations, often imported via `TADA_DataRetrieval()`. This function returns the same raw objects that are mapped in `TADA_ViewATTAINS()`. +#' Fetches NHD features from either the high resolution or medium resolution version of the National Hydrography Dataset (NHD) that intersect catchments containing TADA Water Quality Portal observations. #' -#' Adds the following ATTAINS columns to the input dataframe or list: "ATTAINS.organizationid", "ATTAINS.submissionid", "ATTAINS.hasprotectionplan", "ATTAINS.assessmentunitname", "ATTAINS.nhdplusid", "ATTAINS.tas303d", "ATTAINS.isthreatened", "ATTAINS.state", "ATTAINS.on303dlist", "ATTAINS.organizationname", "ATTAINS.region", "ATTAINS.Shape_Length", "ATTAINS.reportingcycle", "ATTAINS.assmnt_joinkey", "ATTAINS.hastmdl", "ATTAINS.orgtype", "ATTAINS.permid_joinkey", "ATTAINS.catchmentistribal", "ATTAINS.ircategory", "ATTAINS.waterbodyreportlink", "ATTAINS.assessmentunitidentifier", "ATTAINS.overallstatus", "ATTAINS.isassessed", "ATTAINS.isimpaired", "ATTAINS.has4bplan", "ATTAINS.huc12", "ATTAINS.hasalternativeplan", "ATTAINS.visionpriority303d", "ATTAINS.areasqkm", "ATTAINS.catchmentareasqkm", "ATTAINS.catchmentstatecode", "ATTAINS.catchmentresolution", "ATTAINS.Shape_Area". Check out the TADAModule2.Rmd for an example workflow. -#' -#' @param .data A dataframe created by `TADA_DataRetrieval()` or the sf equivalent made by `TADA_MakeSpatial()`. -#' @param return_sf Whether to return the associated ATTAINS_catchments, ATTAINS_lines, ATTAINS_points, and ATTAINS_polygons shapefile objects. TRUE (yes, return) or FALSE (no, do not return). All ATTAINS features are in WGS84 (crs = 4326). +#' @param .data A dataframe created by `TADA_DataRetrieval()` or the geospatial equivalent made by `TADA_MakeSpatial()`. +#' @param resolution Whether to download the NHDPlus HiRes resolution ("Hi") or medium NHDPlus V2 resolution ("Med") version of the National Hydrography Dataset (NHD). Default is "Hi". +#' @param features Which NHD features to return: "catchments", "flowlines", "waterbodies", or any combination. #' -#' @return A modified `TADA_DataRetrieval()` dataframe with additional columns associated with the ATTAINS assessment unit data. Or, if return_sf = TRUE, a list containing that same data frame plus the raw ATTAINS features associated with those observations. +#' @return A list containing all selected NHD features associated with the WQP observations of interest. Or, if a single feature type is selected, a single geospatial object instead of a list. Default is "catchments" only. #' #' @seealso [TADA_DataRetrieval()] #' @seealso [TADA_MakeSpatial()] -#' @seealso [TADA_ViewATTAINS()] -#' -#' @export #' #' @examples #' \dontrun{ #' tada_data <- TADA_DataRetrieval( -#' startDate = "2018-05-01", -#' endDate = "2018-09-30", +#' startDate = "1990-01-01", +#' endDate = "1990-01-15", #' characteristicName = "pH", -#' statecode = "IL", +#' statecode = "CO", #' applyautoclean = TRUE #' ) #' -#' tada_attains <- TADA_GetATTAINS(tada_data, return_sf = FALSE) -#' -#' tada_attains_list <- TADA_GetATTAINS(tada_data, return_sf = TRUE) +#' nhd_data <- fetchNHD(.data = tada_data, resolution = "Hi", features = c("catchments", "waterbodies", "flowlines")) #' } -TADA_GetATTAINS <- function(.data, return_sf = TRUE) { - attains_names <- c( - "ATTAINS.organizationid", "ATTAINS.submissionid", "ATTAINS.hasprotectionplan", - "ATTAINS.assessmentunitname", "ATTAINS.nhdplusid", "ATTAINS.tas303d", - "ATTAINS.isthreatened", "ATTAINS.state", "ATTAINS.on303dlist", - "ATTAINS.organizationname", "ATTAINS.region", "ATTAINS.Shape_Length", - "ATTAINS.reportingcycle", "ATTAINS.assmnt_joinkey", "ATTAINS.hastmdl", - "ATTAINS.orgtype", "ATTAINS.permid_joinkey", "ATTAINS.catchmentistribal", - "ATTAINS.ircategory", "ATTAINS.waterbodyreportlink", "ATTAINS.assessmentunitidentifier", - "ATTAINS.overallstatus", "ATTAINS.isassessed", "ATTAINS.isimpaired", - "ATTAINS.has4bplan", "ATTAINS.huc12", "ATTAINS.hasalternativeplan", - "ATTAINS.visionpriority303d", "ATTAINS.areasqkm", "ATTAINS.catchmentareasqkm", - "ATTAINS.catchmentstatecode", "ATTAINS.catchmentresolution", "ATTAINS.Shape_Area" - ) - - if (any(attains_names %in% colnames(.data))) { - stop("Your data has already been joined with ATTAINS data.") - } - - if (nrow(.data) == 0) { - print("Your Water Quality Portal dataframe has no observations. Returning an empty dataframe with empty ATTAINS features.") - - # if no WQP observations, return a modified `data` with empty ATTAINS-related columns: - - col_val_list <- stats::setNames( - object = rep( - x = list(NA), - times = length(attains_names) - ), - nm = attains_names - ) - - # Add ATTAINS columns with NA values - no_WQP_data <- .data %>% - dplyr::mutate(index = NA) %>% - dplyr::bind_cols(col_val_list) - - # In this case we'll need to return empty ATTAINS objects - if (return_sf == TRUE) { - ATTAINS_catchments <- NULL - ATTAINS_lines <- NULL - ATTAINS_points <- NULL - ATTAINS_polygons <- NULL - - return(list( - "TADA_with_ATTAINS" = no_WQP_data, - "ATTAINS_catchments" = ATTAINS_catchments, - "ATTAINS_points" = ATTAINS_points, - "ATTAINS_lines" = ATTAINS_lines, - "ATTAINS_polygons" = ATTAINS_polygons - )) - # If ATTAINS objects not requested, then just return the dataframe: - } else { - return(no_WQP_data) - } - } - - # If data doesn't already contain ATTAINS data and isn't an empty dataframe: +fetchNHD <- function(.data, resolution = "Hi", features = "catchments") { suppressMessages(suppressWarnings({ - sf::sf_use_s2(FALSE) - + # sf::sf_use_s2(TRUE) # If data is already spatial, just make sure it is in the right CRS - # and add unique WQP ID for identifying obs with more than one ATTAINS assessment unit - if (!is.null(.data) & inherits(.data, "sf")) { if (sf::st_crs(.data)$epsg != 4326) { - TADA_DataRetrieval_data <- .data %>% - sf::st_transform(4326) %>% - tibble::rowid_to_column(var = "index") + geospatial_data <- .data %>% + sf::st_transform(4326) } else { - TADA_DataRetrieval_data <- .data %>% - tibble::rowid_to_column(var = "index") + geospatial_data <- .data } } else { # ... Otherwise transform into a spatial object then do the same thing: - TADA_DataRetrieval_data <- .data %>% + geospatial_data <- .data %>% # convert dataframe to a spatial object TADA_MakeSpatial(.data = ., crs = 4326) %>% - # add unique WQP ID for identifying obs with more than one ATTAINS assessment unit - tibble::rowid_to_column(var = "index") + dplyr::mutate(geometry_join = geometry) } })) - - attains_features <- try(fetchATTAINS(.data = TADA_DataRetrieval_data), silent = TRUE) - - suppressMessages(suppressWarnings({ - # grab the ATTAINS catchments within our WQP bbox: - nearby_catchments <- NULL - # (Wrapped with "try" because it is possible that no ATTAINS data exists in the bbox.) - try( - nearby_catchments <- attains_features[["ATTAINS_catchments"]] %>% - # remove unnecessary columns: - dplyr::select(-c(OBJECTID, GLOBALID)) %>% - # select only catchments that have WQP observations in them: - .[TADA_DataRetrieval_data, ] %>% - # add prefix "ATTAINS" to ATTAINS data - dplyr::rename_with(~ paste0("ATTAINS.", .), dplyr::everything()) %>% - # get rid of dupes (as a precaution) - dplyr::distinct(.keep_all = TRUE), - silent = TRUE - ) - })) - - # if no ATTAINS data, return original dataframe with empty ATTAINS columns: - if (is.null(nearby_catchments)) { - print("There are no ATTAINS features associated with these WQP observations. Returning original dataframe with empty ATTAINS columns and empty ATTAINS geospatial features.") - - col_val_list <- stats::setNames( - object = rep( - x = list(NA), - times = length(attains_names) - ), - nm = attains_names - ) - - # return a modified `.data` with empty ATTAINS-related columns: - no_ATTAINS_data <- .data %>% - dplyr::bind_cols(col_val_list) %>% - tibble::rowid_to_column(var = "index") - - if (return_sf == TRUE) { - ATTAINS_catchments <- NULL - ATTAINS_lines <- NULL - ATTAINS_points <- NULL - ATTAINS_polygons <- NULL - - return(list( - "TADA_with_ATTAINS" = no_ATTAINS_data, - "ATTAINS_catchments" = ATTAINS_catchments, - "ATTAINS_points" = ATTAINS_points, - "ATTAINS_lines" = ATTAINS_lines, - "ATTAINS_polygons" = ATTAINS_polygons - )) - } else { - return(no_ATTAINS_data) - } - - # If there IS ATTAINS data... - } else { + + # Reduce WQP data to unique coordinates + unique_sites <- dplyr::distinct(geospatial_data, geometry) + + # If user wants HighRes NHD... + if (resolution %in% c("Hi", "hi")) { suppressMessages(suppressWarnings({ - # ... link WQP features to the ATTAINS catchment feature(s) they land in: - TADA_with_ATTAINS <- TADA_DataRetrieval_data %>% - # left join = TRUE to preserve all observations (with or without ATTAINS features): - sf::st_join(., nearby_catchments, left = TRUE) - - if (return_sf == FALSE) { - return(TADA_with_ATTAINS) + # Map server for NHDPlus_HR that is used to download features: + nhd_plus_hr_url <- "https://hydro.nationalmap.gov/arcgis/rest/services/NHDPlus_HR/MapServer" + + # bounding box of user's WQP data + + wqp_bboxes <- unique_sites %>% + sf::st_buffer(1e-07) %>% + dplyr::rowwise() %>% + dplyr::mutate(bbox = purrr::map(geometry, sf::st_bbox)) %>% + sf::st_as_sfc() + + # open the nhd_hr - which contains a bunch of layers + nhd_hr <- arcgislayers::arc_open(nhd_plus_hr_url) + + # list the layers of the nhdhr object + # arcgislayers::list_items(nhd_hr) + + # select the layer by id from the items list called above (10 is HR catchments) + nhd_hr_catchments <- arcgislayers::get_layer(nhd_hr, 10) + + # use bboxes of the sites to return their associated catchments + nhd_catchments_stored <- vector("list", length = length(wqp_bboxes)) + + for (i in 1:length(wqp_bboxes)) { + try( + nhd_catchments_stored[[i]] <- arcgislayers::arc_select(nhd_hr_catchments, + filter_geom = wqp_bboxes[i], + crs = sf::st_crs(wqp_bboxes[i]) + ) %>% + sf::st_make_valid(), + silent = TRUE + ) } - - # CATCHMENT FEATURES - # use original catchment pull, but return column names to original - ATTAINS_catchments <- nearby_catchments - colnames(ATTAINS_catchments) <- gsub("ATTAINS.", "", colnames(ATTAINS_catchments)) - # due to the rename, must re-set geometry column: - sf::st_geometry(ATTAINS_catchments) <- "geometry" - - # POINT FEATURES - try to pull point AU data if it exists. Otherwise, move on... - ATTAINS_points <- NULL - try( - ATTAINS_points <- attains_features[["ATTAINS_points"]] %>% - # subset to only ATTAINS point features in the same NHD HR catchments as WQP observations - .[nearby_catchments, ] %>% - # make sure no duplicate features exist - dplyr::distinct(assessmentunitidentifier, .keep_all = TRUE), - silent = TRUE + + nhd_catchments_stored <- nhd_catchments_stored %>% + purrr::keep(~ !is.null(.)) %>% + dplyr::bind_rows() %>% + dplyr::distinct() + + try(nhd_catchments_stored <- nhd_catchments_stored %>% + dplyr::select(nhdplusid, + catchmentareasqkm = areasqkm + ) %>% + dplyr::mutate( + NHD.nhdplusid = as.character(nhdplusid), + NHD.resolution = "HR", + NHD.catchmentareasqkm = as.numeric(catchmentareasqkm) + ) %>% + dplyr::select(NHD.nhdplusid, NHD.resolution, NHD.catchmentareasqkm, geometry), silent = TRUE) + })) + + # Empty version of the df will be returned if no associated catchments + # to avoid breaking downstream fxns reliant on catchment info. + if (nrow(nhd_catchments_stored) == 0 && "catchments" %in% features) { + print("No NHD HR features associated with your area of interest.") + nhd_catchments_stored <- tibble::tibble( + NHD.nhdplusid = character(), + NHD.resolution = character(), + NHD.catchmentareasqkm = numeric() ) - - # LINE FEATURES - try to pull line AU data if it exists. Otherwise, move on... - ATTAINS_lines <- NULL - try( - ATTAINS_lines <- attains_features[["ATTAINS_lines"]] %>% + } + + if (nrow(nhd_catchments_stored) == 0 && !"catchments" %in% features) { + stop("No NHD HR features associated with your area of interest.") + } + + if (length(features) == 1 && features == "catchments") { + return(nhd_catchments_stored) + } + + # Grab flowlines - + if ("flowlines" %in% features && nrow(nhd_catchments_stored) > 0) { + suppressMessages(suppressWarnings({ + # use catchments to grab other NHD features + geospatial_aoi <- nhd_catchments_stored %>% + sf::st_as_sfc() + + # select the layer by id from the items list (3 is HR flowlines) + nhd_hr_flowlines <- arcgislayers::get_layer(nhd_hr, 3) + + # use catchments to return associated flowlines + nhd_flowlines_stored <- vector("list", length = length(geospatial_aoi)) + + for (i in 1:length(geospatial_aoi)) { + try( + nhd_flowlines_stored[[i]] <- arcgislayers::arc_select(nhd_hr_flowlines, + filter_geom = geospatial_aoi[i], + crs = sf::st_crs(geospatial_aoi[i]) + ) %>% + sf::st_make_valid(), + silent = TRUE + ) + + # so all returned meta data binds properly, must transform all columns into characters, + # EXCEPT for the geometry column: + try( + geometry_col <- sf::st_geometry(nhd_flowlines_stored[[i]]), + silent = TRUE + ) + + try( + nhd_flowlines_stored[[i]] <- nhd_flowlines_stored[[i]] %>% + dplyr::mutate(dplyr::across(dplyr::where(~ !identical(., geometry_col)), ~ as.character(.))), + silent = TRUE + ) + } + + nhd_flowlines_stored <- nhd_flowlines_stored %>% + purrr::keep(~ !is.null(.)) %>% + purrr::keep(~ !is.character(.)) %>% + dplyr::bind_rows() %>% + dplyr::distinct() + })) + + if (length(features) == 1 && features == "flowlines") { + if (length(nhd_flowlines_stored) == 0 || is.null(nhd_flowlines_stored)) { + print("There are no NHD flowlines associated with your area of interest.") + } + + return(nhd_flowlines_stored) + } + + if (length(nhd_flowlines_stored) == 0 || is.null(nhd_flowlines_stored)) { + print("There are no NHD flowlines associated with your area of interest.") + } + } + + # Grab waterbodies - + if ("waterbodies" %in% features & nrow(nhd_catchments_stored) > 0) { + suppressMessages(suppressWarnings({ + geospatial_aoi <- nhd_catchments_stored %>% + sf::st_as_sfc() + + # select the layer by id from the items list called above (9 is HR waterbodies) + nhd_hr_waterbodies <- arcgislayers::get_layer(nhd_hr, 9) + + # use catchments to return associated waterbodies + nhd_waterbodies_stored <- vector("list", length = length(geospatial_aoi)) + + for (i in 1:length(geospatial_aoi)) { + try( + nhd_waterbodies_stored[[i]] <- arcgislayers::arc_select(nhd_hr_waterbodies, + # where = query, + filter_geom = geospatial_aoi[i], + crs = sf::st_crs(geospatial_aoi[i]) + ) %>% + sf::st_make_valid(), + silent = TRUE + ) + + # so all returned meta data binds properly, must transform all columns into characters, + # EXCEPT for the geometry column: + try( + geometry_col <- sf::st_geometry(nhd_waterbodies_stored[[i]]), + silent = TRUE + ) + + try( + nhd_waterbodies_stored[[i]] <- nhd_waterbodies_stored[[i]] %>% + dplyr::mutate(dplyr::across(dplyr::where(~ !identical(., geometry_col)), ~ as.character(.))), + silent = TRUE + ) + } + + nhd_waterbodies_stored <- nhd_waterbodies_stored %>% + purrr::keep(~ !is.null(.)) %>% + purrr::keep(~ !is.character(.)) %>% + dplyr::bind_rows() %>% + dplyr::distinct() + })) + + if (length(features) == 1 && features == "waterbodies") { + if (length(nhd_waterbodies_stored) == 0 || is.null(nhd_waterbodies_stored)) { + print("There are no NHD waterbodies associated with your area of interest.") + } + + return(nhd_waterbodies_stored) + } + + if (length(nhd_waterbodies_stored) == 0 || is.null(nhd_waterbodies_stored)) { + print("There are no NHD waterbodies associated with your area of interest.") + } + } + + # Combinations of features selected, and what they return: + + if (length(features) == 2 && "catchments" %in% features && "flowlines" %in% features) { + nhd_list <- list( + "NHD_catchments" = nhd_catchments_stored, + "NHD_flowlines" = nhd_flowlines_stored + ) + + return(nhd_list) + } else if (length(features) == 2 && "catchments" %in% features && "waterbodies" %in% features) { + nhd_list <- list( + "NHD_catchments" = nhd_catchments_stored, + "NHD_waterbodies" = nhd_waterbodies_stored + ) + + return(nhd_list) + } else if (length(features) == 2 && "flowlines" %in% features && "waterbodies" %in% features) { + nhd_list <- list( + "NHD_flowlines" = nhd_flowlines_stored, + "NHD_waterbodies" = nhd_waterbodies_stored + ) + + return(nhd_list) + } else if (length(features) == 3 && "catchments" %in% features && "flowlines" %in% features && "waterbodies" %in% features) { + nhd_list <- list( + "NHD_catchments" = nhd_catchments_stored, + "NHD_flowlines" = nhd_flowlines_stored, + "NHD_waterbodies" = nhd_waterbodies_stored + ) + } else { + stop("Please select between 'catchments', 'flowlines', 'waterbodies', or any combination for `feature` argument.") + } + + # If user wants NHDPlus V2... + } else if (resolution %in% c("Med", "med")) { + suppressMessages(suppressWarnings({ + nhd_catchments <- vector("list", length = nrow(unique_sites)) + + for (i in 1:nrow(unique_sites)) { + # Use {nhdplusTools} to grab associated catchments... + try( + nhd_catchments[[i]] <- nhdplusTools::get_nhdplus(AOI = unique_sites[i, ], realization = "catchment") %>% + sf::st_make_valid() %>% + dplyr::select( + comid = featureid, + catchmentareasqkm = areasqkm + ) %>% + dplyr::mutate( + NHD.comid = as.character(comid), + NHD.resolution = "nhdplusV2", + NHD.catchmentareasqkm = as.numeric(catchmentareasqkm) + ) %>% + dplyr::select(NHD.comid, NHD.resolution, NHD.catchmentareasqkm, geometry), + silent = TRUE + ) + } + + nhd_catchments <- nhd_catchments %>% + purrr::keep(~ !is.null(.)) + + try(nhd_catchments <- dplyr::bind_rows(nhd_catchments) %>% + dplyr::distinct(), silent = TRUE) + + # if NHD catchments are not in the correct CRS, transform them + try(if (sf::st_crs(nhd_catchments) != sf::st_crs(geospatial_data)) { + nhd_catchments <- nhd_catchments %>% + sf::st_transform(sf::st_crs(geospatial_data)$epsg) + }, silent = TRUE) + })) + + if (nrow(nhd_catchments) == 0 && "catchments" %in% features) { + print("No NHDPlus V2 features associated with your WQP observations.") + nhd_catchments <- tibble::tibble( + NHD.comid = character(), + NHD.resolution = character(), + NHD.catchmentareasqkm = numeric() + ) + } + + if (nrow(nhd_catchments) == 0 && !"catchments" %in% features) { + stop("No NHDPlus V2 features associated with your WQP observations.") + } + + if (length(features) == 1 && features == "catchments") { + return(nhd_catchments) + } + + + # Grab flowlines - + if ("flowlines" %in% features && nrow(nhd_catchments) > 0) { + suppressMessages(suppressWarnings({ + nhd_flowlines <- vector("list", length = nrow(nhd_catchments)) + + # use catchments to grab other NHD features: + unique_sites <- nhd_catchments + + for (i in 1:nrow(unique_sites)) { + # Use {nhdplusTools} to grab associated flowlines... + try( + nhd_flowlines[[i]] <- nhdplusTools::get_nhdplus(AOI = unique_sites[i, ], realization = "flowline") %>% + sf::st_make_valid(), + silent = TRUE + ) + + try(geometry_col <- sf::st_geometry(nhd_flowlines[[i]]), + silent = TRUE + ) + + try( + nhd_flowlines[[i]] <- nhd_flowlines[[i]] %>% + dplyr::mutate(dplyr::across(dplyr::where(~ !identical(., geometry_col)), ~ as.character(.))), + silent = TRUE + ) + } + + nhd_flowlines <- nhd_flowlines %>% + purrr::keep(~ !is.null(.)) + + try(nhd_flowlines <- dplyr::bind_rows(nhd_flowlines)) %>% + dplyr::distinct() + + # if NHD flowlines are not in the correct CRS, transform them + try(if (sf::st_crs(nhd_flowlines) != sf::st_crs(geospatial_data)) { + nhd_flowlines <- nhd_flowlines %>% + sf::st_transform(sf::st_crs(geospatial_data)$epsg) + }, silent = TRUE) + })) + + if (nrow(nhd_flowlines) == 0 && "flowlines" %in% features) { + print("No NHDPlus V2 flowlines associated with your WQP observations.") + } + + if (length(features) == 1 && features == "flowlines") { + return(nhd_flowlines) + } + } + + # Grab waterbodies - + if ("waterbodies" %in% features && nrow(nhd_catchments) > 0) { + suppressMessages(suppressWarnings({ + nhd_waterbodies <- vector("list", length = nrow(nhd_catchments)) + + # use catchments to grab other NHD features: + unique_sites <- nhd_catchments + + for (i in 1:nrow(unique_sites)) { + # Use {nhdplusTools} to grab associated flowlines... + try( + nhd_waterbodies[[i]] <- nhdplusTools::get_waterbodies(AOI = unique_sites[i, ]) %>% + sf::st_make_valid(), + silent = TRUE + ) + + try(geometry_col <- sf::st_geometry(nhd_waterbodies[[i]]), + silent = TRUE + ) + + try( + nhd_waterbodies[[i]] <- nhd_waterbodies[[i]] %>% + dplyr::mutate(dplyr::across(dplyr::where(~ !identical(., geometry_col)), ~ as.character(.))), + silent = TRUE + ) + } + + nhd_waterbodies <- nhd_waterbodies %>% + purrr::keep(~ !is.null(.)) + + try( + nhd_waterbodies <- dplyr::bind_rows(nhd_waterbodies) %>% + dplyr::distinct(), + silent = TRUE + ) + + # if NHD waterbodies are not in the correct CRS, transform them + try(if (sf::st_crs(nhd_waterbodies) != sf::st_crs(geospatial_data)) { + nhd_waterbodies <- nhd_waterbodies %>% + sf::st_transform(sf::st_crs(geospatial_data)$epsg) + }, silent = TRUE) + })) + + if (nrow(nhd_waterbodies) == 0 && "waterbodies" %in% features) { + print("No NHDPlus V2 waterbodies associated with your WQP observations.") + } + + if (length(features) == 1 && features == "waterbodies") { + return(nhd_waterbodies) + } + } + + # Combinations of features selected, and what they return: + + if (length(features) == 2 && "catchments" %in% features && "flowlines" %in% features) { + nhd_list <- list( + "NHD_catchments" = nhd_catchments, + "NHD_flowlines" = nhd_flowlines + ) + + return(nhd_list) + } else if (length(features) == 2 && "catchments" %in% features && "waterbodies" %in% features) { + nhd_list <- list( + "NHD_catchments" = nhd_catchments, + "NHD_waterbodies" = nhd_waterbodies + ) + + return(nhd_list) + } else if (length(features) == 2 && "flowlines" %in% features && "waterbodies" %in% features) { + nhd_list <- list( + "NHD_flowlines" = nhd_flowlines, + "NHD_waterbodies" = nhd_waterbodies + ) + + return(nhd_list) + } else if (length(features) == 3 && "catchments" %in% features && "flowlines" %in% features && "waterbodies" %in% features) { + nhd_list <- list( + "NHD_catchments" = nhd_catchments, + "NHD_flowlines" = nhd_flowlines, + "NHD_waterbodies" = nhd_waterbodies + ) + } else { + stop("Please select between 'catchments', 'flowlines', 'waterbodies', or any combination for `feature` argument.") + } + } else { + stop('User-supplied resolution unavailable. Please select between "Med" or "Hi".') + } +} + + + +#' TADA_GetATTAINS +#' +#' Link catchment-based ATTAINS assessment unit data (EPA snapshot of NHDPlus HR catchments associated with entity submitted assessment unit features - points, lines, and polygons) to Water Quality Portal observations, often imported via `TADA_DataRetrieval()`. This function returns the objects that can be mapped in `TADA_ViewATTAINS()`. Check out the +#' TADAModule2.Rmd for an example workflow. +#' +#' Adds one new column to input dataframe, 'index', which identifies rows that are the same observation but are linked to multiple ATTAINS assessment units. It is possible for a single TADA WQP observation to have multiple ATTAINS assessment units linked to it and subsequently more than one row of data. +#' +#' If TADA_MakeSpatial has not yet been run, this function runs it which also adds another new column to the input dataframe, 'geometry', which allows for mapping and additional geospatial capabilities. +#' +#' @param .data A dataframe created by `TADA_DataRetrieval()` or the sf equivalent made by `TADA_MakeSpatial()`. +#' @param fill_catchments Whether the user would like to return NHD catchments for WQP observations not associated with an ATTAINS assessment unit (TRUE or FALSE). When fill_catchments = TRUE, the returned list splits observations into two dataframes: WQP observations with ATTAINS catchment data, and WQP observations without ATTAINS catchment data. Defaults to FALSE. +#' @param resolution If fill_catchments = TRUE, whether to use NHDPlus V2 "Med" catchments or NHDPlus HiRes "Hi" catchments. Default is NHDPlus HiRes ("Hi"). +#' @param return_sf Whether to return the ATTAINS associated catchments, lines, points, and polygon shapefile objects along with the data frame(s). TRUE (yes, return list) or FALSE (no, do not return). All shapefile features are in WGS84 (crs = 4326). If fill_catchments = TRUE and return_sf = TRUE, the function will additionally return the raw catchment features associated with the observations in TADA_without_ATTAINS in a new shapefile called without_ATTAINS_catchments. Defaults to TRUE. +#' +#' @return A modified `TADA_DataRetrieval()` dataframe or list with additional columns associated with the ATTAINS assessment unit data, and, if fill_catchments = TRUE, an additional dataframe of the observations without intersecting ATTAINS features. +#' Moreover, if return_sf = TRUE, this function will additionally return the raw ATTAINS and catchment shapefile features associated with those observations. +#' +#' @seealso [TADA_DataRetrieval()] +#' @seealso [TADA_MakeSpatial()] +#' @seealso [TADA_ViewATTAINS()] +#' +#' @export +#' +#' @examples +#' \dontrun{ +#' tada_data <- TADA_DataRetrieval( +#' startDate = "2018-05-01", +#' endDate = "2018-07-31", +#' characteristicName = "pH", +#' statecode = "IL", +#' applyautoclean = TRUE +#' ) +#' +#' # note: these example ATTAINS data retrieval queries below may take a long time (10+ minutes) to run +#' +#' tada_attains <- TADA_GetATTAINS(tada_data, fill_catchments = FALSE, return_sf = FALSE) +#' +#' tada_attains_sf <- TADA_GetATTAINS(tada_data, fill_catchments = FALSE, return_sf = TRUE) +#' +#' tada_attains_filled <- TADA_GetATTAINS(tada_data, fill_catchments = TRUE, resolution = "Hi", return_sf = FALSE) +#' +#' tada_attains_filled_sf <- TADA_GetATTAINS(tada_data, fill_catchments = TRUE, resolution = "Hi", return_sf = TRUE) +#' } +TADA_GetATTAINS <- function(.data, fill_catchments = FALSE, resolution = "Hi", return_sf = TRUE) { + sf::sf_use_s2(FALSE) + + attains_names <- c( + "ATTAINS.organizationid", "ATTAINS.submissionid", "ATTAINS.hasprotectionplan", + "ATTAINS.assessmentunitname", "ATTAINS.nhdplusid", "ATTAINS.tas303d", + "ATTAINS.isthreatened", "ATTAINS.state", "ATTAINS.on303dlist", + "ATTAINS.organizationname", "ATTAINS.region", "ATTAINS.Shape_Length", + "ATTAINS.reportingcycle", "ATTAINS.assmnt_joinkey", "ATTAINS.hastmdl", + "ATTAINS.orgtype", "ATTAINS.permid_joinkey", "ATTAINS.catchmentistribal", + "ATTAINS.ircategory", "ATTAINS.waterbodyreportlink", "ATTAINS.assessmentunitidentifier", + "ATTAINS.overallstatus", "ATTAINS.isassessed", "ATTAINS.isimpaired", + "ATTAINS.has4bplan", "ATTAINS.huc12", "ATTAINS.hasalternativeplan", + "ATTAINS.visionpriority303d", "ATTAINS.areasqkm", "ATTAINS.catchmentareasqkm", + "ATTAINS.catchmentstatecode", "ATTAINS.catchmentresolution", "ATTAINS.waterTypeCode", + "ATTAINS.Shape_Area" + ) + + if (any(attains_names %in% colnames(.data))) { + stop("Your data has already been joined with ATTAINS data.") + } + + if (nrow(.data) == 0) { + print("Your Water Quality Portal dataframe has no observations. Returning an empty dataframe with empty ATTAINS features.") + + # if no WQP observations, return a modified `data` with empty ATTAINS-related columns: + + # Add ATTAINS columns with NA values + col_val_list <- stats::setNames( + object = rep( + x = list(NA), + times = length(attains_names) + ), + nm = attains_names + ) + + no_WQP_data <- .data %>% + dplyr::mutate(index = NA) %>% + dplyr::bind_cols(col_val_list) + + # In this case we'll need to return empty ATTAINS objects + if (return_sf == TRUE) { + ATTAINS_catchments <- NULL + ATTAINS_lines <- NULL + ATTAINS_points <- NULL + ATTAINS_polygons <- NULL + + return(list( + "TADA_with_ATTAINS" = no_WQP_data, + "ATTAINS_catchments" = ATTAINS_catchments, + "ATTAINS_points" = ATTAINS_points, + "ATTAINS_lines" = ATTAINS_lines, + "ATTAINS_polygons" = ATTAINS_polygons + )) + # If ATTAINS objects not requested, then just return the dataframe: + } else { + return(no_WQP_data) + } + } + + # If data doesn't already contain ATTAINS data and isn't an empty dataframe: + suppressMessages(suppressWarnings({ + # If data is already spatial, just make sure it is in the right CRS + # and add unique WQP ID for identifying obs with more than one ATTAINS assessment unit + + if (!is.null(.data) & inherits(.data, "sf")) { + if (sf::st_crs(.data)$epsg != 4326) { + TADA_DataRetrieval_data <- .data %>% + sf::st_transform(4326) %>% + tibble::rowid_to_column(var = "index") + } else { + TADA_DataRetrieval_data <- .data %>% + tibble::rowid_to_column(var = "index") + } + } else { + # ... Otherwise transform into a spatial object then do the same thing: + TADA_DataRetrieval_data <- .data %>% + # convert dataframe to a spatial object + TADA_MakeSpatial(.data = ., crs = 4326) %>% + # add unique WQP ID for identifying obs with more than one ATTAINS assessment unit + tibble::rowid_to_column(var = "index") + } + })) + + if (return_sf == TRUE) { + # grab all ATTAINS features that intersect our WQP objects: + attains_features <- try(fetchATTAINS(.data = TADA_DataRetrieval_data), silent = TRUE) + } + + if (return_sf == FALSE) { + # grab all ATTAINS features that intersect our WQP objects: + attains_features <- try(fetchATTAINS(.data = TADA_DataRetrieval_data, catchments_only = FALSE), silent = TRUE) + } + + # Tidy up the intersecting catchment objects: + suppressMessages(suppressWarnings({ + nearby_catchments <- NULL + # (Wrapped with "try" because it is possible that no ATTAINS data exists in the bbox.) + try( + nearby_catchments <- attains_features[["ATTAINS_catchments"]] %>% + # remove unnecessary columns: + dplyr::select(-c(OBJECTID, GLOBALID)) %>% + # select only catchments that have WQP observations in them: + .[TADA_DataRetrieval_data, ] %>% + # add prefix "ATTAINS" to ATTAINS data + dplyr::rename_with(~ paste0("ATTAINS.", .), dplyr::everything()) %>% + # get rid of dupes (as a precaution) + dplyr::distinct(.keep_all = TRUE), + silent = TRUE + ) + if (is.null(nearby_catchments) || nrow(nearby_catchments) == 0) { + nearby_catchments <- NULL + } + })) + + # If no ATTAINS data associated with WQP obs... + if (is.null(nearby_catchments)) { + col_val_list <- stats::setNames( + object = rep( + x = list(NA), + times = length(attains_names) + ), + nm = attains_names + ) + + # ...return a modified `.data` with empty ATTAINS-related columns: + no_ATTAINS_data <- .data %>% + dplyr::bind_cols(col_val_list) %>% + tibble::rowid_to_column(var = "index") + + print("There are no ATTAINS catchments associated with these WQP observations. Returning an empty data frame for `TADA_with_ATTAINS`.") + + if (fill_catchments == FALSE) { + # If there are no intersecting ATTAINS catchments, fill_catchments = FALSE, and return_sf = TRUE, return empty sf features with the + # empty TADA_with_ATTAINS df. + if (return_sf == TRUE) { + ATTAINS_catchments <- NULL + ATTAINS_lines <- NULL + ATTAINS_points <- NULL + ATTAINS_polygons <- NULL + + return(list( + "TADA_with_ATTAINS" = no_ATTAINS_data, + "ATTAINS_catchments" = ATTAINS_catchments, + "ATTAINS_points" = ATTAINS_points, + "ATTAINS_lines" = ATTAINS_lines, + "ATTAINS_polygons" = ATTAINS_polygons + )) + } else { + # If there are no intersecting ATTAINS catchments, fill_catchments = FALSE, and return_sf = FALSE, just return the + # empty TADA_with_ATTAINS df. + return(no_ATTAINS_data) + } + } else if (fill_catchments == TRUE) { + # "Downloading NHD data to fill in missing ATTAINS features. Depending on the number of observations and their spatial extent, this might take a while... + nhd_catchments <- fetchNHD(.data = TADA_DataRetrieval_data, resolution = resolution) + + TADA_without_ATTAINS <- TADA_DataRetrieval_data %>% + sf::st_join(nhd_catchments, left = TRUE) + + # If there are no intersecting ATTAINS catchments, fill_catchments = TRUE, and return_sf = TRUE, return empty sf features with the + # empty TADA_with_ATTAINS df PLUS the intersecting NHD catchment features of choice and TADA_without_ATTAINS dataframe. + if (return_sf == TRUE) { + ATTAINS_catchments <- NULL + ATTAINS_lines <- NULL + ATTAINS_points <- NULL + ATTAINS_polygons <- NULL + + return(list( + # must remove all obs from TADA_with_ATTAINS, since all exist in TADA_without_ATTAINS + "TADA_with_ATTAINS" = no_ATTAINS_data[0, ], + "TADA_without_ATTAINS" = TADA_without_ATTAINS, + "ATTAINS_catchments" = ATTAINS_catchments, + "ATTAINS_points" = ATTAINS_points, + "ATTAINS_lines" = ATTAINS_lines, + "ATTAINS_polygons" = ATTAINS_polygons, + "without_ATTAINS_catchments" = nhd_catchments + )) + + # If there are no intersecting ATTAINS catchments, fill_catchments = TRUE, and return_sf = FALSE return empty sf features with the + # empty TADA_with_ATTAINS df PLUS just the TADA_without_ATTAINS df (i.e., no shapefiles returned). + } else { + return(list( + "TADA_with_ATTAINS" = no_ATTAINS_data[0, ], + "TADA_without_ATTAINS" = TADA_without_ATTAINS + )) + } + } + } + + # If there IS at least some ATTAINS data, and fill_catchments = FALSE... + if (!is.null(nearby_catchments) & fill_catchments == FALSE) { + suppressMessages(suppressWarnings({ + # ... link WQP features to the ATTAINS catchment feature(s) they land in: + TADA_with_ATTAINS <- TADA_DataRetrieval_data %>% + # (left join = TRUE to preserve all observations (with or without ATTAINS features):) + sf::st_join(., nearby_catchments, left = TRUE) + + # If there are intersecting ATTAINS catchments, fill_catchments = FALSE, and return_sf = FALSE, return just the + # TADA_with_ATTAINS df + if (return_sf == FALSE) { + return(TADA_with_ATTAINS) + } + # ... otherwise return_sf = TRUE, and therefore must grab ATTAINS features, too: + + # CATCHMENT FEATURES + # use original catchment pull, but return column names to original + ATTAINS_catchments <- nearby_catchments + colnames(ATTAINS_catchments) <- gsub("ATTAINS.", "", colnames(ATTAINS_catchments)) + # due to the rename, must re-set geometry column: + sf::st_geometry(ATTAINS_catchments) <- "geometry" + + # POINT FEATURES - try to pull point AU data if it exists. Otherwise, move on... + ATTAINS_points <- NULL + try( + ATTAINS_points <- attains_features[["ATTAINS_points"]] %>% + # subset to only ATTAINS point features in the same NHD HR catchments as WQP observations + .[nearby_catchments, ] %>% + # make sure no duplicate features exist + dplyr::distinct(assessmentunitidentifier, .keep_all = TRUE), + silent = TRUE + ) + if (is.null(ATTAINS_points) || nrow(ATTAINS_points) == 0) { + ATTAINS_points <- NULL + } + + # LINE FEATURES - try to pull line AU data if it exists. Otherwise, move on... + ATTAINS_lines <- NULL + try( + ATTAINS_lines <- attains_features[["ATTAINS_lines"]] %>% # subset to only ATTAINS line features in the same NHD HR catchments as WQP observations .[nearby_catchments, ] %>% # make sure no duplicate line features exist dplyr::distinct(assessmentunitidentifier, .keep_all = TRUE), silent = TRUE ) - + if (is.null(ATTAINS_lines) || nrow(ATTAINS_lines) == 0) { + ATTAINS_lines <- NULL + } + # POLYGON FEATURES - try to pull polygon AU data if it exists. Otherwise, move on... ATTAINS_polygons <- NULL try( @@ -450,8 +1212,11 @@ TADA_GetATTAINS <- function(.data, return_sf = TRUE) { dplyr::distinct(assessmentunitidentifier, .keep_all = TRUE), silent = TRUE ) + if (is.null(ATTAINS_polygons) || nrow(ATTAINS_polygons) == 0) { + ATTAINS_polygons <- NULL + } })) - + # If there are ATTAINS catchments, fill_catchments = FALSE, and return_sf = TRUE: return(list( "TADA_with_ATTAINS" = TADA_with_ATTAINS, "ATTAINS_catchments" = ATTAINS_catchments, @@ -459,7 +1224,126 @@ TADA_GetATTAINS <- function(.data, return_sf = TRUE) { "ATTAINS_lines" = ATTAINS_lines, "ATTAINS_polygons" = ATTAINS_polygons )) - } + + # If there IS at least some ATTAINS data, and fill_catchments = TRUE... + } else if (!is.null(nearby_catchments) & fill_catchments == TRUE) { + # ... link WQP features to the ATTAINS catchment feature(s) they land in: + TADA_with_ATTAINS <- TADA_DataRetrieval_data %>% + sf::st_join(., nearby_catchments, left = TRUE) + + missing_attains <- dplyr::filter(TADA_with_ATTAINS, is.na(ATTAINS.submissionid)) + + # Splitting up sites with and without ATTAINS, so remove those without ATTAINS: + TADA_with_ATTAINS <- TADA_with_ATTAINS %>% + dplyr::filter(!is.na(ATTAINS.submissionid)) + + # If there are no WQP observations without missing ATTAINS features, return empty df for + # TADA_without_ATTAINS + if (nrow(missing_attains) == 0) { + print('All WQP features intersect an ATTAINS catchment. Returning empty dataframe for "TADA_without_ATTAINS".') + + if (resolution %in% c("Med", "med")) { + TADA_without_ATTAINS <- tibble::tibble( + NHD.comid = character(), + NHD.resolution = character(), + NHD.catchmentareasqkm = numeric() + ) + nhd_catchments <- NULL + } else if (resolution %in% c("Hi", "hi")) { + TADA_without_ATTAINS <- tibble::tibble( + NHD.nhdplusid = character(), + NHD.resolution = character(), + NHD.catchmentareasqkm = numeric() + ) + + nhd_catchments <- NULL + } else { + stop('Please select between "Med" or "Hi" for your NHD resolution.') + } + } + + # If there are some observations with no attains features, grab those sites' intersecting NHD catchments: + if (nrow(missing_attains) > 0) { + # Downloading NHD data to fill in missing ATTAINS features. Depending on the number of observations and + # their spatial extent, this can take a while. + nhd_catchments <- fetchNHD( + .data = missing_attains, + resolution = resolution + ) + + TADA_without_ATTAINS <- missing_attains %>% + # left join = TRUE to preserve all observations: + sf::st_join(., nhd_catchments, left = TRUE) + + # if there are intersecting ATTAINS, fill_catchments = TRUE, and if return_sf = FALSE, return just the dfs: + if (return_sf == FALSE) { + return(list( + "TADA_with_ATTAINS" = TADA_with_ATTAINS, + "TADA_without_ATTAINS" = TADA_without_ATTAINS + )) + } + } + + # CATCHMENT FEATURES + # use original catchment pull, but return column names to original + ATTAINS_catchments <- nearby_catchments + colnames(ATTAINS_catchments) <- gsub("ATTAINS.", "", colnames(ATTAINS_catchments)) + # due to the rename, must re-set geometry column: + sf::st_geometry(ATTAINS_catchments) <- "geometry" + + # POINT FEATURES - try to pull point AU data if it exists. Otherwise, move on... + ATTAINS_points <- NULL + try( + ATTAINS_points <- attains_features[["ATTAINS_points"]] %>% + # subset to only ATTAINS point features in the same NHD HR catchments as WQP observations + .[nearby_catchments, ] %>% + # make sure no duplicate features exist + dplyr::distinct(assessmentunitidentifier, .keep_all = TRUE), + silent = TRUE + ) + if (is.null(ATTAINS_points) || nrow(ATTAINS_points) == 0) { + ATTAINS_points <- NULL + } + + # LINE FEATURES - try to pull line AU data if it exists. Otherwise, move on... + ATTAINS_lines <- NULL + try( + ATTAINS_lines <- attains_features[["ATTAINS_lines"]] %>% + # subset to only ATTAINS line features in the same NHD HR catchments as WQP observations + .[nearby_catchments, ] %>% + # make sure no duplicate line features exist + dplyr::distinct(assessmentunitidentifier, .keep_all = TRUE), + silent = TRUE + ) + if (is.null(ATTAINS_lines) || nrow(ATTAINS_lines) == 0) { + ATTAINS_lines <- NULL + } + + # POLYGON FEATURES - try to pull polygon AU data if it exists. Otherwise, move on... + ATTAINS_polygons <- NULL + try( + ATTAINS_polygons <- attains_features[["ATTAINS_polygons"]] %>% + # subset to only ATTAINS polygon features in the same NHD HR catchments as WQP observations + .[nearby_catchments, ] %>% + # make sure no duplicate polygon features exist + dplyr::distinct(assessmentunitidentifier, .keep_all = TRUE), + silent = TRUE + ) + if (is.null(ATTAINS_polygons) || nrow(ATTAINS_polygons) == 0) { + ATTAINS_polygons <- NULL + } + + # if there is ATTAINS catchment data, fill_catchments = TRUE, return_sf = TRUE, return everything! + return(list( + "TADA_with_ATTAINS" = TADA_with_ATTAINS, + "TADA_without_ATTAINS" = TADA_without_ATTAINS, + "ATTAINS_catchments" = ATTAINS_catchments, + "ATTAINS_points" = ATTAINS_points, + "ATTAINS_lines" = ATTAINS_lines, + "ATTAINS_polygons" = ATTAINS_polygons, + "without_ATTAINS_catchments" = nhd_catchments + )) + } # else {stop("Some combination of arguments is impossible.")} } @@ -467,11 +1351,10 @@ TADA_GetATTAINS <- function(.data, return_sf = TRUE) { #' #' Visualizes the data returned from TADA_GetATTAINS if return_sf was set to TRUE. #' -#' This function visualizes the raw ATTAINS features that are linked to the -#' TADA Water Quality Portal observations. For the function to work properly, -#' the input dataframe must be the list produced from `TADA_GetATTAINS()` -#' with `return_sf = TRUE`. The map also displays the Water Quality Portal -#' monitoring locations used to find the ATTAINS features. Check out the +#' This function visualizes the shapefile features generated with TADA_GetATTAINS and the associated +#' TADA Water Quality Portal monitoring locations used to find the ATTAINS features. For the function to work properly, +#' .data must be the list produced from `TADA_GetATTAINS()` +#' with `return_sf = TRUE`. Check out the #' TADAModule2.Rmd for an example workflow. #' #' @param .data A list containing a data frame and ATTAINS shapefile objects created by `TADA_GetATTAINS()` with the return_sf argument set to TRUE. @@ -499,302 +1382,516 @@ TADA_GetATTAINS <- function(.data, return_sf = TRUE) { #' } TADA_ViewATTAINS <- function(.data) { if (!any(c( - "TADA_with_ATTAINS", "ATTAINS_catchments", "ATTAINS_points", + "ATTAINS_catchments", "ATTAINS_points", "ATTAINS_lines", "ATTAINS_polygons" ) %in% names(.data))) { - stop("Your input dataframe was not produced from `TADA_GetATTAINS()` or it was modified. Please create your list of ATTAINS features using `TADA_GetATTAINS()` and confirm that return_sf has been set to TRUE.") + stop("Your input dataframe was not produced from `TADA_GetATTAINS(return_sf = TRUE)`, or it was modified. Please create your list of ATTAINS features using `TADA_GetATTAINS()` and confirm that return_sf had been set to TRUE.") } - + ATTAINS_table <- .data[["TADA_with_ATTAINS"]] ATTAINS_catchments <- .data[["ATTAINS_catchments"]] ATTAINS_points <- .data[["ATTAINS_points"]] ATTAINS_lines <- .data[["ATTAINS_lines"]] ATTAINS_polygons <- .data[["ATTAINS_polygons"]] - - if (nrow(ATTAINS_table) == 0) { - stop("Your WQP dataframe has no observations.") - } - - required_columns <- c( - "LongitudeMeasure", "LatitudeMeasure", - "HorizontalCoordinateReferenceSystemDatumName", - "CharacteristicName", "MonitoringLocationIdentifier", - "MonitoringLocationName", "ResultIdentifier", - "ActivityStartDate", "OrganizationIdentifier" - ) - - if (!any(required_columns %in% colnames(ATTAINS_table))) { - stop("Your dataframe does not contain the necessary WQP-style column names.") - } - - suppressMessages(suppressWarnings({ - sf::sf_use_s2(FALSE) - - # if data was spatial, remove for downstream leaflet dev: - try(ATTAINS_table <- ATTAINS_table %>% - sf::st_drop_geometry(), silent = TRUE) - - tada.pal <- TADA_ColorPalette() - - colors <- data.frame( - overallstatus = c("Not Supporting", "Fully Supporting", "Not Assessed"), - col = c(tada.pal[3], tada.pal[4], tada.pal[7]), - dark_col = c(tada.pal[12], tada.pal[6], tada.pal[11]), - priority = c(1, 2, 3) - ) - - # POINT FEATURES - try to pull point AU data if it exists. Otherwise, move on... - try( - points_mapper <- ATTAINS_points %>% - dplyr::left_join(., colors, by = "overallstatus") %>% - dplyr::mutate(type = "Point Feature") %>% - tibble::rowid_to_column(var = "index") %>% - # some point features are actually multipoint features. Must extract all coordinates for mapping - # later: - dplyr::right_join(., tibble::as_tibble(sf::st_coordinates(ATTAINS_points)), by = c("index" = "L1")), - silent = TRUE - ) - - # LINE FEATURES - try to pull line AU data if it exists. Otherwise, move on... - try( - lines_mapper <- ATTAINS_lines %>% - dplyr::left_join(., colors, by = "overallstatus") %>% - dplyr::mutate(type = "Line Feature"), - silent = TRUE - ) - - # POLYGON FEATURES - try to pull polygon AU data if it exists. Otherwise, move on... - try( - polygons_mapper <- ATTAINS_polygons %>% - dplyr::left_join(., colors, by = "overallstatus") %>% - dplyr::mutate(type = "Polygon Feature"), - silent = TRUE - ) - - # Develop WQP site stats (e.g. count of observations, parameters, per site) - sumdat <- ATTAINS_table %>% - dplyr::group_by(MonitoringLocationIdentifier, MonitoringLocationName, LatitudeMeasure, LongitudeMeasure) %>% - dplyr::summarize( - Sample_Count = length(unique(ResultIdentifier)), - Visit_Count = length(unique(ActivityStartDate)), - Parameter_Count = length(unique(CharacteristicName)), - Organization_Count = length(unique(OrganizationIdentifier)), - ATTAINS_AUs = as.character(list(unique(ATTAINS.assessmentunitidentifier))) - ) %>% - dplyr::mutate( - ATTAINS_AUs = ifelse(is.na(ATTAINS_AUs), "None", ATTAINS_AUs), - LatitudeMeasure = as.numeric(LatitudeMeasure), - LongitudeMeasure = as.numeric(LongitudeMeasure) - ) - - # Basemap for AOI: - map <- leaflet::leaflet() %>% - leaflet::addProviderTiles("Esri.WorldTopoMap", - group = "World topo", - options = leaflet::providerTileOptions( - updateWhenZooming = FALSE, - updateWhenIdle = TRUE - ) - ) %>% - leaflet::clearShapes() %>% - leaflet::fitBounds( - lng1 = min(sumdat$LongitudeMeasure), - lat1 = min(sumdat$LatitudeMeasure), - lng2 = max(sumdat$LongitudeMeasure), - lat2 = max(sumdat$LatitudeMeasure) - ) %>% - leaflet.extras::addResetMapButton() %>% - leaflet::addLegend( - position = "bottomright", - colors = c(tada.pal[3], tada.pal[4], tada.pal[7], "black", NA), - labels = c( - "ATTAINS: Not Supporting", "ATTAINS: Supporting", "ATTAINS: Not Assessed", "Water Quality Observation(s)", - "NHDPlus HR catchments containing water quality observations + ATTAINS feature are represented as clear polygons with black outlines." - ), - opacity = 1, - title = "Legend" - ) - - # Add ATTAINS catchment outlines (if they exist): - try( - map <- map %>% - leaflet::addPolygons( - data = ATTAINS_catchments, - color = "black", - weight = 1, fillOpacity = 0, - popup = paste0("NHDPlus HR Catchment ID: ", ATTAINS_catchments$nhdplusid) - ), - silent = TRUE - ) - - # Add ATTAINS polygon features (if they exist): - try( - map <- map %>% - leaflet::addPolygons( - data = polygons_mapper, - color = ~ polygons_mapper$col, - fill = ~ polygons_mapper$col, - weight = 3, fillOpacity = 1, - popup = paste0( - "Assessment Unit Name: ", polygons_mapper$assessmentunitname, - "
Assessment Unit ID: ", polygons_mapper$assessmentunitidentifier, - "
Status: ", polygons_mapper$overallstatus, - "
Assessment Unit Type: ", polygons_mapper$type, - "
ATTAINS Link" - ) - ), - silent = TRUE - ) - - # Add ATTAINS lines features (if they exist): - try( - map <- map %>% - leaflet::addPolylines( - data = lines_mapper, - color = ~ lines_mapper$col, - weight = 4, fillOpacity = 1, - popup = paste0( - "Assessment Unit Name: ", lines_mapper$assessmentunitname, - "
Assessment Unit ID: ", lines_mapper$assessmentunitidentifier, - "
Status: ", lines_mapper$overallstatus, - "
Assessment Unit Type: ", lines_mapper$type, - "
ATTAINS Link" - ) - ), - silent = TRUE - ) - - # Add ATTAINS point features (if they exist): - try( - map <- map %>% - leaflet::addCircleMarkers( - data = points_mapper, - lng = ~X, lat = ~Y, - color = ~ points_mapper$col, fillColor = ~ points_mapper$col, - fillOpacity = 1, stroke = TRUE, weight = 1.5, radius = 5, - popup = paste0( - "Assessment Unit Name: ", points_mapper$assessmentunitname, - "
Assessment Unit ID: ", points_mapper$assessmentunitidentifier, - "
Status: ", points_mapper$overallstatus, - "
Assessment Unit Type: ", points_mapper$type, - "
ATTAINS Link" - ) - ), - silent = TRUE - ) - - # Add WQP observation features (should always exist): - try( - map <- map %>% - leaflet::addCircleMarkers( - data = sumdat, - lng = ~LongitudeMeasure, lat = ~LatitudeMeasure, - color = "grey", fillColor = "black", - fillOpacity = 0.8, stroke = TRUE, weight = 1.5, radius = 6, - popup = paste0( - "Site ID: ", sumdat$MonitoringLocationIdentifier, - "
Site Name: ", sumdat$MonitoringLocationName, - "
Measurement Count: ", sumdat$Sample_Count, - "
Visit Count: ", sumdat$Visit_Count, - "
Characteristic Count: ", sumdat$Parameter_Count, - "
ATTAINS Assessment Unit(s): ", sumdat$ATTAINS_AUs - ) - ), - silent = TRUE + + # ATTAINS API seems to be missing some AU data that is still preserved in the catchment layer. + # Use catchments for those instances for mapping purposes: + missing_raw_features <- NULL + + try(missing_raw_features <- ATTAINS_catchments %>% + dplyr::filter(!assessmentunitidentifier %in% c( + ATTAINS_points$assessmentunitidentifier, + ATTAINS_lines$assessmentunitidentifier, + ATTAINS_polygons$assessmentunitidentifier + )), silent = TRUE) + + if (!"without_ATTAINS_catchments" %in% names(.data)) { + if (nrow(ATTAINS_table) == 0) { + stop("Your WQP dataframe has no observations.") + } + + required_columns <- c( + "LongitudeMeasure", "LatitudeMeasure", + "HorizontalCoordinateReferenceSystemDatumName", + "CharacteristicName", "MonitoringLocationIdentifier", + "MonitoringLocationName", "ResultIdentifier", + "ActivityStartDate", "OrganizationIdentifier" ) - - if (is.null(ATTAINS_lines) & is.null(ATTAINS_points) & is.null(ATTAINS_polygons)) { - print("No ATTAINS data associated with this Water Quality Portal data.") + + if (!any(required_columns %in% colnames(ATTAINS_table))) { + stop("Your dataframe does not contain the necessary WQP-style column names.") } - - # Return leaflet map of TADA WQ and its associated ATTAINS data - return(map) - })) -} + suppressMessages(suppressWarnings({ + sf::sf_use_s2(FALSE) -#' Access options available for querying tribal spatial data with `TADA_DataRetrieval()`. -#' -#' @description -#' This function provides access to [six layer datasets](https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer) -#' containing spatial data related to tribal lands: "Alaska Native Allotments", -#' "Alaska Native Villages", "American Indian Reservations", "Off-reservation Trust Lands", -#' "Oklahoma Tribal Statistical Areas", and "Virginia Federally Recognized Tribes". -#' These datasets are used by `TADA_DataRetrieval()` when retrieving spatial data -#' for tribal lands specified by the user. -#' -#' The purpose of `TADA_TribalOptions()` is to allow the user to review the available -#' data in those datasets and identify the records they would like to query with -#' `TADA_DataRetrieval()`. -#' -#' An interactive map of the six layer datasets is available on ArcGIS Online Map -#' Viewer here: https://www.arcgis.com/apps/mapviewer/index.html?url=https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer&source=sd -#' -#' @param tribal_area_type A character string. Must be one of the six tribal -#' spatial layers: "Alaska Native Allotments", "Alaska Native Villages", -#' "American Indian Reservations", "Off-reservation Trust Lands", -#' "Oklahoma Tribal Statistical Areas", or "Virginia Federally Recognized Tribes". -#' -#' @param return_sf Logical. Should the function return the dataset as an `sf` -#' object (TRUE) or a data frame (FALSE)? Defaults to FALSE. -#' -#' @returns A data frame or `sf` object containing the specified layer from the EPA -#' Map Service. -#' -#' @note -#' Alaska Native Villages and Virginia Federally Recognized Tribes are point -#' geometries in the Map Service, not polygons. At the time of this writing they -#' do not return any data when used for WQP bbox queries. -#' -#' @seealso [TADA_DataRetrieval()] -#' -#' @export -#' - -TADA_TribalOptions <- function(tribal_area_type, return_sf = FALSE){ - - # Make a reference table for tribal area type + url matching - map_service_urls <- tibble::tribble( - ~tribal_area, ~url, - "Alaska Native Allotments", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/0", - "Alaska Native Villages", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/1", - "American Indian Reservations", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/2", - "Off-reservation Trust Lands", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/3", - "Oklahoma Tribal Statistical Areas", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/4", - "Virginia Federally Recognized Tribes", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/5" - ) - - # Confirm usable string provided - if( !(tribal_area_type %in% map_service_urls$tribal_area) ){ - stop("tribal_area_type must match one of the six tribal spatial layer names.") - } - - if( tribal_area_type %in% - c("Alaska Native Villages", "Virginia Federally Recognized Tribes") ){ - warning( - paste0( - "Alaska Native Villages and Virginia Federally Recognized Tribes are point geometries in the Map Service, not polygons. ", - "At the time of this writing they do not return any data when used for WQP bbox queries." + # if data was spatial, remove for downstream leaflet dev: + try(ATTAINS_table <- ATTAINS_table %>% + sf::st_drop_geometry(), silent = TRUE) + + tada.pal <- TADA_ColorPalette() + + colors <- data.frame( + overallstatus = c("Not Supporting", "Fully Supporting", "Not Assessed"), + col = c(tada.pal[3], tada.pal[4], tada.pal[7]), + dark_col = c(tada.pal[12], tada.pal[6], tada.pal[11]), + priority = c(1, 2, 3) ) - ) + + # POINT FEATURES - try to pull point AU data if it exists. Otherwise, move on... + try( + points_mapper <- ATTAINS_points %>% + dplyr::left_join(., colors, by = "overallstatus") %>% + dplyr::mutate(type = "Point Feature") %>% + tibble::rowid_to_column(var = "index") %>% + # some point features are actually multipoint features. Must extract all coordinates for mapping + # later: + dplyr::right_join(., tibble::as_tibble(sf::st_coordinates(ATTAINS_points)), by = c("index" = "L1")), + silent = TRUE + ) + + # LINE FEATURES - try to pull line AU data if it exists. Otherwise, move on... + try( + lines_mapper <- ATTAINS_lines %>% + dplyr::left_join(., colors, by = "overallstatus") %>% + dplyr::mutate(type = "Line Feature"), + silent = TRUE + ) + + # POLYGON FEATURES - try to pull polygon AU data if it exists. Otherwise, move on... + try( + polygons_mapper <- ATTAINS_polygons %>% + dplyr::left_join(., colors, by = "overallstatus") %>% + dplyr::mutate(type = "Polygon Feature"), + silent = TRUE + ) + + # CATCHMENT FEATURES - try to pull missing feature AU data if it exists. Otherwise, move on... + try( + missing_raw_mapper <- missing_raw_features %>% + dplyr::left_join(., colors, by = "overallstatus") %>% + dplyr::mutate(type = "Raw Feature Unavailable"), + silent = TRUE + ) + + # Develop WQP site stats (e.g. count of observations, parameters, per site) + sumdat <- ATTAINS_table %>% + dplyr::group_by(MonitoringLocationIdentifier, MonitoringLocationName, LatitudeMeasure, LongitudeMeasure) %>% + dplyr::summarize( + Sample_Count = length(unique(ResultIdentifier)), + Visit_Count = length(unique(ActivityStartDate)), + Parameter_Count = length(unique(CharacteristicName)), + Organization_Count = length(unique(OrganizationIdentifier)), + ATTAINS_AUs = as.character(list(unique(ATTAINS.assessmentunitidentifier))) + ) %>% + dplyr::mutate( + ATTAINS_AUs = ifelse(is.na(ATTAINS_AUs), "None", ATTAINS_AUs), + LatitudeMeasure = as.numeric(LatitudeMeasure), + LongitudeMeasure = as.numeric(LongitudeMeasure) + ) + + # Basemap for AOI: + map <- leaflet::leaflet() %>% + leaflet::addProviderTiles("Esri.WorldTopoMap", + group = "World topo", + options = leaflet::providerTileOptions( + updateWhenZooming = FALSE, + updateWhenIdle = TRUE + ) + ) %>% + leaflet::clearShapes() %>% + leaflet::fitBounds( + lng1 = min(sumdat$LongitudeMeasure), + lat1 = min(sumdat$LatitudeMeasure), + lng2 = max(sumdat$LongitudeMeasure), + lat2 = max(sumdat$LatitudeMeasure) + ) %>% + leaflet.extras::addResetMapButton() %>% + leaflet::addLegend( + position = "bottomright", + colors = c(tada.pal[3], tada.pal[4], tada.pal[7], "black", NA), + labels = c( + "ATTAINS: Not Supporting", "ATTAINS: Supporting", "ATTAINS: Not Assessed", "Water Quality Observation(s)", + "NHDPlus HR catchments containing water quality observations + ATTAINS feature are represented as clear polygons with black outlines." + ), + opacity = 1, + title = "Legend" + ) + + # Add ATTAINS catchment outlines (if they exist): + try( + map <- map %>% + leaflet::addPolygons( + data = ATTAINS_catchments, + color = "black", + weight = 1, fillOpacity = 0, + popup = paste0("NHDPlus HR Catchment ID: ", ATTAINS_catchments$nhdplusid) + ), + silent = TRUE + ) + + # Add ATTAINS catchment outlines as AUs: + try( + map <- map %>% + leaflet::addPolygons( + data = missing_raw_mapper, + color = ~ missing_raw_mapper$col, + fill = ~ missing_raw_mapper$col, + weight = 3, fillOpacity = 0.25, + popup = paste0( + "Assessment Unit Name: ", missing_raw_mapper$assessmentunitname, + "
Assessment Unit ID: ", missing_raw_mapper$assessmentunitidentifier, + "
Status: ", missing_raw_mapper$overallstatus, + "
Assessment Unit Type: ", missing_raw_mapper$type, + "
ATTAINS Link", + "
NHDPlus HR Catchment ID: ", missing_raw_mapper$nhdplusid + ) + ), + silent = TRUE + ) + + # Add ATTAINS polygon features (if they exist): + try( + map <- map %>% + leaflet::addPolygons( + data = polygons_mapper, + color = ~ polygons_mapper$col, + fill = ~ polygons_mapper$col, + weight = 3, fillOpacity = 1, + popup = paste0( + "Assessment Unit Name: ", polygons_mapper$assessmentunitname, + "
Assessment Unit ID: ", polygons_mapper$assessmentunitidentifier, + "
Status: ", polygons_mapper$overallstatus, + "
Assessment Unit Type: ", polygons_mapper$type, + "
ATTAINS Link" + ) + ), + silent = TRUE + ) + + # Add ATTAINS lines features (if they exist): + try( + map <- map %>% + leaflet::addPolylines( + data = lines_mapper, + color = ~ lines_mapper$col, + weight = 4, fillOpacity = 1, + popup = paste0( + "Assessment Unit Name: ", lines_mapper$assessmentunitname, + "
Assessment Unit ID: ", lines_mapper$assessmentunitidentifier, + "
Status: ", lines_mapper$overallstatus, + "
Assessment Unit Type: ", lines_mapper$type, + "
ATTAINS Link" + ) + ), + silent = TRUE + ) + + # Add ATTAINS point features (if they exist): + try( + map <- map %>% + leaflet::addCircleMarkers( + data = points_mapper, + lng = ~X, lat = ~Y, + color = ~ points_mapper$col, fillColor = ~ points_mapper$col, + fillOpacity = 1, stroke = TRUE, weight = 1.5, radius = 5, + popup = paste0( + "Assessment Unit Name: ", points_mapper$assessmentunitname, + "
Assessment Unit ID: ", points_mapper$assessmentunitidentifier, + "
Status: ", points_mapper$overallstatus, + "
Assessment Unit Type: ", points_mapper$type, + "
ATTAINS Link" + ) + ), + silent = TRUE + ) + + # Add WQP observation features (should always exist): + try( + map <- map %>% + leaflet::addCircleMarkers( + data = sumdat, + lng = ~LongitudeMeasure, lat = ~LatitudeMeasure, + color = "grey", fillColor = "black", + fillOpacity = 0.8, stroke = TRUE, weight = 1.5, radius = 6, + popup = paste0( + "Site ID: ", sumdat$MonitoringLocationIdentifier, + "
Site Name: ", sumdat$MonitoringLocationName, + "
Measurement Count: ", sumdat$Sample_Count, + "
Visit Count: ", sumdat$Visit_Count, + "
Characteristic Count: ", sumdat$Parameter_Count, + "
ATTAINS Assessment Unit(s): ", sumdat$ATTAINS_AUs + ) + ), + silent = TRUE + ) + + if (is.null(ATTAINS_lines) & is.null(ATTAINS_points) & is.null(ATTAINS_polygons)) { + print("No ATTAINS data associated with this Water Quality Portal data.") + } + + # Return leaflet map of TADA WQ and its associated ATTAINS data + return(map) + })) } - - # Query Map Service - tribal_area_sf <- dplyr::filter(map_service_urls, - tribal_area == tribal_area_type)$url %>% - arcgislayers::arc_open() %>% - # Return sf - arcgislayers::arc_select() %>% - sf::st_make_valid() - - # Convert to df if needed, export - if(return_sf == FALSE){ - return( - as.data.frame(tribal_area_sf) %>% - sf::st_drop_geometry() + + if ("without_ATTAINS_catchments" %in% names(.data)) { + without_ATTAINS_table <- .data[["TADA_without_ATTAINS"]] + + if (nrow(ATTAINS_table) == 0 & nrow(without_ATTAINS_table) == 0) { + stop("Your WQP dataframe has no observations.") + } + + required_columns <- c( + "LongitudeMeasure", "LatitudeMeasure", + "HorizontalCoordinateReferenceSystemDatumName", + "CharacteristicName", "MonitoringLocationIdentifier", + "MonitoringLocationName", "ResultIdentifier", + "ActivityStartDate", "OrganizationIdentifier" ) - } else { - return(tribal_area_sf) + + if (!any(required_columns %in% colnames(ATTAINS_table))) { + stop("Your dataframe does not contain the necessary WQP-style column names.") + } + + without_ATTAINS_catchments <- NULL + try(without_ATTAINS_catchments <- .data[["without_ATTAINS_catchments"]] %>% + dplyr::rename(nhd = 1), silent = TRUE) + + suppressMessages(suppressWarnings({ + sf::sf_use_s2(FALSE) + + # if data was spatial, remove for downstream leaflet dev. + # But first if no data in the ATTAINS table, add in required column names to + # without ATTAINS data: + if (nrow(ATTAINS_table) == 0) { + new_columns <- names(ATTAINS_table)[grep("^ATTAINS\\.", names(ATTAINS_table))] + ATTAINS_table <- without_ATTAINS_table %>% + sf::st_drop_geometry() + + ATTAINS_table[new_columns] <- NA + } else { + ATTAINS_table <- ATTAINS_table %>% + sf::st_drop_geometry() %>% + dplyr::bind_rows(without_ATTAINS_table) + } + + tada.pal <- TADA_ColorPalette() + + colors <- data.frame( + overallstatus = c("Not Supporting", "Fully Supporting", "Not Assessed"), + col = c(tada.pal[3], tada.pal[4], tada.pal[7]), + dark_col = c(tada.pal[12], tada.pal[6], tada.pal[11]), + priority = c(1, 2, 3) + ) + + # POINT FEATURES - try to pull point AU data if it exists. Otherwise, move on... + try( + points_mapper <- ATTAINS_points %>% + dplyr::left_join(., colors, by = "overallstatus") %>% + dplyr::mutate(type = "Point Feature") %>% + tibble::rowid_to_column(var = "index") %>% + # some point features are actually multipoint features. Must extract all coordinates for mapping + # later: + dplyr::right_join(., tibble::as_tibble(sf::st_coordinates(ATTAINS_points)), by = c("index" = "L1")), + silent = TRUE + ) + + # LINE FEATURES - try to pull line AU data if it exists. Otherwise, move on... + try( + lines_mapper <- ATTAINS_lines %>% + dplyr::left_join(., colors, by = "overallstatus") %>% + dplyr::mutate(type = "Line Feature"), + silent = TRUE + ) + + # POLYGON FEATURES - try to pull polygon AU data if it exists. Otherwise, move on... + try( + polygons_mapper <- ATTAINS_polygons %>% + dplyr::left_join(., colors, by = "overallstatus") %>% + dplyr::mutate(type = "Polygon Feature"), + silent = TRUE + ) + + # CATCHMENT FEATURES - try to pull missing feature AU data if it exists. Otherwise, move on... + try( + missing_raw_mapper <- missing_raw_features %>% + dplyr::left_join(., colors, by = "overallstatus") %>% + dplyr::mutate(type = "Raw Feature Unavailable"), + silent = TRUE + ) + + # Develop WQP site stats (e.g. count of observations, parameters, per site) + sumdat <- ATTAINS_table %>% + dplyr::group_by(MonitoringLocationIdentifier, MonitoringLocationName, LatitudeMeasure, LongitudeMeasure) %>% + dplyr::summarize( + Sample_Count = length(unique(ResultIdentifier)), + Visit_Count = length(unique(ActivityStartDate)), + Parameter_Count = length(unique(CharacteristicName)), + Organization_Count = length(unique(OrganizationIdentifier)), + ATTAINS_AUs = as.character(list(unique(ATTAINS.assessmentunitidentifier))) + ) %>% + dplyr::mutate( + ATTAINS_AUs = ifelse(is.na(ATTAINS_AUs), "None", ATTAINS_AUs), + LatitudeMeasure = as.numeric(LatitudeMeasure), + LongitudeMeasure = as.numeric(LongitudeMeasure) + ) + + # Basemap for AOI: + map <- leaflet::leaflet() %>% + leaflet::addProviderTiles("Esri.WorldTopoMap", + group = "World topo", + options = leaflet::providerTileOptions( + updateWhenZooming = FALSE, + updateWhenIdle = TRUE + ) + ) %>% + leaflet::clearShapes() %>% + leaflet::fitBounds( + lng1 = min(sumdat$LongitudeMeasure), + lat1 = min(sumdat$LatitudeMeasure), + lng2 = max(sumdat$LongitudeMeasure), + lat2 = max(sumdat$LatitudeMeasure) + ) %>% + leaflet.extras::addResetMapButton() %>% + leaflet::addLegend( + position = "bottomright", + colors = c(tada.pal[3], tada.pal[4], tada.pal[7], "black", NA, NA), + labels = c( + "ATTAINS: Not Supporting", "ATTAINS: Supporting", "ATTAINS: Not Assessed", "Water Quality Observation(s)", + "NHDPlus HiRes catchments containing a WQP site + ATTAINS feature(s) are represented as clear polygons.", + "Intersecting grey catchments are those without an ATTAINS feature if fill_catchments = TRUE." + ), + opacity = 1, + title = "Legend" + ) + + # Add ATTAINS catchment outlines (if they exist): + try( + map <- map %>% + leaflet::addPolygons( + data = ATTAINS_catchments, + color = "black", + weight = 1, fillOpacity = 0, + popup = paste0("NHDPlus HR Catchment ID: ", ATTAINS_catchments$nhdplusid) + ), + silent = TRUE + ) + + # Add missing catchment outlines (if they exist): + try( + map <- map %>% + leaflet::addPolygons( + data = without_ATTAINS_catchments, + color = "black", fillColor = "grey", + weight = 1, fillOpacity = 0.3, + popup = paste0(without_ATTAINS_catchments$NHD.resolution, " catchment ID: ", without_ATTAINS_catchments$nhd) + ), + silent = TRUE + ) + + # Add ATTAINS catchment outlines as AUs: + try( + map <- map %>% + leaflet::addPolygons( + data = missing_raw_mapper, + color = ~ missing_raw_mapper$col, + fill = ~ missing_raw_mapper$col, + weight = 3, fillOpacity = 0.25, + popup = paste0( + "Assessment Unit Name: ", missing_raw_mapper$assessmentunitname, + "
Assessment Unit ID: ", missing_raw_mapper$assessmentunitidentifier, + "
Status: ", missing_raw_mapper$overallstatus, + "
Assessment Unit Type: ", missing_raw_mapper$type, + "
ATTAINS Link", + "
NHDPlus HR Catchment ID: ", missing_raw_mapper$nhdplusid + ) + ), + silent = TRUE + ) + + # Add ATTAINS polygon features (if they exist): + try( + map <- map %>% + leaflet::addPolygons( + data = polygons_mapper, + color = ~ polygons_mapper$col, + fill = ~ polygons_mapper$col, + weight = 3, fillOpacity = 1, + popup = paste0( + "Assessment Unit Name: ", polygons_mapper$assessmentunitname, + "
Assessment Unit ID: ", polygons_mapper$assessmentunitidentifier, + "
Status: ", polygons_mapper$overallstatus, + "
Assessment Unit Type: ", polygons_mapper$type, + "
ATTAINS Link" + ) + ), + silent = TRUE + ) + + # Add ATTAINS lines features (if they exist): + try( + map <- map %>% + leaflet::addPolylines( + data = lines_mapper, + color = ~ lines_mapper$col, + weight = 4, fillOpacity = 1, + popup = paste0( + "Assessment Unit Name: ", lines_mapper$assessmentunitname, + "
Assessment Unit ID: ", lines_mapper$assessmentunitidentifier, + "
Status: ", lines_mapper$overallstatus, + "
Assessment Unit Type: ", lines_mapper$type, + "
ATTAINS Link" + ) + ), + silent = TRUE + ) + + # Add ATTAINS point features (if they exist): + try( + map <- map %>% + leaflet::addCircleMarkers( + data = points_mapper, + lng = ~X, lat = ~Y, + color = ~ points_mapper$col, fillColor = ~ points_mapper$col, + fillOpacity = 1, stroke = TRUE, weight = 1.5, radius = 5, + popup = paste0( + "Assessment Unit Name: ", points_mapper$assessmentunitname, + "
Assessment Unit ID: ", points_mapper$assessmentunitidentifier, + "
Status: ", points_mapper$overallstatus, + "
Assessment Unit Type: ", points_mapper$type, + "
ATTAINS Link" + ) + ), + silent = TRUE + ) + + # Add WQP observation features (should always exist): + try( + map <- map %>% + leaflet::addCircleMarkers( + data = sumdat, + lng = ~LongitudeMeasure, lat = ~LatitudeMeasure, + color = "grey", fillColor = "black", + fillOpacity = 0.8, stroke = TRUE, weight = 1.5, radius = 6, + popup = paste0( + "Site ID: ", sumdat$MonitoringLocationIdentifier, + "
Site Name: ", sumdat$MonitoringLocationName, + "
Measurement Count: ", sumdat$Sample_Count, + "
Visit Count: ", sumdat$Visit_Count, + "
Characteristic Count: ", sumdat$Parameter_Count, + "
ATTAINS Assessment Unit(s): ", sumdat$ATTAINS_AUs + ) + ), + silent = TRUE + ) + + if (is.null(ATTAINS_lines) & is.null(ATTAINS_points) & is.null(ATTAINS_polygons)) { + print("No ATTAINS data associated with this Water Quality Portal data.") + } + + # Return leaflet map of TADA WQ and its associated ATTAINS data + return(map) + })) } - } From 001c2ebf54ec47a618073e6dbf2570b04eb53343 Mon Sep 17 00:00:00 2001 From: Matt Brousil <37380883+mbrousil@users.noreply.github.com> Date: Thu, 21 Nov 2024 16:29:07 -0800 Subject: [PATCH 08/35] warning -> message & add dplyr:: --- R/DataDiscoveryRetrieval.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/R/DataDiscoveryRetrieval.R b/R/DataDiscoveryRetrieval.R index 56a2e57e..b83abac4 100644 --- a/R/DataDiscoveryRetrieval.R +++ b/R/DataDiscoveryRetrieval.R @@ -54,7 +54,7 @@ #' @param organization A string of letters and/or numbers (some additional characters also possible) used to signify an organization with data in the Water Quality Portal. See https://www.waterqualitydata.us/Codes/organization for options. #' @param project A string of letters and/or numbers (some additional characters also possible) used to signify a project with data in the Water Quality Portal. See https://www.waterqualitydata.us/Codes/project for options. #' @param providers Leave blank to include all, or specify "STEWARDS", "STORET" (i.e., WQX), and/or "NWIS". See https://www.waterqualitydata.us/Codes/providers for options. -#' @param applyautoclean Logical, defaults to TRUE. Applies TADA_AutoClean function on the returned data profile. +#' @param applyautoclean Logical, defaults to TRUE. Applies TADA_AutoClean function on the returned data profile. Suggest switching to FALSE for queries that are expected to be large. #' #' @return TADA-compatible dataframe #' @@ -196,8 +196,8 @@ TADA_DataRetrieval <- function(startDate = "null", providers = "null", maxrecs = 250000, applyautoclean = TRUE) { - - # Check for incomplete or inconsistent inputs: + + # Check for incomplete or inconsistent inputs: # If both an sf object and tribe information are provided it's unclear what # the priority should be for the query @@ -437,7 +437,7 @@ TADA_DataRetrieval <- function(startDate = "null", # Check for either more than 300 sites or more records than max_recs. # If either is true then we'll approach the pull as a "big data" pull if( site_count > 300 | record_count > maxrecs) { - warning( + message( paste0( "The number of sites and/or records matched by the AOI and query terms is large, so the download may take some time. ", "If your AOI is a county, state, country, or HUC boundary it would be more efficient to provide a code instead of an sf object." @@ -901,7 +901,7 @@ TADA_BigDataHelper <- function(record_summary, WQPquery, maxrecs = 250000, maxsi # Number of sites per download group limited to 300. if (dim(smallsites)[1] > 0) { smallsitesgrp <- smallsites %>% - mutate(group = MESS::cumsumbinning( + dplyr::mutate(group = MESS::cumsumbinning( x = tot_n, threshold = maxrecs, maxgroupsize = 300 @@ -1167,7 +1167,7 @@ TADA_BigDataRetrieval <- function(startDate = "null", rm(df_summary) # if there are still site records when filtered to years of interest.... if (dim(sites)[1] > 0) { - + # get total number of results per site and separate out sites with >250000 results From 19d80a2c459c9dd43c6ae9837bedffd140bac755 Mon Sep 17 00:00:00 2001 From: Matt Brousil <37380883+mbrousil@users.noreply.github.com> Date: Fri, 22 Nov 2024 10:02:37 -0800 Subject: [PATCH 09/35] odds and ends --- NAMESPACE | 1 - man/TADA_BigDataHelper.Rd | 28 +++++++++++++++++++++++++ man/TADA_DataRetrieval.Rd | 3 ++- man/TADA_TribalOptions.Rd | 44 --------------------------------------- 4 files changed, 30 insertions(+), 46 deletions(-) create mode 100644 man/TADA_BigDataHelper.Rd delete mode 100644 man/TADA_TribalOptions.Rd diff --git a/NAMESPACE b/NAMESPACE index 0b286c45..9bd8f6cb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -74,7 +74,6 @@ export(TADA_SimpleCensoredMethods) export(TADA_Stats) export(TADA_SubstituteDeprecatedChars) export(TADA_SummarizeColumn) -export(TADA_TribalOptions) export(TADA_TwoCharacteristicScatterplot) export(TADA_UniqueCharUnitSpeciation) export(TADA_ViewATTAINS) diff --git a/man/TADA_BigDataHelper.Rd b/man/TADA_BigDataHelper.Rd new file mode 100644 index 00000000..7968424c --- /dev/null +++ b/man/TADA_BigDataHelper.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/DataDiscoveryRetrieval.R +\name{TADA_BigDataHelper} +\alias{TADA_BigDataHelper} +\title{Assist with large WQP data pulls using dataRetrieval} +\usage{ +TADA_BigDataHelper(record_summary, WQPquery, maxrecs = 250000, maxsites = 300) +} +\arguments{ +\item{record_summary}{MonitoringLocationIdentifier and resultCount columns from the output of \code{dataRetrieval::whatWQPdata} for the WQP query being used.} + +\item{WQPquery}{A named list of query terms to supply dataRetrieval functions.} + +\item{maxrecs}{Maximum number of records to query at once.} + +\item{maxsites}{Maximum number of sites to query at once.} +} +\value{ +TADA-compatible dataframe +} +\description{ +This is a helper function that takes large WQP (waterqualitydata.us) queries +and splits them up into multiple, smaller queries. By default it pulls data +for up to 300 sites or 250,000 data records at a time and then joins the separate +pulls back together to produce a single TADA compatible dataframe as the output. +Computer memory may limit the size of data frames that your R console will +be able to hold in one session. +} diff --git a/man/TADA_DataRetrieval.Rd b/man/TADA_DataRetrieval.Rd index e49f1e03..24b30c87 100644 --- a/man/TADA_DataRetrieval.Rd +++ b/man/TADA_DataRetrieval.Rd @@ -22,6 +22,7 @@ TADA_DataRetrieval( organization = "null", project = "null", providers = "null", + maxrecs = 250000, applyautoclean = TRUE ) } @@ -60,7 +61,7 @@ TADA_DataRetrieval( \item{providers}{Leave blank to include all, or specify "STEWARDS", "STORET" (i.e., WQX), and/or "NWIS". See https://www.waterqualitydata.us/Codes/providers for options.} -\item{applyautoclean}{Logical, defaults to TRUE. Applies TADA_AutoClean function on the returned data profile.} +\item{applyautoclean}{Logical, defaults to TRUE. Applies TADA_AutoClean function on the returned data profile. Suggest switching to FALSE for queries that are expected to be large.} } \value{ TADA-compatible dataframe diff --git a/man/TADA_TribalOptions.Rd b/man/TADA_TribalOptions.Rd deleted file mode 100644 index 05396d87..00000000 --- a/man/TADA_TribalOptions.Rd +++ /dev/null @@ -1,44 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/GeospatialFunctions.R -\name{TADA_TribalOptions} -\alias{TADA_TribalOptions} -\title{Access options available for querying tribal spatial data with \code{TADA_DataRetrieval()}.} -\usage{ -TADA_TribalOptions(tribal_area_type, return_sf = FALSE) -} -\arguments{ -\item{tribal_area_type}{A character string. Must be one of the six tribal -spatial layers: "Alaska Native Allotments", "Alaska Native Villages", -"American Indian Reservations", "Off-reservation Trust Lands", -"Oklahoma Tribal Statistical Areas", or "Virginia Federally Recognized Tribes".} - -\item{return_sf}{Logical. Should the function return the dataset as an \code{sf} -object (TRUE) or a data frame (FALSE)? Defaults to FALSE.} -} -\value{ -A data frame or \code{sf} object containing the specified layer from the EPA -Map Service. -} -\description{ -This function provides access to \href{https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer}{six layer datasets} -containing spatial data related to tribal lands: "Alaska Native Allotments", -"Alaska Native Villages", "American Indian Reservations", "Off-reservation Trust Lands", -"Oklahoma Tribal Statistical Areas", and "Virginia Federally Recognized Tribes". -These datasets are used by \code{TADA_DataRetrieval()} when retrieving spatial data -for tribal lands specified by the user. - -The purpose of \code{TADA_TribalOptions()} is to allow the user to review the available -data in those datasets and identify the records they would like to query with -\code{TADA_DataRetrieval()}. - -An interactive map of the six layer datasets is available on ArcGIS Online Map -Viewer here: https://www.arcgis.com/apps/mapviewer/index.html?url=https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer&source=sd -} -\note{ -Alaska Native Villages and Virginia Federally Recognized Tribes are point -geometries in the Map Service, not polygons. At the time of this writing they -do not return any data when used for WQP bbox queries. -} -\seealso{ -\code{\link[=TADA_DataRetrieval]{TADA_DataRetrieval()}} -} From a71a49594189c2f1e9ce4aa77bfe2785abef57bb Mon Sep 17 00:00:00 2001 From: Matt Brousil <37380883+mbrousil@users.noreply.github.com> Date: Fri, 3 Jan 2025 16:41:27 -0800 Subject: [PATCH 10/35] Add user prompts to data retrieval --- R/DataDiscoveryRetrieval.R | 61 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/R/DataDiscoveryRetrieval.R b/R/DataDiscoveryRetrieval.R index b83abac4..84c3b45c 100644 --- a/R/DataDiscoveryRetrieval.R +++ b/R/DataDiscoveryRetrieval.R @@ -54,6 +54,8 @@ #' @param organization A string of letters and/or numbers (some additional characters also possible) used to signify an organization with data in the Water Quality Portal. See https://www.waterqualitydata.us/Codes/organization for options. #' @param project A string of letters and/or numbers (some additional characters also possible) used to signify a project with data in the Water Quality Portal. See https://www.waterqualitydata.us/Codes/project for options. #' @param providers Leave blank to include all, or specify "STEWARDS", "STORET" (i.e., WQX), and/or "NWIS". See https://www.waterqualitydata.us/Codes/providers for options. +#' @param maxrecs Maximum number of records to query at once. +#' @param ask A logical value indicating whether the user should be asked for approval before downloads begin. #' @param applyautoclean Logical, defaults to TRUE. Applies TADA_AutoClean function on the returned data profile. Suggest switching to FALSE for queries that are expected to be large. #' #' @return TADA-compatible dataframe @@ -195,6 +197,7 @@ TADA_DataRetrieval <- function(startDate = "null", project = "null", providers = "null", maxrecs = 250000, + ask = TRUE, applyautoclean = TRUE) { # Check for incomplete or inconsistent inputs: @@ -432,6 +435,19 @@ TADA_DataRetrieval <- function(startDate = "null", dplyr::pull(resultCount) %>% sum() + # Should we proceed with downloads? If ask == TRUE then ask the user. + if(ask == TRUE){ + user_decision <- ask_user(n_records = record_count) + + # Act on input + if(user_decision == "yes") { + print("Proceeding with download.") + } else { + stop("Cancelled by user.", call. = FALSE) + } + } + + # Continue now with site count site_count <- length(clipped_site_ids) # Check for either more than 300 sites or more records than max_recs. @@ -687,6 +703,18 @@ TADA_DataRetrieval <- function(startDate = "null", dplyr::pull(resultCount) %>% sum() + # Should we proceed with downloads? If ask == TRUE then ask the user. + if(ask == TRUE){ + user_decision <- ask_user(n_records = record_count) + + # Act on input + if(user_decision == "yes") { + print("Proceeding with download.") + } else { + stop("Cancelled by user.", call. = FALSE) + } + } + # Check for either more than 300 sites or more records than max_recs. # If either is true then we'll approach the pull as a "big data" pull if(site_count > 300 | record_count > maxrecs) { @@ -1352,6 +1380,39 @@ TADA_JoinWQPProfiles <- function(FullPhysChem = "null", return(join2) } +#' Ask user to approve WQP downloads +#' +#' Once record counts have been retrieved from the Water Quality Portal (WQP) for +#' a query, this function is used to prompt the user to decide (i.e., "yes"/"no") +#' whether the download should proceed. The user is also reminded of the limits of +#' Microsoft Excel for row counts as a comparison. +#' +#' @param n_records A numeric value indicating the number of records that will be downloaded from the WQP if the user decides to proceed. +ask_user <- function(n_records){ + + # Text to show user + user_prompt <- cat( + "Your WQP query will return ", + n_records, + " records.\nFor reference, Microsoft Excel will only display ~one million.\n", + "Would you like to continue with the download? [yes/no] ", + sep = "" + ) + + # Ask user if they want to continue & check for valid response + while(TRUE){ + user_input <- readline(prompt = user_prompt) + # Convert response to lower and no whitespace + user_input <- tolower(trimws(user_input)) + if (user_input == "yes" || user_input == "no") { + return(user_input) + } else { + cat("Invalid input. Please enter 'yes' or 'no'.\n") + } + } + +} + # function for chunking by records make_groups <- function(x, maxrecs) { From 0d55e7452a3884b1835f93332143f2da0d0e36b6 Mon Sep 17 00:00:00 2001 From: Matt Brousil <37380883+mbrousil@users.noreply.github.com> Date: Mon, 6 Jan 2025 10:58:22 -0800 Subject: [PATCH 11/35] Add progress bar & change warnings to messages --- R/DataDiscoveryRetrieval.R | 96 +++++++++++++++++++++++--------------- 1 file changed, 58 insertions(+), 38 deletions(-) diff --git a/R/DataDiscoveryRetrieval.R b/R/DataDiscoveryRetrieval.R index 84c3b45c..52c6f789 100644 --- a/R/DataDiscoveryRetrieval.R +++ b/R/DataDiscoveryRetrieval.R @@ -409,9 +409,11 @@ TADA_DataRetrieval <- function(startDate = "null", input_bbox <- sf::st_bbox(aoi_sf) # Query info on available data within the bbox - bbox_avail <- dataRetrieval::whatWQPdata( - WQPquery, - bBox = c(input_bbox$xmin, input_bbox$ymin, input_bbox$xmax, input_bbox$ymax) + bbox_avail <- suppressMessages( + dataRetrieval::whatWQPdata( + WQPquery, + bBox = c(input_bbox$xmin, input_bbox$ymin, input_bbox$xmax, input_bbox$ymax) + ) ) # Check if any sites are within the aoi @@ -420,8 +422,10 @@ TADA_DataRetrieval <- function(startDate = "null", } # Reformat returned info as sf - bbox_sites_sf <- dataRetrieval::whatWQPsites( - siteid = bbox_avail$MonitoringLocationIdentifier + bbox_sites_sf <- suppressMessages( + dataRetrieval::whatWQPsites( + siteid = bbox_avail$MonitoringLocationIdentifier + ) ) %>% TADA_MakeSpatial(., crs = 4326) @@ -461,7 +465,7 @@ TADA_DataRetrieval <- function(startDate = "null", ) # Use helper function to download large data volume - results.DR <- suppressMessages( + results.DR <- withCallingHandlers( TADA_BigDataHelper( record_summary = bbox_avail %>% dplyr::select(MonitoringLocationIdentifier, resultCount) %>% @@ -469,9 +473,11 @@ TADA_DataRetrieval <- function(startDate = "null", WQPquery = WQPquery, maxrecs = maxrecs, maxsites = 300 - ) + ), + message = function(m) message(m$message) ) + rm(bbox_avail, bbox_sites_sf) gc() @@ -495,11 +501,13 @@ TADA_DataRetrieval <- function(startDate = "null", select(-geometry) # Get project metadata - projects.DR <- dataRetrieval::readWQPdata( - siteid = clipped_site_ids, - WQPquery, - ignore_attributes = TRUE, - service = "Project" + projects.DR <- suppressMessages( + dataRetrieval::readWQPdata( + siteid = clipped_site_ids, + WQPquery, + ignore_attributes = TRUE, + service = "Project" + ) ) # Join results, sites, projects @@ -531,11 +539,13 @@ TADA_DataRetrieval <- function(startDate = "null", print(WQPquery) # Get results - results.DR <- dataRetrieval::readWQPdata( - siteid = clipped_site_ids, - WQPquery, - dataProfile = "resultPhysChem", - ignore_attributes = TRUE + results.DR <- suppressMessages( + dataRetrieval::readWQPdata( + siteid = clipped_site_ids, + WQPquery, + dataProfile = "resultPhysChem", + ignore_attributes = TRUE + ) ) # Check if any results were returned @@ -555,11 +565,13 @@ TADA_DataRetrieval <- function(startDate = "null", select(-geometry) # Get project metadata - projects.DR <- dataRetrieval::readWQPdata( - siteid = clipped_site_ids, - WQPquery, - ignore_attributes = TRUE, - service = "Project" + projects.DR <- suppressMessages( + dataRetrieval::readWQPdata( + siteid = clipped_site_ids, + WQPquery, + ignore_attributes = TRUE, + service = "Project" + ) ) # Join results, sites, projects @@ -695,7 +707,7 @@ TADA_DataRetrieval <- function(startDate = "null", } # Query info on available data - query_avail <- dataRetrieval::whatWQPdata(WQPquery) + query_avail <- supressMessages(dataRetrieval::whatWQPdata(WQPquery)) site_count <- length(query_avail$MonitoringLocationIdentifier) @@ -718,7 +730,7 @@ TADA_DataRetrieval <- function(startDate = "null", # Check for either more than 300 sites or more records than max_recs. # If either is true then we'll approach the pull as a "big data" pull if(site_count > 300 | record_count > maxrecs) { - warning( + message( "The number of sites and/or records matched by the query terms is large, so the download may take some time." ) @@ -737,16 +749,20 @@ TADA_DataRetrieval <- function(startDate = "null", gc() # Get site metadata - sites.DR <- dataRetrieval::whatWQPsites( - siteid = unique(results.DR$MonitoringLocationIdentifier) + sites.DR <- suppressMessages( + dataRetrieval::whatWQPsites( + siteid = unique(results.DR$MonitoringLocationIdentifier) + ) ) # Get project metadata - projects.DR <- dataRetrieval::readWQPdata( - siteid = unique(results.DR$MonitoringLocationIdentifier), - WQPquery, - ignore_attributes = TRUE, - service = "Project" + projects.DR <- suppressMessages( + dataRetrieval::readWQPdata( + siteid = unique(results.DR$MonitoringLocationIdentifier), + WQPquery, + ignore_attributes = TRUE, + service = "Project" + ) ) # Join results, sites, projects @@ -774,9 +790,11 @@ TADA_DataRetrieval <- function(startDate = "null", # Retrieve all 3 profiles print("Downloading WQP query results. This may take some time depending upon the query size.") print(WQPquery) - results.DR <- dataRetrieval::readWQPdata(WQPquery, - dataProfile = "resultPhysChem", - ignore_attributes = TRUE + results.DR <- suppressMessages( + dataRetrieval::readWQPdata(WQPquery, + dataProfile = "resultPhysChem", + ignore_attributes = TRUE + ) ) # check if any results are available @@ -784,11 +802,13 @@ TADA_DataRetrieval <- function(startDate = "null", print("Returning empty results dataframe: Your WQP query returned no results (no data available). Try a different query. Removing some of your query filters OR broadening your search area may help.") TADAprofile.clean <- results.DR } else { - sites.DR <- dataRetrieval::whatWQPsites(WQPquery) + sites.DR <- suppressMessages(dataRetrieval::whatWQPsites(WQPquery)) - projects.DR <- dataRetrieval::readWQPdata(WQPquery, - ignore_attributes = TRUE, - service = "Project" + projects.DR <- suppressMessages( + dataRetrieval::readWQPdata(WQPquery, + ignore_attributes = TRUE, + service = "Project" + ) ) TADAprofile <- TADA_JoinWQPProfiles( From 27038fb5b7b9553299ac2343bcb2f789d1731bbb Mon Sep 17 00:00:00 2001 From: Matt Brousil <37380883+mbrousil@users.noreply.github.com> Date: Thu, 9 Jan 2025 09:35:36 -0800 Subject: [PATCH 12/35] Include progress bar internals --- R/DataDiscoveryRetrieval.R | 69 ++++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 26 deletions(-) diff --git a/R/DataDiscoveryRetrieval.R b/R/DataDiscoveryRetrieval.R index 52c6f789..cd09ccad 100644 --- a/R/DataDiscoveryRetrieval.R +++ b/R/DataDiscoveryRetrieval.R @@ -967,11 +967,13 @@ TADA_BigDataHelper <- function(record_summary, WQPquery, maxrecs = 250000, maxsi small_site_chunk <- subset(smallsitesgrp$MonitoringLocationIdentifier, smallsitesgrp$group == i) # Query result data - results_small <- dataRetrieval::readWQPdata( - siteid = small_site_chunk, - WQPquery, - dataProfile = "resultPhysChem", - ignore_attributes = TRUE + results_small <- suppressMessages( + dataRetrieval::readWQPdata( + siteid = small_site_chunk, + WQPquery, + dataProfile = "resultPhysChem", + ignore_attributes = TRUE + ) ) %>% dplyr::mutate(across(everything(), as.character)) @@ -979,42 +981,57 @@ TADA_BigDataHelper <- function(record_summary, WQPquery, maxrecs = 250000, maxsi if (dim(results_small)[1] > 0) { df_small <- dplyr::bind_rows(df_small, results_small) } + + # Update progress + setTxtProgressBar(pb = small_prog_bar, value = nrow(df_small)) } + # Close progress bar when complete + close(small_prog_bar) rm(smallsites, smallsitesgrp) gc() + } + + # Large sites (>= maxrecs) next: + if (dim(bigsites)[1] > 0) { + print( + paste0("Downloading data from sites with greater than ", + maxrecs, + " results, chunking queries by site.") + ) - # Large sites (>= maxrecs) next: - if (dim(bigsites)[1] > 0) { - print( - paste0("Downloading data from sites with greater than ", - maxrecs, - " results, chunking queries by site.") - ) - - # Unique site IDs - bsitesvec <- unique(bigsites$MonitoringLocationIdentifier) - - # For each site - for (i in 1:length(bsitesvec)) { - # Download each site's data individually - results_big <- dataRetrieval::readWQPdata( + big_prog_bar <- txtProgressBar(min = 0, max = sum(bigsites$tot_n), style = 3) + + # Unique site IDs + bsitesvec <- unique(bigsites$MonitoringLocationIdentifier) + + # For each site + for (i in 1:length(bsitesvec)) { + # Download each site's data individually + results_big <- suppressMessages( + dataRetrieval::readWQPdata( siteid = bsitesvec[i], WQPquery, dataProfile = "resultPhysChem", ignore_attributes = TRUE - ) %>% - dplyr::mutate(across(everything(), as.character)) - - if (dim(results_big)[1] > 0) { - df_big <- dplyr::bind_rows(df_big, results_big) - } + ) + )%>% + dplyr::mutate(across(everything(), as.character)) + + if (dim(results_big)[1] > 0) { + df_big <- dplyr::bind_rows(df_big, results_big) } + # Update progress + setTxtProgressBar(pb = big_prog_bar, value = nrow(df_big)) } + # Close progress bar when complete + close(big_prog_bar) + rm(bigsites) gc() } + df_out <- bind_rows(df_small, df_big) return(df_out) From 02de221d3ac253cb286d50b0fe46d8ed9cec20c3 Mon Sep 17 00:00:00 2001 From: Matt Brousil <37380883+mbrousil@users.noreply.github.com> Date: Thu, 9 Jan 2025 09:36:17 -0800 Subject: [PATCH 13/35] Update DataDiscoveryRetrieval.R --- R/DataDiscoveryRetrieval.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/DataDiscoveryRetrieval.R b/R/DataDiscoveryRetrieval.R index cd09ccad..03721dc0 100644 --- a/R/DataDiscoveryRetrieval.R +++ b/R/DataDiscoveryRetrieval.R @@ -962,6 +962,8 @@ TADA_BigDataHelper <- function(record_summary, WQPquery, maxrecs = 250000, maxsi " results by grouping them together.") ) + small_prog_bar <- txtProgressBar(min = 0, max = sum(smallsites$tot_n), style = 3) + # Download the data for each group for (i in 1:max(smallsitesgrp$group)) { small_site_chunk <- subset(smallsitesgrp$MonitoringLocationIdentifier, From 6e16a9bf556e67e5059cc7590cdac0e053b651a7 Mon Sep 17 00:00:00 2001 From: Matt Brousil <37380883+mbrousil@users.noreply.github.com> Date: Fri, 10 Jan 2025 10:53:13 -0800 Subject: [PATCH 14/35] Apply suggestions from code review Co-authored-by: Katie/Ryn Willi (she/her) <49761053+kathryn-willi@users.noreply.github.com> --- R/DataDiscoveryRetrieval.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/DataDiscoveryRetrieval.R b/R/DataDiscoveryRetrieval.R index 03721dc0..80ab5bf9 100644 --- a/R/DataDiscoveryRetrieval.R +++ b/R/DataDiscoveryRetrieval.R @@ -707,7 +707,7 @@ TADA_DataRetrieval <- function(startDate = "null", } # Query info on available data - query_avail <- supressMessages(dataRetrieval::whatWQPdata(WQPquery)) + query_avail <- suppressMessages(dataRetrieval::whatWQPdata(WQPquery)) site_count <- length(query_avail$MonitoringLocationIdentifier) @@ -1034,7 +1034,7 @@ TADA_BigDataHelper <- function(record_summary, WQPquery, maxrecs = 250000, maxsi } - df_out <- bind_rows(df_small, df_big) + df_out <- dplyr::bind_rows(df_small, df_big) return(df_out) } From 0c42ea505b79847e63f3be2beca63a83e83aafcc Mon Sep 17 00:00:00 2001 From: Matt Brousil <37380883+mbrousil@users.noreply.github.com> Date: Fri, 10 Jan 2025 11:27:30 -0800 Subject: [PATCH 15/35] Apply suggestions from code review --- R/DataDiscoveryRetrieval.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/DataDiscoveryRetrieval.R b/R/DataDiscoveryRetrieval.R index 80ab5bf9..bdf17323 100644 --- a/R/DataDiscoveryRetrieval.R +++ b/R/DataDiscoveryRetrieval.R @@ -498,7 +498,7 @@ TADA_DataRetrieval <- function(startDate = "null", # Get site metadata sites.DR <- clipped_sites_sf %>% as_tibble() %>% - select(-geometry) + dplyr::select(-geometry) # Get project metadata projects.DR <- suppressMessages( @@ -562,7 +562,7 @@ TADA_DataRetrieval <- function(startDate = "null", # Get site metadata sites.DR <- clipped_sites_sf %>% as_tibble() %>% - select(-geometry) + dplyr::select(-geometry) # Get project metadata projects.DR <- suppressMessages( From 21cf9c5580db099f02dc013b4cc5ac9b9ad6100f Mon Sep 17 00:00:00 2001 From: Matt Brousil <37380883+mbrousil@users.noreply.github.com> Date: Fri, 10 Jan 2025 11:31:39 -0800 Subject: [PATCH 16/35] Update DataDiscoveryRetrieval.R --- R/DataDiscoveryRetrieval.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/DataDiscoveryRetrieval.R b/R/DataDiscoveryRetrieval.R index bdf17323..fa4ef4d0 100644 --- a/R/DataDiscoveryRetrieval.R +++ b/R/DataDiscoveryRetrieval.R @@ -497,7 +497,7 @@ TADA_DataRetrieval <- function(startDate = "null", # Get site metadata sites.DR <- clipped_sites_sf %>% - as_tibble() %>% + dplyr::as_tibble() %>% dplyr::select(-geometry) # Get project metadata @@ -561,7 +561,7 @@ TADA_DataRetrieval <- function(startDate = "null", # Get site metadata sites.DR <- clipped_sites_sf %>% - as_tibble() %>% + dplyr::as_tibble() %>% dplyr::select(-geometry) # Get project metadata From 152e979eb7fbe0a6f75b11734313d8b4afc2af4e Mon Sep 17 00:00:00 2001 From: Matt Brousil <37380883+mbrousil@users.noreply.github.com> Date: Fri, 17 Jan 2025 12:16:24 -0800 Subject: [PATCH 17/35] http errors --- R/DataDiscoveryRetrieval.R | 90 +++++++++++++++++++++++++++++--------- 1 file changed, 69 insertions(+), 21 deletions(-) diff --git a/R/DataDiscoveryRetrieval.R b/R/DataDiscoveryRetrieval.R index fa4ef4d0..5817e519 100644 --- a/R/DataDiscoveryRetrieval.R +++ b/R/DataDiscoveryRetrieval.R @@ -249,15 +249,15 @@ TADA_DataRetrieval <- function(startDate = "null", stop("A tribal_area_type is required if tribe_name_parcel is provided.") } - # Set query parameters - WQPquery <- list() - # If an sf object OR tribal info are provided they will be the basis of the query # (The tribal data handling uses sf objects as well) if( (!is.null(aoi_sf) & inherits(aoi_sf, "sf")) | (tribal_area_type != "null") ){ # Build the non-sf part of the query: + # Set query parameters + WQPquery <- list() + # StartDate if (length(startDate) > 1) { if (is.na(suppressWarnings(lubridate::parse_date_time(startDate[1], orders = "ymd")))) { @@ -409,25 +409,55 @@ TADA_DataRetrieval <- function(startDate = "null", input_bbox <- sf::st_bbox(aoi_sf) # Query info on available data within the bbox - bbox_avail <- suppressMessages( - dataRetrieval::whatWQPdata( - WQPquery, - bBox = c(input_bbox$xmin, input_bbox$ymin, input_bbox$xmax, input_bbox$ymax) - ) + # Don't want to print every message that's returned by WQP + quiet_whatWQPdata <- purrr::quietly(dataRetrieval::whatWQPdata) + + # Try getting WQP info + quiet_bbox_avail <- quiet_whatWQPdata( + WQPquery, + bBox = c(input_bbox$xmin, input_bbox$ymin, input_bbox$xmax, input_bbox$ymax) ) + # Alert & stop if an http error was received + if(is.null(quiet_bbox_avail$result)){ + + stop_message <- quiet_bbox_avail$messages %>% + grep(pattern = "failed|HTTP", x = ., ignore.case = FALSE, value = TRUE) %>% + paste("\n", ., collapse = "") %>% + paste("The WQP request returned a NULL with the following message(s): \n", + ., + collapse = "\n") + + stop(stop_message) + + } + + # Use result only + bbox_avail <- quiet_bbox_avail$result + # Check if any sites are within the aoi if ( (nrow(bbox_avail) > 0 ) == FALSE) { stop("No monitoring sites were returned within your area of interest (no data available).") } + quiet_bbox_sites <- quiet_whatWQPdata( + siteid = bbox_avail$MonitoringLocationIdentifier + ) + + if(is.null(quiet_bbox_sites$result)){ + + stop_message <- quiet_bbox_sites$messages %>% + grep(pattern = "failed|HTTP", x = ., ignore.case = FALSE, value = TRUE) %>% + paste("\n", ., collapse = "") %>% + paste("The WQP request returned a NULL with the following message(s): \n", + ., + collapse = "\n") + stop(stop_message) + + } + # Reformat returned info as sf - bbox_sites_sf <- suppressMessages( - dataRetrieval::whatWQPsites( - siteid = bbox_avail$MonitoringLocationIdentifier - ) - ) %>% - TADA_MakeSpatial(., crs = 4326) + bbox_sites_sf <- TADA_MakeSpatial(quiet_bbox_sites$result, crs = 4326) # Subset sites to only within shapefile and get IDs clipped_sites_sf <- bbox_sites_sf[aoi_sf, ] @@ -516,7 +546,7 @@ TADA_DataRetrieval <- function(startDate = "null", Sites = sites.DR, Projects = projects.DR ) %>% dplyr::mutate( - across(tidyselect::everything(), as.character) + dplyr::across(tidyselect::everything(), as.character) ) # run TADA_AutoClean function @@ -580,7 +610,7 @@ TADA_DataRetrieval <- function(startDate = "null", Sites = sites.DR, Projects = projects.DR ) %>% dplyr::mutate( - across(tidyselect::everything(), as.character) + dplyr::across(tidyselect::everything(), as.character) ) # Run TADA_AutoClean function @@ -707,7 +737,25 @@ TADA_DataRetrieval <- function(startDate = "null", } # Query info on available data - query_avail <- suppressMessages(dataRetrieval::whatWQPdata(WQPquery)) + # Don't want to print every message that's returned by WQP + quiet_whatWQPdata <- purrr::quietly(dataRetrieval::whatWQPdata) + + quiet_query_avail <- quiet_whatWQPdata(WQPquery) + + if(is.null(quiet_query_avail$result)){ + + stop_message <- quiet_query_avail$messages %>% + grep(pattern = "failed|HTTP", x = ., ignore.case = FALSE, value = TRUE) %>% + paste("\n", ., collapse = "") %>% + paste("The WQP request returned a NULL with the following message(s): \n", + ., + collapse = "\n") + + stop(stop_message) + + } + + query_avail <- quiet_query_avail$result site_count <- length(query_avail$MonitoringLocationIdentifier) @@ -771,7 +819,7 @@ TADA_DataRetrieval <- function(startDate = "null", Sites = sites.DR, Projects = projects.DR ) %>% dplyr::mutate( - across(tidyselect::everything(), as.character) + dplyr::across(tidyselect::everything(), as.character) ) # run TADA_AutoClean function @@ -816,7 +864,7 @@ TADA_DataRetrieval <- function(startDate = "null", Sites = sites.DR, Projects = projects.DR ) %>% dplyr::mutate( - across(tidyselect::everything(), as.character) + dplyr::across(tidyselect::everything(), as.character) ) # run TADA_AutoClean function @@ -977,7 +1025,7 @@ TADA_BigDataHelper <- function(record_summary, WQPquery, maxrecs = 250000, maxsi ignore_attributes = TRUE ) ) %>% - dplyr::mutate(across(everything(), as.character)) + dplyr::mutate(dplyr::across(everything(), as.character)) # If data is returned, stack with what's already been retrieved if (dim(results_small)[1] > 0) { @@ -1018,7 +1066,7 @@ TADA_BigDataHelper <- function(record_summary, WQPquery, maxrecs = 250000, maxsi ignore_attributes = TRUE ) )%>% - dplyr::mutate(across(everything(), as.character)) + dplyr::mutate(dplyr::across(everything(), as.character)) if (dim(results_big)[1] > 0) { df_big <- dplyr::bind_rows(df_big, results_big) From fd3019faa910ab4cc5c2cf66a29861be5603a3ac Mon Sep 17 00:00:00 2001 From: Matt Brousil <37380883+mbrousil@users.noreply.github.com> Date: Fri, 17 Jan 2025 15:54:01 -0800 Subject: [PATCH 18/35] TADA_TribalOptions + vignette + logic --- R/DataDiscoveryRetrieval.R | 100 +++++++++++++++++++++++++++++++++---- vignettes/TADAModule1.Rmd | 23 ++++++++- 2 files changed, 110 insertions(+), 13 deletions(-) diff --git a/R/DataDiscoveryRetrieval.R b/R/DataDiscoveryRetrieval.R index 5817e519..f972d2d5 100644 --- a/R/DataDiscoveryRetrieval.R +++ b/R/DataDiscoveryRetrieval.R @@ -205,7 +205,7 @@ TADA_DataRetrieval <- function(startDate = "null", # If both an sf object and tribe information are provided it's unclear what # the priority should be for the query if( !is.null(aoi_sf) & - ( (tribal_area_type != "null") | (tribe_name_parcel != "null") ) ){ + any( (tribal_area_type != "null") | (tribe_name_parcel != "null") )){ stop( paste0( "Both sf data and tribal information have been provided. ", @@ -245,7 +245,7 @@ TADA_DataRetrieval <- function(startDate = "null", } # Insufficient tribal info provided - if( (tribal_area_type == "null") & (tribe_name_parcel != "null") ){ + if( (tribal_area_type == "null") & all(tribe_name_parcel != "null") ){ stop("A tribal_area_type is required if tribe_name_parcel is provided.") } @@ -365,15 +365,15 @@ TADA_DataRetrieval <- function(startDate = "null", ){ # Get the relevant url - aoi_sf <- filter(map_service_urls, - tribal_area == tribal_area_type)$url %>% + aoi_sf <- dplyr::filter(map_service_urls, + tribal_area == tribal_area_type)$url %>% # Pull data arcgislayers::arc_open() %>% # Return sf arcgislayers::arc_select() %>% # If a value provided, then filter - {if ((tribe_name_parcel != "null") & (tribe_name_parcel != "null")) { - filter(., TRIBE_NAME %in% tribe_name_parcel) + {if (all(tribe_name_parcel != "null") ) { + dplyr::filter(., TRIBE_NAME %in% tribe_name_parcel) } else { . }} @@ -381,12 +381,12 @@ TADA_DataRetrieval <- function(startDate = "null", # Otherwise filter by PARCEL_NO (Note that values in this col are not unique) } else if(tribal_area_type == "Alaska Native Allotments"){ - aoi_sf <- filter(map_service_urls, - tribal_area == tribal_area_type)$url %>% + aoi_sf <- dplyr::filter(map_service_urls, + tribal_area == tribal_area_type)$url %>% arcgislayers::arc_open() %>% arcgislayers::arc_select() %>% - {if ((tribe_name_parcel != "null") & (tribe_name_parcel != "null")) { - filter(., PARCEL_NO %in% tribe_name_parcel) + {if (all(tribe_name_parcel != "null")) { + dplyr::filter(., PARCEL_NO %in% tribe_name_parcel) } else { . }} @@ -440,7 +440,10 @@ TADA_DataRetrieval <- function(startDate = "null", stop("No monitoring sites were returned within your area of interest (no data available).") } - quiet_bbox_sites <- quiet_whatWQPdata( + + quiet_whatWQPsites <- purrr::quietly(dataRetrieval::whatWQPsites) + + quiet_bbox_sites <- quiet_whatWQPsites( siteid = bbox_avail$MonitoringLocationIdentifier ) @@ -882,6 +885,81 @@ TADA_DataRetrieval <- function(startDate = "null", } } +#' Access options available for querying tribal spatial data with `TADA_DataRetrieval()`. +#' +#' @description +#' This function provides access to [six layer datasets](https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer) +#' containing spatial data related to tribal lands: "Alaska Native Allotments", +#' "Alaska Native Villages", "American Indian Reservations", "Off-reservation Trust Lands", +#' "Oklahoma Tribal Statistical Areas", and "Virginia Federally Recognized Tribes". +#' These datasets are used by `TADA_DataRetrieval()` when retrieving spatial data +#' for tribal lands specified by the user. +#' +#' The purpose of `TADA_TribalOptions()` is to allow the user to review the available +#' data in those datasets and identify the records they would like to query with +#' `TADA_DataRetrieval()`. +#' +#' An interactive map of the six layer datasets is available on ArcGIS Online Map +#' Viewer here: https://www.arcgis.com/apps/mapviewer/index.html?url=https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer&source=sd +#' +#' @param tribal_area_type A character string. Must be one of the six tribal +#' spatial layers: "Alaska Native Allotments", "Alaska Native Villages", +#' "American Indian Reservations", "Off-reservation Trust Lands", +#' "Oklahoma Tribal Statistical Areas", or "Virginia Federally Recognized Tribes". +#' +#' @param return_sf Logical. Should the function return the dataset as an `sf` +#' object (TRUE) or a data frame (FALSE)? Defaults to FALSE. +#' +#' @returns A data frame or `sf` object containing the specified layer from the EPA +#' Map Service. +#' +#' @note +#' Alaska Native Villages and Virginia Federally Recognized Tribes are point +#' geometries in the Map Service, not polygons. At the time of this writing they +#' do not return any data when used for WQP bbox queries. +#' +#' @export +#' +#' @seealso [TADA_DataRetrieval()] +#' + +TADA_TribalOptions <- function(tribal_area_type, return_sf = FALSE){ + + # Make a reference table for tribal area type + url matching + map_service_urls <- tibble::tribble( + ~tribal_area, ~url, + "Alaska Native Allotments", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/0", + "Alaska Native Villages", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/1", + "American Indian Reservations", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/2", + "Off-reservation Trust Lands", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/3", + "Oklahoma Tribal Statistical Areas", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/4", + "Virginia Federally Recognized Tribes", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/5" + ) + + # Confirm usable string provided + if( !(tribal_area_type %in% map_service_urls$tribal_area) ){ + stop("tribal_area_type must match one of the six tribal spatial layer names.") + } + + # Query Map Service + tribal_area_sf <- dplyr::filter(map_service_urls, + tribal_area == tribal_area_type)$url %>% + arcgislayers::arc_open() %>% + # Return sf + arcgislayers::arc_select() %>% + sf::st_make_valid() + + # Convert to df if needed, export + if(return_sf == FALSE){ + return( + as.data.frame(tribal_area_sf) %>% + sf::st_drop_geometry() + ) + } else { + return(tribal_area_sf) + } + +} #' Read in WQP data using the Water Quality Portal (WQP) web services #' diff --git a/vignettes/TADAModule1.Rmd b/vignettes/TADAModule1.Rmd index 96ed73f2..e7aafc03 100644 --- a/vignettes/TADAModule1.Rmd +++ b/vignettes/TADAModule1.Rmd @@ -93,7 +93,7 @@ package functions. It joins three WQP profiles: Site, Sample Results data in the Characteristic, Speciation, Fraction, and Unit fields to uppercase and addresses result values that include special characters. -This function uses the same inputs as the dataRetrieval `readWQPdata` +This function accepts the same inputs as the dataRetrieval `readWQPdata` function. `readWQPdata` does not restrict the characteristics pulled from [Water Quality Portal (WQP)](https://www.waterqualitydata.us/). @@ -124,6 +124,16 @@ Data retrieval filters include: - characteristicType +In addition to these filters, TADA_DataRetrieval accepts additional +geospatial-related filters that are not included in the dataRetrieval +`readWQPdata` function: + +- aoi_sf + +- tribal_area_type + +- tribe_name_parcel + The default TADA_DataRetrieval function automatically runs the **TADA_AutoClean** function. In this example, we will set **TADA_AutoClean = FALSE** and run it as a separate step in the work @@ -159,6 +169,15 @@ Tips: identifiers for monitoring locations within USGS NWIS or EPA's WQX databases separately. +3. The aoi_sf and tribal arguments are meant to be used on their own. + For example, if both an aoi_sf argument and tribal information are + provided an error is returned because it's unclear what the priority + location should be for the query. Similarly, aoi_sf and + tribal_area_type are not meant to be used with location-related + filters (e.g., statecode, siteid). In these instances a warning is + returned but the query proceeds by using only the aoi_sf or + tribal_area_type information. + Additional resources: - Review function documentation by entering the following code into @@ -201,7 +220,7 @@ du Lac Band), 5) Pueblo of Tesuque, and 6) The Chickasaw Nation We will move forward with this example in the remainder of the vignette. ```{r TADA_DataRetrieval} -TADAProfile <- TADA_DataRetrieval(organization = c("REDLAKE_WQX", "SFNOES_WQX", "PUEBLO_POJOAQUE", "FONDULAC_WQX", "PUEBLOOFTESUQUE", "CNENVSER"), startDate = "2018-01-01", endDate = "2019-01-01", applyautoclean = FALSE) +TADAProfile <- TADA_DataRetrieval(organization = c("REDLAKE_WQX", "SFNOES_WQX", "PUEBLO_POJOAQUE", "FONDULAC_WQX", "PUEBLOOFTESUQUE", "CNENVSER"), startDate = "2018-01-01", endDate = "2019-01-01", applyautoclean = FALSE, ask = FALSE) ``` ## USGS dataRetrieval From a8dcb05d67dc878128cb3b70f536c9fbcce973dc Mon Sep 17 00:00:00 2001 From: Matt Brousil <37380883+mbrousil@users.noreply.github.com> Date: Tue, 21 Jan 2025 14:16:20 -0800 Subject: [PATCH 19/35] Vignette updates --- R/DataDiscoveryRetrieval.R | 2 +- vignettes/TADAModule1.Rmd | 85 ++++++++++++++++++++++++++++++++++---- 2 files changed, 78 insertions(+), 9 deletions(-) diff --git a/R/DataDiscoveryRetrieval.R b/R/DataDiscoveryRetrieval.R index f972d2d5..8e8f9859 100644 --- a/R/DataDiscoveryRetrieval.R +++ b/R/DataDiscoveryRetrieval.R @@ -54,7 +54,7 @@ #' @param organization A string of letters and/or numbers (some additional characters also possible) used to signify an organization with data in the Water Quality Portal. See https://www.waterqualitydata.us/Codes/organization for options. #' @param project A string of letters and/or numbers (some additional characters also possible) used to signify a project with data in the Water Quality Portal. See https://www.waterqualitydata.us/Codes/project for options. #' @param providers Leave blank to include all, or specify "STEWARDS", "STORET" (i.e., WQX), and/or "NWIS". See https://www.waterqualitydata.us/Codes/providers for options. -#' @param maxrecs Maximum number of records to query at once. +#' @param maxrecs Maximum number of records to query at once (i.e., without breaking into smaller queries). #' @param ask A logical value indicating whether the user should be asked for approval before downloads begin. #' @param applyautoclean Logical, defaults to TRUE. Applies TADA_AutoClean function on the returned data profile. Suggest switching to FALSE for queries that are expected to be large. #' diff --git a/vignettes/TADAModule1.Rmd b/vignettes/TADAModule1.Rmd index e7aafc03..23042ed5 100644 --- a/vignettes/TADAModule1.Rmd +++ b/vignettes/TADAModule1.Rmd @@ -196,7 +196,7 @@ Additional resources: Tutorial](https://waterdata.usgs.gov/blog/dataretrieval/) Use the code below to download data from the WQP using -TADA_DataRetrieval. Edit the code chuck below to define your own WQP +TADA_DataRetrieval. Edit the code chunk below to define your own WQP query inputs. Downloads using TADA_DataRetrieval will have the same columns each time, @@ -217,12 +217,74 @@ This example includes monitoring data collected from Jan 2018 to Jan & Fox Nation, 3) Pueblo of Pojoaque, 4) Minnesota Chippewa Tribe (Fond du Lac Band), 5) Pueblo of Tesuque, and 6) The Chickasaw Nation -We will move forward with this example in the remainder of the vignette. - ```{r TADA_DataRetrieval} TADAProfile <- TADA_DataRetrieval(organization = c("REDLAKE_WQX", "SFNOES_WQX", "PUEBLO_POJOAQUE", "FONDULAC_WQX", "PUEBLOOFTESUQUE", "CNENVSER"), startDate = "2018-01-01", endDate = "2019-01-01", applyautoclean = FALSE, ask = FALSE) ``` +We will move forward with this example in the remainder of the vignette. + +We will first use a subset of this example to demonstrate using new +TADA_DataRetrieval options that allow for **spatial** or +**tribe-specific** queries: + +Focusing just on the "PUEBLO_POJOAQUE" organization, rerun the example +above: + +```{r TADA_DataRetrieval_single} +TADAProfile_single <- TADA_DataRetrieval( + organization = "PUEBLO_POJOAQUE", + startDate = "2018-01-01", + endDate = "2019-01-01", + applyautoclean = FALSE, + ask = FALSE +) +``` + +The same results can now be obtained using a combination of the +tribal_area_type and tribe_name_parcel arguments. Both must be used +together. The tribal_area_type argument indicates which one of the [six +layer +datasets](https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer) +of tribal land data to query within. tribe_name_parcel is the specific +name of the tribal land of interest as listed in the layer. (Running +TADA_TribalOptions("American Indian Reservations") could be used here to +determine the correct spelling for this argument, as listed in the +TRIBE_NAME column.) + +```{r TADA_DataRetrieval_tribal} +TADAProfile_tribal <- TADA_DataRetrieval( + tribal_area_type = "American Indian Reservations", + tribe_name_parcel = "Pueblo of Pojoaque, New Mexico", + startDate = "2018-01-01", + endDate = "2019-01-01", + applyautoclean = FALSE, + ask = FALSE +) +# After removing attribute metadata they are equivalent: +all.equal(data.frame(TADAProfile_single), data.frame(TADAProfile_tribal)) +``` + +Additionally, the aoi_sf argument can be used to provide an sf spatial object +as a query filter. We can match the output of the two short examples above like so: + +```{r TADA_DataRetrieval_spatial} +TADAProfile_spatial <- TADA_DataRetrieval( + aoi_sf = tigris::native_areas() %>% filter(NAMELSAD == "Pueblo of Pojoaque"), + startDate = "2018-01-01", + endDate = "2019-01-01", + applyautoclean = FALSE, + ask = FALSE +) + +all.equal(data.frame(TADAProfile_single), data.frame(TADAProfile_spatial)) +``` + +**Note**: In this example the output data is identical from these three input +methods. However, in some instances this may not be the case. This is because +the tribal_area_type method is based on spatial data and so spatial boundaries +must be taken into account when comparing query results. The same applies when +using aoi_sf results. + ## USGS dataRetrieval Uncomment below (optional) if you would like to review differences @@ -236,19 +298,26 @@ as part of the data retrieval process. # dataRetrieval_example <- dataRetrieval::readWQPdata(organization = c("REDLAKE_WQX", "SFNOES_WQX", "PUEBLO_POJOAQUE", "FONDULAC_WQX", "PUEBLOOFTESUQUE", "CNENVSER"), startDate = "2018-01-01", endDate = "2019-01-01", ignore_attributes = TRUE) ``` -## TADA_BigDataRetrieval +## Big Data Queries If you need to download a large amount of data from across a large area, -and the TADA_DataRetrieval function is not working due to WQP timeout -issues, then the **TADA_BigDataRetrieval** function may work better. +the TADA_DataRetrieval function now handles this automatically. Whereas in the +past there was a second function (TADA_BigDataRetrieval) to do this, the +standard TADA_DataRetrieval function now checks the number of results in each +query and uses similar methods as TADA_BigDataRetrieval when necessary. -This function does multiple synchronous data calls to the WQP +The function does multiple synchronous data calls to the WQP (waterqualitydata.us). It uses the WQP summary service to limit the sites downloaded to only those with relevant data. It pulls back data from set number of stations at a time and then joins the data back together to produce a single TADA compatible dataframe as the output. -See ?TADA_BigDataRetrieval for more details. WARNING, some of the +TADA_DataRetrieval now also prompts the user (when ask = TRUE) to confirm that +they want to download the dataset. As part of this prompt the expected number +of rows of data are provided to help in making the decision. As the downloads +occur, a progress bar is shown as well. + +See ?TADA_DataRetrieval for more details. WARNING, some of the examples below can take multiple HOURS to run. The total run time depends on your query inputs. From d8fe5f3b60893948b064cac7998162609c65187c Mon Sep 17 00:00:00 2001 From: Matt Brousil <37380883+mbrousil@users.noreply.github.com> Date: Wed, 22 Jan 2025 15:21:18 -0800 Subject: [PATCH 20/35] Update testing, build, etc. --- .Rbuildignore | 2 + .gitignore | 2 + NAMESPACE | 1 + R/Utilities.R | 6 ++- man/TADA_DataRetrieval.Rd | 5 +++ man/TADA_TribalOptions.Rd | 44 ++++++++++++++++++++ man/ask_user.Rd | 17 ++++++++ tests/testthat/test-CensoredDataSuite.R | 7 ++-- tests/testthat/test-DataDiscoveryRetrieval.R | 34 ++++++++------- tests/testthat/test-ResultFlagsDependent.R | 2 +- tests/testthat/test-Tables.R | 2 +- tests/testthat/test-UnitConversions.R | 3 +- vignettes/TADAModule1.Rmd | 10 ++--- vignettes/TADAModule2.Rmd | 3 +- 14 files changed, 108 insertions(+), 30 deletions(-) create mode 100644 man/TADA_TribalOptions.Rd create mode 100644 man/ask_user.Rd diff --git a/.Rbuildignore b/.Rbuildignore index 0b0fbfc0..4d9bb147 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -9,3 +9,5 @@ readme.html .gitignore ^articles ^vignettes/articles$ +^doc$ +^Meta$ diff --git a/.gitignore b/.gitignore index ffaf1f85..c109a3ac 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,5 @@ testing_log.txt # test data from AK AK_EPATADA_troubleshooting +/doc/ +/Meta/ diff --git a/NAMESPACE b/NAMESPACE index 9bd8f6cb..0b286c45 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -74,6 +74,7 @@ export(TADA_SimpleCensoredMethods) export(TADA_Stats) export(TADA_SubstituteDeprecatedChars) export(TADA_SummarizeColumn) +export(TADA_TribalOptions) export(TADA_TwoCharacteristicScatterplot) export(TADA_UniqueCharUnitSpeciation) export(TADA_ViewATTAINS) diff --git a/R/Utilities.R b/R/Utilities.R index c1ca0f94..efa91f3b 100644 --- a/R/Utilities.R +++ b/R/Utilities.R @@ -964,7 +964,8 @@ TADA_RandomTestingData <- function(number_of_days = 1, choose_random_state = FAL startDate = as.character(random_start_date), endDate = as.character(end_date), statecode = state, - applyautoclean = TRUE + applyautoclean = TRUE, + ask = FALSE ) } @@ -973,7 +974,8 @@ TADA_RandomTestingData <- function(number_of_days = 1, choose_random_state = FAL startDate = as.character(random_start_date), endDate = as.character(end_date), statecode = state, - applyautoclean = FALSE + applyautoclean = FALSE, + ask = FALSE ) } diff --git a/man/TADA_DataRetrieval.Rd b/man/TADA_DataRetrieval.Rd index 24b30c87..bcc02ad1 100644 --- a/man/TADA_DataRetrieval.Rd +++ b/man/TADA_DataRetrieval.Rd @@ -23,6 +23,7 @@ TADA_DataRetrieval( project = "null", providers = "null", maxrecs = 250000, + ask = TRUE, applyautoclean = TRUE ) } @@ -61,6 +62,10 @@ TADA_DataRetrieval( \item{providers}{Leave blank to include all, or specify "STEWARDS", "STORET" (i.e., WQX), and/or "NWIS". See https://www.waterqualitydata.us/Codes/providers for options.} +\item{maxrecs}{Maximum number of records to query at once (i.e., without breaking into smaller queries).} + +\item{ask}{A logical value indicating whether the user should be asked for approval before downloads begin.} + \item{applyautoclean}{Logical, defaults to TRUE. Applies TADA_AutoClean function on the returned data profile. Suggest switching to FALSE for queries that are expected to be large.} } \value{ diff --git a/man/TADA_TribalOptions.Rd b/man/TADA_TribalOptions.Rd new file mode 100644 index 00000000..2cf6bb37 --- /dev/null +++ b/man/TADA_TribalOptions.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/DataDiscoveryRetrieval.R +\name{TADA_TribalOptions} +\alias{TADA_TribalOptions} +\title{Access options available for querying tribal spatial data with \code{TADA_DataRetrieval()}.} +\usage{ +TADA_TribalOptions(tribal_area_type, return_sf = FALSE) +} +\arguments{ +\item{tribal_area_type}{A character string. Must be one of the six tribal +spatial layers: "Alaska Native Allotments", "Alaska Native Villages", +"American Indian Reservations", "Off-reservation Trust Lands", +"Oklahoma Tribal Statistical Areas", or "Virginia Federally Recognized Tribes".} + +\item{return_sf}{Logical. Should the function return the dataset as an \code{sf} +object (TRUE) or a data frame (FALSE)? Defaults to FALSE.} +} +\value{ +A data frame or \code{sf} object containing the specified layer from the EPA +Map Service. +} +\description{ +This function provides access to \href{https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer}{six layer datasets} +containing spatial data related to tribal lands: "Alaska Native Allotments", +"Alaska Native Villages", "American Indian Reservations", "Off-reservation Trust Lands", +"Oklahoma Tribal Statistical Areas", and "Virginia Federally Recognized Tribes". +These datasets are used by \code{TADA_DataRetrieval()} when retrieving spatial data +for tribal lands specified by the user. + +The purpose of \code{TADA_TribalOptions()} is to allow the user to review the available +data in those datasets and identify the records they would like to query with +\code{TADA_DataRetrieval()}. + +An interactive map of the six layer datasets is available on ArcGIS Online Map +Viewer here: https://www.arcgis.com/apps/mapviewer/index.html?url=https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer&source=sd +} +\note{ +Alaska Native Villages and Virginia Federally Recognized Tribes are point +geometries in the Map Service, not polygons. At the time of this writing they +do not return any data when used for WQP bbox queries. +} +\seealso{ +\code{\link[=TADA_DataRetrieval]{TADA_DataRetrieval()}} +} diff --git a/man/ask_user.Rd b/man/ask_user.Rd new file mode 100644 index 00000000..85380683 --- /dev/null +++ b/man/ask_user.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/DataDiscoveryRetrieval.R +\name{ask_user} +\alias{ask_user} +\title{Ask user to approve WQP downloads} +\usage{ +ask_user(n_records) +} +\arguments{ +\item{n_records}{A numeric value indicating the number of records that will be downloaded from the WQP if the user decides to proceed.} +} +\description{ +Once record counts have been retrieved from the Water Quality Portal (WQP) for +a query, this function is used to prompt the user to decide (i.e., "yes"/"no") +whether the download should proceed. The user is also reminded of the limits of +Microsoft Excel for row counts as a comparison. +} diff --git a/tests/testthat/test-CensoredDataSuite.R b/tests/testthat/test-CensoredDataSuite.R index 8ad21839..644b99d4 100644 --- a/tests/testthat/test-CensoredDataSuite.R +++ b/tests/testthat/test-CensoredDataSuite.R @@ -1,10 +1,10 @@ test_that("TADA_IDCensoredData orphans", { - cens.check <- TADA_DataRetrieval(statecode = "CO", startDate = "2021-01-01", endDate = "2022-01-01", characteristicName = c("Phosphorus", "Nitrate")) + cens.check <- TADA_DataRetrieval(statecode = "CO", startDate = "2021-01-01", endDate = "2022-01-01", characteristicName = c("Phosphorus", "Nitrate"), ask = FALSE) expect_true(all(!is.na(cens.check$TADA.CensoredData.Flag))) }) test_that("TADA_SimpleCensoredMethods doesn't drop data", { - testdat <- TADA_DataRetrieval(statecode = "KS", startDate = "2021-01-01", endDate = "2022-01-01", characteristicName = c("Phosphorus", "Nitrate")) + testdat <- TADA_DataRetrieval(statecode = "KS", startDate = "2021-01-01", endDate = "2022-01-01", characteristicName = c("Phosphorus", "Nitrate"), ask = FALSE) cens.check <- TADA_SimpleCensoredMethods(testdat) expect_equal(dim(testdat)[1], dim(cens.check)[1]) }) @@ -41,7 +41,8 @@ test_that("TADA_IDCensoredData copies det lim values to result values if applica test_that("TADA_IDCensoredData correctly handles specific text values such as ND", { df <- TADA_DataRetrieval( startDate = "2022-12-19", - endDate = "2022-12-20" + endDate = "2022-12-20", + ask = FALSE ) df1 <- TADA_IDCensoredData(df) diff --git a/tests/testthat/test-DataDiscoveryRetrieval.R b/tests/testthat/test-DataDiscoveryRetrieval.R index 55ea2c37..bd96696a 100644 --- a/tests/testthat/test-DataDiscoveryRetrieval.R +++ b/tests/testthat/test-DataDiscoveryRetrieval.R @@ -270,7 +270,8 @@ test_that("TADA_DataRetrieval", { "USGS-054064785", "USGS-430305089260600" ), - characteristicName = "Phosphorus" + characteristicName = "Phosphorus", + ask = FALSE ) # you could just pick the important columns: expect_true(all(c( @@ -405,7 +406,8 @@ test_that("TADA_DataRetrieval", { check_autoclean_meters_works <- TADA_DataRetrieval( statecode = "UT", characteristicName = c("Ammonia", "Nitrate", "Nitrogen"), - startDate = "2021-01-01" + startDate = "2021-01-01", + ask = FALSE ) expect_false("meters" %in% check_autoclean_meters_works$TADA.ResultMeasure.MeasureUnitCode) }) @@ -413,22 +415,22 @@ test_that("TADA_DataRetrieval", { # Testing that regular and big data retrieval return the same number of rows on an identical query. # cm edited to include start date on 2/27/23 because without this it takes too long to run # these tests, and may time out -test_that("Reg&BigdataRetrieval", { - big <- TADA_BigDataRetrieval(characteristicName = "Algae, substrate rock/bank cover (choice list)", sampleMedia = "Water", siteType = "Stream", startDate = "2020-01-01", applyautoclean = FALSE) - reg <- TADA_DataRetrieval(characteristicName = "Algae, substrate rock/bank cover (choice list)", sampleMedia = "Water", siteType = "Stream", startDate = "2020-01-01", applyautoclean = FALSE) - - expect_equal(nrow(big), nrow(reg)) -}) +# test_that("Reg&BigdataRetrieval", { +# # big <- TADA_BigDataRetrieval(characteristicName = "Algae, substrate rock/bank cover (choice list)", sampleMedia = "Water", siteType = "Stream", startDate = "2020-01-01", applyautoclean = FALSE) +# reg <- TADA_DataRetrieval(characteristicName = "Algae, substrate rock/bank cover (choice list)", sampleMedia = "Water", siteType = "Stream", startDate = "2020-01-01", applyautoclean = FALSE, ask = FALSE) +# +# expect_equal(nrow(big), nrow(reg)) +# }) # Testing that dates work correctly in queries in big data retrieval -test_that("BigdataRetrieval_daterange", { - startDate <- "2018-10-01" - endDate <- "2021-09-30" - big <- TADA_BigDataRetrieval(startDate = startDate, endDate = endDate, huc = c("04030202", "04030201"), characteristicName = "Escherichia coli", siteType = "Stream") - logic <- min(big$ActivityStartDate) >= as.Date(startDate, format = "%Y-%m-%d") & max(big$ActivityStartDate) <= as.Date(endDate, format = "%Y-%m-%d") - - expect_true(logic) -}) +# test_that("BigdataRetrieval_daterange", { +# startDate <- "2018-10-01" +# endDate <- "2021-09-30" +# # big <- TADA_BigDataRetrieval(startDate = startDate, endDate = endDate, huc = c("04030202", "04030201"), characteristicName = "Escherichia coli", siteType = "Stream") +# logic <- min(big$ActivityStartDate) >= as.Date(startDate, format = "%Y-%m-%d") & max(big$ActivityStartDate) <= as.Date(endDate, format = "%Y-%m-%d") +# +# expect_true(logic) +# }) # Testing that the TADA_JoinWQPProfiles() function in DataDiscoveryRetrieval.R diff --git a/tests/testthat/test-ResultFlagsDependent.R b/tests/testthat/test-ResultFlagsDependent.R index 1758c9e4..7e763453 100644 --- a/tests/testthat/test-ResultFlagsDependent.R +++ b/tests/testthat/test-ResultFlagsDependent.R @@ -1,7 +1,7 @@ test_that("No NA's in dependent flag columns", { today <- Sys.Date() twoago <- as.character(today - 2 * 365) - testdat <- TADA_DataRetrieval(statecode = "UT", startDate = twoago, characteristicName = c("Nitrate", "Copper"), sampleMedia = "Water") + testdat <- TADA_DataRetrieval(statecode = "UT", startDate = twoago, characteristicName = c("Nitrate", "Copper"), sampleMedia = "Water", ask = FALSE) testdat <- TADA_ConvertResultUnits(testdat, transform = TRUE) testdat <- suppressWarnings(TADA_FlagFraction(testdat, clean = FALSE, flaggedonly = FALSE)) diff --git a/tests/testthat/test-Tables.R b/tests/testthat/test-Tables.R index 5528758e..49a30910 100644 --- a/tests/testthat/test-Tables.R +++ b/tests/testthat/test-Tables.R @@ -1,5 +1,5 @@ test_that("TADA_Stats suggestions complete", { - testdat <- TADA_DataRetrieval(statecode = "KS", startDate = "2021-01-01", endDate = "2022-01-01", characteristicName = c("Phosphorus", "Nitrate")) + testdat <- TADA_DataRetrieval(statecode = "KS", startDate = "2021-01-01", endDate = "2022-01-01", characteristicName = c("Phosphorus", "Nitrate"), ask = FALSE) check <- TADA_Stats(testdat) expect_true(all(!is.na(check$ND_Estimation_Method))) }) diff --git a/tests/testthat/test-UnitConversions.R b/tests/testthat/test-UnitConversions.R index 1a09abc7..5a12e63c 100644 --- a/tests/testthat/test-UnitConversions.R +++ b/tests/testthat/test-UnitConversions.R @@ -63,7 +63,8 @@ test_that("TADA_ConvertDepthUnits converts meters to m", { organization = "USGS-UT", characteristicName = c("Ammonia", "Nitrate", "Nitrogen"), startDate = "2023-01-01", - endDate = "2023-03-01" + endDate = "2023-03-01", + ask = FALSE ) check_depth_meters <- TADA_ConvertDepthUnits(check_depth_meters) expect_false("meters" %in% check_depth_meters$TADA.ActivityDepthHeightMeasure.MeasureUnitCode) diff --git a/vignettes/TADAModule1.Rmd b/vignettes/TADAModule1.Rmd index 23042ed5..8154041b 100644 --- a/vignettes/TADAModule1.Rmd +++ b/vignettes/TADAModule1.Rmd @@ -269,7 +269,7 @@ as a query filter. We can match the output of the two short examples above like ```{r TADA_DataRetrieval_spatial} TADAProfile_spatial <- TADA_DataRetrieval( - aoi_sf = tigris::native_areas() %>% filter(NAMELSAD == "Pueblo of Pojoaque"), + aoi_sf = tigris::native_areas() %>% dplyr::filter(NAMELSAD == "Pueblo of Pojoaque"), startDate = "2018-01-01", endDate = "2019-01-01", applyautoclean = FALSE, @@ -322,13 +322,13 @@ examples below can take multiple HOURS to run. The total run time depends on your query inputs. ```{r BigdataRetrieval, eval = FALSE, results = 'hide'} -# AK_AL_WaterTemp <- TADA_BigDataRetrieval(startDate = "2000-01-01", endDate = "2022-12-31", characteristicName = "Temperature, water", statecode = c("AK","AL")) +# AK_AL_WaterTemp <- TADA_DataRetrieval(startDate = "2000-01-01", endDate = "2022-12-31", characteristicName = "Temperature, water", statecode = c("AK","AL")) -# AllWaterTemp <- TADA_BigDataRetrieval(characteristicName = "Temperature, water") +# AllWaterTemp <- TADA_DataRetrieval(characteristicName = "Temperature, water") -# AllPhosphorus <- TADA_BigDataRetrieval(characteristicName = "Phosphorus") +# AllPhosphorus <- TADA_DataRetrieval(characteristicName = "Phosphorus") -# AllCT <- TADA_BigDataRetrieval(statecode = "CT") +# AllCT <- TADA_DataRetrieval(statecode = "CT") ``` ## Filter data based on media type diff --git a/vignettes/TADAModule2.Rmd b/vignettes/TADAModule2.Rmd index 97d4677d..a5a9d810 100644 --- a/vignettes/TADAModule2.Rmd +++ b/vignettes/TADAModule2.Rmd @@ -155,7 +155,8 @@ TADA_dataframe <- TADA_DataRetrieval( endDate = "2020-12-31", characteristicName = "pH", countycode = "US:08:069", - applyautoclean = TRUE + applyautoclean = TRUE, + ask = FALSE ) ``` From a0b1b4d7c204edefb5903ff81be20bb421d29c13 Mon Sep 17 00:00:00 2001 From: Matt Brousil <37380883+mbrousil@users.noreply.github.com> Date: Wed, 22 Jan 2025 15:54:14 -0800 Subject: [PATCH 21/35] Run styler::style_pkg() --- R/DataDiscoveryRetrieval.R | 559 +++++++++---------- R/Maintenance.R | 8 +- tests/testthat/test-DataDiscoveryRetrieval.R | 4 +- tests/testthat/test-ResultFlagsIndependent.R | 32 +- 4 files changed, 299 insertions(+), 304 deletions(-) diff --git a/R/DataDiscoveryRetrieval.R b/R/DataDiscoveryRetrieval.R index 8e8f9859..67606762 100644 --- a/R/DataDiscoveryRetrieval.R +++ b/R/DataDiscoveryRetrieval.R @@ -5,7 +5,7 @@ #' exceptions of endDate and startDate match the web service call format from the #' online WQP GUI. endDate and startDate match the format suggested in USGS's #' dataRetrieval package (endDate = "YYYY-MM-DD"), which is a more familiar date -#' format for R users than the WQP GUI's endDateHi = "MM-DD-YYYY". aoi_sf, +#' format for R users than the WQP GUI's endDateHi = "MM-DD-YYYY". aoi_sf, #' tribal_area_type, and tribe_name_parcel do not have corresponding inputs in #' the web service. #' @@ -19,7 +19,7 @@ #' of the query fields. #' characteristicName and Characteristic Group also work as an AND, therefore the #' characteristicName must fall within the Characteristic Group when both are entered. -#' +#' #' aoi_sf cannot be used with tribal_area_type. If countrycode, countycode, huc, #' siteid, or statecode are used with aoi_sf or tribal_area_type they will be ignored #' under the assumption that the sf object or tribal location are the intended @@ -65,7 +65,7 @@ #' geometries in the Map Service, not polygons. At the time of this writing they #' do not return any data when used for WQP bbox queries and so are set to return #' errors when used with this function. -#' +#' #' @export #' #' @examples @@ -199,13 +199,12 @@ TADA_DataRetrieval <- function(startDate = "null", maxrecs = 250000, ask = TRUE, applyautoclean = TRUE) { - # Check for incomplete or inconsistent inputs: - + # If both an sf object and tribe information are provided it's unclear what # the priority should be for the query - if( !is.null(aoi_sf) & - any( (tribal_area_type != "null") | (tribe_name_parcel != "null") )){ + if (!is.null(aoi_sf) & + any((tribal_area_type != "null") | (tribe_name_parcel != "null"))) { stop( paste0( "Both sf data and tribal information have been provided. ", @@ -213,29 +212,33 @@ TADA_DataRetrieval <- function(startDate = "null", ) ) } - + # Check for other arguments that indicate location. Function will ignore # these inputs but warn the user - if( + if ( # sf object provided - (!is.null(aoi_sf) & inherits(aoi_sf, "sf")) & - # with additional location info - any( (countrycode != "null"), (countycode != "null"), (huc != "null"), - (siteid != "null"), (statecode != "null") ) - ){ + (!is.null(aoi_sf) & inherits(aoi_sf, "sf")) & + # with additional location info + any( + (countrycode != "null"), (countycode != "null"), (huc != "null"), + (siteid != "null"), (statecode != "null") + ) + ) { warning( paste0( "Location information has been provided in addition to an sf object. ", "Only the sf object will be used in the query." ) ) - } else if( + } else if ( # Tribe info provided - (tribal_area_type != "null") & - # with additional location info - any( (countrycode != "null"), (countycode != "null"), (huc != "null"), - (siteid != "null"), (statecode != "null") ) - ){ + (tribal_area_type != "null") & + # with additional location info + any( + (countrycode != "null"), (countycode != "null"), (huc != "null"), + (siteid != "null"), (statecode != "null") + ) + ) { warning( paste0( "Location information has been provided in addition to tribal information. ", @@ -243,21 +246,20 @@ TADA_DataRetrieval <- function(startDate = "null", ) ) } - + # Insufficient tribal info provided - if( (tribal_area_type == "null") & all(tribe_name_parcel != "null") ){ + if ((tribal_area_type == "null") & all(tribe_name_parcel != "null")) { stop("A tribal_area_type is required if tribe_name_parcel is provided.") } - + # If an sf object OR tribal info are provided they will be the basis of the query # (The tribal data handling uses sf objects as well) - if( (!is.null(aoi_sf) & inherits(aoi_sf, "sf")) | (tribal_area_type != "null") ){ - + if ((!is.null(aoi_sf) & inherits(aoi_sf, "sf")) | (tribal_area_type != "null")) { # Build the non-sf part of the query: - + # Set query parameters WQPquery <- list() - + # StartDate if (length(startDate) > 1) { if (is.na(suppressWarnings(lubridate::parse_date_time(startDate[1], orders = "ymd")))) { @@ -324,12 +326,11 @@ TADA_DataRetrieval <- function(startDate = "null", } WQPquery <- c(WQPquery, endDate = endDate) } - + # sf AOI prep for query - + # If tribe info is provided then grab the corresponding sf object: - if(tribal_area_type != "null"){ - + if (tribal_area_type != "null") { # Make a reference table for tribal area type + url matching # (options that don't return results are commented out for now) map_service_urls <- tibble::tribble( @@ -338,165 +339,167 @@ TADA_DataRetrieval <- function(startDate = "null", # "Alaska Native Villages", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/1", "American Indian Reservations", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/2", "Off-reservation Trust Lands", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/3", - "Oklahoma Tribal Statistical Areas", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/4"# , + "Oklahoma Tribal Statistical Areas", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/4" # , # "Virginia Federally Recognized Tribes", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/5" ) - + # Keep to a single type: - if(length(tribal_area_type) > 1){ + if (length(tribal_area_type) > 1) { stop("tribal_area_type must be of length 1.") } - + # These two layers will not return any data when used for bboxes - if(tribal_area_type == "Alaska Native Villages"){ + if (tribal_area_type == "Alaska Native Villages") { stop("Alaska Native Villages data are centroid points, not spatial boundaries.") - } else if(tribal_area_type == "Virginia Federally Recognized Tribes") { + } else if (tribal_area_type == "Virginia Federally Recognized Tribes") { stop("Federally recognized tribal entities in Virginia do not have any available spatial boundaries.") } - + # These area types allow filtering by TRIBE_NAME (unique within each type) - if(tribal_area_type %in% c( + if (tribal_area_type %in% c( # "Alaska Native Villages", "American Indian Reservations", "Off-reservation Trust Lands", - "Oklahoma Tribal Statistical Areas"#, + "Oklahoma Tribal Statistical Areas" # , # "Virginia Federally Recognized Tribes" ) - ){ - + ) { # Get the relevant url - aoi_sf <- dplyr::filter(map_service_urls, - tribal_area == tribal_area_type)$url %>% + aoi_sf <- dplyr::filter( + map_service_urls, + tribal_area == tribal_area_type + )$url %>% # Pull data arcgislayers::arc_open() %>% # Return sf arcgislayers::arc_select() %>% # If a value provided, then filter - {if (all(tribe_name_parcel != "null") ) { - dplyr::filter(., TRIBE_NAME %in% tribe_name_parcel) - } else { - . - }} - + { + if (all(tribe_name_parcel != "null")) { + dplyr::filter(., TRIBE_NAME %in% tribe_name_parcel) + } else { + . + } + } + # Otherwise filter by PARCEL_NO (Note that values in this col are not unique) - } else if(tribal_area_type == "Alaska Native Allotments"){ - - aoi_sf <- dplyr::filter(map_service_urls, - tribal_area == tribal_area_type)$url %>% + } else if (tribal_area_type == "Alaska Native Allotments") { + aoi_sf <- dplyr::filter( + map_service_urls, + tribal_area == tribal_area_type + )$url %>% arcgislayers::arc_open() %>% arcgislayers::arc_select() %>% - {if (all(tribe_name_parcel != "null")) { - dplyr::filter(., PARCEL_NO %in% tribe_name_parcel) - } else { - . - }} - + { + if (all(tribe_name_parcel != "null")) { + dplyr::filter(., PARCEL_NO %in% tribe_name_parcel) + } else { + . + } + } } else { stop("Tribal area type not recognized. Refer to TADA_TribalOptions() for query options.") } - } - + # Check and/or fix geometry aoi_sf <- sf::st_make_valid(aoi_sf) - + # Match CRS - if(sf::st_crs(aoi_sf) != 4326){ + if (sf::st_crs(aoi_sf) != 4326) { aoi_sf <- sf::st_transform(aoi_sf, crs = 4326) } - + # Get bbox of the sf object input_bbox <- sf::st_bbox(aoi_sf) - + # Query info on available data within the bbox # Don't want to print every message that's returned by WQP quiet_whatWQPdata <- purrr::quietly(dataRetrieval::whatWQPdata) - + # Try getting WQP info quiet_bbox_avail <- quiet_whatWQPdata( WQPquery, bBox = c(input_bbox$xmin, input_bbox$ymin, input_bbox$xmax, input_bbox$ymax) ) - + # Alert & stop if an http error was received - if(is.null(quiet_bbox_avail$result)){ - + if (is.null(quiet_bbox_avail$result)) { stop_message <- quiet_bbox_avail$messages %>% grep(pattern = "failed|HTTP", x = ., ignore.case = FALSE, value = TRUE) %>% paste("\n", ., collapse = "") %>% paste("The WQP request returned a NULL with the following message(s): \n", - ., - collapse = "\n") - + ., + collapse = "\n" + ) + stop(stop_message) - } - + # Use result only bbox_avail <- quiet_bbox_avail$result - + # Check if any sites are within the aoi - if ( (nrow(bbox_avail) > 0 ) == FALSE) { + if ((nrow(bbox_avail) > 0) == FALSE) { stop("No monitoring sites were returned within your area of interest (no data available).") } - - + + quiet_whatWQPsites <- purrr::quietly(dataRetrieval::whatWQPsites) - + quiet_bbox_sites <- quiet_whatWQPsites( siteid = bbox_avail$MonitoringLocationIdentifier ) - - if(is.null(quiet_bbox_sites$result)){ - + + if (is.null(quiet_bbox_sites$result)) { stop_message <- quiet_bbox_sites$messages %>% grep(pattern = "failed|HTTP", x = ., ignore.case = FALSE, value = TRUE) %>% paste("\n", ., collapse = "") %>% paste("The WQP request returned a NULL with the following message(s): \n", - ., - collapse = "\n") + ., + collapse = "\n" + ) stop(stop_message) - } - + # Reformat returned info as sf bbox_sites_sf <- TADA_MakeSpatial(quiet_bbox_sites$result, crs = 4326) - + # Subset sites to only within shapefile and get IDs clipped_sites_sf <- bbox_sites_sf[aoi_sf, ] - + clipped_site_ids <- clipped_sites_sf$MonitoringLocationIdentifier - + record_count <- bbox_avail %>% dplyr::filter(MonitoringLocationIdentifier %in% clipped_site_ids) %>% dplyr::pull(resultCount) %>% sum() - + # Should we proceed with downloads? If ask == TRUE then ask the user. - if(ask == TRUE){ + if (ask == TRUE) { user_decision <- ask_user(n_records = record_count) - + # Act on input - if(user_decision == "yes") { + if (user_decision == "yes") { print("Proceeding with download.") } else { stop("Cancelled by user.", call. = FALSE) } } - + # Continue now with site count site_count <- length(clipped_site_ids) - + # Check for either more than 300 sites or more records than max_recs. # If either is true then we'll approach the pull as a "big data" pull - if( site_count > 300 | record_count > maxrecs) { + if (site_count > 300 | record_count > maxrecs) { message( paste0( "The number of sites and/or records matched by the AOI and query terms is large, so the download may take some time. ", "If your AOI is a county, state, country, or HUC boundary it would be more efficient to provide a code instead of an sf object." ) ) - + # Use helper function to download large data volume results.DR <- withCallingHandlers( TADA_BigDataHelper( @@ -509,13 +512,13 @@ TADA_DataRetrieval <- function(startDate = "null", ), message = function(m) message(m$message) ) - - + + rm(bbox_avail, bbox_sites_sf) gc() - + # Check if any results were returned - if ( (nrow(results.DR) > 0 ) == FALSE) { + if ((nrow(results.DR) > 0) == FALSE) { print( paste0( "Returning empty results dataframe: ", @@ -527,12 +530,11 @@ TADA_DataRetrieval <- function(startDate = "null", # Empty TADAprofile.clean <- results.DR } else { - # Get site metadata sites.DR <- clipped_sites_sf %>% dplyr::as_tibble() %>% dplyr::select(-geometry) - + # Get project metadata projects.DR <- suppressMessages( dataRetrieval::readWQPdata( @@ -542,7 +544,7 @@ TADA_DataRetrieval <- function(startDate = "null", service = "Project" ) ) - + # Join results, sites, projects TADAprofile <- TADA_JoinWQPProfiles( FullPhysChem = results.DR, @@ -551,26 +553,25 @@ TADA_DataRetrieval <- function(startDate = "null", ) %>% dplyr::mutate( dplyr::across(tidyselect::everything(), as.character) ) - + # run TADA_AutoClean function if (applyautoclean == TRUE) { print("Data successfully downloaded. Running TADA_AutoClean function.") - + TADAprofile.clean <- TADA_AutoClean(TADAprofile) } else { TADAprofile.clean <- TADAprofile } } - + return(TADAprofile.clean) - + # Doesn't meet "big data" threshold: } else { - # Retrieve all 3 profiles print("Downloading WQP query results. This may take some time depending upon the query size.") print(WQPquery) - + # Get results results.DR <- suppressMessages( dataRetrieval::readWQPdata( @@ -580,7 +581,7 @@ TADA_DataRetrieval <- function(startDate = "null", ignore_attributes = TRUE ) ) - + # Check if any results were returned if ((nrow(results.DR) > 0) == FALSE) { paste0( @@ -591,12 +592,11 @@ TADA_DataRetrieval <- function(startDate = "null", ) TADAprofile.clean <- results.DR } else { - # Get site metadata sites.DR <- clipped_sites_sf %>% dplyr::as_tibble() %>% dplyr::select(-geometry) - + # Get project metadata projects.DR <- suppressMessages( dataRetrieval::readWQPdata( @@ -606,7 +606,7 @@ TADA_DataRetrieval <- function(startDate = "null", service = "Project" ) ) - + # Join results, sites, projects TADAprofile <- TADA_JoinWQPProfiles( FullPhysChem = results.DR, @@ -615,27 +615,25 @@ TADA_DataRetrieval <- function(startDate = "null", ) %>% dplyr::mutate( dplyr::across(tidyselect::everything(), as.character) ) - + # Run TADA_AutoClean function if (applyautoclean == TRUE) { print("Data successfully downloaded. Running TADA_AutoClean function.") - + TADAprofile.clean <- TADA_AutoClean(TADAprofile) } else { TADAprofile.clean <- TADAprofile } } - + return(TADAprofile.clean) - } - + # If no sf object provided: - } else { - + } else { # Set query parameters WQPquery <- list() - + if (!"null" %in% statecode) { load(system.file("extdata", "statecodes_df.Rdata", package = "EPATADA")) statecode <- as.character(statecode) @@ -648,13 +646,13 @@ TADA_DataRetrieval <- function(startDate = "null", WQPquery <- c(WQPquery, statecode = list(statecd)) } } - + if (length(huc) > 1) { WQPquery <- c(WQPquery, huc = list(huc)) } else if (huc != "null") { WQPquery <- c(WQPquery, huc = huc) } - + if (length(startDate) > 1) { if (is.na(suppressWarnings(lubridate::parse_date_time(startDate[1], orders = "ymd")))) { stop("Incorrect date format. Please use the format YYYY-MM-DD.") @@ -666,67 +664,67 @@ TADA_DataRetrieval <- function(startDate = "null", } WQPquery <- c(WQPquery, startDate = startDate) } - + if (length(countrycode) > 1) { WQPquery <- c(WQPquery, countrycode = list(countrycode)) } else if (countrycode != "null") { WQPquery <- c(WQPquery, countrycode = countrycode) } - + if (length(countycode) > 1) { WQPquery <- c(WQPquery, countycode = list(countycode)) } else if (countycode != "null") { WQPquery <- c(WQPquery, countycode = countycode) } - + if (length(siteid) > 1) { WQPquery <- c(WQPquery, siteid = list(siteid)) } else if (siteid != "null") { WQPquery <- c(WQPquery, siteid = siteid) } - + if (length(siteType) > 1) { WQPquery <- c(WQPquery, siteType = list(siteType)) } else if (siteType != "null") { WQPquery <- c(WQPquery, siteType = siteType) } - + if (length(characteristicName) > 1) { WQPquery <- c(WQPquery, characteristicName = list(characteristicName)) } else if (characteristicName != "null") { WQPquery <- c(WQPquery, characteristicName = characteristicName) } - + if (length(characteristicType) > 1) { WQPquery <- c(WQPquery, characteristicType = list(characteristicType)) } else if (characteristicType != "null") { WQPquery <- c(WQPquery, characteristicType = characteristicType) } - + if (length(sampleMedia) > 1) { WQPquery <- c(WQPquery, sampleMedia = list(sampleMedia)) } else if (sampleMedia != "null") { WQPquery <- c(WQPquery, sampleMedia = sampleMedia) } - + if (length(project) > 1) { WQPquery <- c(WQPquery, project = list(project)) } else if (project != "null") { WQPquery <- c(WQPquery, project = project) } - + if (length(providers) > 1) { WQPquery <- c(WQPquery, providers = list(providers)) } else if (providers != "null") { WQPquery <- c(WQPquery, providers = providers) } - + if (length(organization) > 1) { WQPquery <- c(WQPquery, organization = list(organization)) } else if (organization != "null") { WQPquery <- c(WQPquery, organization = organization) } - + if (length(endDate) > 1) { if (is.na(suppressWarnings(lubridate::parse_date_time(endDate[1], orders = "ymd")))) { stop("Incorrect date format. Please use the format YYYY-MM-DD.") @@ -738,53 +736,52 @@ TADA_DataRetrieval <- function(startDate = "null", } WQPquery <- c(WQPquery, endDate = endDate) } - + # Query info on available data # Don't want to print every message that's returned by WQP quiet_whatWQPdata <- purrr::quietly(dataRetrieval::whatWQPdata) - + quiet_query_avail <- quiet_whatWQPdata(WQPquery) - - if(is.null(quiet_query_avail$result)){ - + + if (is.null(quiet_query_avail$result)) { stop_message <- quiet_query_avail$messages %>% grep(pattern = "failed|HTTP", x = ., ignore.case = FALSE, value = TRUE) %>% paste("\n", ., collapse = "") %>% paste("The WQP request returned a NULL with the following message(s): \n", - ., - collapse = "\n") - + ., + collapse = "\n" + ) + stop(stop_message) - } - + query_avail <- quiet_query_avail$result - + site_count <- length(query_avail$MonitoringLocationIdentifier) - + record_count <- query_avail %>% dplyr::pull(resultCount) %>% sum() - + # Should we proceed with downloads? If ask == TRUE then ask the user. - if(ask == TRUE){ + if (ask == TRUE) { user_decision <- ask_user(n_records = record_count) - + # Act on input - if(user_decision == "yes") { + if (user_decision == "yes") { print("Proceeding with download.") } else { stop("Cancelled by user.", call. = FALSE) } } - + # Check for either more than 300 sites or more records than max_recs. # If either is true then we'll approach the pull as a "big data" pull - if(site_count > 300 | record_count > maxrecs) { + if (site_count > 300 | record_count > maxrecs) { message( "The number of sites and/or records matched by the query terms is large, so the download may take some time." ) - + # Use helper function to download large data volume results.DR <- suppressMessages( TADA_BigDataHelper( @@ -795,17 +792,17 @@ TADA_DataRetrieval <- function(startDate = "null", maxsites = 300 ) ) - + rm(query_avail) gc() - + # Get site metadata sites.DR <- suppressMessages( dataRetrieval::whatWQPsites( siteid = unique(results.DR$MonitoringLocationIdentifier) ) ) - + # Get project metadata projects.DR <- suppressMessages( dataRetrieval::readWQPdata( @@ -815,7 +812,7 @@ TADA_DataRetrieval <- function(startDate = "null", service = "Project" ) ) - + # Join results, sites, projects TADAprofile <- TADA_JoinWQPProfiles( FullPhysChem = results.DR, @@ -824,18 +821,18 @@ TADA_DataRetrieval <- function(startDate = "null", ) %>% dplyr::mutate( dplyr::across(tidyselect::everything(), as.character) ) - + # run TADA_AutoClean function if (applyautoclean == TRUE) { print("Data successfully downloaded. Running TADA_AutoClean function.") - + TADAprofile.clean <- TADA_AutoClean(TADAprofile) } else { TADAprofile.clean <- TADAprofile } - + return(TADAprofile.clean) - + # If not a "big data" pull: } else { # Retrieve all 3 profiles @@ -843,25 +840,25 @@ TADA_DataRetrieval <- function(startDate = "null", print(WQPquery) results.DR <- suppressMessages( dataRetrieval::readWQPdata(WQPquery, - dataProfile = "resultPhysChem", - ignore_attributes = TRUE + dataProfile = "resultPhysChem", + ignore_attributes = TRUE ) ) - + # check if any results are available if ((nrow(results.DR) > 0) == FALSE) { print("Returning empty results dataframe: Your WQP query returned no results (no data available). Try a different query. Removing some of your query filters OR broadening your search area may help.") TADAprofile.clean <- results.DR } else { sites.DR <- suppressMessages(dataRetrieval::whatWQPsites(WQPquery)) - + projects.DR <- suppressMessages( dataRetrieval::readWQPdata(WQPquery, - ignore_attributes = TRUE, - service = "Project" + ignore_attributes = TRUE, + service = "Project" ) ) - + TADAprofile <- TADA_JoinWQPProfiles( FullPhysChem = results.DR, Sites = sites.DR, @@ -869,62 +866,61 @@ TADA_DataRetrieval <- function(startDate = "null", ) %>% dplyr::mutate( dplyr::across(tidyselect::everything(), as.character) ) - + # run TADA_AutoClean function if (applyautoclean == TRUE) { print("Data successfully downloaded. Running TADA_AutoClean function.") - + TADAprofile.clean <- TADA_AutoClean(TADAprofile) } else { TADAprofile.clean <- TADAprofile } } - + return(TADAprofile.clean) } } } #' Access options available for querying tribal spatial data with `TADA_DataRetrieval()`. -#' +#' #' @description #' This function provides access to [six layer datasets](https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer) #' containing spatial data related to tribal lands: "Alaska Native Allotments", #' "Alaska Native Villages", "American Indian Reservations", "Off-reservation Trust Lands", #' "Oklahoma Tribal Statistical Areas", and "Virginia Federally Recognized Tribes". #' These datasets are used by `TADA_DataRetrieval()` when retrieving spatial data -#' for tribal lands specified by the user. -#' +#' for tribal lands specified by the user. +#' #' The purpose of `TADA_TribalOptions()` is to allow the user to review the available #' data in those datasets and identify the records they would like to query with #' `TADA_DataRetrieval()`. -#' +#' #' An interactive map of the six layer datasets is available on ArcGIS Online Map #' Viewer here: https://www.arcgis.com/apps/mapviewer/index.html?url=https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer&source=sd -#' +#' #' @param tribal_area_type A character string. Must be one of the six tribal -#' spatial layers: "Alaska Native Allotments", "Alaska Native Villages", +#' spatial layers: "Alaska Native Allotments", "Alaska Native Villages", #' "American Indian Reservations", "Off-reservation Trust Lands", #' "Oklahoma Tribal Statistical Areas", or "Virginia Federally Recognized Tribes". -#' +#' #' @param return_sf Logical. Should the function return the dataset as an `sf` #' object (TRUE) or a data frame (FALSE)? Defaults to FALSE. -#' +#' #' @returns A data frame or `sf` object containing the specified layer from the EPA #' Map Service. -#' +#' #' @note #' Alaska Native Villages and Virginia Federally Recognized Tribes are point #' geometries in the Map Service, not polygons. At the time of this writing they #' do not return any data when used for WQP bbox queries. -#' +#' #' @export -#' +#' #' @seealso [TADA_DataRetrieval()] -#' +#' -TADA_TribalOptions <- function(tribal_area_type, return_sf = FALSE){ - +TADA_TribalOptions <- function(tribal_area_type, return_sf = FALSE) { # Make a reference table for tribal area type + url matching map_service_urls <- tibble::tribble( ~tribal_area, ~url, @@ -935,22 +931,24 @@ TADA_TribalOptions <- function(tribal_area_type, return_sf = FALSE){ "Oklahoma Tribal Statistical Areas", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/4", "Virginia Federally Recognized Tribes", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/5" ) - + # Confirm usable string provided - if( !(tribal_area_type %in% map_service_urls$tribal_area) ){ + if (!(tribal_area_type %in% map_service_urls$tribal_area)) { stop("tribal_area_type must match one of the six tribal spatial layer names.") } - + # Query Map Service - tribal_area_sf <- dplyr::filter(map_service_urls, - tribal_area == tribal_area_type)$url %>% + tribal_area_sf <- dplyr::filter( + map_service_urls, + tribal_area == tribal_area_type + )$url %>% arcgislayers::arc_open() %>% # Return sf arcgislayers::arc_select() %>% sf::st_make_valid() - + # Convert to df if needed, export - if(return_sf == FALSE){ + if (return_sf == FALSE) { return( as.data.frame(tribal_area_sf) %>% sf::st_drop_geometry() @@ -958,7 +956,6 @@ TADA_TribalOptions <- function(tribal_area_type, return_sf = FALSE){ } else { return(tribal_area_sf) } - } #' Read in WQP data using the Water Quality Portal (WQP) web services @@ -1015,25 +1012,25 @@ TADA_ReadWQPWebServices <- function(webservice) { # read in csv from WQP web service if (grepl("zip=yes", webservice)) { webservice <- stringr::str_replace(webservice, "zip=yes", "zip=no") - + # download data webservice <- data.table::fread(toString(webservice)) - + # if input df was not downloaded using USGS's dataRetrieval, then the # column names will include / separators instead of . and TADA uses . # (e.g. ResultMeasure/MeasureUnitCode vs. ResultMeasure.MeasureUnitCode) colnames(webservice) <- gsub("/", ".", colnames(webservice)) - + return(webservice) } else { # download data webservice <- data.table::fread(toString(webservice)) - + # if input df was not downloaded using USGS's dataRetrieval, then the # column names will include / separators instead of . and TADA uses . # (e.g. ResultMeasure/MeasureUnitCode vs. ResultMeasure.MeasureUnitCode) colnames(webservice) <- gsub("/", ".", colnames(webservice)) - + return(webservice) } } @@ -1053,23 +1050,22 @@ TADA_ReadWQPWebServices <- function(webservice) { #' @param maxsites Maximum number of sites to query at once. #' #' @return TADA-compatible dataframe -TADA_BigDataHelper <- function(record_summary, WQPquery, maxrecs = 250000, maxsites = 300){ - +TADA_BigDataHelper <- function(record_summary, WQPquery, maxrecs = 250000, maxsites = 300) { # Get total number of results per site and separate out sites with >maxrecs results tot_sites <- record_summary %>% dplyr::group_by(MonitoringLocationIdentifier) %>% dplyr::summarise(tot_n = sum(resultCount)) %>% dplyr::filter(tot_n > 0) %>% dplyr::arrange(tot_n) - + # Sites with less than/equal to maxrecs smallsites <- tot_sites %>% dplyr::filter(tot_n <= maxrecs) # Sites with more than maxrecs bigsites <- tot_sites %>% dplyr::filter(tot_n > maxrecs) - + df_small <- data.frame() df_big <- data.frame() - + # Work with small sites first: # Build download groups. Total record count limited to value of maxrecs. # Number of sites per download group limited to 300. @@ -1080,20 +1076,24 @@ TADA_BigDataHelper <- function(record_summary, WQPquery, maxrecs = 250000, maxsi threshold = maxrecs, maxgroupsize = 300 )) - + # Status update to user print( - paste0("Downloading data from sites with fewer than ", - maxrecs, - " results by grouping them together.") + paste0( + "Downloading data from sites with fewer than ", + maxrecs, + " results by grouping them together." + ) ) - + small_prog_bar <- txtProgressBar(min = 0, max = sum(smallsites$tot_n), style = 3) - + # Download the data for each group for (i in 1:max(smallsitesgrp$group)) { - small_site_chunk <- subset(smallsitesgrp$MonitoringLocationIdentifier, - smallsitesgrp$group == i) + small_site_chunk <- subset( + smallsitesgrp$MonitoringLocationIdentifier, + smallsitesgrp$group == i + ) # Query result data results_small <- suppressMessages( dataRetrieval::readWQPdata( @@ -1104,35 +1104,37 @@ TADA_BigDataHelper <- function(record_summary, WQPquery, maxrecs = 250000, maxsi ) ) %>% dplyr::mutate(dplyr::across(everything(), as.character)) - + # If data is returned, stack with what's already been retrieved if (dim(results_small)[1] > 0) { df_small <- dplyr::bind_rows(df_small, results_small) } - + # Update progress setTxtProgressBar(pb = small_prog_bar, value = nrow(df_small)) } # Close progress bar when complete close(small_prog_bar) - + rm(smallsites, smallsitesgrp) gc() } - + # Large sites (>= maxrecs) next: if (dim(bigsites)[1] > 0) { print( - paste0("Downloading data from sites with greater than ", - maxrecs, - " results, chunking queries by site.") + paste0( + "Downloading data from sites with greater than ", + maxrecs, + " results, chunking queries by site." + ) ) - + big_prog_bar <- txtProgressBar(min = 0, max = sum(bigsites$tot_n), style = 3) - + # Unique site IDs bsitesvec <- unique(bigsites$MonitoringLocationIdentifier) - + # For each site for (i in 1:length(bsitesvec)) { # Download each site's data individually @@ -1143,9 +1145,9 @@ TADA_BigDataHelper <- function(record_summary, WQPquery, maxrecs = 250000, maxsi dataProfile = "resultPhysChem", ignore_attributes = TRUE ) - )%>% + ) %>% dplyr::mutate(dplyr::across(everything(), as.character)) - + if (dim(results_big)[1] > 0) { df_big <- dplyr::bind_rows(df_big, results_big) } @@ -1154,14 +1156,14 @@ TADA_BigDataHelper <- function(record_summary, WQPquery, maxrecs = 250000, maxsi } # Close progress bar when complete close(big_prog_bar) - + rm(bigsites) gc() } - - + + df_out <- dplyr::bind_rows(df_small, df_big) - + return(df_out) } @@ -1248,11 +1250,11 @@ TADA_BigDataRetrieval <- function(startDate = "null", maxrecs = 250000, applyautoclean = FALSE) { start_T <- Sys.time() - + if (!"null" %in% statecode & !"null" %in% huc) { stop("Please provide either state code(s) OR huc(s) to proceed.") } - + if (!startDate == "null") { startDat <- lubridate::ymd(startDate) startYearLo <- lubridate::year(startDat) @@ -1261,7 +1263,7 @@ TADA_BigDataRetrieval <- function(startDate = "null", startDat <- lubridate::ymd(startDate) startYearLo <- lubridate::year(startDat) } - + # Logic: if the input endDate is not null, convert to date and obtain year # for summary if (!endDate == "null") { @@ -1272,7 +1274,7 @@ TADA_BigDataRetrieval <- function(startDate = "null", endDat <- lubridate::ymd(endDate) endYearHi <- lubridate::year(endDat) } - + # Create readWQPsummary query WQPquery <- list() if (length(characteristicName) > 1) { @@ -1290,7 +1292,7 @@ TADA_BigDataRetrieval <- function(startDate = "null", } else if (siteType != "null") { WQPquery <- c(WQPquery, siteType = siteType) } - + if (!"null" %in% statecode) { load(system.file("extdata", "statecodes_df.Rdata", package = "EPATADA")) statecode <- as.character(statecode) @@ -1308,45 +1310,45 @@ TADA_BigDataRetrieval <- function(startDate = "null", WQPquery <- c(WQPquery, statecode = statecd) } } - + if (length(huc) > 1) { WQPquery <- c(WQPquery, huc = list(huc)) } else if (huc != "null") { WQPquery <- c(WQPquery, huc = huc) } - + if (length(countrycode) > 1) { WQPquery <- c(WQPquery, countrycode = list(countrycode)) } else if (countrycode != "null") { WQPquery <- c(WQPquery, countrycode = countrycode) } - + if (length(countycode) > 1) { WQPquery <- c(WQPquery, countycode = list(countycode)) } else if (countycode != "null") { WQPquery <- c(WQPquery, countycode = countycode) } - + if (length(organization) > 1) { WQPquery <- c(WQPquery, organization = list(organization)) } else if (organization != "null") { WQPquery <- c(WQPquery, organization = organization) } - + # cut down on summary query time if possible based on big data query diffdat <- lubridate::time_length(difftime(Sys.Date(), startDat), "years") - + if (diffdat <= 1) { WQPquery <- c(WQPquery, summaryYears = 1) } - + if (diffdat > 1 & diffdat <= 5) { WQPquery <- c(WQPquery, summaryYears = 5) } - + print("Building site summary table for chunking result downloads...") df_summary <- dataRetrieval::readWQPsummary(WQPquery) - + ## NOTE: if query brings back no results, function returns empty # dataRetrieval profile, not empty summary if (nrow(df_summary) > 0) { @@ -1356,13 +1358,10 @@ TADA_BigDataRetrieval <- function(startDate = "null", YearSummarized >= startYearLo, YearSummarized <= endYearHi ) - + rm(df_summary) # if there are still site records when filtered to years of interest.... if (dim(sites)[1] > 0) { - - - # get total number of results per site and separate out sites with >250000 results tot_sites <- sites %>% dplyr::group_by(MonitoringLocationIdentifier) %>% @@ -1370,14 +1369,14 @@ TADA_BigDataRetrieval <- function(startDate = "null", dplyr::arrange(tot_n) smallsites <- tot_sites %>% dplyr::filter(tot_n <= maxrecs) bigsites <- tot_sites %>% dplyr::filter(tot_n > maxrecs) - + df <- data.frame() - + if (dim(smallsites)[1] > 0) { smallsitesgrp <- make_groups(smallsites, maxrecs) - + print(paste0("Downloading data from sites with fewer than ", maxrecs, " results by grouping them together.")) - + for (i in 1:max(smallsitesgrp$group)) { site_chunk <- subset(smallsitesgrp$MonitoringLocationIdentifier, smallsitesgrp$group == i) joins <- TADA_DataRetrieval( @@ -1393,29 +1392,29 @@ TADA_BigDataRetrieval <- function(startDate = "null", df <- dplyr::bind_rows(df, joins) } } - + rm(smallsites, smallsitesgrp) } - + if (dim(bigsites)[1] > 0) { print(paste0("Downloading data from sites with greater than ", maxrecs, " results, chunking queries by shorter time intervals...")) - + bsitesvec <- unique(bigsites$MonitoringLocationIdentifier) - + for (i in 1:length(bsitesvec)) { mlidsum <- subset(sites, sites$MonitoringLocationIdentifier == bsitesvec[i]) mlidsum <- mlidsum %>% dplyr::group_by(MonitoringLocationIdentifier, YearSummarized) %>% dplyr::summarise(tot_n = sum(ResultCount)) site_chunk <- unique(mlidsum$MonitoringLocationIdentifier) - + bigsitegrps <- make_groups(mlidsum, maxrecs) - + for (i in 1:max(bigsitegrps$group)) { yearchunk <- subset(bigsitegrps$YearSummarized, bigsitegrps$group == i) startD <- paste0(min(yearchunk), "-01-01") endD <- paste0(max(yearchunk), "-12-31") - + joins <- TADA_DataRetrieval( startDate = startD, endDate = endD, @@ -1425,7 +1424,7 @@ TADA_BigDataRetrieval <- function(startDate = "null", sampleMedia = sampleMedia, applyautoclean = FALSE ) - + if (dim(joins)[1] > 0) { df <- dplyr::bind_rows(df, joins) } @@ -1441,18 +1440,18 @@ TADA_BigDataRetrieval <- function(startDate = "null", warning("Query returned no data. Function returns an empty dataframe.") return(df_summary) } - + df <- subset(df, as.Date(df$ActivityStartDate, "%Y-%m-%d") >= startDat & as.Date(df$ActivityStartDate, "%Y-%m-%d") <= endDat) - + if (applyautoclean == TRUE) { print("Applying TADA_AutoClean function...") df <- TADA_AutoClean(df) } - + # timing function for efficiency tests. difference <- difftime(Sys.time(), start_T, units = "mins") print(difference) - + return(df) } @@ -1489,20 +1488,20 @@ TADA_JoinWQPProfiles <- function(FullPhysChem = "null", Sites = "null", Projects = "null") { FullPhysChem.df <- FullPhysChem - + Sites.df <- Sites - + Projects.df <- Projects - + # Join station data to full phys/chem (FullPhysChem.df) if (length(Sites.df > 1)) { if (nrow(Sites.df) > 0) { join1 <- FullPhysChem.df %>% # join stations to results dplyr::left_join(Sites.df, - by = "MonitoringLocationIdentifier", - multiple = "all", - relationship = "many-to-many" + by = "MonitoringLocationIdentifier", + multiple = "all", + relationship = "many-to-many" ) %>% # remove ".x" suffix from column names dplyr::rename_at(dplyr::vars(dplyr::ends_with(".x")), ~ stringr::str_replace(., "\\..$", "")) %>% @@ -1514,8 +1513,8 @@ TADA_JoinWQPProfiles <- function(FullPhysChem = "null", } else { join1 <- FullPhysChem.df } - - + + # Add QAPP columns from project if (length(Projects.df) > 1) { if (nrow(Projects.df) > 0) { @@ -1541,20 +1540,19 @@ TADA_JoinWQPProfiles <- function(FullPhysChem = "null", } else { join2 <- join1 } - + return(join2) } #' Ask user to approve WQP downloads -#' +#' #' Once record counts have been retrieved from the Water Quality Portal (WQP) for #' a query, this function is used to prompt the user to decide (i.e., "yes"/"no") #' whether the download should proceed. The user is also reminded of the limits of #' Microsoft Excel for row counts as a comparison. -#' +#' #' @param n_records A numeric value indicating the number of records that will be downloaded from the WQP if the user decides to proceed. -ask_user <- function(n_records){ - +ask_user <- function(n_records) { # Text to show user user_prompt <- cat( "Your WQP query will return ", @@ -1563,9 +1561,9 @@ ask_user <- function(n_records){ "Would you like to continue with the download? [yes/no] ", sep = "" ) - + # Ask user if they want to continue & check for valid response - while(TRUE){ + while (TRUE) { user_input <- readline(prompt = user_prompt) # Convert response to lower and no whitespace user_input <- tolower(trimws(user_input)) @@ -1575,7 +1573,6 @@ ask_user <- function(n_records){ cat("Invalid input. Please enter 'yes' or 'no'.\n") } } - } @@ -1601,9 +1598,9 @@ make_groups <- function(x, maxrecs) { i <- i + 1 groupings <- plyr::rbind.fill(groupings, group) } - + x$group <- i - + groupings <- plyr::rbind.fill(groupings, x) } return(groupings) diff --git a/R/Maintenance.R b/R/Maintenance.R index 428f0a5f..b5f047fa 100644 --- a/R/Maintenance.R +++ b/R/Maintenance.R @@ -304,7 +304,7 @@ TADA_UpdateExampleData <- function() { # Type == "CharacteristicUnit", # Status == "Accepted" # ) -# +# # # find Characteristic/Source/Value.Unit combinations with more than one row # find.dups <- unit.ref %>% # dplyr::filter(Type == "CharacteristicUnit") %>% @@ -313,14 +313,12 @@ TADA_UpdateExampleData <- function() { # Max_n = length(unique(Maximum))) %>% # dplyr::filter(Min_n > 1 | # Max_n > 1) -# +# # # create download path # download.path <- file.path(Sys.getenv("USERPROFILE"), "Downloads", "WQXcharValRef_multiples.csv") -# +# # # create csv to send to WQX team and save in test results folder # readr::write_csv(find.dups, download.path) # # # review csv and send to WQX team to update the validation table # - - diff --git a/tests/testthat/test-DataDiscoveryRetrieval.R b/tests/testthat/test-DataDiscoveryRetrieval.R index bd96696a..8c2cb9f2 100644 --- a/tests/testthat/test-DataDiscoveryRetrieval.R +++ b/tests/testthat/test-DataDiscoveryRetrieval.R @@ -418,7 +418,7 @@ test_that("TADA_DataRetrieval", { # test_that("Reg&BigdataRetrieval", { # # big <- TADA_BigDataRetrieval(characteristicName = "Algae, substrate rock/bank cover (choice list)", sampleMedia = "Water", siteType = "Stream", startDate = "2020-01-01", applyautoclean = FALSE) # reg <- TADA_DataRetrieval(characteristicName = "Algae, substrate rock/bank cover (choice list)", sampleMedia = "Water", siteType = "Stream", startDate = "2020-01-01", applyautoclean = FALSE, ask = FALSE) -# +# # expect_equal(nrow(big), nrow(reg)) # }) @@ -428,7 +428,7 @@ test_that("TADA_DataRetrieval", { # endDate <- "2021-09-30" # # big <- TADA_BigDataRetrieval(startDate = startDate, endDate = endDate, huc = c("04030202", "04030201"), characteristicName = "Escherichia coli", siteType = "Stream") # logic <- min(big$ActivityStartDate) >= as.Date(startDate, format = "%Y-%m-%d") & max(big$ActivityStartDate) <= as.Date(endDate, format = "%Y-%m-%d") -# +# # expect_true(logic) # }) diff --git a/tests/testthat/test-ResultFlagsIndependent.R b/tests/testthat/test-ResultFlagsIndependent.R index 9cad1019..4782e61c 100644 --- a/tests/testthat/test-ResultFlagsIndependent.R +++ b/tests/testthat/test-ResultFlagsIndependent.R @@ -123,36 +123,36 @@ test_that("TADA_FindPotentialDuplicatsMultipleOrgs has non-NA values for each ro }) test_that("WQXcharValRef.csv contains only one row for each unique characteristic/source/unit combination for threshold functions", { - unit.ref <- utils::read.csv(system.file("extdata", "WQXcharValRef.csv", package = "EPATADA")) %>% - dplyr::filter( - Type == "CharacteristicUnit", - Status == "Accepted" - ) - + dplyr::filter( + Type == "CharacteristicUnit", + Status == "Accepted" + ) + find.dups <- unit.ref %>% dplyr::filter(Type == "CharacteristicUnit") %>% dplyr::group_by(Characteristic, Source, Value.Unit) %>% - dplyr::mutate(Min_n = length(unique(Minimum)), - Max_n = length(unique(Maximum))) %>% + dplyr::mutate( + Min_n = length(unique(Minimum)), + Max_n = length(unique(Maximum)) + ) %>% dplyr::filter(Min_n > 1 | - Max_n > 1) - + Max_n > 1) + expect_true(nrow(find.dups) == 0) - }) +}) test_that("range flag functions work", { # use random data upper <- TADA_RandomTestingData() - + expect_no_error(TADA_FlagAboveThreshold(upper)) - expect_no_warning(TADA_FlagAboveThreshold(upper)) + expect_no_warning(TADA_FlagAboveThreshold(upper)) expect_no_message(TADA_FlagAboveThreshold(upper)) expect_no_condition(TADA_FlagAboveThreshold(upper)) - + expect_no_error(TADA_FlagBelowThreshold(upper)) - expect_no_warning(TADA_FlagBelowThreshold(upper)) + expect_no_warning(TADA_FlagBelowThreshold(upper)) expect_no_message(TADA_FlagBelowThreshold(upper)) expect_no_condition(TADA_FlagBelowThreshold(upper)) - }) From 8ab88578692e2ccbd7561a0759174d383779cdf5 Mon Sep 17 00:00:00 2001 From: Matt Brousil <37380883+mbrousil@users.noreply.github.com> Date: Thu, 23 Jan 2025 08:40:03 -0800 Subject: [PATCH 22/35] Example housekeeping --- R/CriteriaComparison.R | 3 +- R/DataDiscoveryRetrieval.R | 29 ++++++++++++------- R/GeospatialFunctions.R | 15 ++++++---- R/ResultFlagsIndependent.R | 4 +-- R/Utilities.R | 6 ++-- man/TADA_DataRetrieval.Rd | 29 ++++++++++++------- ...ADA_FindPotentialDuplicatesMultipleOrgs.Rd | 2 +- man/TADA_FlagContinuousData.Rd | 2 +- man/TADA_GetATTAINS.Rd | 3 +- man/TADA_MakeSpatial.Rd | 3 +- man/TADA_PairForCriteriaCalc.Rd | 3 +- man/TADA_SubstituteDeprecatedChars.Rd | 6 ++-- man/TADA_ViewATTAINS.Rd | 3 +- man/fetchATTAINS.Rd | 3 +- man/fetchNHD.Rd | 3 +- 15 files changed, 70 insertions(+), 44 deletions(-) diff --git a/R/CriteriaComparison.R b/R/CriteriaComparison.R index 82634975..792dbf30 100644 --- a/R/CriteriaComparison.R +++ b/R/CriteriaComparison.R @@ -207,7 +207,8 @@ TADA_CreatePairRef <- function(.data, ph = TRUE, hardness = TRUE, temp = TRUE, #' AL_df <- TADA_DataRetrieval( #' startDate = "2010-11-30", #' endDate = "2010-12-01", -#' statecode = "AL" +#' statecode = "AL", +#' ask = FALSE #' ) #' #' AL_PairRef <- TADA_PairForCriteriaCalc(AL_df) diff --git a/R/DataDiscoveryRetrieval.R b/R/DataDiscoveryRetrieval.R index 67606762..6688cd8c 100644 --- a/R/DataDiscoveryRetrieval.R +++ b/R/DataDiscoveryRetrieval.R @@ -73,24 +73,26 @@ #' # example for WI #' tada1 <- TADA_DataRetrieval( #' statecode = "WI", countycode = "Dane", -#' characteristicName = "Phosphorus" +#' characteristicName = "Phosphorus", +#' ask = FALSE #' ) #' #' # example for UT #' tada2 <- TADA_DataRetrieval( #' statecode = "UT", -#' characteristicName = c("Ammonia", "Nitrate", "Nitrogen") +#' characteristicName = c("Ammonia", "Nitrate", "Nitrogen"), +#' ask = FALSE #' ) #' #' # example for SC -#' tada3 <- TADA_DataRetrieval(statecode = "SC", countycode = "Abbeville") +#' tada3 <- TADA_DataRetrieval(statecode = "SC", countycode = "Abbeville", ask = FALSE) #' #' # example for CT -#' tada4 <- TADA_DataRetrieval(statecode = "CT", startDate = "2020-10-01") +#' tada4 <- TADA_DataRetrieval(statecode = "CT", startDate = "2020-10-01", ask = FALSE) #' #' #' # note that countycode queries require a statecode (see example below) -#' tada5 <- TADA_DataRetrieval(countycode = "US:02:020") +#' tada5 <- TADA_DataRetrieval(countycode = "US:02:020", ask = FALSE) #' #' # example for NM #' tada6 <- TADA_DataRetrieval( @@ -100,11 +102,12 @@ #' "Nitrate", #' "Nitrogen" #' ), -#' startDate = "2020-05-01" +#' startDate = "2020-05-01", +#' ask = FALSE #' ) #' #' # example for AK project -#' tada7 <- TADA_DataRetrieval(project = "Anchorage Bacteria 20-21") +#' tada7 <- TADA_DataRetrieval(project = "Anchorage Bacteria 20-21", ask = FALSE) #' #' # another example for AK #' tada8 <- TADA_DataRetrieval( @@ -117,7 +120,8 @@ #' "Nitrate", #' "Nitrogen" #' ), -#' startDate = "2018-05-01" +#' startDate = "2018-05-01", +#' ask = FALSE #' ) #' #' # example for tribes @@ -160,7 +164,8 @@ #' "CHOCNATWQX", #' "WNENVDPT_WQX", #' "PUEBLO_POJOAQUE" -#' )) +#' ), +#' ask = FALSE) #' #' # query only NWIS data for a 10 year period in CT #' tada10 <- TADA_DataRetrieval( @@ -169,13 +174,15 @@ #' sampleMedia = c("Water", "water"), #' statecode = "CT", # consider downloading only 1 state at a time #' providers = "NWIS", -#' applyautoclean = FALSE +#' applyautoclean = FALSE, +#' ask = FALSE #' ) #' #' # query by country code (e.g. Canada, countrycode = "CA") #' tada11 <- TADA_DataRetrieval( #' startDate = "2015-01-01", -#' countrycode = "CA" +#' countrycode = "CA", +#' ask = FALSE #' ) #' } #' diff --git a/R/GeospatialFunctions.R b/R/GeospatialFunctions.R index 85f9a9d2..79cd11e9 100644 --- a/R/GeospatialFunctions.R +++ b/R/GeospatialFunctions.R @@ -20,7 +20,8 @@ #' characteristicName = "pH", #' statecode = "SC", #' countycode = "Abbeville", -#' applyautoclean = TRUE +#' applyautoclean = TRUE, +#' ask = FALSE #' ) #' #' # make `tada_not_spatial` an sf object, projected in crs = 4269 (NAD83) @@ -127,7 +128,8 @@ TADA_MakeSpatial <- function(.data, crs = 4326) { #' endDate = "1995-12-31", #' characteristicName = "pH", #' statecode = "NV", -#' applyautoclean = TRUE +#' applyautoclean = TRUE, +#' ask = FALSE #' ) #' #' nv_attains_features <- fetchATTAINS(tada_data, catchments_only = FALSE) @@ -476,7 +478,8 @@ fetchATTAINS <- function(.data, catchments_only = FALSE) { #' endDate = "1990-01-15", #' characteristicName = "pH", #' statecode = "CO", -#' applyautoclean = TRUE +#' applyautoclean = TRUE, +#' ask = FALSE #' ) #' #' nhd_data <- fetchNHD(.data = tada_data, resolution = "Hi", features = c("catchments", "waterbodies", "flowlines")) @@ -949,7 +952,8 @@ fetchNHD <- function(.data, resolution = "Hi", features = "catchments") { #' endDate = "2018-07-31", #' characteristicName = "pH", #' statecode = "IL", -#' applyautoclean = TRUE +#' applyautoclean = TRUE, +#' ask = FALSE #' ) #' #' # note: these example ATTAINS data retrieval queries below may take a long time (10+ minutes) to run @@ -1373,7 +1377,8 @@ TADA_GetATTAINS <- function(.data, fill_catchments = FALSE, resolution = "Hi", r #' endDate = "1995-12-31", #' characteristicName = "pH", #' statecode = "NV", -#' applyautoclean = TRUE +#' applyautoclean = TRUE, +#' ask = FALSE #' ) #' #' attains_data <- TADA_GetATTAINS(tada_data, return_sf = TRUE) diff --git a/R/ResultFlagsIndependent.R b/R/ResultFlagsIndependent.R index 04c9f2c4..61c00580 100644 --- a/R/ResultFlagsIndependent.R +++ b/R/ResultFlagsIndependent.R @@ -171,7 +171,7 @@ TADA_FlagMethod <- function(.data, clean = TRUE, flaggedonly = FALSE) { #' #' @examples #' \dontrun{ -#' all_data <- TADA_DataRetrieval(project = c("Continuous LC1", "MA_Continuous", "Anchorage Bacteria 20-21")) +#' all_data <- TADA_DataRetrieval(project = c("Continuous LC1", "MA_Continuous", "Anchorage Bacteria 20-21"), ask = FALSE) #' #' # Flag continuous data in new column titled "TADA.ContinuousData.Flag" #' all_data_flags <- TADA_FlagContinuousData(all_data, clean = FALSE) @@ -1184,7 +1184,7 @@ TADA_FlagCoordinates <- function(.data, #' @examples #' \dontrun{ #' # Load dataset -#' dat <- TADA_DataRetrieval(startDate = "2022-09-01", endDate = "2023-05-01", statecode = "PA", sampleMedia = "Water") +#' dat <- TADA_DataRetrieval(startDate = "2022-09-01", endDate = "2023-05-01", statecode = "PA", sampleMedia = "Water", ask = FALSE) #' unique(dat$OrganizationIdentifier) #' # If duplicates across organizations exist, pick the result belonging to "21PA_WQX" if available. #' dat1 <- TADA_FindPotentialDuplicatesMultipleOrgs(dat, dist_buffer = 100, org_hierarchy = c("21PA_WQX")) diff --git a/R/Utilities.R b/R/Utilities.R index efa91f3b..45b139e1 100644 --- a/R/Utilities.R +++ b/R/Utilities.R @@ -627,7 +627,7 @@ TADA_ConvertSpecialChars <- function(.data, col, percent.ave = TRUE) { #' This function uses the WQX Characteristic domain table to substitute #' deprecated (i.e. retired and/or suspect) Characteristic Names with the new #' name in the TADA.CharacteristicName column. TADA_SubstituteDeprecatedChars is -#' run within TADA_AutoClean, which runs within TADA_DataRetreival and (if autoclean = TRUE) +#' run within TADA_AutoClean, which runs within TADA_DataRetrieval and (if autoclean = TRUE) #' in TADA_BigDataRetrieval. Therefore, deprecated characteristic names are #' harmonized to the new name automatically upon data retrieval. #' TADA_SubstituteDeprecatedChars can also be used by itself on a user supplied @@ -648,7 +648,7 @@ TADA_ConvertSpecialChars <- function(.data, col, percent.ave = TRUE) { #' @examples #' \dontrun{ #' # download nutrient data in MT from 2022 and set autoclean = FALSE -#' df <- TADA_DataRetrieval(startDate = "2022-01-01", endDate = "2022-12-31", characteristicType = "Nutrient", statecode = "MT", applyautoclean = FALSE) +#' df <- TADA_DataRetrieval(startDate = "2022-01-01", endDate = "2022-12-31", characteristicType = "Nutrient", statecode = "MT", applyautoclean = FALSE, ask = FALSE) #' df2 <- TADA_SubstituteDeprecatedChars(df) #' # in this example, "Inorganic nitrogen (nitrate and nitrite)" is a USGS NWIS characteristic that is #' # deprecated and "Phosphate-phosphorus***retired***use Total Phosphorus, mixed forms" is a deprecated WQX @@ -657,7 +657,7 @@ TADA_ConvertSpecialChars <- function(.data, col, percent.ave = TRUE) { #' unique(df2$CharacteristicName) #' unique(df2$TADA.CharacteristicName) #' -#' df3 <- TADA_DataRetrieval(startDate = "2022-01-01", endDate = "2022-12-31", characteristicType = "Nutrient", statecode = "WY", applyautoclean = FALSE) +#' df3 <- TADA_DataRetrieval(startDate = "2022-01-01", endDate = "2022-12-31", characteristicType = "Nutrient", statecode = "WY", applyautoclean = FALSE, ask = FALSE) #' df4 <- TADA_SubstituteDeprecatedChars(df3) #' unique(df4$CharacteristicName) #' unique(df4$TADA.CharacteristicName) diff --git a/man/TADA_DataRetrieval.Rd b/man/TADA_DataRetrieval.Rd index bcc02ad1..9701cf0c 100644 --- a/man/TADA_DataRetrieval.Rd +++ b/man/TADA_DataRetrieval.Rd @@ -121,24 +121,26 @@ errors when used with this function. # example for WI tada1 <- TADA_DataRetrieval( statecode = "WI", countycode = "Dane", - characteristicName = "Phosphorus" + characteristicName = "Phosphorus", + ask = FALSE ) # example for UT tada2 <- TADA_DataRetrieval( statecode = "UT", - characteristicName = c("Ammonia", "Nitrate", "Nitrogen") + characteristicName = c("Ammonia", "Nitrate", "Nitrogen"), + ask = FALSE ) # example for SC -tada3 <- TADA_DataRetrieval(statecode = "SC", countycode = "Abbeville") +tada3 <- TADA_DataRetrieval(statecode = "SC", countycode = "Abbeville", ask = FALSE) # example for CT -tada4 <- TADA_DataRetrieval(statecode = "CT", startDate = "2020-10-01") +tada4 <- TADA_DataRetrieval(statecode = "CT", startDate = "2020-10-01", ask = FALSE) # note that countycode queries require a statecode (see example below) -tada5 <- TADA_DataRetrieval(countycode = "US:02:020") +tada5 <- TADA_DataRetrieval(countycode = "US:02:020", ask = FALSE) # example for NM tada6 <- TADA_DataRetrieval( @@ -148,11 +150,12 @@ tada6 <- TADA_DataRetrieval( "Nitrate", "Nitrogen" ), - startDate = "2020-05-01" + startDate = "2020-05-01", + ask = FALSE ) # example for AK project -tada7 <- TADA_DataRetrieval(project = "Anchorage Bacteria 20-21") +tada7 <- TADA_DataRetrieval(project = "Anchorage Bacteria 20-21", ask = FALSE) # another example for AK tada8 <- TADA_DataRetrieval( @@ -165,7 +168,8 @@ tada8 <- TADA_DataRetrieval( "Nitrate", "Nitrogen" ), - startDate = "2018-05-01" + startDate = "2018-05-01", + ask = FALSE ) # example for tribes @@ -208,7 +212,8 @@ tada9 <- TADA_DataRetrieval(organization = c( "CHOCNATWQX", "WNENVDPT_WQX", "PUEBLO_POJOAQUE" -)) +), +ask = FALSE) # query only NWIS data for a 10 year period in CT tada10 <- TADA_DataRetrieval( @@ -217,13 +222,15 @@ tada10 <- TADA_DataRetrieval( sampleMedia = c("Water", "water"), statecode = "CT", # consider downloading only 1 state at a time providers = "NWIS", - applyautoclean = FALSE + applyautoclean = FALSE, + ask = FALSE ) # query by country code (e.g. Canada, countrycode = "CA") tada11 <- TADA_DataRetrieval( startDate = "2015-01-01", - countrycode = "CA" + countrycode = "CA", + ask = FALSE ) } diff --git a/man/TADA_FindPotentialDuplicatesMultipleOrgs.Rd b/man/TADA_FindPotentialDuplicatesMultipleOrgs.Rd index 8130dcc1..f30abeda 100644 --- a/man/TADA_FindPotentialDuplicatesMultipleOrgs.Rd +++ b/man/TADA_FindPotentialDuplicatesMultipleOrgs.Rd @@ -53,7 +53,7 @@ TADA_FindPotentialDuplicatesSingleOrg. \examples{ \dontrun{ # Load dataset -dat <- TADA_DataRetrieval(startDate = "2022-09-01", endDate = "2023-05-01", statecode = "PA", sampleMedia = "Water") +dat <- TADA_DataRetrieval(startDate = "2022-09-01", endDate = "2023-05-01", statecode = "PA", sampleMedia = "Water", ask = FALSE) unique(dat$OrganizationIdentifier) # If duplicates across organizations exist, pick the result belonging to "21PA_WQX" if available. dat1 <- TADA_FindPotentialDuplicatesMultipleOrgs(dat, dist_buffer = 100, org_hierarchy = c("21PA_WQX")) diff --git a/man/TADA_FlagContinuousData.Rd b/man/TADA_FlagContinuousData.Rd index c4488a86..c0de68ab 100644 --- a/man/TADA_FlagContinuousData.Rd +++ b/man/TADA_FlagContinuousData.Rd @@ -55,7 +55,7 @@ attachment at the activity level. } \examples{ \dontrun{ -all_data <- TADA_DataRetrieval(project = c("Continuous LC1", "MA_Continuous", "Anchorage Bacteria 20-21")) +all_data <- TADA_DataRetrieval(project = c("Continuous LC1", "MA_Continuous", "Anchorage Bacteria 20-21"), ask = FALSE) # Flag continuous data in new column titled "TADA.ContinuousData.Flag" all_data_flags <- TADA_FlagContinuousData(all_data, clean = FALSE) diff --git a/man/TADA_GetATTAINS.Rd b/man/TADA_GetATTAINS.Rd index 4aaa743d..341d17d6 100644 --- a/man/TADA_GetATTAINS.Rd +++ b/man/TADA_GetATTAINS.Rd @@ -40,7 +40,8 @@ tada_data <- TADA_DataRetrieval( endDate = "2018-07-31", characteristicName = "pH", statecode = "IL", - applyautoclean = TRUE + applyautoclean = TRUE, + ask = FALSE ) # note: these example ATTAINS data retrieval queries below may take a long time (10+ minutes) to run diff --git a/man/TADA_MakeSpatial.Rd b/man/TADA_MakeSpatial.Rd index 54e9cb04..0a0735f1 100644 --- a/man/TADA_MakeSpatial.Rd +++ b/man/TADA_MakeSpatial.Rd @@ -27,7 +27,8 @@ tada_not_spatial <- TADA_DataRetrieval( characteristicName = "pH", statecode = "SC", countycode = "Abbeville", - applyautoclean = TRUE + applyautoclean = TRUE, + ask = FALSE ) # make `tada_not_spatial` an sf object, projected in crs = 4269 (NAD83) diff --git a/man/TADA_PairForCriteriaCalc.Rd b/man/TADA_PairForCriteriaCalc.Rd index 83e893ba..83520a48 100644 --- a/man/TADA_PairForCriteriaCalc.Rd +++ b/man/TADA_PairForCriteriaCalc.Rd @@ -36,7 +36,8 @@ combinations for hardness, pH, temperature, salinity and chloride will be used. AL_df <- TADA_DataRetrieval( startDate = "2010-11-30", endDate = "2010-12-01", - statecode = "AL" + statecode = "AL", + ask = FALSE ) AL_PairRef <- TADA_PairForCriteriaCalc(AL_df) diff --git a/man/TADA_SubstituteDeprecatedChars.Rd b/man/TADA_SubstituteDeprecatedChars.Rd index a53a3aa9..e5b06176 100644 --- a/man/TADA_SubstituteDeprecatedChars.Rd +++ b/man/TADA_SubstituteDeprecatedChars.Rd @@ -17,7 +17,7 @@ TADA.CharacteristicName column. Original columns are unchanged. This function uses the WQX Characteristic domain table to substitute deprecated (i.e. retired and/or suspect) Characteristic Names with the new name in the TADA.CharacteristicName column. TADA_SubstituteDeprecatedChars is -run within TADA_AutoClean, which runs within TADA_DataRetreival and (if autoclean = TRUE) +run within TADA_AutoClean, which runs within TADA_DataRetrieval and (if autoclean = TRUE) in TADA_BigDataRetrieval. Therefore, deprecated characteristic names are harmonized to the new name automatically upon data retrieval. TADA_SubstituteDeprecatedChars can also be used by itself on a user supplied @@ -32,7 +32,7 @@ deprecated names (Char_Flag). This can be used as a crosswalk between the deprec \examples{ \dontrun{ # download nutrient data in MT from 2022 and set autoclean = FALSE -df <- TADA_DataRetrieval(startDate = "2022-01-01", endDate = "2022-12-31", characteristicType = "Nutrient", statecode = "MT", applyautoclean = FALSE) +df <- TADA_DataRetrieval(startDate = "2022-01-01", endDate = "2022-12-31", characteristicType = "Nutrient", statecode = "MT", applyautoclean = FALSE, ask = FALSE) df2 <- TADA_SubstituteDeprecatedChars(df) # in this example, "Inorganic nitrogen (nitrate and nitrite)" is a USGS NWIS characteristic that is # deprecated and "Phosphate-phosphorus***retired***use Total Phosphorus, mixed forms" is a deprecated WQX @@ -41,7 +41,7 @@ df2 <- TADA_SubstituteDeprecatedChars(df) unique(df2$CharacteristicName) unique(df2$TADA.CharacteristicName) -df3 <- TADA_DataRetrieval(startDate = "2022-01-01", endDate = "2022-12-31", characteristicType = "Nutrient", statecode = "WY", applyautoclean = FALSE) +df3 <- TADA_DataRetrieval(startDate = "2022-01-01", endDate = "2022-12-31", characteristicType = "Nutrient", statecode = "WY", applyautoclean = FALSE, ask = FALSE) df4 <- TADA_SubstituteDeprecatedChars(df3) unique(df4$CharacteristicName) unique(df4$TADA.CharacteristicName) diff --git a/man/TADA_ViewATTAINS.Rd b/man/TADA_ViewATTAINS.Rd index 05f62503..adee99e9 100644 --- a/man/TADA_ViewATTAINS.Rd +++ b/man/TADA_ViewATTAINS.Rd @@ -29,7 +29,8 @@ tada_data <- TADA_DataRetrieval( endDate = "1995-12-31", characteristicName = "pH", statecode = "NV", - applyautoclean = TRUE + applyautoclean = TRUE, + ask = FALSE ) attains_data <- TADA_GetATTAINS(tada_data, return_sf = TRUE) diff --git a/man/fetchATTAINS.Rd b/man/fetchATTAINS.Rd index 802b055a..959fa750 100644 --- a/man/fetchATTAINS.Rd +++ b/man/fetchATTAINS.Rd @@ -24,7 +24,8 @@ tada_data <- TADA_DataRetrieval( endDate = "1995-12-31", characteristicName = "pH", statecode = "NV", - applyautoclean = TRUE + applyautoclean = TRUE, + ask = FALSE ) nv_attains_features <- fetchATTAINS(tada_data, catchments_only = FALSE) diff --git a/man/fetchNHD.Rd b/man/fetchNHD.Rd index e431af6e..9849185a 100644 --- a/man/fetchNHD.Rd +++ b/man/fetchNHD.Rd @@ -26,7 +26,8 @@ tada_data <- TADA_DataRetrieval( endDate = "1990-01-15", characteristicName = "pH", statecode = "CO", - applyautoclean = TRUE + applyautoclean = TRUE, + ask = FALSE ) nhd_data <- fetchNHD(.data = tada_data, resolution = "Hi", features = c("catchments", "waterbodies", "flowlines")) From dfd0b3c35f2533995e4ede558668b4f0b0ff9a6f Mon Sep 17 00:00:00 2001 From: Matt Brousil <37380883+mbrousil@users.noreply.github.com> Date: Fri, 24 Jan 2025 10:20:27 -0800 Subject: [PATCH 23/35] Remove bigdataretrieval --- R/DataDiscoveryRetrieval.R | 289 ------------------------------------- 1 file changed, 289 deletions(-) diff --git a/R/DataDiscoveryRetrieval.R b/R/DataDiscoveryRetrieval.R index 6688cd8c..b703efe1 100644 --- a/R/DataDiscoveryRetrieval.R +++ b/R/DataDiscoveryRetrieval.R @@ -1174,295 +1174,6 @@ TADA_BigDataHelper <- function(record_summary, WQPquery, maxrecs = 250000, maxsi return(df_out) } -#' Large WQP data pulls using dataRetrieval -#' -#' This function does multiple synchronous data calls to the WQP -#' (waterqualitydata.us). It uses the WQP summary service to limit the amount -#' downloaded to only relevant data (based on user query), pulls back data for -#' 250000 records at a time, and then joins the data back together to produce a -#' single TADA compatible dataframe as the output. For large data sets, that can save a lot -#' of time and ultimately reduce the complexity of subsequent data processing. -#' Using this function, you will be able to download all data available from all -#' sites in the contiguous United States available for the time period, -#' characteristicName, and siteType requested. Computer memory may limit the -#' size of data frames that your R console will be able to hold in one session. -#' Function requires a characteristicName, siteType, statecode, huc, or start/ -#' end date input. The recommendation is to be as specific as you can with your -#' large data call. The function allows the user to run TADA_AutoClean on the data frame, -#' but this is not the default as checking large dataframes for exact duplicate -#' rows can be time consuming and is better performed on its own once the query is -#' completed. -#' -#' Some code for this function was adapted from this USGS Blog (Author: Aliesha Krall) -#' \href{https://waterdata.usgs.gov/blog/large_sample_pull/}{Large Sample Pull} -#' -#' See ?TADA_AutoClean documentation for more information on this optional input. -#' -#' Note: TADA_BigDataRetrieval (by leveraging USGS's dataRetrieval), automatically converts -#' the date times to UTC. It also automatically converts the data to dates, -#' datetimes, numerics based on a standard algorithm. See: ?dataRetrieval::readWQPdata -#' -#' @param startDate Start Date string in the format YYYY-MM-DD, for example, "2020-01-01" -#' @param endDate End Date string in the format YYYY-MM-DD, for example, "2020-01-01" -#' @param countrycode Code that identifies a country or ocean (e.g. countrycode = "CA" for Canada, countrycode = "OA" for Atlantic Ocean). See https://www.waterqualitydata.us/Codes/countrycode for options. -#' @param statecode FIPS state alpha code that identifies a state (e.g. statecode = "DE" for Delaware). See https://www.waterqualitydata.us/Codes/statecode for options. -#' @param countycode FIPS county name. Note that a state code must also be supplied (e.g. statecode = "AL", countycode = "Chilton"). See https://www.waterqualitydata.us/Codes/countycode for options. -#' @param huc A numeric code denoting a hydrologic unit. Example: "04030202". Different size hucs can be entered. See https://epa.maps.arcgis.com/home/item.html?id=796992f4588c401fabec7446ecc7a5a3 for a map with HUCS. Click on a HUC to find the associated code. -#' @param siteid Unique monitoring location identifier. -#' @param siteType Type of waterbody. See https://www.waterqualitydata.us/Codes/sitetype for options. -#' @param characteristicName Name of parameter. See https://www.waterqualitydata.us/Codes/characteristicName for options. -#' @param characteristicType Groups of environmental measurements/parameters. See https://www.waterqualitydata.us/Codes/characteristicType for options. -#' @param sampleMedia Sampling substrate such as water, air, or sediment. See https://www.waterqualitydata.us/Codes/sampleMedia for options. -#' @param organization A string of letters and/or numbers (some additional characters also possible) used to signify an organization with data in the Water Quality Portal. See https://www.waterqualitydata.us/Codes/organization for options. -#' @param maxrecs The maximum number of results queried within one call to dataRetrieval. -#' @param applyautoclean Logical, defaults to FALSE. If TRUE, runs TADA_AutoClean function on the returned data profile. -#' -#' @return TADA-compatible dataframe -#' -#' @export -#' -#' @examples -#' \dontrun{ -#' # takes approx 3 mins to run -#' tada1 <- TADA_BigDataRetrieval(startDate = "2019-01-01", endDate = "2021-12-31", characteristicName = "Temperature, water", statecode = c("AK", "AL")) -#' -#' # takes approx 21 mins -#' tada2 <- TADA_BigDataRetrieval(startDate = "2016-10-01", endDate = "2022-09-30", statecode = "UT") -#' -#' # takes seconds to run -#' tada3 <- TADA_BigDataRetrieval(huc = "04030202", characteristicName = "Escherichia coli") -#' -#' # takes approx 3 mins to run -#' tada4 <- TADA_BigDataRetrieval(startDate = "2004-01-01", countrycode = "CA") -#' -#' # takes seconds to run -#' tada5 <- TADA_BigDataRetrieval(startDate = "2018-01-01", statecode = "AL", countycode = "Chilton") -#' -#' # takes seconds to run -#' tada6 <- TADA_BigDataRetrieval(organization = "PUEBLOOFTESUQUE") -#' } -#' -TADA_BigDataRetrieval <- function(startDate = "null", - endDate = "null", - countrycode = "null", - statecode = "null", - countycode = "null", - huc = "null", - siteid = "null", - siteType = "null", - characteristicName = "null", - characteristicType = "null", - sampleMedia = "null", - organization = "null", - maxrecs = 250000, - applyautoclean = FALSE) { - start_T <- Sys.time() - - if (!"null" %in% statecode & !"null" %in% huc) { - stop("Please provide either state code(s) OR huc(s) to proceed.") - } - - if (!startDate == "null") { - startDat <- lubridate::ymd(startDate) - startYearLo <- lubridate::year(startDat) - } else { # else: pick a date before which any data are unlikely to be in WQP - startDate <- "1800-01-01" - startDat <- lubridate::ymd(startDate) - startYearLo <- lubridate::year(startDat) - } - - # Logic: if the input endDate is not null, convert to date and obtain year - # for summary - if (!endDate == "null") { - endDat <- lubridate::ymd(endDate) - endYearHi <- lubridate::year(endDat) - } else { # else: if not populated, default to using today's date/year for summary - endDate <- as.character(Sys.Date()) - endDat <- lubridate::ymd(endDate) - endYearHi <- lubridate::year(endDat) - } - - # Create readWQPsummary query - WQPquery <- list() - if (length(characteristicName) > 1) { - WQPquery <- c(WQPquery, characteristicName = list(characteristicName)) - } else if (characteristicName != "null") { - WQPquery <- c(WQPquery, characteristicName = characteristicName) - } - if (length(characteristicType) > 1) { - WQPquery <- c(WQPquery, characteristicType = list(characteristicType)) - } else if (characteristicType != "null") { - WQPquery <- c(WQPquery, characteristicType = characteristicType) - } - if (length(siteType) > 1) { - WQPquery <- c(WQPquery, siteType = list(siteType)) - } else if (siteType != "null") { - WQPquery <- c(WQPquery, siteType = siteType) - } - - if (!"null" %in% statecode) { - load(system.file("extdata", "statecodes_df.Rdata", package = "EPATADA")) - statecode <- as.character(statecode) - statecodes_sub <- statecodes_df %>% dplyr::filter(STUSAB %in% statecode) - statecd <- paste0("US:", statecodes_sub$STATE) - if (nrow(statecodes_sub) == 0) { - stop("State code is not valid. Check FIPS state/territory abbreviations.") - } - if (length(statecode) > 1) { - for (i in 1:length(statecode)) { - WQPquery <- c(WQPquery, statecode = list(statecd)) - } - WQPquery <- c(WQPquery, statecode = list(statecd)) - } else { - WQPquery <- c(WQPquery, statecode = statecd) - } - } - - if (length(huc) > 1) { - WQPquery <- c(WQPquery, huc = list(huc)) - } else if (huc != "null") { - WQPquery <- c(WQPquery, huc = huc) - } - - if (length(countrycode) > 1) { - WQPquery <- c(WQPquery, countrycode = list(countrycode)) - } else if (countrycode != "null") { - WQPquery <- c(WQPquery, countrycode = countrycode) - } - - if (length(countycode) > 1) { - WQPquery <- c(WQPquery, countycode = list(countycode)) - } else if (countycode != "null") { - WQPquery <- c(WQPquery, countycode = countycode) - } - - if (length(organization) > 1) { - WQPquery <- c(WQPquery, organization = list(organization)) - } else if (organization != "null") { - WQPquery <- c(WQPquery, organization = organization) - } - - # cut down on summary query time if possible based on big data query - diffdat <- lubridate::time_length(difftime(Sys.Date(), startDat), "years") - - if (diffdat <= 1) { - WQPquery <- c(WQPquery, summaryYears = 1) - } - - if (diffdat > 1 & diffdat <= 5) { - WQPquery <- c(WQPquery, summaryYears = 5) - } - - print("Building site summary table for chunking result downloads...") - df_summary <- dataRetrieval::readWQPsummary(WQPquery) - - ## NOTE: if query brings back no results, function returns empty - # dataRetrieval profile, not empty summary - if (nrow(df_summary) > 0) { - # narrow down to years of interest from summary - sites <- df_summary %>% - dplyr::filter( - YearSummarized >= startYearLo, - YearSummarized <= endYearHi - ) - - rm(df_summary) - # if there are still site records when filtered to years of interest.... - if (dim(sites)[1] > 0) { - # get total number of results per site and separate out sites with >250000 results - tot_sites <- sites %>% - dplyr::group_by(MonitoringLocationIdentifier) %>% - dplyr::summarise(tot_n = sum(ResultCount)) %>% - dplyr::arrange(tot_n) - smallsites <- tot_sites %>% dplyr::filter(tot_n <= maxrecs) - bigsites <- tot_sites %>% dplyr::filter(tot_n > maxrecs) - - df <- data.frame() - - if (dim(smallsites)[1] > 0) { - smallsitesgrp <- make_groups(smallsites, maxrecs) - - print(paste0("Downloading data from sites with fewer than ", maxrecs, " results by grouping them together.")) - - for (i in 1:max(smallsitesgrp$group)) { - site_chunk <- subset(smallsitesgrp$MonitoringLocationIdentifier, smallsitesgrp$group == i) - joins <- TADA_DataRetrieval( - startDate = startDate, - endDate = endDate, - siteid = site_chunk, - characteristicName = characteristicName, - characteristicType = characteristicType, - sampleMedia = sampleMedia, - applyautoclean = FALSE - ) - if (dim(joins)[1] > 0) { - df <- dplyr::bind_rows(df, joins) - } - } - - rm(smallsites, smallsitesgrp) - } - - if (dim(bigsites)[1] > 0) { - print(paste0("Downloading data from sites with greater than ", maxrecs, " results, chunking queries by shorter time intervals...")) - - bsitesvec <- unique(bigsites$MonitoringLocationIdentifier) - - for (i in 1:length(bsitesvec)) { - mlidsum <- subset(sites, sites$MonitoringLocationIdentifier == bsitesvec[i]) - mlidsum <- mlidsum %>% - dplyr::group_by(MonitoringLocationIdentifier, YearSummarized) %>% - dplyr::summarise(tot_n = sum(ResultCount)) - site_chunk <- unique(mlidsum$MonitoringLocationIdentifier) - - bigsitegrps <- make_groups(mlidsum, maxrecs) - - for (i in 1:max(bigsitegrps$group)) { - yearchunk <- subset(bigsitegrps$YearSummarized, bigsitegrps$group == i) - startD <- paste0(min(yearchunk), "-01-01") - endD <- paste0(max(yearchunk), "-12-31") - - joins <- TADA_DataRetrieval( - startDate = startD, - endDate = endD, - siteid = site_chunk, - characteristicName = characteristicName, - characteristicType = characteristicType, - sampleMedia = sampleMedia, - applyautoclean = FALSE - ) - - if (dim(joins)[1] > 0) { - df <- dplyr::bind_rows(df, joins) - } - } - } - rm(bigsites, bigsitegrps) - } - } else { - warning("Query returned no data. Function returns an empty dataframe.") - return(sites) - } - } else { - warning("Query returned no data. Function returns an empty dataframe.") - return(df_summary) - } - - df <- subset(df, as.Date(df$ActivityStartDate, "%Y-%m-%d") >= startDat & as.Date(df$ActivityStartDate, "%Y-%m-%d") <= endDat) - - if (applyautoclean == TRUE) { - print("Applying TADA_AutoClean function...") - df <- TADA_AutoClean(df) - } - - # timing function for efficiency tests. - difference <- difftime(Sys.time(), start_T, units = "mins") - print(difference) - - return(df) -} - - #' Join WQP Profiles #' From abe653fbc296a447c0504523bbcd3c01064fe0e1 Mon Sep 17 00:00:00 2001 From: Matt Brousil <37380883+mbrousil@users.noreply.github.com> Date: Fri, 24 Jan 2025 10:37:25 -0800 Subject: [PATCH 24/35] Apply suggestions from code review Co-authored-by: B Steele <32140074+steeleb@users.noreply.github.com> --- R/DataDiscoveryRetrieval.R | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/R/DataDiscoveryRetrieval.R b/R/DataDiscoveryRetrieval.R index b703efe1..b8a87e44 100644 --- a/R/DataDiscoveryRetrieval.R +++ b/R/DataDiscoveryRetrieval.R @@ -339,7 +339,7 @@ TADA_DataRetrieval <- function(startDate = "null", # If tribe info is provided then grab the corresponding sf object: if (tribal_area_type != "null") { # Make a reference table for tribal area type + url matching - # (options that don't return results are commented out for now) + # (options that don't return results are commented out) map_service_urls <- tibble::tribble( ~tribal_area, ~url, "Alaska Native Allotments", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/0", @@ -364,11 +364,9 @@ TADA_DataRetrieval <- function(startDate = "null", # These area types allow filtering by TRIBE_NAME (unique within each type) if (tribal_area_type %in% c( - # "Alaska Native Villages", "American Indian Reservations", "Off-reservation Trust Lands", - "Oklahoma Tribal Statistical Areas" # , - # "Virginia Federally Recognized Tribes" + "Oklahoma Tribal Statistical Areas" ) ) { # Get the relevant url @@ -405,7 +403,7 @@ TADA_DataRetrieval <- function(startDate = "null", } } } else { - stop("Tribal area type not recognized. Refer to TADA_TribalOptions() for query options.") + stop("Tribal area type or tribal name parcel not recognized. Refer to TADA_TribalOptions() for query options.") } } @@ -957,7 +955,7 @@ TADA_TribalOptions <- function(tribal_area_type, return_sf = FALSE) { # Convert to df if needed, export if (return_sf == FALSE) { return( - as.data.frame(tribal_area_sf) %>% + tribal_area_sf %>% sf::st_drop_geometry() ) } else { From acab23e03e3f1244d77b68786938d888fb383b2e Mon Sep 17 00:00:00 2001 From: Matt Brousil <37380883+mbrousil@users.noreply.github.com> Date: Fri, 24 Jan 2025 11:58:55 -0800 Subject: [PATCH 25/35] Documentation and messaging --- R/DataDiscoveryRetrieval.R | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/R/DataDiscoveryRetrieval.R b/R/DataDiscoveryRetrieval.R index b8a87e44..fe06f0d0 100644 --- a/R/DataDiscoveryRetrieval.R +++ b/R/DataDiscoveryRetrieval.R @@ -46,8 +46,8 @@ #' @param huc A numeric code denoting a hydrologic unit. Example: "04030202". Different size hucs can be entered. See https://epa.maps.arcgis.com/home/item.html?id=796992f4588c401fabec7446ecc7a5a3 for a map with HUCS. Click on a HUC to find the associated code. #' @param siteid Unique monitoring location identifier. #' @param siteType Type of waterbody. See https://www.waterqualitydata.us/Codes/sitetype for options. -#' @param tribal_area_type One of the six tribal spatial layers: "Alaska Native Allotments", "Alaska Native Villages", "American Indian Reservations", "Off-reservation Trust Lands", "Oklahoma Tribal Statistical Areas", or "Virginia Federally Recognized Tribes". -#' @param tribe_name_parcel The name of a tribe corresponding to an entry in the TRIBE_NAME field of the specified tribal_area_type. OR if the type is Alaska Native Allotments" then the corresponding PARCEL_NO. +#' @param tribal_area_type One of four tribal spatial layers: "Alaska Native Allotments", "American Indian Reservations", "Off-reservation Trust Lands", or "Oklahoma Tribal Statistical Areas". More info in TADA_TribalOptions(). Note that "Alaska Native Villages" and "Virginia Federally Recognized Tribes" layers will not return a successful query. +#' @param tribe_name_parcel The name of a tribe corresponding to an entry in the TRIBE_NAME field of the specified tribal_area_type. OR if the type is "Alaska Native Allotments" then the corresponding PARCEL_NO. More info in TADA_TribalOptions(). #' @param characteristicName Name of parameter. See https://www.waterqualitydata.us/Codes/characteristicName for options. #' @param characteristicType Groups of environmental measurements/parameters. See https://www.waterqualitydata.us/Codes/characteristicType for options. #' @param sampleMedia Sampling substrate such as water, air, or sediment. See https://www.waterqualitydata.us/Codes/sampleMedia for options. @@ -254,7 +254,12 @@ TADA_DataRetrieval <- function(startDate = "null", ) } - # Insufficient tribal info provided + # Insufficient tribal info provided: + # Type but no parcel + if ((tribal_area_type != "null") & all(tribe_name_parcel == "null")) { + stop("A tribe_name_parcel is required if tribal_area_type is provided.") + } + # Parcel but no type if ((tribal_area_type == "null") & all(tribe_name_parcel != "null")) { stop("A tribal_area_type is required if tribe_name_parcel is provided.") } @@ -423,6 +428,8 @@ TADA_DataRetrieval <- function(startDate = "null", quiet_whatWQPdata <- purrr::quietly(dataRetrieval::whatWQPdata) # Try getting WQP info + message("Checking what data is available. This may take a moment.") + quiet_bbox_avail <- quiet_whatWQPdata( WQPquery, bBox = c(input_bbox$xmin, input_bbox$ymin, input_bbox$xmax, input_bbox$ymax) @@ -743,6 +750,8 @@ TADA_DataRetrieval <- function(startDate = "null", } # Query info on available data + message("Checking what data is available. This may take a moment.") + # Don't want to print every message that's returned by WQP quiet_whatWQPdata <- purrr::quietly(dataRetrieval::whatWQPdata) From 4b3ec7f0b608c7ba0c05df16e4f72ab6c6ae728a Mon Sep 17 00:00:00 2001 From: Matt Brousil <37380883+mbrousil@users.noreply.github.com> Date: Fri, 24 Jan 2025 12:14:56 -0800 Subject: [PATCH 26/35] Clarify tribe_name_parcel reqs --- R/DataDiscoveryRetrieval.R | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/R/DataDiscoveryRetrieval.R b/R/DataDiscoveryRetrieval.R index fe06f0d0..5df7f847 100644 --- a/R/DataDiscoveryRetrieval.R +++ b/R/DataDiscoveryRetrieval.R @@ -47,7 +47,7 @@ #' @param siteid Unique monitoring location identifier. #' @param siteType Type of waterbody. See https://www.waterqualitydata.us/Codes/sitetype for options. #' @param tribal_area_type One of four tribal spatial layers: "Alaska Native Allotments", "American Indian Reservations", "Off-reservation Trust Lands", or "Oklahoma Tribal Statistical Areas". More info in TADA_TribalOptions(). Note that "Alaska Native Villages" and "Virginia Federally Recognized Tribes" layers will not return a successful query. -#' @param tribe_name_parcel The name of a tribe corresponding to an entry in the TRIBE_NAME field of the specified tribal_area_type. OR if the type is "Alaska Native Allotments" then the corresponding PARCEL_NO. More info in TADA_TribalOptions(). +#' @param tribe_name_parcel The name of one or more tribes corresponding to entries in the TRIBE_NAME field of the specified tribal_area_type. OR if the type is "Alaska Native Allotments" then the corresponding PARCEL_NO. More info in TADA_TribalOptions(). #' @param characteristicName Name of parameter. See https://www.waterqualitydata.us/Codes/characteristicName for options. #' @param characteristicType Groups of environmental measurements/parameters. See https://www.waterqualitydata.us/Codes/characteristicType for options. #' @param sampleMedia Sampling substrate such as water, air, or sediment. See https://www.waterqualitydata.us/Codes/sampleMedia for options. @@ -206,6 +206,12 @@ TADA_DataRetrieval <- function(startDate = "null", maxrecs = 250000, ask = TRUE, applyautoclean = TRUE) { + + # Require one tribal area type: + if (length(tribal_area_type == "null") >1) { + stop("The tribal_area_type argument only accepts a single value, but multiple have been provided.") + } + # Check for incomplete or inconsistent inputs: # If both an sf object and tribe information are provided it's unclear what @@ -263,7 +269,7 @@ TADA_DataRetrieval <- function(startDate = "null", if ((tribal_area_type == "null") & all(tribe_name_parcel != "null")) { stop("A tribal_area_type is required if tribe_name_parcel is provided.") } - + # If an sf object OR tribal info are provided they will be the basis of the query # (The tribal data handling uses sf objects as well) if ((!is.null(aoi_sf) & inherits(aoi_sf, "sf")) | (tribal_area_type != "null")) { From c3e40e31c56d29348f5da74986180b3b0710c747 Mon Sep 17 00:00:00 2001 From: Matt Brousil <37380883+mbrousil@users.noreply.github.com> Date: Fri, 24 Jan 2025 12:19:22 -0800 Subject: [PATCH 27/35] Redo tribal_area_type check --- R/DataDiscoveryRetrieval.R | 303 ++++++++++++++++++------------------- 1 file changed, 149 insertions(+), 154 deletions(-) diff --git a/R/DataDiscoveryRetrieval.R b/R/DataDiscoveryRetrieval.R index 5df7f847..c658aa5f 100644 --- a/R/DataDiscoveryRetrieval.R +++ b/R/DataDiscoveryRetrieval.R @@ -208,16 +208,16 @@ TADA_DataRetrieval <- function(startDate = "null", applyautoclean = TRUE) { # Require one tribal area type: - if (length(tribal_area_type == "null") >1) { - stop("The tribal_area_type argument only accepts a single value, but multiple have been provided.") + if (length(tribal_area_type) > 1) { + stop("tribal_area_type must be of length 1.") } # Check for incomplete or inconsistent inputs: - + # If both an sf object and tribe information are provided it's unclear what # the priority should be for the query if (!is.null(aoi_sf) & - any((tribal_area_type != "null") | (tribe_name_parcel != "null"))) { + any((tribal_area_type != "null") | (tribe_name_parcel != "null"))) { stop( paste0( "Both sf data and tribal information have been provided. ", @@ -225,17 +225,17 @@ TADA_DataRetrieval <- function(startDate = "null", ) ) } - + # Check for other arguments that indicate location. Function will ignore # these inputs but warn the user if ( # sf object provided (!is.null(aoi_sf) & inherits(aoi_sf, "sf")) & - # with additional location info - any( - (countrycode != "null"), (countycode != "null"), (huc != "null"), - (siteid != "null"), (statecode != "null") - ) + # with additional location info + any( + (countrycode != "null"), (countycode != "null"), (huc != "null"), + (siteid != "null"), (statecode != "null") + ) ) { warning( paste0( @@ -246,11 +246,11 @@ TADA_DataRetrieval <- function(startDate = "null", } else if ( # Tribe info provided (tribal_area_type != "null") & - # with additional location info - any( - (countrycode != "null"), (countycode != "null"), (huc != "null"), - (siteid != "null"), (statecode != "null") - ) + # with additional location info + any( + (countrycode != "null"), (countycode != "null"), (huc != "null"), + (siteid != "null"), (statecode != "null") + ) ) { warning( paste0( @@ -259,7 +259,7 @@ TADA_DataRetrieval <- function(startDate = "null", ) ) } - + # Insufficient tribal info provided: # Type but no parcel if ((tribal_area_type != "null") & all(tribe_name_parcel == "null")) { @@ -274,10 +274,10 @@ TADA_DataRetrieval <- function(startDate = "null", # (The tribal data handling uses sf objects as well) if ((!is.null(aoi_sf) & inherits(aoi_sf, "sf")) | (tribal_area_type != "null")) { # Build the non-sf part of the query: - + # Set query parameters WQPquery <- list() - + # StartDate if (length(startDate) > 1) { if (is.na(suppressWarnings(lubridate::parse_date_time(startDate[1], orders = "ymd")))) { @@ -344,9 +344,9 @@ TADA_DataRetrieval <- function(startDate = "null", } WQPquery <- c(WQPquery, endDate = endDate) } - + # sf AOI prep for query - + # If tribe info is provided then grab the corresponding sf object: if (tribal_area_type != "null") { # Make a reference table for tribal area type + url matching @@ -360,19 +360,14 @@ TADA_DataRetrieval <- function(startDate = "null", "Oklahoma Tribal Statistical Areas", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/4" # , # "Virginia Federally Recognized Tribes", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/5" ) - - # Keep to a single type: - if (length(tribal_area_type) > 1) { - stop("tribal_area_type must be of length 1.") - } - + # These two layers will not return any data when used for bboxes if (tribal_area_type == "Alaska Native Villages") { stop("Alaska Native Villages data are centroid points, not spatial boundaries.") } else if (tribal_area_type == "Virginia Federally Recognized Tribes") { stop("Federally recognized tribal entities in Virginia do not have any available spatial boundaries.") } - + # These area types allow filtering by TRIBE_NAME (unique within each type) if (tribal_area_type %in% c( "American Indian Reservations", @@ -397,7 +392,7 @@ TADA_DataRetrieval <- function(startDate = "null", . } } - + # Otherwise filter by PARCEL_NO (Note that values in this col are not unique) } else if (tribal_area_type == "Alaska Native Allotments") { aoi_sf <- dplyr::filter( @@ -417,22 +412,22 @@ TADA_DataRetrieval <- function(startDate = "null", stop("Tribal area type or tribal name parcel not recognized. Refer to TADA_TribalOptions() for query options.") } } - + # Check and/or fix geometry aoi_sf <- sf::st_make_valid(aoi_sf) - + # Match CRS if (sf::st_crs(aoi_sf) != 4326) { aoi_sf <- sf::st_transform(aoi_sf, crs = 4326) } - + # Get bbox of the sf object input_bbox <- sf::st_bbox(aoi_sf) - + # Query info on available data within the bbox # Don't want to print every message that's returned by WQP quiet_whatWQPdata <- purrr::quietly(dataRetrieval::whatWQPdata) - + # Try getting WQP info message("Checking what data is available. This may take a moment.") @@ -440,63 +435,63 @@ TADA_DataRetrieval <- function(startDate = "null", WQPquery, bBox = c(input_bbox$xmin, input_bbox$ymin, input_bbox$xmax, input_bbox$ymax) ) - + # Alert & stop if an http error was received if (is.null(quiet_bbox_avail$result)) { stop_message <- quiet_bbox_avail$messages %>% grep(pattern = "failed|HTTP", x = ., ignore.case = FALSE, value = TRUE) %>% paste("\n", ., collapse = "") %>% paste("The WQP request returned a NULL with the following message(s): \n", - ., - collapse = "\n" + ., + collapse = "\n" ) - + stop(stop_message) } - + # Use result only bbox_avail <- quiet_bbox_avail$result - + # Check if any sites are within the aoi if ((nrow(bbox_avail) > 0) == FALSE) { stop("No monitoring sites were returned within your area of interest (no data available).") } - - + + quiet_whatWQPsites <- purrr::quietly(dataRetrieval::whatWQPsites) - + quiet_bbox_sites <- quiet_whatWQPsites( siteid = bbox_avail$MonitoringLocationIdentifier ) - + if (is.null(quiet_bbox_sites$result)) { stop_message <- quiet_bbox_sites$messages %>% grep(pattern = "failed|HTTP", x = ., ignore.case = FALSE, value = TRUE) %>% paste("\n", ., collapse = "") %>% paste("The WQP request returned a NULL with the following message(s): \n", - ., - collapse = "\n" + ., + collapse = "\n" ) stop(stop_message) } - + # Reformat returned info as sf bbox_sites_sf <- TADA_MakeSpatial(quiet_bbox_sites$result, crs = 4326) - + # Subset sites to only within shapefile and get IDs clipped_sites_sf <- bbox_sites_sf[aoi_sf, ] - + clipped_site_ids <- clipped_sites_sf$MonitoringLocationIdentifier - + record_count <- bbox_avail %>% dplyr::filter(MonitoringLocationIdentifier %in% clipped_site_ids) %>% dplyr::pull(resultCount) %>% sum() - + # Should we proceed with downloads? If ask == TRUE then ask the user. if (ask == TRUE) { user_decision <- ask_user(n_records = record_count) - + # Act on input if (user_decision == "yes") { print("Proceeding with download.") @@ -504,10 +499,10 @@ TADA_DataRetrieval <- function(startDate = "null", stop("Cancelled by user.", call. = FALSE) } } - + # Continue now with site count site_count <- length(clipped_site_ids) - + # Check for either more than 300 sites or more records than max_recs. # If either is true then we'll approach the pull as a "big data" pull if (site_count > 300 | record_count > maxrecs) { @@ -517,7 +512,7 @@ TADA_DataRetrieval <- function(startDate = "null", "If your AOI is a county, state, country, or HUC boundary it would be more efficient to provide a code instead of an sf object." ) ) - + # Use helper function to download large data volume results.DR <- withCallingHandlers( TADA_BigDataHelper( @@ -530,11 +525,11 @@ TADA_DataRetrieval <- function(startDate = "null", ), message = function(m) message(m$message) ) - - + + rm(bbox_avail, bbox_sites_sf) gc() - + # Check if any results were returned if ((nrow(results.DR) > 0) == FALSE) { print( @@ -552,7 +547,7 @@ TADA_DataRetrieval <- function(startDate = "null", sites.DR <- clipped_sites_sf %>% dplyr::as_tibble() %>% dplyr::select(-geometry) - + # Get project metadata projects.DR <- suppressMessages( dataRetrieval::readWQPdata( @@ -562,7 +557,7 @@ TADA_DataRetrieval <- function(startDate = "null", service = "Project" ) ) - + # Join results, sites, projects TADAprofile <- TADA_JoinWQPProfiles( FullPhysChem = results.DR, @@ -571,25 +566,25 @@ TADA_DataRetrieval <- function(startDate = "null", ) %>% dplyr::mutate( dplyr::across(tidyselect::everything(), as.character) ) - + # run TADA_AutoClean function if (applyautoclean == TRUE) { print("Data successfully downloaded. Running TADA_AutoClean function.") - + TADAprofile.clean <- TADA_AutoClean(TADAprofile) } else { TADAprofile.clean <- TADAprofile } } - + return(TADAprofile.clean) - + # Doesn't meet "big data" threshold: } else { # Retrieve all 3 profiles print("Downloading WQP query results. This may take some time depending upon the query size.") print(WQPquery) - + # Get results results.DR <- suppressMessages( dataRetrieval::readWQPdata( @@ -599,7 +594,7 @@ TADA_DataRetrieval <- function(startDate = "null", ignore_attributes = TRUE ) ) - + # Check if any results were returned if ((nrow(results.DR) > 0) == FALSE) { paste0( @@ -614,7 +609,7 @@ TADA_DataRetrieval <- function(startDate = "null", sites.DR <- clipped_sites_sf %>% dplyr::as_tibble() %>% dplyr::select(-geometry) - + # Get project metadata projects.DR <- suppressMessages( dataRetrieval::readWQPdata( @@ -624,7 +619,7 @@ TADA_DataRetrieval <- function(startDate = "null", service = "Project" ) ) - + # Join results, sites, projects TADAprofile <- TADA_JoinWQPProfiles( FullPhysChem = results.DR, @@ -633,25 +628,25 @@ TADA_DataRetrieval <- function(startDate = "null", ) %>% dplyr::mutate( dplyr::across(tidyselect::everything(), as.character) ) - + # Run TADA_AutoClean function if (applyautoclean == TRUE) { print("Data successfully downloaded. Running TADA_AutoClean function.") - + TADAprofile.clean <- TADA_AutoClean(TADAprofile) } else { TADAprofile.clean <- TADAprofile } } - + return(TADAprofile.clean) } - + # If no sf object provided: } else { # Set query parameters WQPquery <- list() - + if (!"null" %in% statecode) { load(system.file("extdata", "statecodes_df.Rdata", package = "EPATADA")) statecode <- as.character(statecode) @@ -664,13 +659,13 @@ TADA_DataRetrieval <- function(startDate = "null", WQPquery <- c(WQPquery, statecode = list(statecd)) } } - + if (length(huc) > 1) { WQPquery <- c(WQPquery, huc = list(huc)) } else if (huc != "null") { WQPquery <- c(WQPquery, huc = huc) } - + if (length(startDate) > 1) { if (is.na(suppressWarnings(lubridate::parse_date_time(startDate[1], orders = "ymd")))) { stop("Incorrect date format. Please use the format YYYY-MM-DD.") @@ -682,67 +677,67 @@ TADA_DataRetrieval <- function(startDate = "null", } WQPquery <- c(WQPquery, startDate = startDate) } - + if (length(countrycode) > 1) { WQPquery <- c(WQPquery, countrycode = list(countrycode)) } else if (countrycode != "null") { WQPquery <- c(WQPquery, countrycode = countrycode) } - + if (length(countycode) > 1) { WQPquery <- c(WQPquery, countycode = list(countycode)) } else if (countycode != "null") { WQPquery <- c(WQPquery, countycode = countycode) } - + if (length(siteid) > 1) { WQPquery <- c(WQPquery, siteid = list(siteid)) } else if (siteid != "null") { WQPquery <- c(WQPquery, siteid = siteid) } - + if (length(siteType) > 1) { WQPquery <- c(WQPquery, siteType = list(siteType)) } else if (siteType != "null") { WQPquery <- c(WQPquery, siteType = siteType) } - + if (length(characteristicName) > 1) { WQPquery <- c(WQPquery, characteristicName = list(characteristicName)) } else if (characteristicName != "null") { WQPquery <- c(WQPquery, characteristicName = characteristicName) } - + if (length(characteristicType) > 1) { WQPquery <- c(WQPquery, characteristicType = list(characteristicType)) } else if (characteristicType != "null") { WQPquery <- c(WQPquery, characteristicType = characteristicType) } - + if (length(sampleMedia) > 1) { WQPquery <- c(WQPquery, sampleMedia = list(sampleMedia)) } else if (sampleMedia != "null") { WQPquery <- c(WQPquery, sampleMedia = sampleMedia) } - + if (length(project) > 1) { WQPquery <- c(WQPquery, project = list(project)) } else if (project != "null") { WQPquery <- c(WQPquery, project = project) } - + if (length(providers) > 1) { WQPquery <- c(WQPquery, providers = list(providers)) } else if (providers != "null") { WQPquery <- c(WQPquery, providers = providers) } - + if (length(organization) > 1) { WQPquery <- c(WQPquery, organization = list(organization)) } else if (organization != "null") { WQPquery <- c(WQPquery, organization = organization) } - + if (length(endDate) > 1) { if (is.na(suppressWarnings(lubridate::parse_date_time(endDate[1], orders = "ymd")))) { stop("Incorrect date format. Please use the format YYYY-MM-DD.") @@ -754,39 +749,39 @@ TADA_DataRetrieval <- function(startDate = "null", } WQPquery <- c(WQPquery, endDate = endDate) } - + # Query info on available data message("Checking what data is available. This may take a moment.") # Don't want to print every message that's returned by WQP quiet_whatWQPdata <- purrr::quietly(dataRetrieval::whatWQPdata) - + quiet_query_avail <- quiet_whatWQPdata(WQPquery) - + if (is.null(quiet_query_avail$result)) { stop_message <- quiet_query_avail$messages %>% grep(pattern = "failed|HTTP", x = ., ignore.case = FALSE, value = TRUE) %>% paste("\n", ., collapse = "") %>% paste("The WQP request returned a NULL with the following message(s): \n", - ., - collapse = "\n" + ., + collapse = "\n" ) - + stop(stop_message) } - + query_avail <- quiet_query_avail$result - + site_count <- length(query_avail$MonitoringLocationIdentifier) - + record_count <- query_avail %>% dplyr::pull(resultCount) %>% sum() - + # Should we proceed with downloads? If ask == TRUE then ask the user. if (ask == TRUE) { user_decision <- ask_user(n_records = record_count) - + # Act on input if (user_decision == "yes") { print("Proceeding with download.") @@ -794,14 +789,14 @@ TADA_DataRetrieval <- function(startDate = "null", stop("Cancelled by user.", call. = FALSE) } } - + # Check for either more than 300 sites or more records than max_recs. # If either is true then we'll approach the pull as a "big data" pull if (site_count > 300 | record_count > maxrecs) { message( "The number of sites and/or records matched by the query terms is large, so the download may take some time." ) - + # Use helper function to download large data volume results.DR <- suppressMessages( TADA_BigDataHelper( @@ -812,17 +807,17 @@ TADA_DataRetrieval <- function(startDate = "null", maxsites = 300 ) ) - + rm(query_avail) gc() - + # Get site metadata sites.DR <- suppressMessages( dataRetrieval::whatWQPsites( siteid = unique(results.DR$MonitoringLocationIdentifier) ) ) - + # Get project metadata projects.DR <- suppressMessages( dataRetrieval::readWQPdata( @@ -832,7 +827,7 @@ TADA_DataRetrieval <- function(startDate = "null", service = "Project" ) ) - + # Join results, sites, projects TADAprofile <- TADA_JoinWQPProfiles( FullPhysChem = results.DR, @@ -841,18 +836,18 @@ TADA_DataRetrieval <- function(startDate = "null", ) %>% dplyr::mutate( dplyr::across(tidyselect::everything(), as.character) ) - + # run TADA_AutoClean function if (applyautoclean == TRUE) { print("Data successfully downloaded. Running TADA_AutoClean function.") - + TADAprofile.clean <- TADA_AutoClean(TADAprofile) } else { TADAprofile.clean <- TADAprofile } - + return(TADAprofile.clean) - + # If not a "big data" pull: } else { # Retrieve all 3 profiles @@ -860,25 +855,25 @@ TADA_DataRetrieval <- function(startDate = "null", print(WQPquery) results.DR <- suppressMessages( dataRetrieval::readWQPdata(WQPquery, - dataProfile = "resultPhysChem", - ignore_attributes = TRUE + dataProfile = "resultPhysChem", + ignore_attributes = TRUE ) ) - + # check if any results are available if ((nrow(results.DR) > 0) == FALSE) { print("Returning empty results dataframe: Your WQP query returned no results (no data available). Try a different query. Removing some of your query filters OR broadening your search area may help.") TADAprofile.clean <- results.DR } else { sites.DR <- suppressMessages(dataRetrieval::whatWQPsites(WQPquery)) - + projects.DR <- suppressMessages( dataRetrieval::readWQPdata(WQPquery, - ignore_attributes = TRUE, - service = "Project" + ignore_attributes = TRUE, + service = "Project" ) ) - + TADAprofile <- TADA_JoinWQPProfiles( FullPhysChem = results.DR, Sites = sites.DR, @@ -886,17 +881,17 @@ TADA_DataRetrieval <- function(startDate = "null", ) %>% dplyr::mutate( dplyr::across(tidyselect::everything(), as.character) ) - + # run TADA_AutoClean function if (applyautoclean == TRUE) { print("Data successfully downloaded. Running TADA_AutoClean function.") - + TADAprofile.clean <- TADA_AutoClean(TADAprofile) } else { TADAprofile.clean <- TADAprofile } } - + return(TADAprofile.clean) } } @@ -951,12 +946,12 @@ TADA_TribalOptions <- function(tribal_area_type, return_sf = FALSE) { "Oklahoma Tribal Statistical Areas", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/4", "Virginia Federally Recognized Tribes", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/5" ) - + # Confirm usable string provided if (!(tribal_area_type %in% map_service_urls$tribal_area)) { stop("tribal_area_type must match one of the six tribal spatial layer names.") } - + # Query Map Service tribal_area_sf <- dplyr::filter( map_service_urls, @@ -966,7 +961,7 @@ TADA_TribalOptions <- function(tribal_area_type, return_sf = FALSE) { # Return sf arcgislayers::arc_select() %>% sf::st_make_valid() - + # Convert to df if needed, export if (return_sf == FALSE) { return( @@ -1032,25 +1027,25 @@ TADA_ReadWQPWebServices <- function(webservice) { # read in csv from WQP web service if (grepl("zip=yes", webservice)) { webservice <- stringr::str_replace(webservice, "zip=yes", "zip=no") - + # download data webservice <- data.table::fread(toString(webservice)) - + # if input df was not downloaded using USGS's dataRetrieval, then the # column names will include / separators instead of . and TADA uses . # (e.g. ResultMeasure/MeasureUnitCode vs. ResultMeasure.MeasureUnitCode) colnames(webservice) <- gsub("/", ".", colnames(webservice)) - + return(webservice) } else { # download data webservice <- data.table::fread(toString(webservice)) - + # if input df was not downloaded using USGS's dataRetrieval, then the # column names will include / separators instead of . and TADA uses . # (e.g. ResultMeasure/MeasureUnitCode vs. ResultMeasure.MeasureUnitCode) colnames(webservice) <- gsub("/", ".", colnames(webservice)) - + return(webservice) } } @@ -1077,15 +1072,15 @@ TADA_BigDataHelper <- function(record_summary, WQPquery, maxrecs = 250000, maxsi dplyr::summarise(tot_n = sum(resultCount)) %>% dplyr::filter(tot_n > 0) %>% dplyr::arrange(tot_n) - + # Sites with less than/equal to maxrecs smallsites <- tot_sites %>% dplyr::filter(tot_n <= maxrecs) # Sites with more than maxrecs bigsites <- tot_sites %>% dplyr::filter(tot_n > maxrecs) - + df_small <- data.frame() df_big <- data.frame() - + # Work with small sites first: # Build download groups. Total record count limited to value of maxrecs. # Number of sites per download group limited to 300. @@ -1096,7 +1091,7 @@ TADA_BigDataHelper <- function(record_summary, WQPquery, maxrecs = 250000, maxsi threshold = maxrecs, maxgroupsize = 300 )) - + # Status update to user print( paste0( @@ -1105,9 +1100,9 @@ TADA_BigDataHelper <- function(record_summary, WQPquery, maxrecs = 250000, maxsi " results by grouping them together." ) ) - + small_prog_bar <- txtProgressBar(min = 0, max = sum(smallsites$tot_n), style = 3) - + # Download the data for each group for (i in 1:max(smallsitesgrp$group)) { small_site_chunk <- subset( @@ -1124,22 +1119,22 @@ TADA_BigDataHelper <- function(record_summary, WQPquery, maxrecs = 250000, maxsi ) ) %>% dplyr::mutate(dplyr::across(everything(), as.character)) - + # If data is returned, stack with what's already been retrieved if (dim(results_small)[1] > 0) { df_small <- dplyr::bind_rows(df_small, results_small) } - + # Update progress setTxtProgressBar(pb = small_prog_bar, value = nrow(df_small)) } # Close progress bar when complete close(small_prog_bar) - + rm(smallsites, smallsitesgrp) gc() } - + # Large sites (>= maxrecs) next: if (dim(bigsites)[1] > 0) { print( @@ -1149,12 +1144,12 @@ TADA_BigDataHelper <- function(record_summary, WQPquery, maxrecs = 250000, maxsi " results, chunking queries by site." ) ) - + big_prog_bar <- txtProgressBar(min = 0, max = sum(bigsites$tot_n), style = 3) - + # Unique site IDs bsitesvec <- unique(bigsites$MonitoringLocationIdentifier) - + # For each site for (i in 1:length(bsitesvec)) { # Download each site's data individually @@ -1167,7 +1162,7 @@ TADA_BigDataHelper <- function(record_summary, WQPquery, maxrecs = 250000, maxsi ) ) %>% dplyr::mutate(dplyr::across(everything(), as.character)) - + if (dim(results_big)[1] > 0) { df_big <- dplyr::bind_rows(df_big, results_big) } @@ -1176,14 +1171,14 @@ TADA_BigDataHelper <- function(record_summary, WQPquery, maxrecs = 250000, maxsi } # Close progress bar when complete close(big_prog_bar) - + rm(bigsites) gc() } - - + + df_out <- dplyr::bind_rows(df_small, df_big) - + return(df_out) } @@ -1219,20 +1214,20 @@ TADA_JoinWQPProfiles <- function(FullPhysChem = "null", Sites = "null", Projects = "null") { FullPhysChem.df <- FullPhysChem - + Sites.df <- Sites - + Projects.df <- Projects - + # Join station data to full phys/chem (FullPhysChem.df) if (length(Sites.df > 1)) { if (nrow(Sites.df) > 0) { join1 <- FullPhysChem.df %>% # join stations to results dplyr::left_join(Sites.df, - by = "MonitoringLocationIdentifier", - multiple = "all", - relationship = "many-to-many" + by = "MonitoringLocationIdentifier", + multiple = "all", + relationship = "many-to-many" ) %>% # remove ".x" suffix from column names dplyr::rename_at(dplyr::vars(dplyr::ends_with(".x")), ~ stringr::str_replace(., "\\..$", "")) %>% @@ -1244,8 +1239,8 @@ TADA_JoinWQPProfiles <- function(FullPhysChem = "null", } else { join1 <- FullPhysChem.df } - - + + # Add QAPP columns from project if (length(Projects.df) > 1) { if (nrow(Projects.df) > 0) { @@ -1271,7 +1266,7 @@ TADA_JoinWQPProfiles <- function(FullPhysChem = "null", } else { join2 <- join1 } - + return(join2) } @@ -1292,7 +1287,7 @@ ask_user <- function(n_records) { "Would you like to continue with the download? [yes/no] ", sep = "" ) - + # Ask user if they want to continue & check for valid response while (TRUE) { user_input <- readline(prompt = user_prompt) @@ -1329,9 +1324,9 @@ make_groups <- function(x, maxrecs) { i <- i + 1 groupings <- plyr::rbind.fill(groupings, group) } - + x$group <- i - + groupings <- plyr::rbind.fill(groupings, x) } return(groupings) From f0de6b161e7b651944ec8b492eca54f22ee24fab Mon Sep 17 00:00:00 2001 From: Matt Brousil <37380883+mbrousil@users.noreply.github.com> Date: Fri, 24 Jan 2025 15:36:10 -0800 Subject: [PATCH 28/35] Catch clips with no sites --- R/DataDiscoveryRetrieval.R | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/R/DataDiscoveryRetrieval.R b/R/DataDiscoveryRetrieval.R index c658aa5f..0a98c553 100644 --- a/R/DataDiscoveryRetrieval.R +++ b/R/DataDiscoveryRetrieval.R @@ -261,7 +261,7 @@ TADA_DataRetrieval <- function(startDate = "null", } # Insufficient tribal info provided: - # Type but no parcel + # Type but no name or parcel if ((tribal_area_type != "null") & all(tribe_name_parcel == "null")) { stop("A tribe_name_parcel is required if tribal_area_type is provided.") } @@ -412,7 +412,7 @@ TADA_DataRetrieval <- function(startDate = "null", stop("Tribal area type or tribal name parcel not recognized. Refer to TADA_TribalOptions() for query options.") } } - + # Check and/or fix geometry aoi_sf <- sf::st_make_valid(aoi_sf) @@ -457,7 +457,6 @@ TADA_DataRetrieval <- function(startDate = "null", stop("No monitoring sites were returned within your area of interest (no data available).") } - quiet_whatWQPsites <- purrr::quietly(dataRetrieval::whatWQPsites) quiet_bbox_sites <- quiet_whatWQPsites( @@ -483,6 +482,11 @@ TADA_DataRetrieval <- function(startDate = "null", clipped_site_ids <- clipped_sites_sf$MonitoringLocationIdentifier + # Check if any sites are within the clip + if ((length(clipped_site_ids) > 0) == FALSE) { + stop("No monitoring sites were returned within your area of interest (no data available).") + } + record_count <- bbox_avail %>% dplyr::filter(MonitoringLocationIdentifier %in% clipped_site_ids) %>% dplyr::pull(resultCount) %>% @@ -837,7 +841,7 @@ TADA_DataRetrieval <- function(startDate = "null", dplyr::across(tidyselect::everything(), as.character) ) - # run TADA_AutoClean function + # Run TADA_AutoClean function if (applyautoclean == TRUE) { print("Data successfully downloaded. Running TADA_AutoClean function.") @@ -860,7 +864,7 @@ TADA_DataRetrieval <- function(startDate = "null", ) ) - # check if any results are available + # Check if any results are available if ((nrow(results.DR) > 0) == FALSE) { print("Returning empty results dataframe: Your WQP query returned no results (no data available). Try a different query. Removing some of your query filters OR broadening your search area may help.") TADAprofile.clean <- results.DR From cd05dfd4fd07102587ef5ec192148f68839f3d78 Mon Sep 17 00:00:00 2001 From: Matt Brousil <37380883+mbrousil@users.noreply.github.com> Date: Mon, 27 Jan 2025 13:08:43 -0800 Subject: [PATCH 29/35] More quietly --- R/DataDiscoveryRetrieval.R | 113 ++++++++++++++++++++++++++----------- vignettes/TADAModule1.Rmd | 38 +++++++------ 2 files changed, 100 insertions(+), 51 deletions(-) diff --git a/R/DataDiscoveryRetrieval.R b/R/DataDiscoveryRetrieval.R index 0a98c553..5e0490de 100644 --- a/R/DataDiscoveryRetrieval.R +++ b/R/DataDiscoveryRetrieval.R @@ -270,6 +270,12 @@ TADA_DataRetrieval <- function(startDate = "null", stop("A tribal_area_type is required if tribe_name_parcel is provided.") } + # Before proceeding make quiet wrappers for dataRetrieval functions for later + # use in if/else processes + quiet_whatWQPsites <- purrr::quietly(dataRetrieval::whatWQPsites) + quiet_whatWQPdata <- purrr::quietly(dataRetrieval::whatWQPdata) + quiet_readWQPdata <- purrr::quietly(dataRetrieval::readWQPdata) + # If an sf object OR tribal info are provided they will be the basis of the query # (The tribal data handling uses sf objects as well) if ((!is.null(aoi_sf) & inherits(aoi_sf, "sf")) | (tribal_area_type != "null")) { @@ -412,7 +418,7 @@ TADA_DataRetrieval <- function(startDate = "null", stop("Tribal area type or tribal name parcel not recognized. Refer to TADA_TribalOptions() for query options.") } } - + # Check and/or fix geometry aoi_sf <- sf::st_make_valid(aoi_sf) @@ -425,12 +431,11 @@ TADA_DataRetrieval <- function(startDate = "null", input_bbox <- sf::st_bbox(aoi_sf) # Query info on available data within the bbox - # Don't want to print every message that's returned by WQP - quiet_whatWQPdata <- purrr::quietly(dataRetrieval::whatWQPdata) # Try getting WQP info message("Checking what data is available. This may take a moment.") + # Don't want to print every message that's returned by WQP quiet_bbox_avail <- quiet_whatWQPdata( WQPquery, bBox = c(input_bbox$xmin, input_bbox$ymin, input_bbox$xmax, input_bbox$ymax) @@ -457,8 +462,6 @@ TADA_DataRetrieval <- function(startDate = "null", stop("No monitoring sites were returned within your area of interest (no data available).") } - quiet_whatWQPsites <- purrr::quietly(dataRetrieval::whatWQPsites) - quiet_bbox_sites <- quiet_whatWQPsites( siteid = bbox_avail$MonitoringLocationIdentifier ) @@ -553,15 +556,27 @@ TADA_DataRetrieval <- function(startDate = "null", dplyr::select(-geometry) # Get project metadata - projects.DR <- suppressMessages( - dataRetrieval::readWQPdata( - siteid = clipped_site_ids, - WQPquery, - ignore_attributes = TRUE, - service = "Project" - ) + quiet_projects.DR <- quiet_readWQPdata( + siteid = clipped_site_ids, + WQPquery, + ignore_attributes = TRUE, + service = "Project" ) + if (is.null(quiet_projects.DR$result)) { + stop_message <- quiet_projects.DR$messages %>% + grep(pattern = "failed|HTTP", x = ., ignore.case = FALSE, value = TRUE) %>% + paste("\n", ., collapse = "") %>% + paste("The WQP request returned a NULL with the following message(s): \n", + ., + collapse = "\n" + ) + + stop(stop_message) + } + + projects.DR <- quiet_projects.DR$result + # Join results, sites, projects TADAprofile <- TADA_JoinWQPProfiles( FullPhysChem = results.DR, @@ -615,15 +630,27 @@ TADA_DataRetrieval <- function(startDate = "null", dplyr::select(-geometry) # Get project metadata - projects.DR <- suppressMessages( - dataRetrieval::readWQPdata( - siteid = clipped_site_ids, - WQPquery, - ignore_attributes = TRUE, - service = "Project" - ) + quiet_projects.DR <- quiet_readWQPdata( + siteid = clipped_site_ids, + WQPquery, + ignore_attributes = TRUE, + service = "Project" ) + if (is.null(quiet_projects.DR$result)) { + stop_message <- quiet_projects.DR$messages %>% + grep(pattern = "failed|HTTP", x = ., ignore.case = FALSE, value = TRUE) %>% + paste("\n", ., collapse = "") %>% + paste("The WQP request returned a NULL with the following message(s): \n", + ., + collapse = "\n" + ) + + stop(stop_message) + } + + projects.DR <- quiet_projects.DR$result + # Join results, sites, projects TADAprofile <- TADA_JoinWQPProfiles( FullPhysChem = results.DR, @@ -758,8 +785,6 @@ TADA_DataRetrieval <- function(startDate = "null", message("Checking what data is available. This may take a moment.") # Don't want to print every message that's returned by WQP - quiet_whatWQPdata <- purrr::quietly(dataRetrieval::whatWQPdata) - quiet_query_avail <- quiet_whatWQPdata(WQPquery) if (is.null(quiet_query_avail$result)) { @@ -816,22 +841,44 @@ TADA_DataRetrieval <- function(startDate = "null", gc() # Get site metadata - sites.DR <- suppressMessages( - dataRetrieval::whatWQPsites( - siteid = unique(results.DR$MonitoringLocationIdentifier) - ) - ) + quiet_sites.DR <- quiet_whatWQPsites(siteid = unique(results.DR$MonitoringLocationIdentifier)) + + if (is.null(quiet_sites.DR$result)) { + stop_message <- quiet_sites.DR$messages %>% + grep(pattern = "failed|HTTP", x = ., ignore.case = FALSE, value = TRUE) %>% + paste("\n", ., collapse = "") %>% + paste("The WQP request returned a NULL with the following message(s): \n", + ., + collapse = "\n" + ) + + stop(stop_message) + } + + sites.DR <- quiet_sites.DR$result # Get project metadata - projects.DR <- suppressMessages( - dataRetrieval::readWQPdata( - siteid = unique(results.DR$MonitoringLocationIdentifier), - WQPquery, - ignore_attributes = TRUE, - service = "Project" - ) + quiet_projects.DR <- quiet_readWQPdata( + siteid = unique(results.DR$MonitoringLocationIdentifier), + WQPquery, + ignore_attributes = TRUE, + service = "Project" ) + if (is.null(quiet_projects.DR$result)) { + stop_message <- quiet_projects.DR$messages %>% + grep(pattern = "failed|HTTP", x = ., ignore.case = FALSE, value = TRUE) %>% + paste("\n", ., collapse = "") %>% + paste("The WQP request returned a NULL with the following message(s): \n", + ., + collapse = "\n" + ) + + stop(stop_message) + } + + projects.DR <- quiet_projects.DR$result + # Join results, sites, projects TADAprofile <- TADA_JoinWQPProfiles( FullPhysChem = results.DR, diff --git a/vignettes/TADAModule1.Rmd b/vignettes/TADAModule1.Rmd index 8154041b..42a6e93b 100644 --- a/vignettes/TADAModule1.Rmd +++ b/vignettes/TADAModule1.Rmd @@ -264,8 +264,9 @@ TADAProfile_tribal <- TADA_DataRetrieval( all.equal(data.frame(TADAProfile_single), data.frame(TADAProfile_tribal)) ``` -Additionally, the aoi_sf argument can be used to provide an sf spatial object -as a query filter. We can match the output of the two short examples above like so: +Additionally, the aoi_sf argument can be used to provide an sf spatial +object as a query filter. We can match the output of the two short +examples above like so: ```{r TADA_DataRetrieval_spatial} TADAProfile_spatial <- TADA_DataRetrieval( @@ -279,11 +280,11 @@ TADAProfile_spatial <- TADA_DataRetrieval( all.equal(data.frame(TADAProfile_single), data.frame(TADAProfile_spatial)) ``` -**Note**: In this example the output data is identical from these three input -methods. However, in some instances this may not be the case. This is because -the tribal_area_type method is based on spatial data and so spatial boundaries -must be taken into account when comparing query results. The same applies when -using aoi_sf results. +**Note**: In this example the output data is identical from these three +input methods. However, in some instances this may not be the case. This +is because the tribal_area_type method is based on spatial data and so +spatial boundaries must be taken into account when comparing query +results. The same applies when using aoi_sf results. ## USGS dataRetrieval @@ -301,10 +302,11 @@ as part of the data retrieval process. ## Big Data Queries If you need to download a large amount of data from across a large area, -the TADA_DataRetrieval function now handles this automatically. Whereas in the -past there was a second function (TADA_BigDataRetrieval) to do this, the -standard TADA_DataRetrieval function now checks the number of results in each -query and uses similar methods as TADA_BigDataRetrieval when necessary. +the TADA_DataRetrieval function now handles this automatically. Whereas +in the past there was a second function (TADA_BigDataRetrieval) to do +this, the standard TADA_DataRetrieval function now checks the number of +results in each query and uses similar methods as TADA_BigDataRetrieval +when necessary. The function does multiple synchronous data calls to the WQP (waterqualitydata.us). It uses the WQP summary service to limit the @@ -312,14 +314,14 @@ sites downloaded to only those with relevant data. It pulls back data from set number of stations at a time and then joins the data back together to produce a single TADA compatible dataframe as the output. -TADA_DataRetrieval now also prompts the user (when ask = TRUE) to confirm that -they want to download the dataset. As part of this prompt the expected number -of rows of data are provided to help in making the decision. As the downloads -occur, a progress bar is shown as well. +TADA_DataRetrieval now also prompts the user (when ask = TRUE) to +confirm that they want to download the dataset. As part of this prompt +the expected number of rows of data are provided to help in making the +decision. As the downloads occur, a progress bar is shown as well. -See ?TADA_DataRetrieval for more details. WARNING, some of the -examples below can take multiple HOURS to run. The total run time -depends on your query inputs. +See ?TADA_DataRetrieval for more details. WARNING, some of the examples +below can take multiple HOURS to run. The total run time depends on your +query inputs. ```{r BigdataRetrieval, eval = FALSE, results = 'hide'} # AK_AL_WaterTemp <- TADA_DataRetrieval(startDate = "2000-01-01", endDate = "2022-12-31", characteristicName = "Temperature, water", statecode = c("AK","AL")) From 820f762b5f37d69f5dff2d4126bf585d5962e3e6 Mon Sep 17 00:00:00 2001 From: Matt Brousil <37380883+mbrousil@users.noreply.github.com> Date: Mon, 27 Jan 2025 13:21:51 -0800 Subject: [PATCH 30/35] tigris --- DESCRIPTION | 5 +++-- vignettes/TADAModule1.Rmd | 6 +++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index ce8f7442..5e9e3645 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -84,13 +84,14 @@ Suggests: tidyverse, knitr, testthat, - testthis, + testthis, usethis, devtools, pkgdown, spelling, htmltools, - yaml + yaml, + tigris VignetteBuilder: knitr, rmarkdown Language: en-US Config/testthat/edition: 3 diff --git a/vignettes/TADAModule1.Rmd b/vignettes/TADAModule1.Rmd index 42a6e93b..2aa6c07b 100644 --- a/vignettes/TADAModule1.Rmd +++ b/vignettes/TADAModule1.Rmd @@ -134,6 +134,9 @@ geospatial-related filters that are not included in the dataRetrieval - tribe_name_parcel +The TADA_TribalOptions function can be used to narrow down options for +use with tribe_name_parcel. See ?TADA_TribalOptions for more info. + The default TADA_DataRetrieval function automatically runs the **TADA_AutoClean** function. In this example, we will set **TADA_AutoClean = FALSE** and run it as a separate step in the work @@ -266,7 +269,8 @@ all.equal(data.frame(TADAProfile_single), data.frame(TADAProfile_tribal)) Additionally, the aoi_sf argument can be used to provide an sf spatial object as a query filter. We can match the output of the two short -examples above like so: +examples above, using tigris::native_areas to acquire Census Bureau +spatial data: ```{r TADA_DataRetrieval_spatial} TADAProfile_spatial <- TADA_DataRetrieval( From 77357beb2f318ddb8976b8c1f7ddcf28995734ee Mon Sep 17 00:00:00 2001 From: Matt Brousil <37380883+mbrousil@users.noreply.github.com> Date: Mon, 27 Jan 2025 14:52:27 -0800 Subject: [PATCH 31/35] Recheck/rebuild --- NAMESPACE | 1 - man/TADA_BigDataRetrieval.Rd | 105 ----------------------------------- man/TADA_DataRetrieval.Rd | 4 +- 3 files changed, 2 insertions(+), 108 deletions(-) delete mode 100644 man/TADA_BigDataRetrieval.Rd diff --git a/NAMESPACE b/NAMESPACE index 0b286c45..5e76429c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,7 +5,6 @@ export(TADA_AggregateMeasurements) export(TADA_AnalysisDataFilter) export(TADA_AutoClean) export(TADA_AutoFilter) -export(TADA_BigDataRetrieval) export(TADA_Boxplot) export(TADA_CalculateTotalNP) export(TADA_CharStringRemoveNA) diff --git a/man/TADA_BigDataRetrieval.Rd b/man/TADA_BigDataRetrieval.Rd deleted file mode 100644 index 67de5705..00000000 --- a/man/TADA_BigDataRetrieval.Rd +++ /dev/null @@ -1,105 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/DataDiscoveryRetrieval.R -\name{TADA_BigDataRetrieval} -\alias{TADA_BigDataRetrieval} -\title{Large WQP data pulls using dataRetrieval} -\usage{ -TADA_BigDataRetrieval( - startDate = "null", - endDate = "null", - countrycode = "null", - statecode = "null", - countycode = "null", - huc = "null", - siteid = "null", - siteType = "null", - characteristicName = "null", - characteristicType = "null", - sampleMedia = "null", - organization = "null", - maxrecs = 250000, - applyautoclean = FALSE -) -} -\arguments{ -\item{startDate}{Start Date string in the format YYYY-MM-DD, for example, "2020-01-01"} - -\item{endDate}{End Date string in the format YYYY-MM-DD, for example, "2020-01-01"} - -\item{countrycode}{Code that identifies a country or ocean (e.g. countrycode = "CA" for Canada, countrycode = "OA" for Atlantic Ocean). See https://www.waterqualitydata.us/Codes/countrycode for options.} - -\item{statecode}{FIPS state alpha code that identifies a state (e.g. statecode = "DE" for Delaware). See https://www.waterqualitydata.us/Codes/statecode for options.} - -\item{countycode}{FIPS county name. Note that a state code must also be supplied (e.g. statecode = "AL", countycode = "Chilton"). See https://www.waterqualitydata.us/Codes/countycode for options.} - -\item{huc}{A numeric code denoting a hydrologic unit. Example: "04030202". Different size hucs can be entered. See https://epa.maps.arcgis.com/home/item.html?id=796992f4588c401fabec7446ecc7a5a3 for a map with HUCS. Click on a HUC to find the associated code.} - -\item{siteid}{Unique monitoring location identifier.} - -\item{siteType}{Type of waterbody. See https://www.waterqualitydata.us/Codes/sitetype for options.} - -\item{characteristicName}{Name of parameter. See https://www.waterqualitydata.us/Codes/characteristicName for options.} - -\item{characteristicType}{Groups of environmental measurements/parameters. See https://www.waterqualitydata.us/Codes/characteristicType for options.} - -\item{sampleMedia}{Sampling substrate such as water, air, or sediment. See https://www.waterqualitydata.us/Codes/sampleMedia for options.} - -\item{organization}{A string of letters and/or numbers (some additional characters also possible) used to signify an organization with data in the Water Quality Portal. See https://www.waterqualitydata.us/Codes/organization for options.} - -\item{maxrecs}{The maximum number of results queried within one call to dataRetrieval.} - -\item{applyautoclean}{Logical, defaults to FALSE. If TRUE, runs TADA_AutoClean function on the returned data profile.} -} -\value{ -TADA-compatible dataframe -} -\description{ -This function does multiple synchronous data calls to the WQP -(waterqualitydata.us). It uses the WQP summary service to limit the amount -downloaded to only relevant data (based on user query), pulls back data for -250000 records at a time, and then joins the data back together to produce a -single TADA compatible dataframe as the output. For large data sets, that can save a lot -of time and ultimately reduce the complexity of subsequent data processing. -Using this function, you will be able to download all data available from all -sites in the contiguous United States available for the time period, -characteristicName, and siteType requested. Computer memory may limit the -size of data frames that your R console will be able to hold in one session. -Function requires a characteristicName, siteType, statecode, huc, or start/ -end date input. The recommendation is to be as specific as you can with your -large data call. The function allows the user to run TADA_AutoClean on the data frame, -but this is not the default as checking large dataframes for exact duplicate -rows can be time consuming and is better performed on its own once the query is -completed. -} -\details{ -Some code for this function was adapted from this USGS Blog (Author: Aliesha Krall) -\href{https://waterdata.usgs.gov/blog/large_sample_pull/}{Large Sample Pull} - -See ?TADA_AutoClean documentation for more information on this optional input. - -Note: TADA_BigDataRetrieval (by leveraging USGS's dataRetrieval), automatically converts -the date times to UTC. It also automatically converts the data to dates, -datetimes, numerics based on a standard algorithm. See: ?dataRetrieval::readWQPdata -} -\examples{ -\dontrun{ -# takes approx 3 mins to run -tada1 <- TADA_BigDataRetrieval(startDate = "2019-01-01", endDate = "2021-12-31", characteristicName = "Temperature, water", statecode = c("AK", "AL")) - -# takes approx 21 mins -tada2 <- TADA_BigDataRetrieval(startDate = "2016-10-01", endDate = "2022-09-30", statecode = "UT") - -# takes seconds to run -tada3 <- TADA_BigDataRetrieval(huc = "04030202", characteristicName = "Escherichia coli") - -# takes approx 3 mins to run -tada4 <- TADA_BigDataRetrieval(startDate = "2004-01-01", countrycode = "CA") - -# takes seconds to run -tada5 <- TADA_BigDataRetrieval(startDate = "2018-01-01", statecode = "AL", countycode = "Chilton") - -# takes seconds to run -tada6 <- TADA_BigDataRetrieval(organization = "PUEBLOOFTESUQUE") -} - -} diff --git a/man/TADA_DataRetrieval.Rd b/man/TADA_DataRetrieval.Rd index 9701cf0c..377ca521 100644 --- a/man/TADA_DataRetrieval.Rd +++ b/man/TADA_DataRetrieval.Rd @@ -44,9 +44,9 @@ TADA_DataRetrieval( \item{siteType}{Type of waterbody. See https://www.waterqualitydata.us/Codes/sitetype for options.} -\item{tribal_area_type}{One of the six tribal spatial layers: "Alaska Native Allotments", "Alaska Native Villages", "American Indian Reservations", "Off-reservation Trust Lands", "Oklahoma Tribal Statistical Areas", or "Virginia Federally Recognized Tribes".} +\item{tribal_area_type}{One of four tribal spatial layers: "Alaska Native Allotments", "American Indian Reservations", "Off-reservation Trust Lands", or "Oklahoma Tribal Statistical Areas". More info in TADA_TribalOptions(). Note that "Alaska Native Villages" and "Virginia Federally Recognized Tribes" layers will not return a successful query.} -\item{tribe_name_parcel}{The name of a tribe corresponding to an entry in the TRIBE_NAME field of the specified tribal_area_type. OR if the type is Alaska Native Allotments" then the corresponding PARCEL_NO.} +\item{tribe_name_parcel}{The name of one or more tribes corresponding to entries in the TRIBE_NAME field of the specified tribal_area_type. OR if the type is "Alaska Native Allotments" then the corresponding PARCEL_NO. More info in TADA_TribalOptions().} \item{characteristicName}{Name of parameter. See https://www.waterqualitydata.us/Codes/characteristicName for options.} From 8e4b98ad94d8444ae2de577547067b6b574fd36f Mon Sep 17 00:00:00 2001 From: Matt Brousil <37380883+mbrousil@users.noreply.github.com> Date: Mon, 27 Jan 2025 14:54:15 -0800 Subject: [PATCH 32/35] Style --- R/DataDiscoveryRetrieval.R | 381 +++++++++++++++++++------------------ 1 file changed, 191 insertions(+), 190 deletions(-) diff --git a/R/DataDiscoveryRetrieval.R b/R/DataDiscoveryRetrieval.R index 5e0490de..aa93f03a 100644 --- a/R/DataDiscoveryRetrieval.R +++ b/R/DataDiscoveryRetrieval.R @@ -148,24 +148,26 @@ #' # Wyandotte Nation "WNENVDPT_WQX" #' # Pueblo of Pojoaque "PUEBLO_POJOAQUE" #' -#' tada9 <- TADA_DataRetrieval(organization = c( -#' "SFNOES_WQX", -#' "CPNWATER", -#' "DELAWARENATION", -#' "HVTEPA_WQX", -#' "O_MTRIBE_WQX", -#' "FONDULAC_WQX", -#' "SANILDEFONSODECP", -#' "PUEBLO_SANTAANA", -#' "PUEBLOOFTESUQUE", -#' "REDLAKE_WQX", -#' "SCEQ", -#' "CNENVSER", -#' "CHOCNATWQX", -#' "WNENVDPT_WQX", -#' "PUEBLO_POJOAQUE" -#' ), -#' ask = FALSE) +#' tada9 <- TADA_DataRetrieval( +#' organization = c( +#' "SFNOES_WQX", +#' "CPNWATER", +#' "DELAWARENATION", +#' "HVTEPA_WQX", +#' "O_MTRIBE_WQX", +#' "FONDULAC_WQX", +#' "SANILDEFONSODECP", +#' "PUEBLO_SANTAANA", +#' "PUEBLOOFTESUQUE", +#' "REDLAKE_WQX", +#' "SCEQ", +#' "CNENVSER", +#' "CHOCNATWQX", +#' "WNENVDPT_WQX", +#' "PUEBLO_POJOAQUE" +#' ), +#' ask = FALSE +#' ) #' #' # query only NWIS data for a 10 year period in CT #' tada10 <- TADA_DataRetrieval( @@ -206,18 +208,17 @@ TADA_DataRetrieval <- function(startDate = "null", maxrecs = 250000, ask = TRUE, applyautoclean = TRUE) { - # Require one tribal area type: if (length(tribal_area_type) > 1) { stop("tribal_area_type must be of length 1.") } - + # Check for incomplete or inconsistent inputs: - + # If both an sf object and tribe information are provided it's unclear what # the priority should be for the query if (!is.null(aoi_sf) & - any((tribal_area_type != "null") | (tribe_name_parcel != "null"))) { + any((tribal_area_type != "null") | (tribe_name_parcel != "null"))) { stop( paste0( "Both sf data and tribal information have been provided. ", @@ -225,17 +226,17 @@ TADA_DataRetrieval <- function(startDate = "null", ) ) } - + # Check for other arguments that indicate location. Function will ignore # these inputs but warn the user if ( # sf object provided (!is.null(aoi_sf) & inherits(aoi_sf, "sf")) & - # with additional location info - any( - (countrycode != "null"), (countycode != "null"), (huc != "null"), - (siteid != "null"), (statecode != "null") - ) + # with additional location info + any( + (countrycode != "null"), (countycode != "null"), (huc != "null"), + (siteid != "null"), (statecode != "null") + ) ) { warning( paste0( @@ -246,11 +247,11 @@ TADA_DataRetrieval <- function(startDate = "null", } else if ( # Tribe info provided (tribal_area_type != "null") & - # with additional location info - any( - (countrycode != "null"), (countycode != "null"), (huc != "null"), - (siteid != "null"), (statecode != "null") - ) + # with additional location info + any( + (countrycode != "null"), (countycode != "null"), (huc != "null"), + (siteid != "null"), (statecode != "null") + ) ) { warning( paste0( @@ -259,7 +260,7 @@ TADA_DataRetrieval <- function(startDate = "null", ) ) } - + # Insufficient tribal info provided: # Type but no name or parcel if ((tribal_area_type != "null") & all(tribe_name_parcel == "null")) { @@ -269,21 +270,21 @@ TADA_DataRetrieval <- function(startDate = "null", if ((tribal_area_type == "null") & all(tribe_name_parcel != "null")) { stop("A tribal_area_type is required if tribe_name_parcel is provided.") } - + # Before proceeding make quiet wrappers for dataRetrieval functions for later # use in if/else processes quiet_whatWQPsites <- purrr::quietly(dataRetrieval::whatWQPsites) quiet_whatWQPdata <- purrr::quietly(dataRetrieval::whatWQPdata) quiet_readWQPdata <- purrr::quietly(dataRetrieval::readWQPdata) - + # If an sf object OR tribal info are provided they will be the basis of the query # (The tribal data handling uses sf objects as well) if ((!is.null(aoi_sf) & inherits(aoi_sf, "sf")) | (tribal_area_type != "null")) { # Build the non-sf part of the query: - + # Set query parameters WQPquery <- list() - + # StartDate if (length(startDate) > 1) { if (is.na(suppressWarnings(lubridate::parse_date_time(startDate[1], orders = "ymd")))) { @@ -350,9 +351,9 @@ TADA_DataRetrieval <- function(startDate = "null", } WQPquery <- c(WQPquery, endDate = endDate) } - + # sf AOI prep for query - + # If tribe info is provided then grab the corresponding sf object: if (tribal_area_type != "null") { # Make a reference table for tribal area type + url matching @@ -366,19 +367,19 @@ TADA_DataRetrieval <- function(startDate = "null", "Oklahoma Tribal Statistical Areas", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/4" # , # "Virginia Federally Recognized Tribes", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/5" ) - + # These two layers will not return any data when used for bboxes if (tribal_area_type == "Alaska Native Villages") { stop("Alaska Native Villages data are centroid points, not spatial boundaries.") } else if (tribal_area_type == "Virginia Federally Recognized Tribes") { stop("Federally recognized tribal entities in Virginia do not have any available spatial boundaries.") } - + # These area types allow filtering by TRIBE_NAME (unique within each type) if (tribal_area_type %in% c( "American Indian Reservations", "Off-reservation Trust Lands", - "Oklahoma Tribal Statistical Areas" + "Oklahoma Tribal Statistical Areas" ) ) { # Get the relevant url @@ -398,7 +399,7 @@ TADA_DataRetrieval <- function(startDate = "null", . } } - + # Otherwise filter by PARCEL_NO (Note that values in this col are not unique) } else if (tribal_area_type == "Alaska Native Allotments") { aoi_sf <- dplyr::filter( @@ -418,87 +419,87 @@ TADA_DataRetrieval <- function(startDate = "null", stop("Tribal area type or tribal name parcel not recognized. Refer to TADA_TribalOptions() for query options.") } } - + # Check and/or fix geometry aoi_sf <- sf::st_make_valid(aoi_sf) - + # Match CRS if (sf::st_crs(aoi_sf) != 4326) { aoi_sf <- sf::st_transform(aoi_sf, crs = 4326) } - + # Get bbox of the sf object input_bbox <- sf::st_bbox(aoi_sf) - + # Query info on available data within the bbox - + # Try getting WQP info message("Checking what data is available. This may take a moment.") - + # Don't want to print every message that's returned by WQP quiet_bbox_avail <- quiet_whatWQPdata( WQPquery, bBox = c(input_bbox$xmin, input_bbox$ymin, input_bbox$xmax, input_bbox$ymax) ) - + # Alert & stop if an http error was received if (is.null(quiet_bbox_avail$result)) { stop_message <- quiet_bbox_avail$messages %>% grep(pattern = "failed|HTTP", x = ., ignore.case = FALSE, value = TRUE) %>% paste("\n", ., collapse = "") %>% paste("The WQP request returned a NULL with the following message(s): \n", - ., - collapse = "\n" + ., + collapse = "\n" ) - + stop(stop_message) } - + # Use result only bbox_avail <- quiet_bbox_avail$result - + # Check if any sites are within the aoi if ((nrow(bbox_avail) > 0) == FALSE) { stop("No monitoring sites were returned within your area of interest (no data available).") } - + quiet_bbox_sites <- quiet_whatWQPsites( siteid = bbox_avail$MonitoringLocationIdentifier ) - + if (is.null(quiet_bbox_sites$result)) { stop_message <- quiet_bbox_sites$messages %>% grep(pattern = "failed|HTTP", x = ., ignore.case = FALSE, value = TRUE) %>% paste("\n", ., collapse = "") %>% paste("The WQP request returned a NULL with the following message(s): \n", - ., - collapse = "\n" + ., + collapse = "\n" ) stop(stop_message) } - + # Reformat returned info as sf bbox_sites_sf <- TADA_MakeSpatial(quiet_bbox_sites$result, crs = 4326) - + # Subset sites to only within shapefile and get IDs clipped_sites_sf <- bbox_sites_sf[aoi_sf, ] - + clipped_site_ids <- clipped_sites_sf$MonitoringLocationIdentifier - + # Check if any sites are within the clip if ((length(clipped_site_ids) > 0) == FALSE) { stop("No monitoring sites were returned within your area of interest (no data available).") } - + record_count <- bbox_avail %>% dplyr::filter(MonitoringLocationIdentifier %in% clipped_site_ids) %>% dplyr::pull(resultCount) %>% sum() - + # Should we proceed with downloads? If ask == TRUE then ask the user. if (ask == TRUE) { user_decision <- ask_user(n_records = record_count) - + # Act on input if (user_decision == "yes") { print("Proceeding with download.") @@ -506,10 +507,10 @@ TADA_DataRetrieval <- function(startDate = "null", stop("Cancelled by user.", call. = FALSE) } } - + # Continue now with site count site_count <- length(clipped_site_ids) - + # Check for either more than 300 sites or more records than max_recs. # If either is true then we'll approach the pull as a "big data" pull if (site_count > 300 | record_count > maxrecs) { @@ -519,7 +520,7 @@ TADA_DataRetrieval <- function(startDate = "null", "If your AOI is a county, state, country, or HUC boundary it would be more efficient to provide a code instead of an sf object." ) ) - + # Use helper function to download large data volume results.DR <- withCallingHandlers( TADA_BigDataHelper( @@ -532,11 +533,11 @@ TADA_DataRetrieval <- function(startDate = "null", ), message = function(m) message(m$message) ) - - + + rm(bbox_avail, bbox_sites_sf) gc() - + # Check if any results were returned if ((nrow(results.DR) > 0) == FALSE) { print( @@ -554,7 +555,7 @@ TADA_DataRetrieval <- function(startDate = "null", sites.DR <- clipped_sites_sf %>% dplyr::as_tibble() %>% dplyr::select(-geometry) - + # Get project metadata quiet_projects.DR <- quiet_readWQPdata( siteid = clipped_site_ids, @@ -562,21 +563,21 @@ TADA_DataRetrieval <- function(startDate = "null", ignore_attributes = TRUE, service = "Project" ) - + if (is.null(quiet_projects.DR$result)) { stop_message <- quiet_projects.DR$messages %>% grep(pattern = "failed|HTTP", x = ., ignore.case = FALSE, value = TRUE) %>% paste("\n", ., collapse = "") %>% paste("The WQP request returned a NULL with the following message(s): \n", - ., - collapse = "\n" + ., + collapse = "\n" ) - + stop(stop_message) } - + projects.DR <- quiet_projects.DR$result - + # Join results, sites, projects TADAprofile <- TADA_JoinWQPProfiles( FullPhysChem = results.DR, @@ -585,25 +586,25 @@ TADA_DataRetrieval <- function(startDate = "null", ) %>% dplyr::mutate( dplyr::across(tidyselect::everything(), as.character) ) - + # run TADA_AutoClean function if (applyautoclean == TRUE) { print("Data successfully downloaded. Running TADA_AutoClean function.") - + TADAprofile.clean <- TADA_AutoClean(TADAprofile) } else { TADAprofile.clean <- TADAprofile } } - + return(TADAprofile.clean) - + # Doesn't meet "big data" threshold: } else { # Retrieve all 3 profiles print("Downloading WQP query results. This may take some time depending upon the query size.") print(WQPquery) - + # Get results results.DR <- suppressMessages( dataRetrieval::readWQPdata( @@ -613,7 +614,7 @@ TADA_DataRetrieval <- function(startDate = "null", ignore_attributes = TRUE ) ) - + # Check if any results were returned if ((nrow(results.DR) > 0) == FALSE) { paste0( @@ -628,7 +629,7 @@ TADA_DataRetrieval <- function(startDate = "null", sites.DR <- clipped_sites_sf %>% dplyr::as_tibble() %>% dplyr::select(-geometry) - + # Get project metadata quiet_projects.DR <- quiet_readWQPdata( siteid = clipped_site_ids, @@ -636,21 +637,21 @@ TADA_DataRetrieval <- function(startDate = "null", ignore_attributes = TRUE, service = "Project" ) - + if (is.null(quiet_projects.DR$result)) { stop_message <- quiet_projects.DR$messages %>% grep(pattern = "failed|HTTP", x = ., ignore.case = FALSE, value = TRUE) %>% paste("\n", ., collapse = "") %>% paste("The WQP request returned a NULL with the following message(s): \n", - ., - collapse = "\n" + ., + collapse = "\n" ) - + stop(stop_message) } - + projects.DR <- quiet_projects.DR$result - + # Join results, sites, projects TADAprofile <- TADA_JoinWQPProfiles( FullPhysChem = results.DR, @@ -659,25 +660,25 @@ TADA_DataRetrieval <- function(startDate = "null", ) %>% dplyr::mutate( dplyr::across(tidyselect::everything(), as.character) ) - + # Run TADA_AutoClean function if (applyautoclean == TRUE) { print("Data successfully downloaded. Running TADA_AutoClean function.") - + TADAprofile.clean <- TADA_AutoClean(TADAprofile) } else { TADAprofile.clean <- TADAprofile } } - + return(TADAprofile.clean) } - + # If no sf object provided: } else { # Set query parameters WQPquery <- list() - + if (!"null" %in% statecode) { load(system.file("extdata", "statecodes_df.Rdata", package = "EPATADA")) statecode <- as.character(statecode) @@ -690,13 +691,13 @@ TADA_DataRetrieval <- function(startDate = "null", WQPquery <- c(WQPquery, statecode = list(statecd)) } } - + if (length(huc) > 1) { WQPquery <- c(WQPquery, huc = list(huc)) } else if (huc != "null") { WQPquery <- c(WQPquery, huc = huc) } - + if (length(startDate) > 1) { if (is.na(suppressWarnings(lubridate::parse_date_time(startDate[1], orders = "ymd")))) { stop("Incorrect date format. Please use the format YYYY-MM-DD.") @@ -708,67 +709,67 @@ TADA_DataRetrieval <- function(startDate = "null", } WQPquery <- c(WQPquery, startDate = startDate) } - + if (length(countrycode) > 1) { WQPquery <- c(WQPquery, countrycode = list(countrycode)) } else if (countrycode != "null") { WQPquery <- c(WQPquery, countrycode = countrycode) } - + if (length(countycode) > 1) { WQPquery <- c(WQPquery, countycode = list(countycode)) } else if (countycode != "null") { WQPquery <- c(WQPquery, countycode = countycode) } - + if (length(siteid) > 1) { WQPquery <- c(WQPquery, siteid = list(siteid)) } else if (siteid != "null") { WQPquery <- c(WQPquery, siteid = siteid) } - + if (length(siteType) > 1) { WQPquery <- c(WQPquery, siteType = list(siteType)) } else if (siteType != "null") { WQPquery <- c(WQPquery, siteType = siteType) } - + if (length(characteristicName) > 1) { WQPquery <- c(WQPquery, characteristicName = list(characteristicName)) } else if (characteristicName != "null") { WQPquery <- c(WQPquery, characteristicName = characteristicName) } - + if (length(characteristicType) > 1) { WQPquery <- c(WQPquery, characteristicType = list(characteristicType)) } else if (characteristicType != "null") { WQPquery <- c(WQPquery, characteristicType = characteristicType) } - + if (length(sampleMedia) > 1) { WQPquery <- c(WQPquery, sampleMedia = list(sampleMedia)) } else if (sampleMedia != "null") { WQPquery <- c(WQPquery, sampleMedia = sampleMedia) } - + if (length(project) > 1) { WQPquery <- c(WQPquery, project = list(project)) } else if (project != "null") { WQPquery <- c(WQPquery, project = project) } - + if (length(providers) > 1) { WQPquery <- c(WQPquery, providers = list(providers)) } else if (providers != "null") { WQPquery <- c(WQPquery, providers = providers) } - + if (length(organization) > 1) { WQPquery <- c(WQPquery, organization = list(organization)) } else if (organization != "null") { WQPquery <- c(WQPquery, organization = organization) } - + if (length(endDate) > 1) { if (is.na(suppressWarnings(lubridate::parse_date_time(endDate[1], orders = "ymd")))) { stop("Incorrect date format. Please use the format YYYY-MM-DD.") @@ -780,37 +781,37 @@ TADA_DataRetrieval <- function(startDate = "null", } WQPquery <- c(WQPquery, endDate = endDate) } - + # Query info on available data message("Checking what data is available. This may take a moment.") - + # Don't want to print every message that's returned by WQP quiet_query_avail <- quiet_whatWQPdata(WQPquery) - + if (is.null(quiet_query_avail$result)) { stop_message <- quiet_query_avail$messages %>% grep(pattern = "failed|HTTP", x = ., ignore.case = FALSE, value = TRUE) %>% paste("\n", ., collapse = "") %>% paste("The WQP request returned a NULL with the following message(s): \n", - ., - collapse = "\n" + ., + collapse = "\n" ) - + stop(stop_message) } - + query_avail <- quiet_query_avail$result - + site_count <- length(query_avail$MonitoringLocationIdentifier) - + record_count <- query_avail %>% dplyr::pull(resultCount) %>% sum() - + # Should we proceed with downloads? If ask == TRUE then ask the user. if (ask == TRUE) { user_decision <- ask_user(n_records = record_count) - + # Act on input if (user_decision == "yes") { print("Proceeding with download.") @@ -818,14 +819,14 @@ TADA_DataRetrieval <- function(startDate = "null", stop("Cancelled by user.", call. = FALSE) } } - + # Check for either more than 300 sites or more records than max_recs. # If either is true then we'll approach the pull as a "big data" pull if (site_count > 300 | record_count > maxrecs) { message( "The number of sites and/or records matched by the query terms is large, so the download may take some time." ) - + # Use helper function to download large data volume results.DR <- suppressMessages( TADA_BigDataHelper( @@ -836,27 +837,27 @@ TADA_DataRetrieval <- function(startDate = "null", maxsites = 300 ) ) - + rm(query_avail) gc() - + # Get site metadata quiet_sites.DR <- quiet_whatWQPsites(siteid = unique(results.DR$MonitoringLocationIdentifier)) - + if (is.null(quiet_sites.DR$result)) { stop_message <- quiet_sites.DR$messages %>% grep(pattern = "failed|HTTP", x = ., ignore.case = FALSE, value = TRUE) %>% paste("\n", ., collapse = "") %>% paste("The WQP request returned a NULL with the following message(s): \n", - ., - collapse = "\n" + ., + collapse = "\n" ) - + stop(stop_message) } - + sites.DR <- quiet_sites.DR$result - + # Get project metadata quiet_projects.DR <- quiet_readWQPdata( siteid = unique(results.DR$MonitoringLocationIdentifier), @@ -864,21 +865,21 @@ TADA_DataRetrieval <- function(startDate = "null", ignore_attributes = TRUE, service = "Project" ) - + if (is.null(quiet_projects.DR$result)) { stop_message <- quiet_projects.DR$messages %>% grep(pattern = "failed|HTTP", x = ., ignore.case = FALSE, value = TRUE) %>% paste("\n", ., collapse = "") %>% paste("The WQP request returned a NULL with the following message(s): \n", - ., - collapse = "\n" + ., + collapse = "\n" ) - + stop(stop_message) } - + projects.DR <- quiet_projects.DR$result - + # Join results, sites, projects TADAprofile <- TADA_JoinWQPProfiles( FullPhysChem = results.DR, @@ -887,18 +888,18 @@ TADA_DataRetrieval <- function(startDate = "null", ) %>% dplyr::mutate( dplyr::across(tidyselect::everything(), as.character) ) - + # Run TADA_AutoClean function if (applyautoclean == TRUE) { print("Data successfully downloaded. Running TADA_AutoClean function.") - + TADAprofile.clean <- TADA_AutoClean(TADAprofile) } else { TADAprofile.clean <- TADAprofile } - + return(TADAprofile.clean) - + # If not a "big data" pull: } else { # Retrieve all 3 profiles @@ -906,25 +907,25 @@ TADA_DataRetrieval <- function(startDate = "null", print(WQPquery) results.DR <- suppressMessages( dataRetrieval::readWQPdata(WQPquery, - dataProfile = "resultPhysChem", - ignore_attributes = TRUE + dataProfile = "resultPhysChem", + ignore_attributes = TRUE ) ) - + # Check if any results are available if ((nrow(results.DR) > 0) == FALSE) { print("Returning empty results dataframe: Your WQP query returned no results (no data available). Try a different query. Removing some of your query filters OR broadening your search area may help.") TADAprofile.clean <- results.DR } else { sites.DR <- suppressMessages(dataRetrieval::whatWQPsites(WQPquery)) - + projects.DR <- suppressMessages( dataRetrieval::readWQPdata(WQPquery, - ignore_attributes = TRUE, - service = "Project" + ignore_attributes = TRUE, + service = "Project" ) ) - + TADAprofile <- TADA_JoinWQPProfiles( FullPhysChem = results.DR, Sites = sites.DR, @@ -932,17 +933,17 @@ TADA_DataRetrieval <- function(startDate = "null", ) %>% dplyr::mutate( dplyr::across(tidyselect::everything(), as.character) ) - + # run TADA_AutoClean function if (applyautoclean == TRUE) { print("Data successfully downloaded. Running TADA_AutoClean function.") - + TADAprofile.clean <- TADA_AutoClean(TADAprofile) } else { TADAprofile.clean <- TADAprofile } } - + return(TADAprofile.clean) } } @@ -997,12 +998,12 @@ TADA_TribalOptions <- function(tribal_area_type, return_sf = FALSE) { "Oklahoma Tribal Statistical Areas", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/4", "Virginia Federally Recognized Tribes", "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/5" ) - + # Confirm usable string provided if (!(tribal_area_type %in% map_service_urls$tribal_area)) { stop("tribal_area_type must match one of the six tribal spatial layer names.") } - + # Query Map Service tribal_area_sf <- dplyr::filter( map_service_urls, @@ -1012,7 +1013,7 @@ TADA_TribalOptions <- function(tribal_area_type, return_sf = FALSE) { # Return sf arcgislayers::arc_select() %>% sf::st_make_valid() - + # Convert to df if needed, export if (return_sf == FALSE) { return( @@ -1078,25 +1079,25 @@ TADA_ReadWQPWebServices <- function(webservice) { # read in csv from WQP web service if (grepl("zip=yes", webservice)) { webservice <- stringr::str_replace(webservice, "zip=yes", "zip=no") - + # download data webservice <- data.table::fread(toString(webservice)) - + # if input df was not downloaded using USGS's dataRetrieval, then the # column names will include / separators instead of . and TADA uses . # (e.g. ResultMeasure/MeasureUnitCode vs. ResultMeasure.MeasureUnitCode) colnames(webservice) <- gsub("/", ".", colnames(webservice)) - + return(webservice) } else { # download data webservice <- data.table::fread(toString(webservice)) - + # if input df was not downloaded using USGS's dataRetrieval, then the # column names will include / separators instead of . and TADA uses . # (e.g. ResultMeasure/MeasureUnitCode vs. ResultMeasure.MeasureUnitCode) colnames(webservice) <- gsub("/", ".", colnames(webservice)) - + return(webservice) } } @@ -1123,15 +1124,15 @@ TADA_BigDataHelper <- function(record_summary, WQPquery, maxrecs = 250000, maxsi dplyr::summarise(tot_n = sum(resultCount)) %>% dplyr::filter(tot_n > 0) %>% dplyr::arrange(tot_n) - + # Sites with less than/equal to maxrecs smallsites <- tot_sites %>% dplyr::filter(tot_n <= maxrecs) # Sites with more than maxrecs bigsites <- tot_sites %>% dplyr::filter(tot_n > maxrecs) - + df_small <- data.frame() df_big <- data.frame() - + # Work with small sites first: # Build download groups. Total record count limited to value of maxrecs. # Number of sites per download group limited to 300. @@ -1142,7 +1143,7 @@ TADA_BigDataHelper <- function(record_summary, WQPquery, maxrecs = 250000, maxsi threshold = maxrecs, maxgroupsize = 300 )) - + # Status update to user print( paste0( @@ -1151,9 +1152,9 @@ TADA_BigDataHelper <- function(record_summary, WQPquery, maxrecs = 250000, maxsi " results by grouping them together." ) ) - + small_prog_bar <- txtProgressBar(min = 0, max = sum(smallsites$tot_n), style = 3) - + # Download the data for each group for (i in 1:max(smallsitesgrp$group)) { small_site_chunk <- subset( @@ -1170,22 +1171,22 @@ TADA_BigDataHelper <- function(record_summary, WQPquery, maxrecs = 250000, maxsi ) ) %>% dplyr::mutate(dplyr::across(everything(), as.character)) - + # If data is returned, stack with what's already been retrieved if (dim(results_small)[1] > 0) { df_small <- dplyr::bind_rows(df_small, results_small) } - + # Update progress setTxtProgressBar(pb = small_prog_bar, value = nrow(df_small)) } # Close progress bar when complete close(small_prog_bar) - + rm(smallsites, smallsitesgrp) gc() } - + # Large sites (>= maxrecs) next: if (dim(bigsites)[1] > 0) { print( @@ -1195,12 +1196,12 @@ TADA_BigDataHelper <- function(record_summary, WQPquery, maxrecs = 250000, maxsi " results, chunking queries by site." ) ) - + big_prog_bar <- txtProgressBar(min = 0, max = sum(bigsites$tot_n), style = 3) - + # Unique site IDs bsitesvec <- unique(bigsites$MonitoringLocationIdentifier) - + # For each site for (i in 1:length(bsitesvec)) { # Download each site's data individually @@ -1213,7 +1214,7 @@ TADA_BigDataHelper <- function(record_summary, WQPquery, maxrecs = 250000, maxsi ) ) %>% dplyr::mutate(dplyr::across(everything(), as.character)) - + if (dim(results_big)[1] > 0) { df_big <- dplyr::bind_rows(df_big, results_big) } @@ -1222,14 +1223,14 @@ TADA_BigDataHelper <- function(record_summary, WQPquery, maxrecs = 250000, maxsi } # Close progress bar when complete close(big_prog_bar) - + rm(bigsites) gc() } - - + + df_out <- dplyr::bind_rows(df_small, df_big) - + return(df_out) } @@ -1265,20 +1266,20 @@ TADA_JoinWQPProfiles <- function(FullPhysChem = "null", Sites = "null", Projects = "null") { FullPhysChem.df <- FullPhysChem - + Sites.df <- Sites - + Projects.df <- Projects - + # Join station data to full phys/chem (FullPhysChem.df) if (length(Sites.df > 1)) { if (nrow(Sites.df) > 0) { join1 <- FullPhysChem.df %>% # join stations to results dplyr::left_join(Sites.df, - by = "MonitoringLocationIdentifier", - multiple = "all", - relationship = "many-to-many" + by = "MonitoringLocationIdentifier", + multiple = "all", + relationship = "many-to-many" ) %>% # remove ".x" suffix from column names dplyr::rename_at(dplyr::vars(dplyr::ends_with(".x")), ~ stringr::str_replace(., "\\..$", "")) %>% @@ -1290,8 +1291,8 @@ TADA_JoinWQPProfiles <- function(FullPhysChem = "null", } else { join1 <- FullPhysChem.df } - - + + # Add QAPP columns from project if (length(Projects.df) > 1) { if (nrow(Projects.df) > 0) { @@ -1317,7 +1318,7 @@ TADA_JoinWQPProfiles <- function(FullPhysChem = "null", } else { join2 <- join1 } - + return(join2) } @@ -1338,7 +1339,7 @@ ask_user <- function(n_records) { "Would you like to continue with the download? [yes/no] ", sep = "" ) - + # Ask user if they want to continue & check for valid response while (TRUE) { user_input <- readline(prompt = user_prompt) @@ -1375,9 +1376,9 @@ make_groups <- function(x, maxrecs) { i <- i + 1 groupings <- plyr::rbind.fill(groupings, group) } - + x$group <- i - + groupings <- plyr::rbind.fill(groupings, x) } return(groupings) From fb1fc4977939a6eecf7b2a4aa3a89b17f8b5998b Mon Sep 17 00:00:00 2001 From: Matt Brousil <37380883+mbrousil@users.noreply.github.com> Date: Mon, 27 Jan 2025 17:40:08 -0800 Subject: [PATCH 33/35] Update R/DataDiscoveryRetrieval.R Co-authored-by: B Steele <32140074+steeleb@users.noreply.github.com> --- R/DataDiscoveryRetrieval.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/DataDiscoveryRetrieval.R b/R/DataDiscoveryRetrieval.R index aa93f03a..b88ccc63 100644 --- a/R/DataDiscoveryRetrieval.R +++ b/R/DataDiscoveryRetrieval.R @@ -434,7 +434,7 @@ TADA_DataRetrieval <- function(startDate = "null", # Query info on available data within the bbox # Try getting WQP info - message("Checking what data is available. This may take a moment.") + message("Checking for available data. This may take a moment.") # Don't want to print every message that's returned by WQP quiet_bbox_avail <- quiet_whatWQPdata( From 131073633a1a9272558e8d6742aaf1554060e200 Mon Sep 17 00:00:00 2001 From: Matt Brousil <37380883+mbrousil@users.noreply.github.com> Date: Mon, 27 Jan 2025 17:41:21 -0800 Subject: [PATCH 34/35] Apply suggestions from code review Co-authored-by: B Steele <32140074+steeleb@users.noreply.github.com> --- R/DataDiscoveryRetrieval.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/DataDiscoveryRetrieval.R b/R/DataDiscoveryRetrieval.R index b88ccc63..74401fec 100644 --- a/R/DataDiscoveryRetrieval.R +++ b/R/DataDiscoveryRetrieval.R @@ -449,6 +449,7 @@ TADA_DataRetrieval <- function(startDate = "null", paste("\n", ., collapse = "") %>% paste("The WQP request returned a NULL with the following message(s): \n", ., + "The bounding box is likely too large for this process. Reduce your area of interest and try again.", collapse = "\n" ) From 7602fac4dcb9439b308ee2cbcdeebc24469bfa20 Mon Sep 17 00:00:00 2001 From: Matt Brousil <37380883+mbrousil@users.noreply.github.com> Date: Mon, 27 Jan 2025 17:59:52 -0800 Subject: [PATCH 35/35] Update DataDiscoveryRetrieval.R --- R/DataDiscoveryRetrieval.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/DataDiscoveryRetrieval.R b/R/DataDiscoveryRetrieval.R index 74401fec..2b6288d9 100644 --- a/R/DataDiscoveryRetrieval.R +++ b/R/DataDiscoveryRetrieval.R @@ -449,7 +449,7 @@ TADA_DataRetrieval <- function(startDate = "null", paste("\n", ., collapse = "") %>% paste("The WQP request returned a NULL with the following message(s): \n", ., - "The bounding box is likely too large for this process. Reduce your area of interest and try again.", + "The bounding box may be too large for this process. Reduce your area of interest and try again.", collapse = "\n" )