From 8ad186ba727376e03e1d63e88dd3063323c9bf89 Mon Sep 17 00:00:00 2001 From: adelegem Date: Wed, 30 Oct 2024 13:17:35 +1100 Subject: [PATCH] optimise spectral metrics using data.table --- R/calculate_spectral_metrics.R | 224 +++++++++++++++------------------ R/create_masked_raster.R | 13 +- R/find_optimum_thresholds.R | 1 - 3 files changed, 109 insertions(+), 129 deletions(-) diff --git a/R/calculate_spectral_metrics.R b/R/calculate_spectral_metrics.R index 53b272f..c311f12 100644 --- a/R/calculate_spectral_metrics.R +++ b/R/calculate_spectral_metrics.R @@ -1,85 +1,75 @@ -#' @title Calculate spectral metrics -#' @description Calculates continuous spectral metrics (CV, SV, CHV) based on input pixel value df -#' @param df description -#' @return -#' @examples -#' @export -#' @import data.table -#' @import geometry -#' @import vegan -#' @import tidyverse - -## CV, CHV, SV metric functions appropriated from https://github.com/ALCrofts/CABO_SVH_Forest_Sites/tree/v1.0 - -# cv function with rarefaction option -calculate_cv <- function(pixel_values_df, # Dataframe with spectral reflectance values - aoi_id, # What you want to calculate spectral diversity for - wavelengths, # Cols where spectral reflectance values are - rarefaction = F, - n = NULL, # Number of random resampling events, if rarefraction = T. - min_points = NULL # minimum number of pixels (ie. the min # of pixels in any subplot) -){ - # validate inputs if rarefaction = T - if (rarefaction == T) { - if (is.null(n) || is.null(min_points)) { - stop("Please provide both 'n' and 'min_points' when rarefaction is TRUE.") - } - - # convert to datatable (more efficient performance) - setDT(spectral_df) - - # create a list to store CV values for each replication +#rarefraction cv function +calculate_cv <- function(pixel_values_df, + subplots = 'subplot_id', + wavelengths, + rarefaction = TRUE, + min_points = NULL, + n = 999) { + + # Convert the dataframe to a data.table for efficiency + setDT(pixel_values_df) + + if (rarefaction) { + # Initialize a list to store CV values for each replication cv_list <- vector("list", n) - # b) calculate CV for each resampling event + # Resample and calculate CV for each iteration for (i in seq_len(n)) { - # sample to the minimum number of points per subplot - sampled_df <- spectral_df[, .SD[sample(.N, min_points)], by = aoi_id, .SDcols = wavelengths] + # Sample to the minimum number of points per subplot + sampled_df <- pixel_values_df[, .SD[sample(.N, min_points)], by = subplots, .SDcols = wavelengths] - # calculate CV for each wavelength within each subplot - cv_data <- sampled_df[, lapply(.SD, function(x) sd(x) / abs(mean(x, na.rm = TRUE))), by = aoi_id] + # Calculate CV for each wavelength within each subplot + cv_data <- sampled_df[, lapply(.SD, function(x) sd(x) / abs(mean(x, na.rm = TRUE))), by = subplots] - # sum across wavelengths and normalize by the number of bands (ignoring NAs) + # Sum across wavelengths and normalize by the number of bands (ignoring NAs) cv_data[, CV := rowSums(.SD, na.rm = TRUE) / (length(wavelengths) - rowSums(is.na(.SD))), .SDcols = wavelengths] - # store cv values - cv_list[[i]] <- cv_data[, .(CV), by = aoi_id] + # Store CV values along with subplots + cv_list[[i]] <- cv_data[, .(subplot_id = subplots, CV), by = subplots] # Ensure subplot_id is included } - # c) Collapse list of CV data tables into a single data table and calculate average CV for each area of interest - CV <- rbindlist(cv_list)[, .(CV = mean(CV, na.rm = TRUE)), by = areas_of_interest] + # Collapse the list of CV data tables into a single data table and calculate the average CV for each subplot + cv <- rbindlist(cv_list)[, .(CV = mean(CV, na.rm = TRUE)), by = subplots] - return(CV) - } - if (rarefaction == F){ - pixel_values_df %>% - select(c({{aoi_id}}, {{wavelengths}})) %>% - group_by({{aoi_id}}) %>% - summarise_all(~sd(.)/abs(mean(.))) %>% - rowwise({{aoi_id}}) %>% - summarise(CV = sum(c_across(cols = everything()), na.rm = T) / (ncol(.) - sum(is.na(c_across(everything()))))) + } else { + # If rarefaction is FALSE, directly calculate the CV without resampling + cv <- pixel_values_df[, lapply(.SD, function(x) sd(x) / abs(mean(x, na.rm = TRUE))), by = subplots, .SDcols = wavelengths] - return(cv) - } + # Sum across wavelengths and normalize by the number of bands (ignoring NAs) + cv[, CV := rowSums(.SD, na.rm = TRUE) / (length(wavelengths) - rowSums(is.na(.SD))), .SDcols = wavelengths] + # Ensure subplot_id is included in the output + cv <- cv[, .(subplot_id = subplots, CV)] + } + + return(cv) } + + # sv function -calculate_sv <- function(pixel_values_df, aoi_id, wavelengths) { - spectral_points <- pixel_values_df %>% - group_by({{aoi_id}}) %>% - summarise(points = n()) - - sv <- pixel_values_df %>% - select(c({{wavelengths}}, {{aoi_id}})) %>% - group_by({{aoi_id}}) %>% - summarise_all(~sum((.x - mean(.x))^2)) %>% - rowwise({{aoi_id}}) %>% - summarise(SS = sum(c_across(cols = everything()))) %>% - left_join(spectral_points) %>% - summarise(SV = SS / (points - 1)) - - return(sv) +calculate_sv <- function(pixel_values_df, subplots = 'subplot_id', wavelengths) { + # Convert pixel_values_df to data.table for better performance + setDT(pixel_values_df) + + # Calculate the number of points per subplot + spectral_points <- pixel_values_df[, .(points = .N), by = subplots] + + # Calculate spectral variance (SV) + sv <- pixel_values_df[, lapply(.SD, function(x) sum((x - mean(x, na.rm = TRUE))^2)), + .SDcols = wavelengths, + by = subplots] + + # Sum across wavelengths + sv[, SS := rowSums(.SD), .SDcols = wavelengths] + + # Join with the spectral points + sv <- sv[spectral_points, on = subplots] + + # Calculate SV + sv[, SV := SS / (points - 1)] + + return(sv[, .(subplot_id = get(subplots), SV)]) } # chv function @@ -96,96 +86,82 @@ calculate_chv <- function(df, dim) { } # function to calculate chv for each subplot -calculate_chv_for_aoi <- function(df, wavelengths, dim = 3, aoi_id = 'aoi_id', rarefraction = TRUE, n = 999) { - results <- tibble(aoi_id = character(), CHV = double()) - - # Perform PCA for specified wavelengths - PCA <- df %>% - select(all_of(wavelengths)) %>% - vegan::rda(scale = FALSE) +calculate_chv_nopca <- function(df, + wavelengths, + subplots = 'subplot_id', + rarefaction = TRUE, + min_points = NULL, + n = 999) { - # Add subplot id as column to PCA df - pca_results <- data.frame(PCA$CA$u) %>% - bind_cols(aoi_id = df[[aoi_id]]) + # Convert df to data.table for better performance + setDT(df) - # Compute the minimum number of points across all subplots - min_points <- pca_results %>% - group_by(aoi_id) %>% - summarise(points = n()) %>% - summarise(min_points = min(points)) %>% - pull(min_points) + results <- data.table(subplot_id = character(), CHV_nopca = double()) # Loop through each subplot - for (aoi in unique(df[[aoi_id]])) { + for (subplot in unique(df[[subplots]])) { # Subset data for current subplot - subplot_sample <- pca_results %>% - filter(aoi_id == aoi) + subplot_sample <- df[get(subplots) == subplot, ..wavelengths] - if (rarefraction) { + if (rarefaction) { # Resample CHV n times and calculate the mean chv_values <- replicate(n, { - resampled <- aoi_sample %>% - select(-aoi_id) %>% - sample_n(min_points, replace = FALSE) + resampled <- subplot_sample[sample(.N, min_points, replace = FALSE)] + + # Convert to matrix for convex hull calculation + CHV_matrix <- as.matrix(resampled) - chv_out <- calculate_chv(resampled, dim = dim) + # Calculate CHV + chv_out <- geometry::convhulln(CHV_matrix, option = "FA") return(chv_out$vol) }) mean_chv <- mean(chv_values) } else { - # calculate CHV without resampling - chv_out <- calculate_chv(aoi_sample, dim = dim) + # Calculate CHV without resampling + CHV_matrix <- as.matrix(subplot_sample) + chv_out <- geometry::convhulln(CHV_matrix, option = "FA") mean_chv <- chv_out$vol } - # store results - results <- results %>% - add_row(aoi_id = aoi, CHV = mean_chv) + # Store results in data.table + results <- rbind(results, data.table(subplot_id = subplot, CHV_nopca = mean_chv), fill = TRUE) } return(results) } - ## FUNCTION FOR CALCULATING ALL METRICS - -calculate_spectral_metrics <- function(pixel_values_df, masked = TRUE, wavelengths) { +calculate_spectral_metrics <- function(pixel_values_df, + masked = TRUE, + wavelengths, + min_points, + n = 999, + rarefaction = TRUE) { # Add rarefaction here results <- list() - # loop through each site (represented as 'identifier' from file name) - for (site_name in unique(pixel_values_df$site_name)) { - - # filter pixel values for the current site - site_pixel_values <- pixel_values_df %>% filter(site_name == !!site_name) + for (identifier in unique(pixel_values_df$identifier)) { + site_pixel_values <- pixel_values_df %>% filter(identifier == !!identifier) - # calculate metrics (CV, SV, CHV) - cv <- calculate_cv(site_pixel_values, aoi_id, wavelengths) - sv <- calculate_sv(site_pixel_values, aoi_id, wavelengths) - chv <- calculate_chv_for_aoi(site_pixel_values, wavelengths) + # Calculate metrics, pass rarefaction where needed + cv <- calculate_cv(site_pixel_values, wavelengths = wavelengths, rarefaction = rarefaction, n = n, min_points = min_points) + sv <- calculate_sv(site_pixel_values, wavelengths = wavelengths) + chv <- calculate_chv_nopca(site_pixel_values, wavelengths, rarefaction = rarefaction, min_points = min_points) - # store results - results[[site_name]] <- list(CV = cv, SV = sv, CHV = chv, CHV_nopca = chv_nopca) + results[[identifier]] <- list(CV = cv, SV = sv, CHV = chv) } - # combine metrics into data frames - combined_cv <- bind_rows(lapply(results, function(x) x$CV), .id = 'site_name') - combined_sv <- bind_rows(lapply(results, function(x) x$SV), .id = 'site_name') - combined_chv <- bind_rows(lapply(results, function(x) x$CHV), .id = 'site_name') + combined_cv <- bind_rows(lapply(results, function(x) x$CV), .id = 'identifier') + combined_sv <- bind_rows(lapply(results, function(x) x$SV), .id = 'identifier') + combined_chv <- bind_rows(lapply(results, function(x) x$CHV), .id = 'identifier') # create a data frame for combined metrics combined_metrics <- combined_cv %>% - left_join(combined_sv, by = c("site_name", "aoi_id")) %>% - left_join(combined_chv, by = c("site_name", "aoi_id")) + left_join(combined_sv, by = c("identifier", "subplot_id")) %>% + left_join(combined_chv, by = c("identifier", "subplot_id")) - # add image_type column based on masked argument - if (masked) { - combined_metrics <- combined_metrics %>% - mutate(image_type = 'masked') - } else { - combined_metrics <- combined_metrics %>% - mutate(image_type = 'unmasked') - } + combined_metrics <- combined_metrics %>% + mutate(image_type = ifelse(masked, 'masked', 'unmasked')) return(combined_metrics) } diff --git a/R/create_masked_raster.R b/R/create_masked_raster.R index 7fdcdd8..3f10a85 100644 --- a/R/create_masked_raster.R +++ b/R/create_masked_raster.R @@ -18,8 +18,8 @@ #'NIR_band_index = 5) #' @export #' @import terra +#' @import raster #' @import tools -#' @import stringr # CREATE_MASKED_RASTER FUNCTION #input can be directory with a number of files, a single file, or string of files. @@ -51,7 +51,7 @@ create_masked_raster <- function(input, output_dir, for (file in files) { # extract the site identifier from file name - file_name <- strsplit(basename(file)) + file_name <- basename(file) if (!is.null(ndvi_threshold_df)) { @@ -60,7 +60,8 @@ create_masked_raster <- function(input, output_dir, ndvi_threshold <- ndvi_threshold_df$threshold[ndvi_threshold_df$site == site_id] - } else { + } + if (length(ndvi_threshold) == 0 ) { stop(paste("No NDVI threshold values found for file", file_name)) } @@ -71,7 +72,8 @@ create_masked_raster <- function(input, output_dir, site_id <- nir_threshold_df$site[grepl(nir_threshold_df$site, file_name)] nir_threshold <- nir_threshold_df$threshold[nir_threshold_df$site == site_id] - } else { + } + if (length(nir_threshold) == 0) { stop(paste("No NIR threshold values found for file", file_name)) } @@ -99,3 +101,6 @@ create_masked_raster <- function(input, output_dir, } } +library(tools) +create_masked_raster('C:/Users/adele/Documents/fg_spectral_diversity/data_out/combined_rasters/2024/NSABHC0009_combined_image.tif', 'test', ndvi_threshold = 0.02, nir_threshold = 0.04) + diff --git a/R/find_optimum_thresholds.R b/R/find_optimum_thresholds.R index 335b7ed..c6ec985 100644 --- a/R/find_optimum_thresholds.R +++ b/R/find_optimum_thresholds.R @@ -12,7 +12,6 @@ #' @import pROC #' @import dplyr - # add a for loop so nir and ndvi ground truth values can be given in the same df instead of seperately :) # FIND OPTIMUM THRESHOLDS FUNCTION