From 8ad186ba727376e03e1d63e88dd3063323c9bf89 Mon Sep 17 00:00:00 2001
From: adelegem <adelegemmell@hotmail.com>
Date: Wed, 30 Oct 2024 13:17:35 +1100
Subject: [PATCH] optimise spectral metrics using data.table

---
 R/calculate_spectral_metrics.R | 224 +++++++++++++++------------------
 R/create_masked_raster.R       |  13 +-
 R/find_optimum_thresholds.R    |   1 -
 3 files changed, 109 insertions(+), 129 deletions(-)

diff --git a/R/calculate_spectral_metrics.R b/R/calculate_spectral_metrics.R
index 53b272f..c311f12 100644
--- a/R/calculate_spectral_metrics.R
+++ b/R/calculate_spectral_metrics.R
@@ -1,85 +1,75 @@
-#' @title Calculate spectral metrics
-#' @description Calculates continuous spectral metrics (CV, SV, CHV) based on input pixel value df
-#' @param df description
-#' @return
-#' @examples
-#' @export
-#' @import data.table
-#' @import geometry
-#' @import vegan
-#' @import tidyverse
-
-## CV, CHV, SV metric functions appropriated from https://github.com/ALCrofts/CABO_SVH_Forest_Sites/tree/v1.0
-
-# cv function with rarefaction option
-calculate_cv <- function(pixel_values_df, # Dataframe with spectral reflectance values
-                  aoi_id, # What you want to calculate spectral diversity for
-                  wavelengths, # Cols where spectral reflectance values are
-                  rarefaction = F,
-                  n = NULL, # Number of random resampling events, if rarefraction = T.
-                  min_points = NULL # minimum number of pixels (ie. the min # of pixels in any subplot)
-){
-  # validate inputs if rarefaction = T
-  if (rarefaction == T) {
-    if (is.null(n) || is.null(min_points)) {
-      stop("Please provide both 'n' and 'min_points' when rarefaction is TRUE.")
-    }
-
-    # convert to datatable (more efficient performance)
-    setDT(spectral_df)
-
-    # create a list to store CV values for each replication
+#rarefraction cv function
+calculate_cv <- function(pixel_values_df,
+                         subplots = 'subplot_id',
+                         wavelengths,
+                         rarefaction = TRUE,
+                         min_points = NULL,
+                         n = 999) {
+
+  # Convert the dataframe to a data.table for efficiency
+  setDT(pixel_values_df)
+
+  if (rarefaction) {
+    # Initialize a list to store CV values for each replication
     cv_list <- vector("list", n)
 
-    # b) calculate CV for each resampling event
+    # Resample and calculate CV for each iteration
     for (i in seq_len(n)) {
-      # sample to the minimum number of points per subplot
-      sampled_df <- spectral_df[, .SD[sample(.N, min_points)], by = aoi_id, .SDcols = wavelengths]
+      # Sample to the minimum number of points per subplot
+      sampled_df <- pixel_values_df[, .SD[sample(.N, min_points)], by = subplots, .SDcols = wavelengths]
 
-      # calculate CV for each wavelength within each subplot
-      cv_data <- sampled_df[, lapply(.SD, function(x) sd(x) / abs(mean(x, na.rm = TRUE))), by = aoi_id]
+      # Calculate CV for each wavelength within each subplot
+      cv_data <- sampled_df[, lapply(.SD, function(x) sd(x) / abs(mean(x, na.rm = TRUE))), by = subplots]
 
-      # sum across wavelengths and normalize by the number of bands (ignoring NAs)
+      # Sum across wavelengths and normalize by the number of bands (ignoring NAs)
       cv_data[, CV := rowSums(.SD, na.rm = TRUE) / (length(wavelengths) - rowSums(is.na(.SD))), .SDcols = wavelengths]
 
-      # store cv values
-      cv_list[[i]] <- cv_data[, .(CV), by = aoi_id]
+      # Store CV values along with subplots
+      cv_list[[i]] <- cv_data[, .(subplot_id = subplots, CV), by = subplots]  # Ensure subplot_id is included
     }
 
-    # c) Collapse list of CV data tables into a single data table and calculate average CV for each area of interest
-    CV <- rbindlist(cv_list)[, .(CV = mean(CV, na.rm = TRUE)), by = areas_of_interest]
+    # Collapse the list of CV data tables into a single data table and calculate the average CV for each subplot
+    cv <- rbindlist(cv_list)[, .(CV = mean(CV, na.rm = TRUE)), by = subplots]
 
-    return(CV)
-  }
- if (rarefaction == F){
-   pixel_values_df %>%
-     select(c({{aoi_id}}, {{wavelengths}})) %>%
-     group_by({{aoi_id}}) %>%
-     summarise_all(~sd(.)/abs(mean(.))) %>%
-     rowwise({{aoi_id}}) %>%
-     summarise(CV = sum(c_across(cols = everything()), na.rm = T) / (ncol(.) - sum(is.na(c_across(everything())))))
+  } else {
+    # If rarefaction is FALSE, directly calculate the CV without resampling
+    cv <- pixel_values_df[, lapply(.SD, function(x) sd(x) / abs(mean(x, na.rm = TRUE))), by = subplots, .SDcols = wavelengths]
 
-   return(cv)
- }
+    # Sum across wavelengths and normalize by the number of bands (ignoring NAs)
+    cv[, CV := rowSums(.SD, na.rm = TRUE) / (length(wavelengths) - rowSums(is.na(.SD))), .SDcols = wavelengths]
 
+    # Ensure subplot_id is included in the output
+    cv <- cv[, .(subplot_id = subplots, CV)]
+  }
+
+  return(cv)
 }
 
+
+
 # sv function
-calculate_sv <- function(pixel_values_df, aoi_id, wavelengths) {
-  spectral_points <- pixel_values_df %>%
-    group_by({{aoi_id}}) %>%
-    summarise(points = n())
-
-  sv <- pixel_values_df %>%
-    select(c({{wavelengths}}, {{aoi_id}})) %>%
-    group_by({{aoi_id}}) %>%
-    summarise_all(~sum((.x - mean(.x))^2)) %>%
-    rowwise({{aoi_id}}) %>%
-    summarise(SS = sum(c_across(cols = everything()))) %>%
-    left_join(spectral_points) %>%
-    summarise(SV = SS / (points - 1))
-
-  return(sv)
+calculate_sv <- function(pixel_values_df, subplots = 'subplot_id', wavelengths) {
+  # Convert pixel_values_df to data.table for better performance
+  setDT(pixel_values_df)
+
+  # Calculate the number of points per subplot
+  spectral_points <- pixel_values_df[, .(points = .N), by = subplots]
+
+  # Calculate spectral variance (SV)
+  sv <- pixel_values_df[, lapply(.SD, function(x) sum((x - mean(x, na.rm = TRUE))^2)),
+                        .SDcols = wavelengths,
+                        by = subplots]
+
+  # Sum across wavelengths
+  sv[, SS := rowSums(.SD), .SDcols = wavelengths]
+
+  # Join with the spectral points
+  sv <- sv[spectral_points, on = subplots]
+
+  # Calculate SV
+  sv[, SV := SS / (points - 1)]
+
+  return(sv[, .(subplot_id = get(subplots), SV)])
 }
 
 # chv function
@@ -96,96 +86,82 @@ calculate_chv <- function(df, dim) {
 }
 
 # function to calculate chv for each subplot
-calculate_chv_for_aoi <- function(df, wavelengths, dim = 3, aoi_id = 'aoi_id', rarefraction = TRUE, n = 999) {
-  results <- tibble(aoi_id = character(), CHV = double())
-
-  # Perform PCA for specified wavelengths
-  PCA <- df %>%
-    select(all_of(wavelengths)) %>%
-    vegan::rda(scale = FALSE)
+calculate_chv_nopca <- function(df,
+                                wavelengths,
+                                subplots = 'subplot_id',
+                                rarefaction = TRUE,
+                                min_points = NULL,
+                                n = 999) {
 
-  # Add subplot id as column to PCA df
-  pca_results <- data.frame(PCA$CA$u) %>%
-    bind_cols(aoi_id = df[[aoi_id]])
+  # Convert df to data.table for better performance
+  setDT(df)
 
-  # Compute the minimum number of points across all subplots
-  min_points <- pca_results %>%
-    group_by(aoi_id) %>%
-    summarise(points = n()) %>%
-    summarise(min_points = min(points)) %>%
-    pull(min_points)
+  results <- data.table(subplot_id = character(), CHV_nopca = double())
 
   # Loop through each subplot
-  for (aoi in unique(df[[aoi_id]])) {
+  for (subplot in unique(df[[subplots]])) {
     # Subset data for current subplot
-    subplot_sample <- pca_results %>%
-      filter(aoi_id == aoi)
+    subplot_sample <- df[get(subplots) == subplot, ..wavelengths]
 
-    if (rarefraction) {
+    if (rarefaction) {
       # Resample CHV n times and calculate the mean
       chv_values <- replicate(n, {
-        resampled <- aoi_sample %>%
-          select(-aoi_id) %>%
-          sample_n(min_points, replace = FALSE)
+        resampled <- subplot_sample[sample(.N, min_points, replace = FALSE)]
+
+        # Convert to matrix for convex hull calculation
+        CHV_matrix <- as.matrix(resampled)
 
-        chv_out <- calculate_chv(resampled, dim = dim)
+        # Calculate CHV
+        chv_out <- geometry::convhulln(CHV_matrix, option = "FA")
         return(chv_out$vol)
       })
 
       mean_chv <- mean(chv_values)
     } else {
-      # calculate CHV without resampling
-      chv_out <- calculate_chv(aoi_sample, dim = dim)
+      # Calculate CHV without resampling
+      CHV_matrix <- as.matrix(subplot_sample)
+      chv_out <- geometry::convhulln(CHV_matrix, option = "FA")
       mean_chv <- chv_out$vol
     }
 
-    # store results
-    results <- results %>%
-      add_row(aoi_id = aoi, CHV = mean_chv)
+    # Store results in data.table
+    results <- rbind(results, data.table(subplot_id = subplot, CHV_nopca = mean_chv), fill = TRUE)
   }
 
   return(results)
 }
 
-
 ## FUNCTION FOR CALCULATING ALL METRICS
-
-calculate_spectral_metrics <- function(pixel_values_df, masked = TRUE, wavelengths) {
+calculate_spectral_metrics <- function(pixel_values_df,
+                                       masked = TRUE,
+                                       wavelengths,
+                                       min_points,
+                                       n = 999,
+                                       rarefaction = TRUE) {  # Add rarefaction here
   results <- list()
 
-  # loop through each site (represented as 'identifier' from file name)
-  for (site_name in unique(pixel_values_df$site_name)) {
-
-    # filter pixel values for the current site
-    site_pixel_values <- pixel_values_df %>% filter(site_name == !!site_name)
+  for (identifier in unique(pixel_values_df$identifier)) {
+    site_pixel_values <- pixel_values_df %>% filter(identifier == !!identifier)
 
-    # calculate metrics (CV, SV, CHV)
-    cv <- calculate_cv(site_pixel_values, aoi_id, wavelengths)
-    sv <- calculate_sv(site_pixel_values, aoi_id, wavelengths)
-    chv <- calculate_chv_for_aoi(site_pixel_values, wavelengths)
+    # Calculate metrics, pass rarefaction where needed
+    cv <- calculate_cv(site_pixel_values, wavelengths = wavelengths, rarefaction = rarefaction, n = n, min_points = min_points)
+    sv <- calculate_sv(site_pixel_values, wavelengths = wavelengths)
+    chv <- calculate_chv_nopca(site_pixel_values, wavelengths, rarefaction = rarefaction, min_points = min_points)
 
-    # store results
-    results[[site_name]] <- list(CV = cv, SV = sv, CHV = chv, CHV_nopca = chv_nopca)
+    results[[identifier]] <- list(CV = cv, SV = sv, CHV = chv)
   }
 
-  # combine metrics into data frames
-  combined_cv <- bind_rows(lapply(results, function(x) x$CV), .id = 'site_name')
-  combined_sv <- bind_rows(lapply(results, function(x) x$SV), .id = 'site_name')
-  combined_chv <- bind_rows(lapply(results, function(x) x$CHV), .id = 'site_name')
+  combined_cv <- bind_rows(lapply(results, function(x) x$CV), .id = 'identifier')
+  combined_sv <- bind_rows(lapply(results, function(x) x$SV), .id = 'identifier')
+  combined_chv <- bind_rows(lapply(results, function(x) x$CHV), .id = 'identifier')
 
   # create a data frame for combined metrics
   combined_metrics <- combined_cv %>%
-    left_join(combined_sv, by = c("site_name", "aoi_id")) %>%
-    left_join(combined_chv, by = c("site_name", "aoi_id"))
+    left_join(combined_sv, by = c("identifier", "subplot_id")) %>%
+    left_join(combined_chv, by = c("identifier", "subplot_id"))
 
-  # add image_type column based on masked argument
-  if (masked) {
-    combined_metrics <- combined_metrics %>%
-      mutate(image_type = 'masked')
-  } else {
-    combined_metrics <- combined_metrics %>%
-      mutate(image_type = 'unmasked')
-  }
+  combined_metrics <- combined_metrics %>%
+    mutate(image_type = ifelse(masked, 'masked', 'unmasked'))
 
   return(combined_metrics)
 }
diff --git a/R/create_masked_raster.R b/R/create_masked_raster.R
index 7fdcdd8..3f10a85 100644
--- a/R/create_masked_raster.R
+++ b/R/create_masked_raster.R
@@ -18,8 +18,8 @@
 #'NIR_band_index = 5)
 #' @export
 #' @import terra
+#' @import raster
 #' @import tools
-#' @import stringr
 
 # CREATE_MASKED_RASTER FUNCTION
 #input can be directory with a number of files, a single file, or string of files.
@@ -51,7 +51,7 @@ create_masked_raster <- function(input, output_dir,
 
   for (file in files) {
     # extract the site identifier from file name
-    file_name <- strsplit(basename(file))
+    file_name <- basename(file)
 
     if (!is.null(ndvi_threshold_df)) {
 
@@ -60,7 +60,8 @@ create_masked_raster <- function(input, output_dir,
 
 
         ndvi_threshold <- ndvi_threshold_df$threshold[ndvi_threshold_df$site == site_id]
-      } else {
+      }
+    if (length(ndvi_threshold) == 0 ) {
         stop(paste("No NDVI threshold values found for file", file_name))
     }
 
@@ -71,7 +72,8 @@ create_masked_raster <- function(input, output_dir,
       site_id <- nir_threshold_df$site[grepl(nir_threshold_df$site, file_name)]
 
         nir_threshold <- nir_threshold_df$threshold[nir_threshold_df$site == site_id]
-      } else {
+      }
+    if (length(nir_threshold) == 0) {
         stop(paste("No NIR threshold values found for file", file_name))
     }
 
@@ -99,3 +101,6 @@ create_masked_raster <- function(input, output_dir,
   }
 }
 
+library(tools)
+create_masked_raster('C:/Users/adele/Documents/fg_spectral_diversity/data_out/combined_rasters/2024/NSABHC0009_combined_image.tif', 'test', ndvi_threshold = 0.02, nir_threshold = 0.04)
+
diff --git a/R/find_optimum_thresholds.R b/R/find_optimum_thresholds.R
index 335b7ed..c6ec985 100644
--- a/R/find_optimum_thresholds.R
+++ b/R/find_optimum_thresholds.R
@@ -12,7 +12,6 @@
 #' @import pROC
 #' @import dplyr
 
-
 # add a for loop so nir and ndvi ground truth values can be given in the same df instead of seperately :)
 # FIND OPTIMUM THRESHOLDS FUNCTION