forked from zhoylman/drought-year-sensitivity
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path2_spi_bias_analysis.R
514 lines (453 loc) · 21.7 KB
/
2_spi_bias_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
#######################################################################
# Script accomponying Hoylman et al., 2022, Nat. Comm.
# Drought assessment has been outpaced by climate change:
# empirical arguments for a paradigm shift
# https://doi.org/10.1038/s41467-022-30316-5
# Author: Dr. Zachary Hoylman
# Contact: [email protected]
#######################################################################
# This script computes the drought metric bias analysis.
# NOTE: This script is relatively resource intensive. For reference,
# the compute enviorment used in the anlysis here used 32 cores and 256Gb RAM.
# This analysis is based on a moving window approach.
# The moving window is 30 years in length and only uses data from the past
# with respect to the day of interest. For example, if the analysis is running for
# August 1, 2015, data will only be evaluated for 2015 and before.
# data from the future should not be used, as practitioners do not have
# information from the future to assess today.
# this analysis computes SPI for "valid" GHCN sites
# as defined in ~/drought-year-sensitivity/R/valid_stations_precip.R
# for the longest period of record possible and for
# the previous 30 year moving window for years with complete data.
#################################################
########### Drought Metric Bias (SPI) ###########
#################################################
library(rnoaa)
library(tidyverse)
library(lubridate)
library(magrittr)
library(lmomco)
library(doParallel)
library(ggplot2)
library(foreach)
library(doParallel)
library(sf)
library(automap)
library(gstat)
library(spdplyr)
options(dplyr.summarise.inform = FALSE)
# define base parameters
# ID to define time scale, months of interest and minimum
# number of records, coorisponding to "complete data"
# time scale here is the only parameter that needs to be changed
# to run different timescales. 1 = 30 day, 2 = 60 day, 3 = 30 day.
time_scale_id = 1
time_scale = list(30,60,90)
months_of_interest = list(c(5,6,7,8),
c(4,5,6,7,8),
c(3,4,5,6,7,8))
n_minimum = list(123,153,184)
# define nice special
`%notin%` = Negate(`%in%`)
# import states to filter and for plotting
states = st_read('~/drought-year-sensitivity/data/shp/conus_states.shp') %>%
st_geometry()
#read in dataframe of valid stations
valid_stations = readRDS('~/drought-year-sensitivity/data/valid_stations_70year_summer_baseline.RDS')
# function to compute SPI
source('~/drought-year-sensitivity/R/funs/gamma_fit_spi.R')
# wrapper function for gamma_fit_spi that processes precip data and
# computes spi for different time periods. The pricipal purpose
# of this function is to properly
daily_spi = function(data, time_scale, index_of_interest){
tryCatch({
#define some date based variables
data$day = yday(data$time)
data$mday = day(data$time)
data$year = year(data$time)
data$month = month(data$time)
#define some meta data on the index of interest
max_year = data$year[index_of_interest]
mday_of_interest = data$mday[index_of_interest]
month_of_interest = data$month[index_of_interest]
#filter data to not be newer then the index of interest
#this is important to emulate practical constraints on
#drought monitoring. It is impossible to use future
#data to compute the probability distribution in practice. Although,
#it should be noted that it is common to
#back calculate drought metrics using reference periods that
#include data that is in the future with respect to the time
#period of interest. for example, SPI for 1/1/2015 could be computed
#using the 1991 - 2020 reference period.
precip_data = data %>%
filter(year <= max_year)
#calculate index vectors for defining breaks used in the summation procedure.
#the important function here is to compute the indices for the start
#period and end period (summation period).
first_date_breaks = which(precip_data$mday == mday_of_interest & precip_data$month == month_of_interest)
second_date_breaks = first_date_breaks-(time_scale-1)
#if there are negative indexes remove last year (incomplete data range)
pos_index = which(second_date_breaks > 0)
first_date_breaks = first_date_breaks[c(pos_index)]
second_date_breaks = second_date_breaks[c(pos_index)]
#create slice vectors and group by vectors, this will be used for the
#summation procedure below. importantly, this is used instead of year
# identifiers (or other date based identifiers) because summation periods
#can span multiple years (not in this case but indeed in other cases).
for(j in 1:length(first_date_breaks)){
if(j == 1){
slice_vec = seq(second_date_breaks[j],first_date_breaks[j], by = 1)
group_by_vec = rep(j,(first_date_breaks[j] - second_date_breaks[j]+1))
}
else{
slice_vec = append(slice_vec, seq(second_date_breaks[j],first_date_breaks[j], by = 1))
group_by_vec = append(group_by_vec, rep(j,(first_date_breaks[j] - second_date_breaks[j]+1)))
}
}
#preform the summation procedure. this has a few steps.
data_time_filter = precip_data %>%
#slice data for appropriate periods
slice(slice_vec) %>%
#add the group_by_vec(tor) to the tibble
tibble::add_column(group_by_vec = group_by_vec)%>%
#group by the group_by_vec
group_by(group_by_vec)%>%
#summation of data (and convert to mm)
dplyr::summarise(sum = sum(data/10, na.rm = T),
#add the year identifier here,
#this can be done for this special
#case because summation does not
#span multiple years. the group by/
#slice by methodology is more abstractable
year = median(year))
#compute date time for day/year of interest
date_time = precip_data$time[first_date_breaks[length(first_date_breaks)]] %>% as.Date()
#30 year moving window filter to compute "contempary SPI values"
contempary_data = data_time_filter %>%
#this is computed for the most current 30 year time period
filter(year >= max_year - 29)
#store params
params_contempary = gamma_fit_spi(contempary_data$sum, 'params')
params_historical = gamma_fit_spi(data_time_filter$sum, 'params')
#generate storage data frame
if(anyNA(params_contempary) == T | anyNA(params_historical) == T){
output.df = data.frame(time = NA,
spi_historical = NA,
shape_historical = NA,
rate_historical = NA,
n_historical = NA,
spi_contemporary = NA,
shape_contemporary = NA,
rate_contemporary = NA,
n_contemporary = NA) %>%
`rownames<-`(1)
}
if(anyNA(params_contempary) == F & anyNA(params_historical) == F){
#define output dataframe and conduct the SPI calculation. SPI is computed using the
#afor-defined gamma_fit_spi
output.df = data.frame(time = date_time,
#compute spi using l-moments and the gamma distribution
#for the historical time period (longest period of record)
spi_historical = gamma_fit_spi(data_time_filter$sum, 'SPI'),
#store parameters
shape_historical = params_historical$para[1],
rate_historical = 1/params_historical$para[2],
#report the number of years in this SPI calculation
n_historical = length(data_time_filter$sum),
#compute SPI using the most current 30 years of data
spi_contemporary = gamma_fit_spi(contempary_data$sum, 'SPI'),
#store parameters
shape_contemporary = params_contempary$para[1],
rate_contemporary = 1/params_contempary$para[2],
#report number of years in the SPI calculation
n_contemporary = length(contempary_data$sum))%>%
`rownames<-`(1)
}
#basic error handling. generally for if a gamma fit cannot be obtained
#there is additional error handling in the gamma_fit_spi above
}, error=function(cond) {
#if there is an error output an empty but equivelent dataframe (place holder)
output.df = data.frame(time = NA,
spi_historical = NA,
shape_historical = NA,
rate_historical = NA,
n_historical = NA,
spi_contemporary = NA,
shape_contemporary = NA,
rate_contemporary = NA,
n_contemporary = NA)%>%
`rownames<-`(1)
})
return(output.df)
}
#generate the list to contain results
spi_comparison = list()
#rev up a cluster for parallel computing
cl = makeCluster(detectCores()-1)
#register the cluster for doPar
registerDoParallel(cl)
#time parallel run
tictoc::tic()
#process all data in parralel (31 cores ~ 2.5 hrs), parallel processing by station
spi_comparison = foreach(s = 1:length(valid_stations$id),
.packages = c('rnoaa', 'tidyverse', 'lubridate', 'magrittr',
'lmomco', 'sf')) %dopar% {
#basic error handling
tryCatch(
{
#pull in raw GHCN data
data_raw = ghcnd_search(
valid_stations$id[s],
date_min = NULL,
date_max = NULL,
var = "PRCP"
)
#compute years with complete data. the relative restrictions defining
#what a complete year changes depending on the time scale.
#for example, 30 day time scales only require complete data between May 1 - Aug. 31
#because the period of interest is June 1 - Aug. 31. However a 60 day time scale
#requires data back to April 1 and 90 day back to March 1.
complete_years = data_raw %$%
#its a list, select precip variable
prcp %>%
#drop nas
drop_na() %>%
#add some time meta data for sorting/slicing/filtering
mutate(year = year(date),
month = month(date)) %>%
#select vars of interest
dplyr::select(date, prcp, year, month)%>%
#rename the variables for proper function ingestion
rename(time = date, data = prcp)%>%
#filter data for the time period of interest
filter(month %in% months_of_interest[[time_scale_id]]) %>%
#group by the year ID
group_by(year) %>%
#summarize the number of observations in the data
summarise(n = length(data)) %>%
#filter for complete datasets
filter(n == n_minimum[[time_scale_id]]) # defines number of obs per time scale for complete data by year
#filter it for variable, time, completeness, etc
data_filtered = data_raw %$%
#its a list, select precip variable
prcp %>%
#add some time meta data for sorting/slicing/filtering
mutate(year = year(date),
month = month(date)) %>%
#select vars of interest
dplyr::select(date, prcp, year, month)%>%
rename(time = date, data = prcp)%>%
#filter for the time period of interest (months) and completeness
filter(month %in% months_of_interest[[time_scale_id]],
year %in% complete_years$year)
#define the indicies of interest, June - August, 1991 - 2020
indicies_of_interest = which(data_filtered$month %in% c(6,7,8) & data_filtered$year > 1990 & data_filtered$year <= 2020)
# map the spi calculation through the indicies of interest
temp = indicies_of_interest %>%
purrr::map(function(i){
export = daily_spi(data_filtered, time_scale[[time_scale_id]], i)
return(export)
})
#merge (rbind) the results and order them by time # bind_rows - dplyr
spi_merged = temp %>%
bind_rows() %>%
.[order(.$time),] %>%
as_tibble() %>%
filter_all(any_vars(!is.na(.)))
},
#basic error handling
error = function(e){
spi_merged = data.frame(time = NA,
spi_historical = NA,
n_historical = NA,
spi_contemporary = NA,
n_contemporary = NA)
})
#return the data
spi_merged
}
tictoc::toc()
#stop cluster
stopCluster(cl)
# save out big list - Define where you want this to be saved,
# I typically use a folder called "temp-drought" in my home directory,
# but this can be changed to anywhere here and the line below.
saveRDS(spi_comparison, paste0('~/temp-drought', '/spi_comparision_moving_window_with_params_30year_', time_scale[[time_scale_id]], '_days.RDS'))
#read in big list if already processed
#spi_comparison = readRDS(paste0('~/temp-drought', '/spi_comparision_moving_window_with_params_30year_', time_scale[[time_scale_id]], '_days.RDS'))
#drought breaks to compute bias based on different classes
generalized_dryness = c(Inf, 2, 1, -1, -2, -Inf) %>% rev
# define function to compute bias for different dryness classes
# which will be mapped across the list of stations output above.
# dryness classes are defined using the longest period of record SPI values
# this analysis is for Daily Values between June 1 - Aug 31
dryness_class_bias = function(x){
#dummy data frame to ensure all levels are present.
#we will bind this to the final summary data to ensure continuity
dummy_df = x[1:5,]
dummy_df[1:5,] = NA
dummy_df = dummy_df%>%
mutate(drought = as.factor(c('SPI > 2', '1 > SPI > 2', '1 > SPI > -1', '-1 > SPI > -2', '-2 > SPI')),
month = NA,
year = NA,
diff = NA)
#summarise
temp = x %>%
#define some time meta data
mutate(month = month(time),
year = year(time),
#compute the difference in historical (longest clim)
#and the contemporary data (last 30 years of data)
diff = spi_historical - spi_contemporary)%>%
#filter for time period of intest (shouldnt be nessisary, but double filtering for sanity)
filter(month %in% c(6,7,8),
#filter for a 25 year minimum climatology in the contemporary data
n_contemporary >= 25,
#filter for a 70 year minimum climatology in the historical data
n_historical >= 70)%>%
#compute the grouping bins
mutate(drought = .bincode(spi_historical, generalized_dryness)) %>%
#drop any missing data
tidyr::drop_na() %>%
#revalue the data to be human interperatble
mutate(drought = drought %>% as.factor(),
drought = plyr::revalue(drought, c(`1` = '-2 > SPI',
`2` = '-1 > SPI > -2',
`3` = '1 > SPI > -1',
`4` = '1 > SPI > 2',
`5` = 'SPI > 2'),warn_missing=F)) %>%
#convert to tibble
as_tibble()%>%
#rbind our dummy dataframe to ensure all levels are present
rbind(., dummy_df) %>%
#define the group_by structure by drought classes, don't drop missing classes
group_by(drought, .drop = FALSE) %>%
#compute the median difference
summarise(bias = median(diff, na.rm = T))
#reorganize the final data frame and rename data
temp = temp$bias[1:5]
export = data.frame(temp) %>% t %>% as.data.frame()
colnames(export) = c('SPI > 2', '1 > SPI > 2', '1 > SPI > -1', '-1 > SPI > -2', '-2 > SPI') %>% rev
rownames(export) = NULL
return(export)
}
#define function to compute bias for all time periods which will be mapped across list of stations output above.
bias_all = function(x){
#summarise data
temp = x %>%
#define some time meta data
mutate(month = month(time),
year = year(time),
#compute the difference in the historical and contemporary data
diff = spi_historical - spi_contemporary)%>%
#filter the data for months of interest (again redundent)
filter(month %in% c(6,7,8),
#filter for a 25 year minimum climatology in the contemporary data
n_contemporary >= 25,
#filter for a 70 year minimum climatology in the historical data
n_historical >= 70)%>%
#drop nas
tidyr::drop_na() %>%
#convert to tibble
as_tibble()%>%
#compute the median difference in values
summarise(bias = median(diff, na.rm = T))
return(temp$bias)
}
# compute average bias for all data together (wet and dry)
# lapply will loop though list of stations
bias = lapply(spi_comparison, bias_all) %>%
unlist()
# compute bias for all dryness classes broken out
# lapply will loop though list of stations
dryness_class = lapply(spi_comparison, dryness_class_bias) %>%
# merge together and convert to tibble
data.table::rbindlist(.) %>%
as_tibble()
#merge into single dataframe that summarizes all results
valid_stations_filtered = valid_stations %>%
mutate(`Average Bias` = bias,
`-2 > SPI` = dryness_class$`-2 > SPI`,
`-1 > SPI > -2` = dryness_class$`-1 > SPI > -2`,
`1 > SPI > -1` = dryness_class$`1 > SPI > -1`,
`1 > SPI > 2` = dryness_class$`1 > SPI > 2`,
`SPI > 2` = dryness_class$`SPI > 2`)
# compute the percent of sites with dry bias (during dry times)
print(paste0('Negative Bias during SPI < -2: ',
(sum(valid_stations_filtered$`-2 > SPI` < 0, na.rm = T)/
sum(!is.na(valid_stations_filtered$`-2 > SPI`)))*100))
# compute the percent of sites with wet bias (during wet times)
print(paste0('Positive Bias during SPI > 2: ',
(sum(valid_stations_filtered$`SPI > 2` > 0, na.rm = T)/
sum(!is.na(valid_stations_filtered$`SPI > 2`)))*100))
#################################################
################ Plot the Results ###############
#################################################
# define plotting parameters
# color ramp
col = colorRampPalette((c('darkred', 'red', 'white', 'blue', 'darkblue')))
#classes to loop through for plotting and kriging
classes = c('Average Bias',
'-2 > SPI', '-1 > SPI > -2', '1 > SPI > -1',
'1 > SPI > 2', 'SPI > 2')
# define the lay descriptions of the classess
layman = c('Average Bias',
'Very Dry Conditions', 'Dry Conditions', 'Normal Conditions',
'Wet Conditions', 'Very Wet Conditions')
# for loop for each class
for(c in 1:length(classes)){
#define the temp stations assosiated with the class of interest
temp_stations = valid_stations_filtered %>%
tidyr::drop_na(classes[c]) %>%
st_as_sf
#first plot the point data itself using the color scaling defined above
pts_plot = ggplot(temp_stations)+
geom_sf(data = states)+
geom_sf(aes(color = get(classes[c])))+
scale_color_gradientn(colours = col(100), breaks = c(-0.5, 0, 0.5), limits = c(-0.5, 0.5),
labels = c('-0.5 (Dry Bias)', '0 (No Bias)', '0.5 (Wet Bias)'), name = "",
oob = scales::squish, guide = F)+
theme_bw(base_size = 15)+
ggtitle(paste0('Daily Summer Bias (',layman[c], ', ', classes[c], ')\n', time_scale[[time_scale_id]], ' Day SPI (June 1 - August 31, 1991-2020)'))+
theme(legend.position = 'none',
legend.key.width=unit(2,"cm"),
plot.title = element_text(hjust = 0.5))
#krige the reults using autofitting the variogram (package automap)
#using a template raster to interpolate over (1/3 degree res)
template = raster::raster(resolution=c(1/3,1/3),
crs = sp::CRS("+init=epsg:4326")) %>%
raster::crop(., as(states, 'Spatial')) %>%
raster::rasterToPoints() %>%
as.data.frame() %>%
st_as_sf(., coords = c('x', 'y'))
st_crs(template) = st_crs(4326)
#fit the variogram
vgm = autofitVariogram(temp_stations[classes[c]] %>% data.frame() %>% .[,1] ~ 1, as(temp_stations, 'Spatial'))
#krige the variogram results
krig = krige(temp_stations[classes[c]] %>% data.frame() %>% .[,1] ~ 1, temp_stations, template, model=vgm$var_model) %>%
st_intersection(states)
#convert to a point system for tile plotting
krig_pts = st_coordinates(krig) %>%
as_tibble() %>%
mutate(val = krig$var1.pred)
#define the kriged map plot
krig_plot = ggplot(krig)+
geom_tile(data = krig_pts, aes(x = X, y = Y, fill = val), width = 1/3, height = 1/3)+
geom_sf(data = states, fill = 'transparent', color = 'black')+
labs(x = "", y = "")+
ggtitle(NULL)+
scale_fill_gradientn(colours = col(100), breaks = c(-0.5, 0, 0.5), limits = c(-0.5, 0.5),
labels = c('-0.5 (Dry Bias)', '0 (No Bias)', '0.5 (Wet Bias)'), name = "",
oob = scales::squish)+
theme_bw(base_size = 15)+
theme(legend.position = 'bottom',
legend.key.width=unit(2,"cm"),
legend.text=element_text(size=15))
#generate the final plot by doing a plot_grid call, first
#align the plots and shrink the space between them by using an empty plot
#and reducing the relative height of the middle plot to a negative value
final = cowplot::plot_grid(pts_plot, NULL, krig_plot, ncol = 1, rel_heights = c(1,-0.17,1), align = 'v')
#save it out
ggsave(final, file = paste0('~/drought-year-sensitivity/figs/moving_window_bias/spi_bias_maps_',classes[c],'_',time_scale[[time_scale_id]],'day_timescale_June1-Aug31.png'), width = 7, height = 10, units = 'in')
#fin
}