OpenDrift · gauteh · Dec 3, 2024 · Nov 28, 2024 · Nov 28, 2024 · Dec 3, 2024
diff --git a/examples/example_compress.py b/examples/example_compress.py
@@ -0,0 +1,66 @@
+"""
+Examples of compressing data when saving to .nc
+==============================================================================
+"""
+
+# %%
+
+import xarray as xr
+from trajan.readers.omb import read_omb_csv
+from pathlib import Path
+import os
+
+# %%
+
+path_to_test_data = Path.cwd().parent / "tests" / "test_data" / "csv" / "omb_large.csv"
+xr_buoys = read_omb_csv(path_to_test_data)
+
+# %%
+
+# by default, to_netcdf does not perform any compression
+xr_buoys.to_netcdf("no_compression.nc")
+
+# on my machine, this is around 33MB
+print(f"size no compression: {round(os.stat('no_compression.nc').st_size/(pow(1024,2)), 2)} MB")
+
+# %%
+
+# one can perform compression by providing explicitly the right arguments
+# note that the best way to compress may depend on your dataset, the access
+# pattern you want to be fastest, etc - be aware of memory layout and
+# performance!
+
+# a simple compression, on a per-trajectory basis: each trajectory will
+# be compressed as a chunk, this means that it will be fast to retrieve one
+# full trajectory, but slow to retrieve e.g. the 5th point of all trajectories.
+
+# choose the encoding chunking - this may be application dependent, here
+# chunk trajectory as a whole
+def generate_chunksize(var):
+    dims = xr_buoys[var].dims
+    shape = list(xr_buoys[var].shape)
+
+    idx_trajectory = dims.index("trajectory")
+    shape[idx_trajectory] = 1
+
+    return tuple(shape)
+
+
+# set the encoding for each variable
+encoding = {
+    var: {"zlib": True, "complevel": 5, "chunksizes": generate_chunksize(var)} \
+        for var in xr_buoys.data_vars
+}
+
+# the encoding looks like:
+for var in encoding:
+    print(f"{var}: {encoding[var] = }")
+print("")
+
+# save, this time with compression
+xr_buoys.to_netcdf("trajectory_compression.nc", encoding=encoding)
+
+# on my machine, this is around 5.6MB
+print(f"size with compression: {round(os.stat('trajectory_compression.nc').st_size/(pow(1024,2)), 2)} MB")
+
+# %%