diff --git a/pyproject.toml b/pyproject.toml index 8c5fc18..f64d968 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "czpeedy" -version = "0.1.0" +version = "0.2.0" description = "A command-line tool used to determine the tensorstore settings which yield the fastest write speed on a given machine." authors = [ { name = "Seth Hinz", email = "sethhinz@me.com" } diff --git a/src/czpeedy/czpeedy.py b/src/czpeedy/czpeedy.py index a00192f..92bf944 100644 --- a/src/czpeedy/czpeedy.py +++ b/src/czpeedy/czpeedy.py @@ -210,6 +210,11 @@ def main() -> None: type=list_type(zarr_version), help="The version of zarr to use. (Supported: 2, 3.)", ) + parser.add_argument( + "--fullxy", + action='store_true', + help="If specified, the chunk size will be the full x and y dimensions of the data. Useful if you plan to stream xy slices over the network.", + ) args = parser.parse_args() if args.dest: @@ -219,7 +224,7 @@ def main() -> None: data = load_input(args.source, args.shape, args.dtype) if args.chunk_size is None: args.chunk_size = ParameterSpace.suggest_chunk_sizes( - data.shape, data.itemsize + data.shape, data.itemsize, args.fullxy ) parameter_space = ParameterSpace( diff --git a/src/czpeedy/parameter_space.py b/src/czpeedy/parameter_space.py index b641c8d..629dcdb 100644 --- a/src/czpeedy/parameter_space.py +++ b/src/czpeedy/parameter_space.py @@ -176,10 +176,22 @@ def to_trial_parameters( def suggest_chunk_sizes( shape: ArrayLike, itemsize: int, - max_bytes=2**31 - 17, - size_ratio=1.5, - volume_ratio=1.5, + full_xy: bool = False, + max_bytes: int = 2**31 - 17, + size_ratio: float | None = None, + volume_ratio: float | None = None, ) -> list[list[int]]: + + # This is just heuristic - for full xy frames, we have fewer variables to play with (usually just the z + # axis chunk length). So, to give the user several chunk options, we make the geometric sequence a bit + # tighter. + if full_xy: + size_ratio = size_ratio or 1.25 + volume_ratio = volume_ratio or 1.25 + else: + size_ratio = size_ratio or 1.5 + volume_ratio = volume_ratio or 1.5 + # Concept: The smallest size we reasonably want along an axis is min(axis_size, 100) - 100 is small, # so we use 100 as minimum unless axis_size is even smaller. # Figure out an integer n such that 100 ~= axis_size / n. Then compute the sequence @@ -230,7 +242,11 @@ def break_axis(axis: int) -> list[int]: return chunk_lengths - chunks = list(product(*[break_axis(axis) for axis in shape])) + if full_xy: + chunks = list(product(*[break_axis(axis) for axis in shape[:-2]], [shape[-2]], [shape[-1]])) + else: + chunks = list(product(*[break_axis(axis) for axis in shape])) + chunks_with_volumes = map(lambda chunk: (chunk, np.prod(chunk)), chunks) chunks_with_volumes = sorted(chunks_with_volumes, key=lambda item: item[1]) @@ -262,6 +278,5 @@ def break_axis(axis: int) -> list[int]: # return np.prod(grid_size) - np.prod(shape) # for chunk in suggested_chunks: - # print(100 % waste(shape, chunk) / np.prod(shape)) - + # print(100 % waste(shape, chunk) / np.prod(shape))) return suggested_chunks