Skip to content

Commit

Permalink
format with ruff
Browse files Browse the repository at this point in the history
  • Loading branch information
shinzlet committed Jul 18, 2024
1 parent 8f1351c commit f65249b
Show file tree
Hide file tree
Showing 7 changed files with 278 additions and 98 deletions.
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ authors = [
dependencies = [
"numpy>=2.0.0",
"termcolor>=2.4.0",
"tensorstore>=0.1.63"
"tensorstore>=0.1.63",
"ruff>=0.5.3",
]
readme = "README.md"
requires-python = ">= 3.12"
Expand Down
2 changes: 1 addition & 1 deletion requirements-dev.lock
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ numpy==2.0.0
# via czpeedy
# via ml-dtypes
# via tensorstore
pyyaml==6.0.1
ruff==0.5.3
# via czpeedy
tensorstore==0.1.63
# via czpeedy
Expand Down
2 changes: 1 addition & 1 deletion requirements.lock
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ numpy==2.0.0
# via czpeedy
# via ml-dtypes
# via tensorstore
pyyaml==6.0.1
ruff==0.5.3
# via czpeedy
tensorstore==0.1.63
# via czpeedy
Expand Down
180 changes: 144 additions & 36 deletions src/czpeedy/czpeedy.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,16 @@
from czpeedy.runner import Runner
from czpeedy.parameter_space import ParameterSpace


# An argument type for argparse that ensures the argument is a valid directory or a directory that could be created.
# Used to specify the output folder.
def dir_or_nonexistent(path: str) -> Path:
if os.path.isfile(path):
raise argparse.ArgumentTypeError(f"{path} is a file, not a directory.")

return Path(path)


# An argument type for argparse that ensures the argument is an existent directory or file.
# Used to ensure that the input source exists.
def dir_or_file(path: str) -> Path:
Expand All @@ -25,35 +27,46 @@ def dir_or_file(path: str) -> Path:
else:
raise argparse.ArgumentTypeError(f"{path} does not exist in the filesystem.")


# An argument type for argparse that ensures the argument is a valid shape for a numpy array (i.e. of form "a,b,c,d,..." where abcd are integers)
# Used to specify the input shape if needed.
def numpy_shape(text: str) -> Tuple[int, ...]:
axes = text.split('x')
axes = text.split("x")

try:
return tuple(int(ax) for ax in axes)
except ValueError:
raise argparse.ArgumentTypeError(f"`{text}` is not an x-delimited list of integers.")
raise argparse.ArgumentTypeError(
f"`{text}` is not an x-delimited list of integers."
)


# An argument type for argparse that ensures the argument is a valid numpy array data type.
def numpy_dtype(text: str) -> np.dtype:
try:
return np.dtype(text)
except TypeError:
raise argparse.ArgumentTypeError(f"Provided dtype string \"{text}\" is not a valid numpy data type.")
raise argparse.ArgumentTypeError(
f'Provided dtype string "{text}" is not a valid numpy data type.'
)


def filepath(text: str) -> Path:
try:
return Path(text)
except Exception:
raise argparse.ArgumentTypeError(f"Failed to convert \"{text}\" to a filepath.")
raise argparse.ArgumentTypeError(f'Failed to convert "{text}" to a filepath.')


def endianness(text: str) -> int:
endiannesses = ParameterSpace.ENDIANNESSES
if text in endiannesses:
return text
else:
raise argparse.ArgumentTypeError(f"Failed to convert \"{text}\" to an endianness. Valid endiannesses: {", ".join(endiannesses.keys())}")
raise argparse.ArgumentTypeError(
f"Failed to convert \"{text}\" to an endianness. Valid endiannesses: {", ".join(endiannesses.keys())}"
)


def clevel(text: str) -> int:
try:
Expand All @@ -62,35 +75,51 @@ def clevel(text: str) -> int:
raise ValueError()
return level
except Exception:
raise argparse.ArgumentTypeError(f"Failed to convert \"{text}\" to a compression level. A valid compression level is an integer from 0 to 9 inclusive.")
raise argparse.ArgumentTypeError(
f'Failed to convert "{text}" to a compression level. A valid compression level is an integer from 0 to 9 inclusive.'
)


def compressor(text: str) -> str:
compressors = ParameterSpace.ALL_COMPRESSORS
if text in compressors:
return text
raise argparse.ArgumentTypeError(f"\"{text}\" is not a valid compressor. Valid compressors: {", ".join(compressors)}.")
raise argparse.ArgumentTypeError(
f"\"{text}\" is not a valid compressor. Valid compressors: {", ".join(compressors)}."
)


def shuffle_type(text: str) -> str:
shuffles = ParameterSpace.SHUFFLE_TYPES
if text in shuffles:
return text
raise argparse.ArgumentTypeError(f"\"{text}\" is not a valid shuffle type. Valid shuffle types: {", ".join(shuffles.keys())}")
raise argparse.ArgumentTypeError(
f"\"{text}\" is not a valid shuffle type. Valid shuffle types: {", ".join(shuffles.keys())}"
)


# Takes all the information that the user provided about the input source and attempts to load it into a numpy array.
# Currently, only raw numpy data files are supported.
def load_input(source: Path, shape: list[int] | None = None, dtype: np.dtype | None = None) -> np.ndarray:
def load_input(
source: Path, shape: list[int] | None = None, dtype: np.dtype | None = None
) -> np.ndarray:
if source.is_file:
# Raw numpy data dump (or known type):
print(f"{colored("Reading input file", "green")} as raw numpy dump")
if shape is None:
raise ValueError("Cannot read from a raw numpy data file without knowing the intended data shape. To resolve this, use the --shape flag (--shape x,y,z)")
raise ValueError(
"Cannot read from a raw numpy data file without knowing the intended data shape. To resolve this, use the --shape flag (--shape x,y,z)"
)
if dtype is None:
raise ValueError("Cannot read from a raw numpy data file without knowing the intended data type. To resolve this, use the --dtype flag (--dtype uint32)")
with open(source, 'rb') as f:
raise ValueError(
"Cannot read from a raw numpy data file without knowing the intended data type. To resolve this, use the --dtype flag (--dtype uint32)"
)
with open(source, "rb") as f:
return np.fromfile(f, dtype=dtype).reshape(shape)
else:
raise NotImplementedError("Loading from zarr is not yet supported.")


# Given a callable that can be used as a type in argparse (i.e. it can convert a string to a more specific type),
# produces a wrapper that accepts a *list* of that type. i.e. if the type 'endianness' works as follows:
# --endianness big => args.endianness = +1
Expand All @@ -100,52 +129,130 @@ def load_input(source: Path, shape: list[int] | None = None, dtype: np.dtype | N
# This is useful for specifying a set of parameters for each given property.
def list_type(element_type: Callable[[str], Any]) -> set[Any]:
def parser(text: str) -> list[Any]:
return {element_type(part) for part in text.split(',')}
return {element_type(part) for part in text.split(",")}

return parser


def zarr_version(text: str) -> int:
try:
version = int(text)
if version < 2 or version > 3:
raise ValueError()
return version
except ValueError:
raise argparse.ArgumentTypeError(f"Failed to convert \"{text}\" to a zarr version. Valid zarr versions: 2, 3.")

raise argparse.ArgumentTypeError(
f'Failed to convert "{text}" to a zarr version. Valid zarr versions: 2, 3.'
)


# Runs the main CLI. Currently, only write testing is implemented.
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("source", type=dir_or_file, help="The input dataset used in benchmarking. If write benchmarking, this is the data that will be written to disk.")
parser.add_argument("--dest", type=dir_or_nonexistent, help="The destination where write testing will occur. A directory will be created inside, called 'czpeedy'. Each write test will delete and recreate the `czpeedy` folder.")
parser.add_argument("--savecsv", type=filepath, help="The destination to save test results to in csv format. Will overwrite the named file if it exists already.")
parser.add_argument("--repetitions", type=int, default=3, help="The number of times to test each configuration. This increases confidence that speeds are repeatable, but takes a while. (default: 3)")
parser.add_argument("--dtype", type=numpy_dtype, help="If your data source is a raw numpy array dump, you must provide its dtype (i.e. --dtype uint32)")
parser.add_argument("--shape", type=numpy_shape, help="If your data source is a raw numpy array dump, you must provide the shape (i.e. --shape 1920x1080x1024). Ignored if the data source has a shape in its metadata.")
parser.add_argument("--clevel", type=list_type(clevel), help="The endianness you want to write your data as (can be big, little, or none). \"none\" is only an acceptable endianness if the dtype is 1 byte.")
parser.add_argument("--compressor", type=list_type(compressor), help=f"The compressor id you want to use with blosc. Valid compressors: {", ".join(ParameterSpace.ALL_COMPRESSORS)}.")
parser.add_argument("--shuffle", type=list_type(shuffle_type), help=f"The shuffle mode you want to use with blosc compression. Valid shuffle types: {", ".join(ParameterSpace.SHUFFLE_TYPES.keys())}")
parser.add_argument("--chunk-size", type=list_type(numpy_shape), help="The chunk size that tensorstore should use when writing data. i.e. --chunk-size 100x100x100. Must have the same number of dimensions as the source data.")
parser.add_argument("--endianness", type=list_type(endianness), help="The endianness you want to write your data as (can be big, little, or none). \"none\" is only an acceptable endianness if the dtype is 1 byte.")
parser.add_argument("--zarr-version", type=list_type(zarr_version), help="The version of zarr to use. (Supported: 2, 3.)")
parser.add_argument(
"source",
type=dir_or_file,
help="The input dataset used in benchmarking. If write benchmarking, this is the data that will be written to disk.",
)
parser.add_argument(
"--dest",
type=dir_or_nonexistent,
help="The destination where write testing will occur. A directory will be created inside, called 'czpeedy'. Each write test will delete and recreate the `czpeedy` folder.",
)
parser.add_argument(
"--savecsv",
type=filepath,
help="The destination to save test results to in csv format. Will overwrite the named file if it exists already.",
)
parser.add_argument(
"--repetitions",
type=int,
default=3,
help="The number of times to test each configuration. This increases confidence that speeds are repeatable, but takes a while. (default: 3)",
)
parser.add_argument(
"--dtype",
type=numpy_dtype,
help="If your data source is a raw numpy array dump, you must provide its dtype (i.e. --dtype uint32)",
)
parser.add_argument(
"--shape",
type=numpy_shape,
help="If your data source is a raw numpy array dump, you must provide the shape (i.e. --shape 1920x1080x1024). Ignored if the data source has a shape in its metadata.",
)
parser.add_argument(
"--clevel",
type=list_type(clevel),
help='The endianness you want to write your data as (can be big, little, or none). "none" is only an acceptable endianness if the dtype is 1 byte.',
)
parser.add_argument(
"--compressor",
type=list_type(compressor),
help=f"The compressor id you want to use with blosc. Valid compressors: {", ".join(ParameterSpace.ALL_COMPRESSORS)}.",
)
parser.add_argument(
"--shuffle",
type=list_type(shuffle_type),
help=f"The shuffle mode you want to use with blosc compression. Valid shuffle types: {", ".join(ParameterSpace.SHUFFLE_TYPES.keys())}",
)
parser.add_argument(
"--chunk-size",
type=list_type(numpy_shape),
help="The chunk size that tensorstore should use when writing data. i.e. --chunk-size 100x100x100. Must have the same number of dimensions as the source data.",
)
parser.add_argument(
"--endianness",
type=list_type(endianness),
help='The endianness you want to write your data as (can be big, little, or none). "none" is only an acceptable endianness if the dtype is 1 byte.',
)
parser.add_argument(
"--zarr-version",
type=list_type(zarr_version),
help="The version of zarr to use. (Supported: 2, 3.)",
)
args = parser.parse_args()

if args.dest:
print(f"{colored("Beginning write testing", "green")} (from {args.source} to {args.dest})")
print(
f"{colored("Beginning write testing", "green")} (from {args.source} to {args.dest})"
)
data = load_input(args.source, args.shape, args.dtype)
if args.chunk_size is None:
args.chunk_size = ParameterSpace.suggest_chunk_sizes(data.shape, data.itemsize)

parameter_space = ParameterSpace(data.shape, args.chunk_size, data.dtype, args.zarr_version, args.clevel, args.compressor, args.shuffle, args.endianness)
args.chunk_size = ParameterSpace.suggest_chunk_sizes(
data.shape, data.itemsize
)

parameter_space = ParameterSpace(
data.shape,
args.chunk_size,
data.dtype,
args.zarr_version,
args.clevel,
args.compressor,
args.shuffle,
args.endianness,
)
parameter_space.summarize()

args.dest.mkdir(parents=True, exist_ok=True)
runner = Runner(parameter_space.all_combinations(), data, args.dest, args.repetitions, parameter_space.num_combinations)
runner = Runner(
parameter_space.all_combinations(),
data,
args.dest,
args.repetitions,
parameter_space.num_combinations,
)
try:
runner.run_all()
except KeyboardInterrupt:
print(colored("\nCtrl-C detected mid-test - Printing partial results and terminating.", "black", "on_red"))

print(
colored(
"\nCtrl-C detected mid-test - Printing partial results and terminating.",
"black",
"on_red",
)
)

print()
print(colored("Fastest Specs:", "green"))
runner.print_results()
Expand All @@ -155,5 +262,6 @@ def main() -> None:
else:
print("Read testing is not yet supported")


if __name__ == "__main__":
main()
Loading

0 comments on commit f65249b

Please sign in to comment.