From 53519146e7a7c16a62a2be0e0a19bcbf27ae2f07 Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Wed, 16 Oct 2024 20:23:47 +0200 Subject: [PATCH 01/17] feat: `@datalad_command` decorator This decorator can equip a function with parameter validation and result post-processing. It replaces fully replaces the command interface used in legacy DataLad, where a combination of a class with a number of static methods, and multiple decorators have been used to achieve a similar goal, but at the expense of a substantial amount of boilerplate code. Importantly, result post-processing is fully factored out into a `ResultHandler`. This means that the typical behavior can configurability of a datalad command is now fully defined by a result handler (class), which is now configurable/replaceable. In contract to legacy Datalad, this even allows for commands that are not generators of result records, and that may not make sense to run via a CLI. A `PassthroughHandler` and a `DefaultHandler` are included as examples. The latter is structurally identical to the result handling implementation used in legacy Datalad (minus result hook support). It is, however, incomplete in that only the flow control (`on_failure`) is actually implemented. --- datalad_core/commands/__init__.py | 53 +++++ datalad_core/commands/decorator.py | 216 ++++++++++++++++++ .../commands/default_result_handler.py | 122 ++++++++++ datalad_core/commands/result_handler.py | 52 +++++ datalad_core/commands/tests/__init__.py | 0 datalad_core/commands/tests/test_cmd.py | 128 +++++++++++ docs/index.rst | 1 + 7 files changed, 572 insertions(+) create mode 100644 datalad_core/commands/__init__.py create mode 100644 datalad_core/commands/decorator.py create mode 100644 datalad_core/commands/default_result_handler.py create mode 100644 datalad_core/commands/result_handler.py create mode 100644 datalad_core/commands/tests/__init__.py create mode 100644 datalad_core/commands/tests/test_cmd.py diff --git a/datalad_core/commands/__init__.py b/datalad_core/commands/__init__.py new file mode 100644 index 0000000..46a963a --- /dev/null +++ b/datalad_core/commands/__init__.py @@ -0,0 +1,53 @@ +"""Components for implementing DataLad commands + +At the very core, a DataLad command is function that documents and validates +its parameters in a particular way, and yields return values that follow +particular conventions. + +In its simplest form, a DataLad command can be created like this: + + >>> @datalad_command() + ... def my_command(some_arg): + ... # do processing ... + ... return # typically one or more results are returned/yielded + +The :func:`datalad_command` decorator wraps a function, and automatically +establishes the conventions expected for a DataLad command (see its +documentation for details). + +Beyond this simplest use, the decorator can be used to customize parameter +validation, and result handling. Both aspects are handled by customizable +classes. + +.. currentmodule:: datalad_core.commands +.. autosummary:: + :toctree: generated + + datalad_command + ResultHandler + StandardResultHandler + PassthroughHandler + get_default_result_handler + set_default_result_handler +""" + +__all__ = [ + 'StandardResultHandler', + 'ResultHandler', + 'PassthroughHandler', + 'datalad_command', + 'get_default_result_handler', + 'set_default_result_handler', +] + + +from .decorator import datalad_command +from .default_result_handler import ( + StandardResultHandler, + get_default_result_handler, + set_default_result_handler, +) +from .result_handler import ( + PassthroughHandler, + ResultHandler, +) diff --git a/datalad_core/commands/decorator.py b/datalad_core/commands/decorator.py new file mode 100644 index 0000000..b0f0ac7 --- /dev/null +++ b/datalad_core/commands/decorator.py @@ -0,0 +1,216 @@ +from __future__ import annotations + +from functools import ( + partial, + wraps, +) +from inspect import ( + Parameter, + signature, +) +from typing import ( + TYPE_CHECKING, + Any, +) + +from datalad_core.commands.default_result_handler import get_default_result_handler + +if TYPE_CHECKING: + from datalad_core.commands.preproc import ParamProcessor + from datalad_core.commands.result_handler import ResultHandler + + +# this could be a function decorator, but we use a class, because +# it is less confusing to read (compared to the alternative decorator +# factory +class datalad_command: # noqa: N801 MIH wants lower-case + """Wrap a callable with parameter preprocessing and result post-processing + + This decorator can handle a wide range of callables. However, they must not + have any positional-only parameters. + + Before a wrapped callable is executed, an optional parameter preprocessor + ``preproc`` is applied. Any implementation that matches the + :class:`ParamProcessor` interface can be used, and arbitrary preprocessing + steps, like type-coercion or validation can be performed. + + The return value of the wrapped callable is post-processed with an instance + of the class returned by :func:`get_default_result_handler`, or that given + as ``postproc_cls`` parameter of the decorator. Any implementation + of the :class:`ResultHandler` interface can be used to apply arbitrary + return value post-processing. An instance of the given result handler class + is created, and the full parameterization is passed to the constructor + as an argument, such that a result handler can inspect and tune itself, + based on a concrete parameterization. + + If a given :class:`ResultHandler` implementation extends the signature of + the wrapped callable with additional keyword-arguments, default values + deviating from the result handler implementation defaults can be supplied + via the ``extra_kwarg_defaults`` parameter for a particular wrapped + callable. + + All parameters given to the decorator are attached as attributes of the + callable returned by the decorator, under the same name. For example, a + parameter preprocessor is accessible as ``.preproc``. + """ + + def __init__( + self, + *, + preproc: ParamProcessor | None = None, + postproc_cls: ResultHandler | None = None, + extra_kwarg_defaults: dict[str, Any] | None = None, + ): + self.preproc = preproc + self.postproc_cls = postproc_cls or get_default_result_handler() + self.extra_kwargs_defaults = extra_kwarg_defaults or {} + + def __call__(self, wrapped): + @wraps(wrapped) + def command_wrapper(*args, **kwargs): + # TODO: there is no reason why `preproc` should not also be + # able to add kwargs + extra_kwarg_specs = self.postproc_cls.get_extra_kwargs() + + # join the given command kwargs with the extra kwargs defined + # by the result handler + kwargs = update_with_extra_kwargs( + extra_kwarg_specs, + self.extra_kwargs_defaults, + **kwargs, + ) + # produce a single dict (parameter name -> parameter value) + # from all parameter sources + allkwargs, params_at_default = get_allargs_as_kwargs( + wrapped, + args, + kwargs, + extra_kwarg_specs, + ) + # preprocess the ful parameterization + if self.preproc is not None: + allkwargs = self.preproc( + allkwargs, + at_default=params_at_default, + ) + + # create a result handler instance for this particular + # parameterization + result_handler = self.postproc_cls( + cmd_kwargs=allkwargs, + ) + + # let the result handler drive the underlying command and + # let it do whatever it considers best to do with the + # results + return result_handler( + # we wrap the result generating function into + # a partial to get an argumentless callable + # that provides an iterable. partial is a misnomer + # here, all necessary parameters are given + partial( + wrapped, + **{ + k: v for k, v in allkwargs.items() if k not in extra_kwarg_specs + }, + ), + ) + + # update the signatured of the wrapped function to include + # the common kwargs + sig = signature(wrapped) + sig = sig.replace( + parameters=( + *sig.parameters.values(), + *self.postproc_cls.get_extra_kwargs().values(), + ), + ) + command_wrapper.__signature__ = sig + + # make decorator parameterization accessible to postprocessing + # consumers + command_wrapper.preproc = self.preproc + command_wrapper.extra_kwargs_defaults = self.extra_kwargs_defaults + # we make the result handler for a command known, so that + # some kind of API could see whether it can further wrap a command + # with another handler for something, and based on the type of + # handler used here, it would be able to rely on particular behaviors, + # or even refuse to consider a particular command + command_wrapper.postproc_cls = self.postproc_cls + return command_wrapper + + +def update_with_extra_kwargs( + handler_kwarg_specs: dict[str, Parameter], + deco_kwargs: dict[str, Any], + **call_kwargs, +) -> dict[str, Any]: + """Helper to update command kwargs with additional arguments + + Two sets of additional arguments are supported: + + - kwargs specifications (result handler provided) to extend the signature + of an underlying command + - ``extra_kwarg_defaults`` (given to the ``datalad_command`` decorator) + that override the defaults of handler-provided kwarg specifications + """ + # retrieve common options from kwargs, and fall back on the command + # class attributes, or general defaults if needed + updated_kwargs = { + p_name: call_kwargs.get( + # go with any explicitly given value + p_name, + # otherwise ifall back on what the command has been decorated with + deco_kwargs.get( + p_name, + # or lastly go with the implementation default + param.default, + ), + ) + for p_name, param in handler_kwarg_specs.items() + } + return dict(call_kwargs, **updated_kwargs) + + +def get_allargs_as_kwargs(call, args, kwargs, extra_kwarg_specs): + """Generate a kwargs dict from a call signature and actual parameters + + The first return value is a mapping of all argument names to their + respective values. + + The second return value is a set of argument names for which the effective + value is identical to the default declared in the signature of the + callable (or extra kwarg specification). + """ + # we base the parsing off of the callables signature + params = dict(signature(call).parameters.items()) + # and also add the common parameter definitions to get a joint + # parameter set inspection + params.update(extra_kwarg_specs) + + args = list(args) + allkwargs = {} + at_default = set() + missing_args = [] + for pname, param in params.items(): + val = args.pop(0) if len(args) else kwargs.get(pname, param.default) + allkwargs[pname] = val + if val == param.default: + at_default.add(pname) + if val is param.empty: + missing_args.append(pname) + + if missing_args: + ma = missing_args + multi_ma = len(ma) > 1 + # imitate standard TypeError message + msg = ( + f'{call.__name__}() missing {len(ma)} required ' + f'positional argument{"s" if multi_ma else ""}: ' + f'{", ".join(repr(a) for a in ma[:-1 if multi_ma else None])}' + ) + if multi_ma: + msg += f' and {ma[-1]!r}' + raise TypeError(msg) + + return allkwargs, at_default diff --git a/datalad_core/commands/default_result_handler.py b/datalad_core/commands/default_result_handler.py new file mode 100644 index 0000000..980a591 --- /dev/null +++ b/datalad_core/commands/default_result_handler.py @@ -0,0 +1,122 @@ +from collections.abc import Generator +from inspect import Parameter +from types import MappingProxyType +from typing import ( + Any, + Callable, +) + +from datalad_core.commands.result_handler import ResultHandler + + +class ResultError(RuntimeError): + """Exception raise when error results have been observed""" + + def __init__(self, failed=None, msg=None): + super().__init__(msg) + self.failed = failed + + +class StandardResultHandler(ResultHandler): + """Result handler commonly used by commands + + .. note:: + + This is largely a no-op implementation for now. It will be co-developed + with actual commands. + + Command must return an iterable (typically a generator) of + result instances. + """ + + @classmethod + def get_extra_kwargs(cls) -> MappingProxyType[str, Parameter]: + kwargs = { + 'on_failure': Parameter( + name='on_failure', + kind=Parameter.KEYWORD_ONLY, + default='continue', + annotation=str, + ), + } + return MappingProxyType(kwargs) + + def __call__(self, producer: Callable): + """ """ + # book-keeping whether to raise an exception + error_results: list[dict] = [] + on_failure = self._cmd_kwargs['on_failure'] + + for res in producer(): + if not res or 'action' not in res: + continue + + self.log_result(res) + self.render_result(res) + + if on_failure in ('continue', 'stop') and res['status'] in ( + 'impossible', + 'error', + ): + error_results.append(res) + if on_failure == 'stop': + # first fail -> that's it + # raise will happen after the loop + break + + # TODO: implement result filtering + # if not self.keep_result(res): + # continue + + yield from self.transform_result(res) + + # TODO: implement result summary rendering + # self.render_result_summary() + + if error_results: + msg = 'Command did not complete successfully' + raise ResultError(failed=error_results, msg=msg) + + def log_result(self, result: dict) -> None: + """ """ + + # TODO: implement result summary rendering + # def want_custom_result_summary(self, mode: str) -> bool: + # """ """ + # return False + + def render_result(self, result: dict) -> None: + """ """ + + # TODO: implement result summary rendering + # def render_result_summary(self) -> None: + # """ """ + + def transform_result(self, res) -> Generator[Any, None, None]: + """ """ + yield res + + # TODO: implement result filtering + # def keep_result(self, res) -> bool: + # """ """ + # return True + + +__the_default_result_handler_class: type[ResultHandler] = StandardResultHandler + + +def set_default_result_handler(handler_cls: type[ResultHandler]): + """Set a default result handler class for use by ``@datalad_command`` + + This must be a class implementing the :class:`ResultHandler` interface. + """ + global __the_default_result_handler_class # noqa: PLW0603 + __the_default_result_handler_class = handler_cls + + +def get_default_result_handler() -> type[ResultHandler]: + """Get the default result handler class used by ``@datalad_command`` + + See :func:`set_default_result_handler` for more information. + """ + return __the_default_result_handler_class diff --git a/datalad_core/commands/result_handler.py b/datalad_core/commands/result_handler.py new file mode 100644 index 0000000..32b6b43 --- /dev/null +++ b/datalad_core/commands/result_handler.py @@ -0,0 +1,52 @@ +from __future__ import annotations + +from abc import ( + ABC, + abstractmethod, +) +from types import MappingProxyType +from typing import ( + TYPE_CHECKING, + Any, + Callable, +) + +if TYPE_CHECKING: + from inspect import Parameter + + +class ResultHandler(ABC): + """Abstract base class for result handlers + + Any subclass must implement ``__call__``. This method will be executed + with a callable as the only argument. The callable is fully parameterized + and takes no parameters. The ``__call__`` implementation must run this + callable to trigger execution and receive and post-process results + in whatever fashion. + """ + + def __init__( + self, + cmd_kwargs, + ): + self._cmd_kwargs = cmd_kwargs + + @classmethod + def get_extra_kwargs(cls) -> MappingProxyType[str, Parameter]: + """Returns a mapping with specifications of extra command parameters""" + return MappingProxyType({}) + + @abstractmethod + def __call__(self, producer: Callable) -> Any: + """Implement to run commands and post-process return values""" + + +class PassthroughHandler(ResultHandler): + """Minimal handler that relays any return value unmodified + + This handler reports no extra keyword arguments via its + :meth:`get_extra_kwargs`. + """ + + def __call__(self, producer: Callable) -> Any: + return producer() diff --git a/datalad_core/commands/tests/__init__.py b/datalad_core/commands/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/datalad_core/commands/tests/test_cmd.py b/datalad_core/commands/tests/test_cmd.py new file mode 100644 index 0000000..1573e56 --- /dev/null +++ b/datalad_core/commands/tests/test_cmd.py @@ -0,0 +1,128 @@ +from inspect import Parameter, signature +from types import MappingProxyType + +import pytest + +from datalad_core.commands.decorator import datalad_command +from datalad_core.commands.default_result_handler import ( + ResultError, + get_default_result_handler, + set_default_result_handler, +) +from datalad_core.commands.result_handler import ( + PassthroughHandler, + ResultHandler, +) + + +def test_testcmd_minimal(): + # test thinnest possible wrapper -- no result processing whatsoever, + # just pass-through. + # this can be useful, when only parameter validation is desired + magic_value = 5 + + @datalad_command(postproc_cls=PassthroughHandler) + def test_command(): + """EXPLAIN IT ALL""" + return magic_value + + assert test_command() == magic_value + assert 'EXPLAIN IT ALL' in test_command.__doc__ + + +def test_testcmd_plain(): + class TestResultHandler(ResultHandler): + @classmethod + def get_extra_kwargs(cls) -> MappingProxyType[str, Parameter]: + kwargs = { + 'on_failure': Parameter( + name='on_failure', + kind=Parameter.KEYWORD_ONLY, + default='continue', + annotation=str, + ), + } + return MappingProxyType(kwargs) + + def __call__(self, get_results): + return list(get_results()) + + @datalad_command( + postproc_cls=TestResultHandler, + ) + def test_command( + parg1, + parg2, # noqa: ARG001 + *, + kwarg1=None, # noqa: ARG001 + kwarg2=True, # noqa: ARG001 + ): + yield parg1 + + # common args are registered in the signature + assert 'on_failure' in str(signature(test_command)) + # decorator args are attached to function as attributes + assert test_command.preproc is None + assert test_command.extra_kwargs_defaults == {} + assert test_command.postproc_cls is TestResultHandler + + with pytest.raises(TypeError, match='missing.*required.*parg1.*parg2'): + test_command() + test_command('some_parg1', 'some_parg2') + + +def test_testcmd_default_result_handler_on_failure(): + test_results = [ + {'action': 'test', 'status': 'ok'}, + {'action': 'test', 'status': 'notneeded'}, + {'action': 'test', 'status': 'impossible'}, + {'action': 'test', 'status': 'error'}, + ] + success = [tr for tr in test_results if tr['status'] in ('ok', 'notneeded')] + failed = [tr for tr in test_results if tr['status'] in ('impossible', 'error')] + + @datalad_command() + def test_command(): + yield + yield from test_results + + assert ( + test_command.postproc_cls.get_extra_kwargs()['on_failure'].default == 'continue' + ) + + res = [] + # we need to have a for-loop here to get as many results out + # as possible before the exception kicks in + with pytest.raises(ResultError) as e: # noqa: PT012 + # `yield None` is ignored + for r in test_command(): + res.append(r) # noqa: PERF402 + + # all results are yielded, also the error ones + assert res == test_results + # the error ones are collected and attached to the exception + assert e.value.failed == failed + + res = [] + with pytest.raises(ResultError) as e: # noqa: PT012 + # `yield None` is ignored + for r in test_command(on_failure='stop'): + res.append(r) + + # only success results are yielded this time + assert res == success + # stopped on first error, hence only one error communicated + assert e.value.failed == failed[:1] + + assert list(test_command(on_failure='ignore')) == test_results + + +def test_default_handler_set(): + dhdlr = get_default_result_handler() + try: + test_handler_cls = PassthroughHandler + set_default_result_handler(test_handler_cls) + assert get_default_result_handler() is test_handler_cls + finally: + # restore + set_default_result_handler(dhdlr) diff --git a/docs/index.rst b/docs/index.rst index 3d6d59c..16e836a 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -10,6 +10,7 @@ Also see the :ref:`modindex`. .. autosummary:: :toctree: generated + commands config constraints consts From 060a97526c964ec047d6aa92c08aa15ff7deabea Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Fri, 25 Oct 2024 10:10:06 +0200 Subject: [PATCH 02/17] doc: render documentation for `__call__()` methods also The most valuable addition to the rendered documentation is the signature of the method, together with its type annotations. --- docs/_templates/autosummary/class.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/_templates/autosummary/class.rst b/docs/_templates/autosummary/class.rst index f9d0589..3bfb788 100644 --- a/docs/_templates/autosummary/class.rst +++ b/docs/_templates/autosummary/class.rst @@ -7,4 +7,4 @@ :inherited-members: :undoc-members: :show-inheritance: - :special-members: __eq__ + :special-members: __eq__,__call__ From 4b46830825eadb7a83268923c811b650ccec6029 Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Fri, 18 Oct 2024 11:01:16 +0200 Subject: [PATCH 03/17] feat: have flyweight be robust to (root-)path resolution Previously, if a `Flyweighted` subclass constructor would somehow alter the given representative path, the flyweight would not catch this, even if to instances alter the path to the exact same value. This change adjust the flyweight logic by double-checking against an instances `.path` property after creation, and return any existing instance matching that path, before keeping a new instance around. I added a TODO, because I suspect this could be done cleaner, by moving the path resolution to an earlier stage. I could not immediately figure this out. It would still need to be subclass specific. I believe, however, that doing this in this particular fashion is not too bad performance-wise, and the complexity of adding this feature is certainly minimal in the current form. --- datalad_core/repo/flyweight.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/datalad_core/repo/flyweight.py b/datalad_core/repo/flyweight.py index 0b5b639..9202bbb 100644 --- a/datalad_core/repo/flyweight.py +++ b/datalad_core/repo/flyweight.py @@ -50,9 +50,21 @@ def __call__(cls, path: Path): # let generic code fiddle with them to preserve any and # all semantics of the instantiated class instance = type.__call__(cls, path) + # a particular class might resolve the input path parameter + # to a different one (think root path resolution). in this + # case we want to root the instance for that resolved path + # (if we have one), regardless of the given `id_`. + # TODO: ideally such a resolved path would already be given + # to this method. However, MIH does not immediately know + # how to wield the __new__ magic to achieve that. Doing the + # hack below should be rather cheap...for now. + if instance.path in cls._unique_instances: # type: ignore + return cls._unique_instances[instance.path] # type: ignore # ignore typing, because MIH does not know how to say that - # `cls` is required to have a particular class attribute - cls._unique_instances[id_] = instance # type: ignore + # `cls` is required to have a particular class attribute. + # we use the (resolved) path of the created instance as an + # ID for future comparison. + cls._unique_instances[instance.path] = instance # type: ignore return instance @@ -82,3 +94,8 @@ def flyweight_valid(self): This test runs on every object creation and should be kept as cheap as possible. """ + + @property + @abstractmethod + def path(self) -> Path: + """Absolute path representative of the instance""" From 4917a2ae48ff2294ba9554d049db9acd206efcb4 Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Fri, 18 Oct 2024 11:05:36 +0200 Subject: [PATCH 04/17] feat: ensure `(Repo|Worktree).path` points to the respective root directory Previously, more or less any path would have been accepted (at lest initially), including non-existing path. This change introduces the cheapest tests for making sure that `Repo` and `Worktree` instances actually point to something existing. The particular implementation in this changeset ensures that any given path is (resolved to) the respective root directory of an entity. With my choice of tests, this additional property comes at no additional cost. --- datalad_core/repo/repo.py | 5 ++++ datalad_core/repo/tests/test_repo.py | 12 +++++++++ datalad_core/repo/tests/test_worktree.py | 22 +++++++++++++++- datalad_core/repo/worktree.py | 33 +++++++++++++++++++----- 4 files changed, 64 insertions(+), 8 deletions(-) diff --git a/datalad_core/repo/repo.py b/datalad_core/repo/repo.py index 84b783a..1d0e1e3 100644 --- a/datalad_core/repo/repo.py +++ b/datalad_core/repo/repo.py @@ -33,6 +33,11 @@ def __init__(self, path: Path): """ ``path`` is the path to an existing repository (Git dir). """ + # perform a cheap test whether this even could be a Git repo + if not path.is_dir() or not (path / 'HEAD').exists(): + msg = f'{path} does not point to an existing Git repository' + raise ValueError(msg) + super().__init__(path) self.reset() diff --git a/datalad_core/repo/tests/test_repo.py b/datalad_core/repo/tests/test_repo.py index 9dc2247..65e4f1f 100644 --- a/datalad_core/repo/tests/test_repo.py +++ b/datalad_core/repo/tests/test_repo.py @@ -22,6 +22,18 @@ def test_repo(baregitrepo): assert repo.git_common_dir == baregitrepo +def test_repo_error(tmp_path): + err_match = 'not point to an existing Git' + with pytest.raises(ValueError, match=err_match): + Repo(tmp_path) + with pytest.raises(ValueError, match=err_match): + Repo(tmp_path / 'notexist') + test_file = tmp_path / 'afile' + test_file.touch() + with pytest.raises(ValueError, match=err_match): + Repo(test_file) + + def test_repo_vanish(baregitrepo): repo = Repo(baregitrepo) assert repo.flyweight_valid() diff --git a/datalad_core/repo/tests/test_worktree.py b/datalad_core/repo/tests/test_worktree.py index 30f7f71..26d99d5 100644 --- a/datalad_core/repo/tests/test_worktree.py +++ b/datalad_core/repo/tests/test_worktree.py @@ -22,9 +22,29 @@ def test_worktree(gitrepo): # to the worktree, not the repo directly) assert wt.config is not wt.repo.config assert wt.config['core.bare'].value is False - assert wt.path is gitrepo + assert wt.path == gitrepo assert wt.repo.path != wt.path + # test resolve to root + test_path = wt.path / 'subdir' + test_path.mkdir() + wt_sub = Worktree(test_path) + assert wt_sub.path == wt.path + # and actually + assert wt_sub is wt + + +def test_work_error(tmp_path): + err_match = 'not point to an existing Git worktree/checkout' + with pytest.raises(ValueError, match=err_match): + Worktree(tmp_path) + with pytest.raises(ValueError, match=err_match): + Worktree(tmp_path / 'notexist') + test_file = tmp_path / 'afile' + test_file.touch() + with pytest.raises(ValueError, match=err_match): + Worktree(test_file) + def test_secondary_worktree(gitrepo): test_key = 'brand.new.key' diff --git a/datalad_core/repo/worktree.py b/datalad_core/repo/worktree.py index dda6667..5ad9542 100644 --- a/datalad_core/repo/worktree.py +++ b/datalad_core/repo/worktree.py @@ -1,12 +1,8 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from pathlib import Path from weakref import WeakValueDictionary -if TYPE_CHECKING: - from pathlib import Path - - from datalad_core.config import ( ConfigItem, ConfigManager, @@ -17,7 +13,11 @@ from datalad_core.repo.gitmanaged import GitManaged from datalad_core.repo.repo import Repo from datalad_core.repo.utils import init_annex_at -from datalad_core.runners import call_git +from datalad_core.runners import ( + CommandError, + call_git, + call_git_oneline, +) class Worktree(GitManaged): @@ -35,7 +35,26 @@ def __init__( self, path: Path, ): - super().__init__(path) + """ + ``path`` is the path to an existing repository worktree/checkout. + """ + # test and resolve to worktree root + try: + resolved_root = Path( + call_git_oneline( + [ + '-C', + str(path), + 'rev-parse', + '--path-format=absolute', + '--show-toplevel', + ], + ) + ) + except CommandError as e: + msg = f'{path} does not point to an existing Git worktree/checkout' + raise ValueError(msg) from e + super().__init__(resolved_root) self.reset() def reset(self) -> None: From 1741cc3fbd8eda530bca03231b2e5a2559113627 Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Fri, 18 Oct 2024 13:34:32 +0200 Subject: [PATCH 05/17] feat: `Dataset` class to represent a command's `dataset` parameter This may be the most substantial departure from legacy DataLad, where `Dataset` is a central convenience type, to which the majority of all commands bind as methods. Here we are no planing to have such a class that encourages maximization of the depencency footprint, and rather see this as a feature of a separate convenience package for interactive Python API usage. The `Dataset` class in this changeset is conceptualize similar to the `DatasetParameter` class in datalad-next. However, in comparision, `Dataset` has been enriched to provide standardized path resolution for existing and non-existing datasets via its `path` attributes. It also provides access to an existing datasets `worktree` and `repo`, thereby allowing for the distinction of bare and non-bare repository based dataset installations. Most importantly, a `pristine_spec` property gives access to the unaltered input value of any path/repo/worktree resolution process, and can thereby aid commands that want to support additional means of identifying/resolving datasets without having to use another parameter type. --- datalad_core/commands/__init__.py | 3 + datalad_core/commands/dataset.py | 173 ++++++++++++++++++++ datalad_core/commands/tests/test_dataset.py | 69 ++++++++ 3 files changed, 245 insertions(+) create mode 100644 datalad_core/commands/dataset.py create mode 100644 datalad_core/commands/tests/test_dataset.py diff --git a/datalad_core/commands/__init__.py b/datalad_core/commands/__init__.py index 46a963a..9cd068e 100644 --- a/datalad_core/commands/__init__.py +++ b/datalad_core/commands/__init__.py @@ -24,6 +24,7 @@ :toctree: generated datalad_command + Dataset ResultHandler StandardResultHandler PassthroughHandler @@ -32,6 +33,7 @@ """ __all__ = [ + 'Dataset', 'StandardResultHandler', 'ResultHandler', 'PassthroughHandler', @@ -41,6 +43,7 @@ ] +from .dataset import Dataset from .decorator import datalad_command from .default_result_handler import ( StandardResultHandler, diff --git a/datalad_core/commands/dataset.py b/datalad_core/commands/dataset.py new file mode 100644 index 0000000..17e9624 --- /dev/null +++ b/datalad_core/commands/dataset.py @@ -0,0 +1,173 @@ +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING + +from datalad_core.repo import ( + Repo, + Worktree, +) + + +class Dataset: + """Dataset parameter type for DataLad command implementations + + Many DataLad commands operate on datasets, which are typically Git + repositories. This class provides a type to represent this parameter. + + The main purpose of this class is to relay the semantics of the original + parameter specification all the way to the implementation of a particular + command. A dataset may be identified in a variety of ways, including + auto-discovery based on a working directory. Individual commands may want + to behave differently depending on how a dataset was identified, or if at + all. + + A second use case are commands that can work on bare repositories and + worktrees alike. This class is a single type from which the presence of + both entities can be discovered without code duplication. + + A third use case are to-be-created datasets for which no repository or + worktree exist on the file system yet, and consequently the + :class:`~datalad_core.repo.Repo` and :class:`~datalad_core.repo.Worktree` + classes cannot be used directly. + + .. note:: + + Despite the name, this class is very different from the ``Dataset`` + class in legacy DataLad. This is not a convenience interface + for DataLad commands that operate on datasets. Instead, it is + merely a type to be used for implementing individual DataLad commands, + with uniform semantics for this key parameter. + """ + + def __init__( + self, + spec: str | Path | Repo | Worktree | None, + ): + """ + A ``spec`` is required, even if the given value is ``None``. + """ + self._spec = spec + self._repo: Repo | None = None + self._worktree: Worktree | None = None + self._path: Path | None = None + + @property + def pristine_spec(self) -> str | Path | Repo | Worktree | None: + """Returns the unaltered specification of the dataset + + This is the exact value that has been given to the constructor. + """ + return self._spec + + @property + def path(self) -> Path: + """Returns the local path associated with any (non-)existing dataset + + If an associated Git repository exists on the file system, the return + path is the worktree root path for non-bare repositories and their + worktree, or the repository root path for bare repositories. + + If no repository exists, the path is derived from the given ``spec`` + regardless of a corresponding directory existing on the file system. + + If the spec is ``None``, the returned path will be the process working + directory. + """ + if self._path is not None: + return self._path + + if self._spec is None: + self._path = Path.cwd() + return self._path + + # use the (resolved) path of a worktree or repo, + # if they exist. + # this gives an absolute path + self._path = ( + self.worktree.path + if self.worktree + else self.repo.path + if self.repo + else None + ) + + if self._path is not None: + return self._path + + # there is nothing on the filesystem, we can only work with the + # pristine_spec as-is + ps = self.pristine_spec + if isinstance(ps, Path): + self._path = ps + else: + if TYPE_CHECKING: + assert isinstance(ps, (Path, str)) + # could be a str-path or some magic label. + # for now we only support a path specification + self._path = Path(ps) + return self._path + + @property + def repo(self) -> Repo | None: + """Returns a repository associated with the dataset (if one exists) + + This property is mostly useful for datasets without a worktree. + For datasets with a worktree it is generally more appropriate + to access the ``repo`` property of the :attr:`worktree` property. + + Returns ``None`` if there is no associated repository. This may + happen, if a repository is yet to be created. + """ + # short cut + ps = self.pristine_spec + + if self._repo is not None: + return self._repo + + if self.worktree is not None: + self._repo = self.worktree.repo + elif isinstance(ps, Repo): + self._repo = ps + elif isinstance(ps, Path): + self._repo = get_gitmanaged_from_pathlike(Repo, ps) + elif isinstance(ps, str): + # could be a str-path or some magic label. + # for now we only support a path specification + self._repo = get_gitmanaged_from_pathlike(Repo, ps) + return self._repo + + @property + def worktree(self) -> Worktree | None: + """Returns a worktree associated with the dataset (if one exists) + + Returns ``None`` if there is no associated worktree. This may + happen, if the dataset is associated with a bare Git repository, + or if the worktree (and repository) is yet to be created. + """ + if self._worktree is None: + ps = self.pristine_spec + if isinstance(ps, Worktree): + # we can take this right away + self._worktree = ps + elif isinstance(ps, Path): + self._worktree = get_gitmanaged_from_pathlike(Worktree, ps) + elif isinstance(ps, str): + # could be a str-path or some magic label. + # for now we only support a path specification + self._worktree = get_gitmanaged_from_pathlike(Worktree, ps) + return self._worktree + + def __repr__(self) -> str: + return f'{self.__class__.__name__}({self.pristine_spec!r})' + + +def get_gitmanaged_from_pathlike(cls, path): + if not isinstance(path, Path): + path = Path(path) + # the constructor will tell us, if this an instance of the + # requested class + try: + return cls(path) + except ValueError: + return None diff --git a/datalad_core/commands/tests/test_dataset.py b/datalad_core/commands/tests/test_dataset.py new file mode 100644 index 0000000..06beca4 --- /dev/null +++ b/datalad_core/commands/tests/test_dataset.py @@ -0,0 +1,69 @@ +from pathlib import Path + +from datalad_core.commands.dataset import Dataset +from datalad_core.repo import ( + Repo, + Worktree, +) + + +def test_nonexisting_dataset_from_nothing(): + spec = None + ds = Dataset(spec) + _assert_no_underlying_git(spec, ds) + assert ds.path == Path.cwd() + + +def test_nonexisting_dataset_from_str(): + spec = 'whatever' + ds = Dataset(spec) + _assert_no_underlying_git(spec, ds) + assert ds.path == Path(spec) + + +def test_nonexisting_dataset_from_path(): + spec = Path('whatever') + ds = Dataset(spec) + _assert_no_underlying_git(spec, ds) + assert ds.path is spec + + +def test_existing_dataset_from_worktree(gitrepo): + wt = Worktree(gitrepo) + ds = Dataset(wt.path) + assert ds.path == wt.path + assert ds.worktree is wt + assert ds.repo is wt.repo + # cached retrieval + assert ds.repo is wt.repo + + ds = Dataset(wt) + assert ds.worktree is wt + assert ds.repo is wt.repo + + ds = Dataset(wt.repo) + assert ds.repo is wt.repo + + +def test_existing_dataset_from_barerepo(baregitrepo): + repo = Repo(baregitrepo) + ds = Dataset(repo.path) + assert ds.path == repo.path + assert ds.worktree is None + assert ds.repo is repo + # cached retrieval + assert ds.repo is repo + + ds = Dataset(repo) + assert ds.path == repo.path + assert ds.worktree is None + assert ds.repo is repo + + +def _assert_no_underlying_git(spec, ds): + assert ds.pristine_spec is spec + assert repr(ds) == f'Dataset({spec!r})' + assert ds.repo is None + assert ds.worktree is None + # under all circumstances we have a path associated with a dataset + assert ds.path is not None From 343e8c7668418fa8c07ebdef9c09f600ebdd854c Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Fri, 18 Oct 2024 19:37:01 +0200 Subject: [PATCH 06/17] feat: allow for `Constraint` subclasses with no `input_description` This change adds a default implementation that simple returns `input_synopsis`. Some things are so simple that the short version is also the full version. --- datalad_core/constraints/constraint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datalad_core/constraints/constraint.py b/datalad_core/constraints/constraint.py index 13f37a7..0fd0733 100644 --- a/datalad_core/constraints/constraint.py +++ b/datalad_core/constraints/constraint.py @@ -59,7 +59,6 @@ def input_synopsis(self) -> str: """ @property - @abstractmethod def input_description(self) -> str: """Returns full description of valid input for a constraint @@ -79,6 +78,7 @@ def input_description(self) -> str: data types. Tailored documentation can be provided via the ``WithDescription`` wrapper. """ + return self.input_synopsis # TODO: also have these for AnyOf and AllOf # def for_dataset(self, dataset: DatasetParameter) -> Constraint: From 93c5443265495c513cfe74ef24d1a75bf2f5149c Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Fri, 18 Oct 2024 20:33:27 +0200 Subject: [PATCH 07/17] feat: add `Constraint.for_dataset()` to tailor constraints for a context This introduces a flexible constraint transformation mechanism. It may be best illustrated by an example: If a command has a parameter that takes the name of an existing Git remote, a simple constraint could only do minimal validation (syntax compliance), because without knowing which dataset to query for existing remote, a full validation is impossible. The implementation possibilities are intentionally kept very flexible. For example, it is not required that `for_dataset()` returns a `Constraint` of the same type. --- datalad_core/constraints/constraint.py | 34 ++++++++++++++----- .../constraints/tests/test_constraint.py | 34 +++++++++++++++++++ 2 files changed, 59 insertions(+), 9 deletions(-) diff --git a/datalad_core/constraints/constraint.py b/datalad_core/constraints/constraint.py index 0fd0733..1815c4c 100644 --- a/datalad_core/constraints/constraint.py +++ b/datalad_core/constraints/constraint.py @@ -6,10 +6,16 @@ ABC, abstractmethod, ) -from typing import Any +from typing import ( + TYPE_CHECKING, + Any, +) from datalad_core.constraints.exceptions import ConstraintError +if TYPE_CHECKING: + from datalad_core.commands import Dataset + class Constraint(ABC): """Base class for value coercion/validation. @@ -80,14 +86,16 @@ def input_description(self) -> str: """ return self.input_synopsis - # TODO: also have these for AnyOf and AllOf - # def for_dataset(self, dataset: DatasetParameter) -> Constraint: - # """Return a constraint-variant for a specific dataset context - # - # The default implementation returns the unmodified, identical - # constraint. However, subclasses can implement different behaviors. - # """ - # return self + # this is called 'for_dataset' and not 'for_submodule' or 'for_repo' + # or 'for_worktree' to give a lot of flexibility re what semantics + # constraints can apply to a transformation like this + def for_dataset(self, dataset: Dataset) -> Constraint: # noqa: ARG002 + """Return a constraint-variant for a specific dataset context + + The default implementation returns the unmodified, identical + constraint. However, subclasses can implement different behaviors. + """ + return self @abstractmethod def __call__(self, value: Any): @@ -121,6 +129,14 @@ def _get_description(self, attr: str, operation: str) -> str: # dont fiddle with the single item, just take it return doc + def for_dataset(self, dataset: Dataset) -> Constraint: + """Return a constraint-variant for a specific dataset context + + The default implementation returns the unmodified, identical + constraint. However, subclasses can implement different behaviors. + """ + return self.__class__(*(c.for_dataset(dataset) for c in self.constraints)) + class AnyOf(_MultiConstraint): """Logical OR for constraints. diff --git a/datalad_core/constraints/tests/test_constraint.py b/datalad_core/constraints/tests/test_constraint.py index bd44bb9..6e9e782 100644 --- a/datalad_core/constraints/tests/test_constraint.py +++ b/datalad_core/constraints/tests/test_constraint.py @@ -1,5 +1,6 @@ import pytest +from datalad_core.commands import Dataset from datalad_core.constraints.constraint import ( AllOf, Constraint, @@ -107,3 +108,36 @@ def test_constraint_allof(): # the current implementation is not sufficient anyways assert EnsureInt().input_description in int5.input_description assert eq5.input_description in int5.input_description + + +def test_constraint_for_dataset(): + class BecomesB(Constraint): + input_synopsis = 'B' + + def __call__(self, value): # noqa: ARG002 + return 'B' + + class WeirdOne(Constraint): + input_synopsis = 'shapeshifter' + + def __call__(self, value): # noqa: ARG002 + return 'A' + + def for_dataset(self, dataset): # noqa: ARG002 + return BecomesB() + + c1 = WeirdOne() + assert c1('anything') == 'A' + c2 = c1.for_dataset(Dataset(None)) + assert c2.input_description == 'B' + assert c2('anything') == 'B' + + # same when put into a MultiConstraint + ca = AllOf(WeirdOne()) + assert ca('anything') == 'A' + cb = ca.for_dataset(Dataset(None)) + assert cb('anything') == 'B' + + # without a dedicated implementation there is no transformation + c = BecomesB() + assert c.for_dataset(Dataset(None)) is c From b9caddfb91f4a5ea4ccc8bf8241fc007e8033495 Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Sat, 19 Oct 2024 06:05:45 +0200 Subject: [PATCH 08/17] feat: `NoConstraint`, a constraint for representing no constraint This can be used in all places that require a constraint, but none has been given or applies. --- datalad_core/constraints/__init__.py | 5 +++++ datalad_core/constraints/basic.py | 14 ++++++++++++++ datalad_core/constraints/tests/test_basic.py | 8 ++++++++ 3 files changed, 27 insertions(+) create mode 100644 datalad_core/constraints/basic.py create mode 100644 datalad_core/constraints/tests/test_basic.py diff --git a/datalad_core/constraints/__init__.py b/datalad_core/constraints/__init__.py index 2dacf5c..3b75466 100644 --- a/datalad_core/constraints/__init__.py +++ b/datalad_core/constraints/__init__.py @@ -39,6 +39,7 @@ AnyOf ConstraintError WithDescription + NoConstraint """ __all__ = [ @@ -47,9 +48,13 @@ 'AnyOf', 'ConstraintError', 'WithDescription', + 'NoConstraint', ] +from .basic import ( + NoConstraint, +) from .constraint import ( AllOf, AnyOf, diff --git a/datalad_core/constraints/basic.py b/datalad_core/constraints/basic.py new file mode 100644 index 0000000..16ceecd --- /dev/null +++ b/datalad_core/constraints/basic.py @@ -0,0 +1,14 @@ +from __future__ import annotations + +from datalad_core.constraints.constraint import Constraint + + +class NoConstraint(Constraint): + """A constraint that represents no constraints""" + + @property + def input_synopsis(self): + return '' + + def __call__(self, value): + return value diff --git a/datalad_core/constraints/tests/test_basic.py b/datalad_core/constraints/tests/test_basic.py new file mode 100644 index 0000000..871973c --- /dev/null +++ b/datalad_core/constraints/tests/test_basic.py @@ -0,0 +1,8 @@ +from datalad_core.constraints.basic import NoConstraint + + +def test_noconstraint(): + c = NoConstraint() + assert not c.input_synopsis + test_val = 5 + assert c(test_val) is test_val From a6d3ab97b9dc73bcc7870dc9650f3552d17ec707 Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Sat, 19 Oct 2024 07:03:51 +0200 Subject: [PATCH 09/17] feat: `EnsureMappingHasKeys` constraint --- datalad_core/constraints/basic.py | 33 ++++++++++++++++++++ datalad_core/constraints/tests/test_basic.py | 24 +++++++++++++- 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/datalad_core/constraints/basic.py b/datalad_core/constraints/basic.py index 16ceecd..d954a3d 100644 --- a/datalad_core/constraints/basic.py +++ b/datalad_core/constraints/basic.py @@ -1,5 +1,8 @@ from __future__ import annotations +from collections.abc import Mapping +from typing import Any + from datalad_core.constraints.constraint import Constraint @@ -12,3 +15,33 @@ def input_synopsis(self): def __call__(self, value): return value + + +class EnsureMappingtHasKeys(Constraint): + """Ensure a mapping has all given keys""" + + def __init__(self, required_keys: tuple | list): + self._required_keys = required_keys + + @property + def input_synopsis(self): + return ( + 'mapping with required keys {self._required_keys!r}' + if self._required_keys + else 'mapping' + ) + + def __call__(self, value: Any) -> dict: + if not isinstance(value, Mapping): + self.raise_for( + value, + 'not a mapping', + ) + missing = tuple(a for a in self._required_keys if a not in value) + if missing: + self.raise_for( + value, + 'missing keys {missing!r}', + missing=missing, + ) + return value diff --git a/datalad_core/constraints/tests/test_basic.py b/datalad_core/constraints/tests/test_basic.py index 871973c..b9ef1a0 100644 --- a/datalad_core/constraints/tests/test_basic.py +++ b/datalad_core/constraints/tests/test_basic.py @@ -1,4 +1,10 @@ -from datalad_core.constraints.basic import NoConstraint +import pytest + +from datalad_core.constraints.basic import ( + EnsureMappingtHasKeys, + NoConstraint, +) +from datalad_core.constraints.exceptions import ConstraintError def test_noconstraint(): @@ -6,3 +12,19 @@ def test_noconstraint(): assert not c.input_synopsis test_val = 5 assert c(test_val) is test_val + + +def test_ensuremappinghaskeys(): + c = EnsureMappingtHasKeys(()) + assert c.input_synopsis == 'mapping' + with pytest.raises(ConstraintError, match='not a mapping'): + c(5) + test_mapping = {5: 'some', 'A': 'some', None: 'some'} + assert c(test_mapping) is test_mapping + + c = EnsureMappingtHasKeys((None, 5)) + assert c(test_mapping) is test_mapping + + c = EnsureMappingtHasKeys(('exotic', 'bogus')) + with pytest.raises(ConstraintError, match='missing.*exotic.*bogus'): + c(test_mapping) From 6a6045d1444349e59ba0f694b38a41129f0ae258 Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Tue, 22 Oct 2024 07:39:17 +0200 Subject: [PATCH 10/17] feat: have `UnsetValue` be available from `consts` It makes sense to promote this special type in a central place to avoid proliferation of equivalent alternatives. Also see: https://github.com/datalad/datasalad/issues/53 Also add an `__all__`. --- datalad_core/consts/__init__.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/datalad_core/consts/__init__.py b/datalad_core/consts/__init__.py index c1e0b1b..ef0fd97 100644 --- a/datalad_core/consts/__init__.py +++ b/datalad_core/consts/__init__.py @@ -1,7 +1,16 @@ """Assorted common constants""" +__all__ = [ + 'UnsetValue', + 'DATALAD_DOTDIR_RELPATH', + 'DATALAD_BRANCH_CONFIG_RELPATH', + 'PRE_INIT_COMMIT_SHA', +] + from os.path import join as opj +from datasalad.settings import UnsetValue + DATALAD_DOTDIR_RELPATH = '.datalad' """Path to dataset directory with committed datalad-specific information From 35a5eb984e84da2b58f52170159d5615c70a3eaa Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Wed, 23 Oct 2024 09:37:53 +0200 Subject: [PATCH 11/17] feat: `EnsureDataset` constraint A constraint to yield a `Dataset` instance from a wide range of inputs. This class is envisioned to be a standard ingredient to the majority of to-be-implemented commands. --- datalad_core/commands/dataset.py | 75 ++++++++++++++++++++ datalad_core/commands/tests/test_dataset.py | 78 ++++++++++++++++++++- 2 files changed, 152 insertions(+), 1 deletion(-) diff --git a/datalad_core/commands/dataset.py b/datalad_core/commands/dataset.py index 17e9624..d1fcde7 100644 --- a/datalad_core/commands/dataset.py +++ b/datalad_core/commands/dataset.py @@ -3,6 +3,7 @@ from pathlib import Path from typing import TYPE_CHECKING +from datalad_core.constraints import Constraint from datalad_core.repo import ( Repo, Worktree, @@ -162,6 +163,80 @@ def __repr__(self) -> str: return f'{self.__class__.__name__}({self.pristine_spec!r})' +class EnsureDataset(Constraint): + """Ensure an absent/present `Dataset` from any path or Dataset instance + + Regardless of the nature of the input (`Dataset` instance or local path) + a resulting instance (if it can be created) is optionally tested for + absence or presence on the local file system. + + Due to the particular nature of the `Dataset` class (the same instance + is used for a unique path), this constraint returns a `DatasetParameter` + rather than a `Dataset` directly. Consuming commands can discover + the original parameter value via its `original` property, and access a + `Dataset` instance via its `ds` property. + + In addition to any value representing an explicit path, this constraint + also recognizes the special value `None`. This instructs the implementation + to find a dataset that contains the process working directory (PWD). + Such a dataset need not have its root at PWD, but could be located in + any parent directory too. If no such dataset can be found, PWD is used + directly. Tests for ``installed`` are performed in the same way as with + an explicit dataset location argument. If `None` is given and + ``installed=True``, but no dataset is found, an exception is raised + (this is the behavior of the ``required_dataset()`` function in + the DataLad core package). With ``installed=False`` no exception is + raised and a dataset instances matching PWD is returned. + """ + + def __init__(self, installed: bool | str | None = None): + """ + Parameters + ---------- + installed: bool, optional + If given, a dataset will be verified to be installed or not. + Otherwise the installation-state will not be inspected. + """ + self._installed = installed + super().__init__() + + @property + def input_synopsis(self) -> str: + return '(path to) {}dataset'.format( + 'an existing ' + if self._installed + else 'a non-existing ' + if self._installed is False + else 'a ' + ) + + def __call__(self, value) -> Dataset: + ds = Dataset(value) + try: + # resolve + ds.path # noqa: B018 + except (ValueError, TypeError) as e: + self.raise_for( + value, + 'cannot create Dataset from {type}: {__caused_by__}', + type=type(value), + __caused_by__=e, + ) + if self._installed is False and (ds.worktree or ds.repo): + self.raise_for(ds, 'already exists locally') + if self._installed and not (ds.worktree or ds.repo): + self.raise_for(ds, 'not installed') + if self._installed != 'with-id': + return ds + + to_query = ds.worktree or ds.repo + if TYPE_CHECKING: + assert to_query is not None + if 'datalad.dataset.id' not in to_query.config.sources['datalad-branch']: + self.raise_for(ds, 'does not have a datalad-id') + return ds + + def get_gitmanaged_from_pathlike(cls, path): if not isinstance(path, Path): path = Path(path) diff --git a/datalad_core/commands/tests/test_dataset.py b/datalad_core/commands/tests/test_dataset.py index 06beca4..18f596c 100644 --- a/datalad_core/commands/tests/test_dataset.py +++ b/datalad_core/commands/tests/test_dataset.py @@ -1,6 +1,13 @@ from pathlib import Path -from datalad_core.commands.dataset import Dataset +import pytest + +from datalad_core.commands.dataset import ( + Dataset, + EnsureDataset, +) +from datalad_core.config import ConfigItem +from datalad_core.constraints import ConstraintError from datalad_core.repo import ( Repo, Worktree, @@ -67,3 +74,72 @@ def _assert_no_underlying_git(spec, ds): assert ds.worktree is None # under all circumstances we have a path associated with a dataset assert ds.path is not None + + +def test_ensuredataset(tmp_path): + assert EnsureDataset().input_synopsis == '(path to) a dataset' + assert ( + EnsureDataset(installed=True).input_synopsis == '(path to) an existing dataset' + ) + assert ( + EnsureDataset(installed=False).input_synopsis + == '(path to) a non-existing dataset' + ) + + with pytest.raises( + ConstraintError, + match="cannot create Dataset from ", + ): + EnsureDataset()(5) + + with pytest.raises( + ConstraintError, + match='not installed', + ): + EnsureDataset(installed=True)(tmp_path) + + +def test_ensuredataset_bare(baregitrepo): + c = EnsureDataset() + for spec in (baregitrepo,): + assert c(spec).repo.path == baregitrepo + + with pytest.raises( + ConstraintError, + match='already exists locally', + ): + EnsureDataset(installed=False)(baregitrepo) + + +def test_ensuredataset_nonbare(gitrepo): + c = EnsureDataset() + for spec in (gitrepo,): + assert c(spec).worktree.path == gitrepo + + with pytest.raises( + ConstraintError, + match='already exists locally', + ): + EnsureDataset(installed=False)(gitrepo) + + +def test_ensuredataset_with_id(tmp_path, gitrepo): + c = EnsureDataset(installed='with-id') + with pytest.raises( + ConstraintError, + match='not installed', + ): + c(tmp_path) + + with pytest.raises( + ConstraintError, + match='does not have a datalad-id', + ): + c(gitrepo) + + # now set an ID in the branch config and satisfy the test + wt = Worktree(gitrepo) + wt.config.sources['datalad-branch']['datalad.dataset.id'] = ConfigItem( + '9557edc6-910e-11ef-bcf8-23e60847c95d' + ) + assert c(gitrepo).path == gitrepo From c8b55cc744f76fae360c5aa2877fdb80278ee302 Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Thu, 24 Oct 2024 16:39:55 +0200 Subject: [PATCH 12/17] feat: `EnsureChoice` constraint This is a straight port of the datalad-next implementation. A difference is the added `choices` property to report the configured choices. The previously private attribute was accessed in many client implementations. --- datalad_core/constraints/__init__.py | 3 ++ datalad_core/constraints/basic.py | 30 +++++++++++++++++++- datalad_core/constraints/tests/test_basic.py | 15 ++++++++++ 3 files changed, 47 insertions(+), 1 deletion(-) diff --git a/datalad_core/constraints/__init__.py b/datalad_core/constraints/__init__.py index 3b75466..89a5109 100644 --- a/datalad_core/constraints/__init__.py +++ b/datalad_core/constraints/__init__.py @@ -40,6 +40,7 @@ ConstraintError WithDescription NoConstraint + EnsureChoice """ __all__ = [ @@ -47,12 +48,14 @@ 'AllOf', 'AnyOf', 'ConstraintError', + 'EnsureChoice', 'WithDescription', 'NoConstraint', ] from .basic import ( + EnsureChoice, NoConstraint, ) from .constraint import ( diff --git a/datalad_core/constraints/basic.py b/datalad_core/constraints/basic.py index d954a3d..2e99fa5 100644 --- a/datalad_core/constraints/basic.py +++ b/datalad_core/constraints/basic.py @@ -1,6 +1,8 @@ from __future__ import annotations -from collections.abc import Mapping +from collections.abc import ( + Mapping, +) from typing import Any from datalad_core.constraints.constraint import Constraint @@ -17,6 +19,32 @@ def __call__(self, value): return value +class EnsureChoice(Constraint): + """Ensure an input is element of a set of possible values""" + + def __init__(self, *values: Any): + self._choices = tuple(values) + super().__init__() + + @property + def choices(self) -> tuple[Any]: + """Returns the possible choice values""" + return self._choices + + def __call__(self, value): + if value not in self.choices: + self.raise_for( + value, + 'is not one of {allowed}', + allowed=self.choices, + ) + return value + + @property + def input_synopsis(self): + return f"one of {{{','.join([repr(c) for c in self.choices])}}}" + + class EnsureMappingtHasKeys(Constraint): """Ensure a mapping has all given keys""" diff --git a/datalad_core/constraints/tests/test_basic.py b/datalad_core/constraints/tests/test_basic.py index b9ef1a0..2fdc895 100644 --- a/datalad_core/constraints/tests/test_basic.py +++ b/datalad_core/constraints/tests/test_basic.py @@ -1,6 +1,7 @@ import pytest from datalad_core.constraints.basic import ( + EnsureChoice, EnsureMappingtHasKeys, NoConstraint, ) @@ -14,6 +15,20 @@ def test_noconstraint(): assert c(test_val) is test_val +def test_ensurechoice(): + c = EnsureChoice('choice1', 'choice2', None) + assert c.input_synopsis == "one of {'choice1','choice2',None}" + assert str(c) == f'Constraint[{c.input_synopsis}]' + # this should always work + assert c('choice1') == 'choice1' + assert c(None) is None + # this should always fail + with pytest.raises(ValueError, match='is not one of'): + c('fail') + with pytest.raises(ValueError, match='is not one of'): + c('None') + + def test_ensuremappinghaskeys(): c = EnsureMappingtHasKeys(()) assert c.input_synopsis == 'mapping' From 5d2b494a39be7c42392f15c1384fcd241213ad67 Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Tue, 22 Oct 2024 07:27:38 +0200 Subject: [PATCH 13/17] feat: parameter preprocessor (for `@datalad_command`) This is a clarified and streamlines implementation migrated from datalad-next. `JointParamProcessor` is no longer a `Constraint` subclass, but a standalone implementation of the `ParamProcessor` interface. --- datalad_core/commands/__init__.py | 21 +- datalad_core/commands/exceptions.py | 198 +++++++++++ datalad_core/commands/param_constraint.py | 67 ++++ datalad_core/commands/preproc.py | 366 ++++++++++++++++++++ datalad_core/commands/tests/test_cmd.py | 64 ++++ datalad_core/commands/tests/test_preproc.py | 257 ++++++++++++++ 6 files changed, 972 insertions(+), 1 deletion(-) create mode 100644 datalad_core/commands/exceptions.py create mode 100644 datalad_core/commands/param_constraint.py create mode 100644 datalad_core/commands/preproc.py create mode 100644 datalad_core/commands/tests/test_preproc.py diff --git a/datalad_core/commands/__init__.py b/datalad_core/commands/__init__.py index 9cd068e..2a2528a 100644 --- a/datalad_core/commands/__init__.py +++ b/datalad_core/commands/__init__.py @@ -24,18 +24,28 @@ :toctree: generated datalad_command - Dataset ResultHandler StandardResultHandler PassthroughHandler get_default_result_handler set_default_result_handler + Dataset + ParamProcessor + JointParamProcessor + ParamConstraintContext + ParamErrors + ParamSetConstraint """ __all__ = [ 'Dataset', 'StandardResultHandler', 'ResultHandler', + 'ParamProcessor', + 'JointParamProcessor', + 'ParamConstraintContext', + 'ParamErrors', + 'ParamSetConstraint', 'PassthroughHandler', 'datalad_command', 'get_default_result_handler', @@ -50,6 +60,15 @@ get_default_result_handler, set_default_result_handler, ) +from .exceptions import ( + ParamConstraintContext, + ParamErrors, +) +from .param_constraint import ParamSetConstraint +from .preproc import ( + JointParamProcessor, + ParamProcessor, +) from .result_handler import ( PassthroughHandler, ResultHandler, diff --git a/datalad_core/commands/exceptions.py b/datalad_core/commands/exceptions.py new file mode 100644 index 0000000..b0c3df2 --- /dev/null +++ b/datalad_core/commands/exceptions.py @@ -0,0 +1,198 @@ +from __future__ import annotations + +from dataclasses import dataclass +from textwrap import indent +from types import MappingProxyType +from typing import ( + TYPE_CHECKING, + Any, +) + +if TYPE_CHECKING: + from collections.abc import Mapping + +from datalad_core.constraints import ConstraintError +from datalad_core.consts import UnsetValue + + +# TODO: Consider moving that to datalad_core for reuse. +# In here it is only an internal base class for ParamErrors +class ConstraintErrors(ConstraintError): # noqa: N818 + """Exception representing context-specific ConstraintError instances + + This class enables the association of a context in which any particular + constraint was violated. This is done by passing a mapping, of a context + identifier (e.g., a label) to the particular ``ConstraintError`` that + occurred in this context, to the constructor. + + This is a generic implementation with no requirements regarding the + nature of the context identifiers (expect for being hashable). See + ``CommandParametrizationError`` for a specialization. + """ + + def __init__(self, exceptions: Mapping[Any, ConstraintError]): + super().__init__( + # this is the main payload, the base class expects a Constraint + # but only stores it + constraint=exceptions, + # all values are already on record in the respective exceptions + # no need to pass again + value=None, + # no support for a dedicated message here (yet?), pass an empty + # string to match assumptions + msg='', + # and no context + ctx=None, + ) + + @property + def errors(self) -> Mapping[Any, ConstraintError]: + # read-only access + return MappingProxyType(self.args[1]) + + +@dataclass(frozen=True) +class ParamConstraintContext: + """Representation of a parameter constraint context + + This type is used for the keys in the error map of. + ``ParamErrors``. Its purpose is to clearly identify which + parameter combination (and its nature) led to a `ConstraintError`. + + An error context comprises to components: 1) the names of the parameters + that were considered, and 2) a description of the particular aspect of the + parameter values is the focus of the constraint. In the simple case of an + error occurring in the context of a single parameter, the second component + is superfluous. + + Example: + + A command has two parameters `p1` and `p2`. The may also have respective + individual constraints, but importantly they 1) must not have identical + values, and 2) their sum must be larger than 3. If the command is called + with ``cmd(p1=1, p2=1)``, both conditions are violated. The reporting may + be implemented using the following ``ParamConstraintContext`` and + ``ConstraintError`` instances:: + + ParamConstraintContext(('p1', 'p2'), 'inequality): + ConstraintError(EnsureValue(True), False, ) + + ParamConstraintContext(('p1', 'p2'), 'sum): + ConstraintError(EnsureRange(min=3), False, ) + + where the ``ConstraintError`` instances are generated by standard + ``Constraint`` implementation. For the second error, this could look like:: + + EnsureRange(min=3)(params['p1'] + params['p2']) + """ + + param_names: tuple[str, ...] + description: str | None = None + + def __str__(self): + return f'Context[{self.label}]' + + @property + def label(self) -> str: + """A concise summary of the context + + This label will be a compact as possible. + """ + # TODO: this could be __str__ but its intended usage for rendering + # a text description of all errors would seemingly forbid adding + # type information -- which OTOH seems to be desirable for __str__ + return '{param}{descr}'.format( + param=', '.join(self.param_names), + descr=f' ({self.description})' if self.description else '', + ) + + def get_label_with_parameter_values(self, values: dict) -> str: + """Like ``.label`` but each parameter will also state a value""" + # TODO: truncate the values after repr() to ensure a somewhat compact + # output + return '{param}{descr}'.format( + param=', '.join( + f'{p}=' + if isinstance(values[p], UnsetValue) + else f'{p}={values[p]!r}' + for p in self.param_names + ), + descr=f' ({self.description})' if self.description else '', + ) + + +class ParamErrors(ConstraintErrors): + """Exception for command parameter constraint violations + + This is a ``ConstraintErrors`` variant that uses + :class:`ParamConstraintContext` (i.e, parameter names) as context + identifiers. + + The main purpose of this class is to enable structured reporting + of individual errors via its :attr:`errors` property. + . + + There is also a generic :meth:`__str__` implementation that can + render an itemization of errors with some context information. + + >>> pe = ParamErrors( + ... { + ... ParamConstraintContext(('p1', 'p2'), 'range'): ConstraintError( + ... '', + ... {'p1': 4, 'p2': 3}, + ... 'p1 must not be larger than p2', + ... ), + ... } + ... ) + >>> print(pe) + 1 parameter constraint violation + p1=4, p2=3 (range) + p1 must not be larger than p2 + + Key components of an error message are available via dedicated convenience + accessors: + + >>> pe.context_labels + ('p1, p2 (range)',) + >>> pe.messages + ('p1 must not be larger than p2',) + """ + + def __init__( + self, + exceptions: Mapping[ParamConstraintContext, ConstraintError], + ): + """ """ + super().__init__(exceptions) + + @property + def messages(self) -> tuple[str]: + return tuple(e.msg for e in self.errors.values()) + + @property + def context_labels(self): + """Some""" + return tuple(e.label for e in self.errors) + + def __str__(self): + return self._render_violations_as_indented_text_list('parameter') + + def _render_violations_as_indented_text_list(self, violation_subject): + violations = len(self.errors) + + return '{ne} {vs}constraint violation{p}\n{el}'.format( + ne=violations, + vs=f'{violation_subject} ' if violation_subject else '', + p='s' if violations > 1 else '', + el='\n'.join( + '{ctx}\n{msg}'.format( + ctx=ctx.get_label_with_parameter_values( + c.value + if isinstance(c.value, dict) + else {ctx.param_names[0]: c.value} + ), + msg=indent(str(c), ' '), + ) + for ctx, c in self.errors.items() + ), + ) diff --git a/datalad_core/commands/param_constraint.py b/datalad_core/commands/param_constraint.py new file mode 100644 index 0000000..224fb16 --- /dev/null +++ b/datalad_core/commands/param_constraint.py @@ -0,0 +1,67 @@ +from __future__ import annotations + +from abc import abstractmethod +from typing import ( + TYPE_CHECKING, + Any, +) + +if TYPE_CHECKING: + from collections.abc import Mapping + +from datalad_core.commands.exceptions import ParamConstraintContext +from datalad_core.constraints import Constraint + + +class ParamSetConstraint(Constraint): + """Base class for constraints of command parameter sets + + A parameter set is a mapping of parameter names (``str``) to arbitrary + values. + + This :class:`Constraint` type can be used to implement arbitrary + validation, and/or transformation of values in a parameters set. + This includes changing the set of returned parameters compared to + those passed to the constraint (e.g., adding parameters, moving + values from one to another). This flexibility allows for implementing + parameter deprecation and transitions with this constraint type. + + In additions to a basic :class:`Constraint`, this type supports + reporting more complex context information via its :attr:`context` + property. This context includes the parameter names, given to the + constructor. In addition it includes a custom "aspect" description, + which can further identify which particular facet of the parameter + set is the subject of a particular constraint implementation, + such as "identity", "range", or "type". + """ + + def __init__( + self, + param_names: tuple[str, ...], + *, + aspect: str | None = None, + ): + self._context = ParamConstraintContext( + param_names=param_names, + description=aspect, + ) + + @property + def param_names(self) -> tuple[str, ...]: + """Names of parameters processed by the constraint""" + return self._context.param_names + + @property + def context(self) -> ParamConstraintContext: + """:class:`ParamConstraintContext` with names and aspect description""" + return self._context + + @abstractmethod + def __call__(self, val: Mapping[str, Any]) -> Mapping[str, Any]: + """To be implemented by subclasses to perform parameter set processing + + An implementation must return a mapping of parameter names to their + processed values. + + On error, implementation should call :meth:`raise_for`. + """ diff --git a/datalad_core/commands/preproc.py b/datalad_core/commands/preproc.py new file mode 100644 index 0000000..f589b8f --- /dev/null +++ b/datalad_core/commands/preproc.py @@ -0,0 +1,366 @@ +"""Constraints for command/function parameters""" + +from __future__ import annotations + +from abc import ( + ABC, + abstractmethod, +) +from itertools import chain +from typing import ( + TYPE_CHECKING, + Any, +) + +if TYPE_CHECKING: + from collections.abc import ( + Container, + Iterable, + Mapping, + ) + + from datalad_core.commands.param_constraint import ParamSetConstraint + +from datalad_core.commands.dataset import Dataset +from datalad_core.commands.exceptions import ( + ParamConstraintContext, + ParamErrors, +) +from datalad_core.config import ( + ConfigItem, + get_defaults, + get_manager, +) +from datalad_core.constraints import ( + Constraint, + ConstraintError, + EnsureChoice, +) +from datalad_core.constraints.basic import ( + NoConstraint, +) + +# register defaults of configuration supported by the code in this module +defaults = get_defaults() +defaults['datalad.runtime.parameter-violation'] = ConfigItem( + 'raise-early', + coercer=EnsureChoice('raise-early', 'raise-at-end'), +) + + +class ParamProcessor(ABC): + """Abstract base class for parameter processors + + Derived classes must implement `__call__()`, which receives two parameters: + + - ``kwargs``: a mapping of ``str`` parameter names to arbitrary values + - ``at_default``: a ``set`` of parameter names, where the value given via + ``kwargs`` is identical to the respective implementation default (i.e., + the default set in a function's signature. + """ + + @abstractmethod + def __call__( + self, + kwargs: Mapping[str, Any], + at_default: set[str] | None = None, + ) -> Mapping[str, Any]: + """ """ + + +class JointParamProcessor(ParamProcessor): + """Parameter preprocessor with support for multi-parameter set handling + + Typically, this processor is used with particular value :class:`Constraint` + instances for individual parameters. For example, declaring that a ``mode`` + parameter must be one of a set of possible choices may look like this:: + + >>> pp = JointParamProcessor({'mode': EnsureChoice('slow', 'fast')}) + >>> pp({'mode': 'slow'}) + {'mode': 'slow'} + + The respective parameter values will be passed "through" their constraint. + When no constraint is given for a particular parameter, any input value + is passed on as-is. + + There is one exception to this rule: When a parameter value is declared + identical to its default value (e.g., as declared in the command signature) + via the ``at_default`` argument of ``__call__()``), this default value is + also passed on as-is. + + An important consequence of this behavior is that assigned constraints + generally need not cover a default value. For example, a parameter + constraint for ``mode=None`` -- where ``None`` is a special value used to + indicate an optional and unset value, but actually only specific ``str`` + labels are acceptable input values -- can simply use something like + ``EnsureChoice('a', 'b')``, and it is not necessary to do something like + ``EnsureChoice('a', 'b', None)``. + + However, this class can also be specifically instructed to perform + validation of defaults for individual parameters, by including any + parameter name in the ``proc_defaults`` parameter. A common use case is + the auto-discovery of datasets, where often ``None`` is the default value + of a `dataset` parameter (to make it optional), and an + :class:`EnsureDataset` constraint is used. This constraint can perform the + auto-discovery (with the ``None`` value indicating that), but validation of + defaults must be turned on for the ``dataset`` parameter in order to do + that. and reraised immediately. + + With ``on_error``, the error handling during processing can be configured. + When set to ``'raise-early'`` processing stops when encountering the first + error and a :class:`ParamErrors` exception is raised. With + ``'raise-at-end'`` processing continues (as far as possible), and potential + additional errors are collected and raised jointly with a + :class:`ParamErrors`. When set to ``None``, the error reporting mode is + taken from the configuration setting + ``datalad.runtime.parameter-violation``, which can take any of the + aforementioned mode labels. + + A raised instance of the :class:`ParamErrors` exception, can report all + individual issues found for a given parameter set in a structured + fashion. This helps reporting such errors in an UI-appropriate manner. + The gathering and structured reporting of errors is only supported for + :class:`ConstraintError` exceptions raised by individual constraints. + Any other exception is raise immediately. + + Higher-order parameter constraints + ---------------------------------- + + After per-parameter constraint processing, this class can perform + additional (optional), higher-order parameter set constraint processing. + This is a flexible mechanism that can be used to investigate the values of + multiple parameters simultaneously. This feature can also be used to + implement API transitions, because values can be moved from one parameter + to another, and/or deprecation messages can be issued. + + Such multi-parameter processing is enabled by passing instances of + :class:`ParamSetConstraint` to the ``paramset_constraints`` + keyword-argument. Each instance declares which parameters it will + process and will receive the associated values as a mapping to its + ``__call__()`` method. + + The order of :class:`ParamSetConstraint` instances given to + ``paramset_constraints`` is significant. Later instances will receive the + outcomes of the processing of earlier instances. + + Error handling for :class:`ParamSetConstraint` instances follows the + same rules as described above, and ``on_error`` is honored in the same + way. + + Automated tailoring of constraints to a specific dataset + -------------------------------------------------------- + + With ``tailor_for_dataset``, constraints for individual parameters can be + automatically transformed to apply to the scope of a particular dataset. + The value given to ``tailor_for_dataset`` has to be a mapping of names of + parameters whose constraints should be tailored to a particular dataset, to + the respective names parameters identifying these datasets. + + The dataset-providing parameter constraints will be evaluated first, and + the resulting :class:`Dataset` instances are used to tailor the constraints + that require a dataset-context, by calling their + :meth:`~Constraint.for_dataset` method. Tailoring is performed if, and only + if, the dataset-providing parameter actually evaluated to a `Dataset` + instance. The non-tailored constraint is used otherwise. + """ + + def __init__( + self, + param_constraints: Mapping[str, Constraint], + *, + proc_defaults: Container[str] | None = None, + paramset_constraints: Iterable[ParamSetConstraint] | None = None, + tailor_for_dataset: Mapping[str, str] | None = None, + on_error: str | None = None, + ): + super().__init__() + self._param_constraints = param_constraints + self._paramset_constraints = paramset_constraints + self._proc_defaults = proc_defaults or set() + self._tailor_for_dataset = tailor_for_dataset or {} + if on_error is None: + # consider configuration for immediate vs exhaustive processing + on_error = ( + get_manager() + .get( + 'datalad.runtime.parameter-violation', + 'raise-early', + ) + .value + ) + # TODO: migrate to coercer for config options + if on_error not in ('raise-early', 'raise-at-end'): + msg = "`on_error` must be 'raise-early' or 'raise-at-end'" + raise ValueError(msg) + self._on_error = on_error + + def __call__( + self, + kwargs: Mapping[str, Any], + at_default: set[str] | None = None, + ) -> Mapping[str, Any]: + """Performs the configured processing on the given parameter values + + The ``kwargs`` mapping specifies the to-be-processed parameters and + their values. Upon successful completion, the method will return + such a mapping again. + + With ``at_default``, a subset of parameters can be identified by name, + whose values are to be considered at their respective default values. + This is only relevant for deciding whether to exclude particular + values from constraint processing (see the ``proc_defaults`` + constructor argument). + + A :class:`ParamErrors` exception is raised whenever one or more + :class:`ConstraintError` exceptions have been caught during processing. + """ + on_error = self._on_error + + exceptions: dict[ParamConstraintContext, ConstraintError] = {} + + # names of parameters we need to process + to_process = set(kwargs) + # check for any dataset that are required for tailoring other parameters + ds_provider_params = set(self._tailor_for_dataset.values()) + # take these out of the set of parameters to validate, because we need + # to process them first. + # the approach is to simply sort them first, but otherwise apply standard + # handling + to_process.difference_update(ds_provider_params) + # strip all args provider args that have not been provided + ds_provider_params.intersection_update(kwargs) + + processed: dict[str, Any] = {} + failed_to_process = set() + # process all parameters. starts with those that are needed as + # dependencies for others. + # this dependency-based sorting is very crude for now. it does not + # consider possible dependencies within `ds_provider_params` at all + for pname in chain(ds_provider_params, to_process): + try: + processed[pname] = self._proc_param( + name=pname, + value=kwargs[pname], + at_default=at_default is not None and pname in at_default, + processed=processed, + ) + # we catch only ConstraintError -- only these exceptions have what + # we need for reporting. If any validator chooses to raise + # something else, we do not handle it here, but let it bubble up. + # it may be an indication of something being wrong with processing + # itself + except ConstraintError as e: + # standard exception type, record and proceed + exceptions[ParamConstraintContext((pname,))] = e + if on_error == 'raise-early': + raise ParamErrors(exceptions) from e + # we record this for an easy check whether it is worth proceeding + # with higher order processing + failed_to_process.add(pname) + + # do not bother with joint processing when the set of expected + # arguments is not complete + expected_for_proc_param_sets: set[str] = set() + for c in self._paramset_constraints or []: + expected_for_proc_param_sets.update(c.param_names) + + if expected_for_proc_param_sets.intersection(failed_to_process): + raise ParamErrors(exceptions) + + try: + # call (subclass) method to perform holistic, cross-parameter + # processing of the full parameterization + final = self._proc_param_sets(processed, on_error) + # ATTN: we do not want to test for equivalence of keys in + # `processed` and `final`. And is desirable for a preprocessor + # to add or remove parameters from the parameter set that goes + # on to an eventually callable. + except ParamErrors as e: + # we can simply suck in the reports, the context keys do not + # overlap, unless the provided validators want that for some + # reason + exceptions.update(e.errors) + + if exceptions: + raise ParamErrors(exceptions) + + return final + + def _proc_param( + self, + name: str, + value: Any, + at_default: bool, # noqa: FBT001 + processed: Mapping[str, Any], + ) -> Any: + if at_default and name not in self._proc_defaults: + # do not validate any parameter where the value matches the + # default declared in the signature. Often these are just + # 'do-nothing' settings or have special meaning that need + # not be communicated to a user. Not validating them has + # two consequences: + # - the condition can simply be referred to as "default + # behavior" regardless of complexity + # - a command implementation must always be able to handle + # its own defaults directly, and cannot delegate a + # default value handling to a constraint + # + # we must nevertheless pass any such default value through + # to make/keep them accessible to the general result handling + # code + return value + + # look-up validator for this parameter, if there is none use + # NoConstraint to avoid complex conditionals in the code below + validator = self._param_constraints.get(name, NoConstraint()) + + # do we need to tailor this constraint for a specific dataset? + # only do if instructed AND the respective other parameter + # processed to a Dataset instance. Any such parameter was sorted + # to be processed first in this loop, so the outcome of that is + # already available + tailor_for = self._tailor_for_dataset.get(name) + if tailor_for and isinstance(processed.get(tailor_for), Dataset): + validator = validator.for_dataset(processed[tailor_for]) + + return validator(value) + + def _proc_param_sets(self, params: dict, on_error: str) -> dict: + exceptions = {} + processed = params.copy() + + for constraint in self._paramset_constraints or []: + ctx = constraint.context + # what the validator will produce + res = None + try: + # call the validator with the parameters given in the context + # and only with those, to make sure the context is valid + # and not an underspecification. + # pull the values form `processed` to be able to benefit + # from incremental coercing done in individual checks + res = constraint({p: processed[p] for p in ctx.param_names}) + except ConstraintError as e: + if ( + not isinstance(e.value, dict) + or set(ctx.param_names) != e.value.keys() + ): # pragma: no cover + msg = ( + 'on raising a ConstraintError the joint validator ' + f'{constraint} did not report ' + 'a mapping of parameter name to (violating) value ' + 'comprising all constraint context parameters. ' + 'This is a software defect of the joint validator. ' + 'Please report!' + ) + raise RuntimeError(msg) from e + exceptions[ctx] = e + if on_error == 'raise-early': + raise ParamErrors(exceptions) from e + if res is not None: + processed.update(**res) + + if exceptions: + raise ParamErrors(exceptions) + + return processed diff --git a/datalad_core/commands/tests/test_cmd.py b/datalad_core/commands/tests/test_cmd.py index 1573e56..45baedd 100644 --- a/datalad_core/commands/tests/test_cmd.py +++ b/datalad_core/commands/tests/test_cmd.py @@ -9,10 +9,16 @@ get_default_result_handler, set_default_result_handler, ) +from datalad_core.commands.param_constraint import ParamSetConstraint +from datalad_core.commands.preproc import JointParamProcessor from datalad_core.commands.result_handler import ( PassthroughHandler, ResultHandler, ) +from datalad_core.constraints import ( + Constraint, + ConstraintError, +) def test_testcmd_minimal(): @@ -117,6 +123,64 @@ def test_command(): assert list(test_command(on_failure='ignore')) == test_results +def test_testcmd_preproc(): + class EnsureInt(Constraint): + input_synopsis = 'make INT' + + def __call__(self, val): + try: + return int(val) + except (TypeError, ValueError) as e: + self.raise_for(val, str(e)) + + class NotEqual(ParamSetConstraint): + input_synopsis = 'not all equal' + + def __call__(self, val): + vals = [val[p] for p in self.param_names] + if len(vals) != len(set(vals)): + self.raise_for( + val, + 'not all unique', + ) + return val + + @datalad_command( + preproc=JointParamProcessor( + { + 'p2': EnsureInt(), + }, + paramset_constraints=[NotEqual(('p1', 'p2'), aspect='identity')], + ), + postproc_cls=PassthroughHandler, + ) + def test_command(p1, p2): + return (p1, p2) + + test_case = ('5', '6') + assert test_command(*test_case) == ('5', 6) + + # does not somehow shadow missing args + with pytest.raises( + TypeError, + match='missing 1 required positional argument.*p2', + ): + test_command('5') + + with pytest.raises( + ConstraintError, + match='not all unique', + ): + # will be equal after first-level constraint processing + test_command(5, '5') + + with pytest.raises( + ConstraintError, + match='invalid literal for int', + ): + test_command('5', 'five') + + def test_default_handler_set(): dhdlr = get_default_result_handler() try: diff --git a/datalad_core/commands/tests/test_preproc.py b/datalad_core/commands/tests/test_preproc.py new file mode 100644 index 0000000..ac5ae83 --- /dev/null +++ b/datalad_core/commands/tests/test_preproc.py @@ -0,0 +1,257 @@ +import pytest + +from datalad_core.commands.dataset import ( + Dataset, + EnsureDataset, +) +from datalad_core.commands.exceptions import ( + ParamConstraintContext, + ParamErrors, +) +from datalad_core.commands.param_constraint import ParamSetConstraint +from datalad_core.commands.preproc import JointParamProcessor +from datalad_core.constraints import ( + Constraint, + ConstraintError, + NoConstraint, +) + + +def test_paramcontext(): + ctx = ParamConstraintContext( + param_names=('p1', 'p2'), + description='key-aspect', + ) + assert ctx.label == 'p1, p2 (key-aspect)' + assert str(ctx) == f'Context[{ctx.label}]' + + +def test_paramprocessor_run_empty(): + pp = JointParamProcessor({}) + assert pp({}) == {} + + pp = JointParamProcessor({}) + params = {'something': 'value', 5: 'nonstr-key'} + assert pp(params) == params + + with pytest.raises(ValueError, match="must be 'raise-early' or 'raise-at-end'"): + JointParamProcessor({}, on_error='explode') + + +def test_paramprocessor_individual_param(): + class Not5(Constraint): + input_synopsis = 'not 5' + + def __call__(self, val): + if val != 5: # noqa: PLR2004 + self.raise_for(val, self.input_synopsis) + return val + + pp = JointParamProcessor( + { + 'p1': Not5(), + } + ) + ok_case = {'p1': 5} + assert ok_case == pp(ok_case) + + # TODO: add match= + with pytest.raises(ParamErrors) as e: + pp({'p1': 6}) + assert len(e.value.errors) == 1 + assert 'not 5' in str(e.value.errors[ParamConstraintContext(('p1',))]) + + +def test_paramprocessor_multiparam_failure(): + class Err(Constraint): + def __call__(self, val): + self.raise_for(val, self.input_synopsis) + + class Err1(Err): + input_synopsis = 'err1' + + class Err2(Err): + input_synopsis = 'err2' + + class NeverReached(ParamSetConstraint): + input_synopsis = 'irrelevant' + + def __call__(self, val): # pragma: no cover + # here could be test code + return val + + pspecs = { + 'p1': Err1(), + 'p2': Err2(), + } + pp = JointParamProcessor( + pspecs, + paramset_constraints=(NeverReached(tuple(pspecs.keys())),), + on_error='raise-at-end', + ) + with pytest.raises(ParamErrors) as e: + pp({'p1': 'ignored', 'p2': 'ignored'}) + assert len(e.value.errors) == len(pspecs) + # smoke test + assert repr(e.value.errors).startswith('mappingproxy(') + assert set(e.value.messages) == {'err1', 'err2'} + assert set(e.value.context_labels) == set(pspecs) + exc_str = str(e.value) + assert '2 parameter constraint violations' in exc_str + assert 'err1' in exc_str + assert 'err2' in exc_str + + +def test_paramprocessor_paramsets(): + class NotEqual(ParamSetConstraint): + input_synopsis = 'parameters must not be all equal' + + def __call__(self, val): + vals = [val[p] for p in self.param_names] + if len(vals) != len(set(vals)): + self.raise_for( + val, + 'no all unique', + ) + return val + + class OnlyCheck(ParamSetConstraint): + input_synopsis = 'some potential checks' + + def __call__(self, val): + # here could be test code + return val + + pp_kwargs = { + 'param_constraints': {}, + 'paramset_constraints': ( + NotEqual(('p1', 'p2'), aspect='identity'), + OnlyCheck(('p1', 'p2', 'p3')), + ), + } + pp = JointParamProcessor(**pp_kwargs) + ok_case = {'p1': 5, 'p2': 4, 'p3': 3} + fail_case = {'p1': 4, 'p2': 4, 'p3': 3} + assert ok_case == pp(ok_case) + + pp = JointParamProcessor(on_error='raise-early', **pp_kwargs) + with pytest.raises(ParamErrors) as e: + pp(fail_case) + + # check composition of error + exc_str = str(e.value) + # summary + assert '1 parameter constraint violation' in exc_str + # subject of the error + assert 'p1=4, p2=4 (identity)' in exc_str + # nature of the error + assert 'no all unique' in exc_str + + pp = JointParamProcessor(on_error='raise-at-end', **pp_kwargs) + with pytest.raises(ParamErrors) as e: + pp(fail_case) + + # composition of error does not change + assert exc_str == str(e.value) + + +def test_paramprocessor_broken_constrainterror(): + class Broken(ParamSetConstraint): + input_synopsis = 'cannot be satisfied' + + def __call__(self, val): # noqa: ARG002 + # do not use raise_for() and intentionally + # implement a broken replacement + raise ConstraintError(self, 5, msg='broken') + + pp = JointParamProcessor( + param_constraints={}, + paramset_constraints=(Broken(('p1', 'p2')),), + ) + with pytest.raises(RuntimeError, match='software defect'): + pp({'p1': 5, 'p2': None}) + + +def test_paramprocessor_no_constrainterror(): + class Broken(Constraint): + input_synopsis = 'raises alternative exception' + + def __call__(self, val): # noqa: ARG002 + # any non-ConstraintError exception raised + # would be an indication of an unexpected event, + # even when validating + msg = 'unexpected' + raise AttributeError(msg) + + pp = JointParamProcessor( + param_constraints={'p1': Broken()}, + ) + # exception is not hidden or transformed + with pytest.raises(AttributeError, match='unexpected'): + pp({'p1': 5}) + + +def test_paramprocessor_no_proc_default(): + class Make5(Constraint): + input_synopsis = 'turns everything to `5`' + + def __call__(self, val): # noqa: ARG002 + return 5 + + pp = JointParamProcessor( + param_constraints={'p1': Make5()}, + ) + assert pp({'p1': 6}) == {'p1': 5} + assert pp({'p1': 6}, at_default={'p1'}) == {'p1': 6} + + pp = JointParamProcessor( + param_constraints={'p1': Make5()}, + proc_defaults={'p1'}, + ) + assert pp({'p1': 6}, at_default={'p1'}) == {'p1': 5} + + +def test_paramprocessor_tailor_for_dataset(gitrepo): + class DsSubDir(Constraint): + input_synopsis = 'give path in data' + + def __init__(self, ds): + self._ds = ds + + def __call__(self, val): + return self._ds.path / val + + # no need to be a NoConstraint, could also be EnsurePath, or any other + # intermmediate validation step, but this is mostly for documentation. + # So it would typically come wrapped into WithDescription + class UntailoredDsSubDir(NoConstraint): + input_synopsis = 'will produce a constraint that can do the job' + + def for_dataset(self, dataset): + return DsSubDir(dataset) + + pp_kwargs = { + 'param_constraints': { + 'dataset': EnsureDataset(), + 'path': UntailoredDsSubDir(), + }, + 'tailor_for_dataset': {'path': 'dataset'}, + } + pp = JointParamProcessor(**pp_kwargs) + + res = pp({'dataset': gitrepo, 'path': 'dummy'}) + assert res['dataset'].path == gitrepo + assert res['path'] == gitrepo / 'dummy' + + # there is no tailoring, when 'dataset' does not come out as + # a `Dataset` instance. Here we do not process the default value + # so it is not happening + res = pp({'dataset': None, 'path': 'dummy'}, at_default={'dataset'}) + assert res['dataset'] is None + assert res['path'] == 'dummy' + + # but if we whitelist a parameter for default processing explicitly, + # thing work again + pp = JointParamProcessor(proc_defaults={'dataset'}, **pp_kwargs) + res = pp({'dataset': None, 'path': 'dummy'}, at_default={'dataset'}) + assert res['path'] == Dataset(None).path / 'dummy' From b69476c300c6a04c5f2f17f56c44b4828acaa5db Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Fri, 25 Oct 2024 10:38:29 +0200 Subject: [PATCH 14/17] test: use a dedicated path a worktree, no re-use This is cleaner and helps avoid undesired side-effects that have nothing to do with the test subject itself. --- datalad_core/repo/tests/test_worktree.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datalad_core/repo/tests/test_worktree.py b/datalad_core/repo/tests/test_worktree.py index 26d99d5..9d31714 100644 --- a/datalad_core/repo/tests/test_worktree.py +++ b/datalad_core/repo/tests/test_worktree.py @@ -46,11 +46,11 @@ def test_work_error(tmp_path): Worktree(test_file) -def test_secondary_worktree(gitrepo): +def test_secondary_worktree(tmp_path, gitrepo): test_key = 'brand.new.key' test_key2 = 'other.brand.new.key' branch = 'dummy' - wt_path = gitrepo.parent / branch + wt_path = tmp_path / branch call_git( [ '-C', From 257f35ed212b9589e40779eb40b39769174f397d Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Sun, 27 Oct 2024 16:35:22 +0100 Subject: [PATCH 15/17] feat: `EnsurePath` and `EnsureDatasetPath` This duo of constraints has a key role for resolving paths with and without an optional dataset context. In legacy DataLad, a similar role was filled by a set of helper functions (e.g., `require_dataset()`, and `resolve_path()`). These are no migrated, because of the complex interdependencies. I envision for them to be entirely replaced by these two constraints. The next commit will have dedicated documentation on the targeted implementation pattern. --- datalad_core/constraints/__init__.py | 8 + datalad_core/constraints/path.py | 202 ++++++++++++++++++++ datalad_core/constraints/tests/test_path.py | 85 ++++++++ 3 files changed, 295 insertions(+) create mode 100644 datalad_core/constraints/path.py create mode 100644 datalad_core/constraints/tests/test_path.py diff --git a/datalad_core/constraints/__init__.py b/datalad_core/constraints/__init__.py index 89a5109..e8ea630 100644 --- a/datalad_core/constraints/__init__.py +++ b/datalad_core/constraints/__init__.py @@ -41,6 +41,8 @@ WithDescription NoConstraint EnsureChoice + EnsureDatasetPath + EnsurePath """ __all__ = [ @@ -51,6 +53,8 @@ 'EnsureChoice', 'WithDescription', 'NoConstraint', + 'EnsureDatasetPath', + 'EnsurePath', ] @@ -66,4 +70,8 @@ from .exceptions import ( ConstraintError, ) +from .path import ( + EnsureDatasetPath, + EnsurePath, +) from .wrapper import WithDescription diff --git a/datalad_core/constraints/path.py b/datalad_core/constraints/path.py new file mode 100644 index 0000000..e62884d --- /dev/null +++ b/datalad_core/constraints/path.py @@ -0,0 +1,202 @@ +from __future__ import annotations + +import contextlib +from pathlib import ( + Path, + PurePath, +) +from typing import ( + TYPE_CHECKING, + Any, + Callable, +) + +if TYPE_CHECKING: + from datalad_core.commands import Dataset + +from datalad_core.constraints.constraint import Constraint +from datalad_core.consts import UnsetValue +from datalad_core.repo import Worktree + + +class EnsurePath(Constraint): + """Convert input to a (platform) path and ensures select properties + + Optionally, the path can be tested for existence and whether it is absolute + or relative. + """ + + def __init__( + self, + *, + path_type: type = Path, + is_format: str | None = None, + lexists: bool | None = None, + is_mode: Callable | None = None, + ref: Path | None = None, + ref_is: str = 'parent-or-same-as', + ) -> None: + """ + Parameters + ---------- + path_type: + Specific pathlib type to convert the input to. The default is `Path`, + i.e. the platform's path type. Not all pathlib Path types can be + instantiated on all platforms, and not all checks are possible with + all path types. + is_format: {'absolute', 'relative'} or None + If not None, the path is tested whether it matches being relative or + absolute. + lexists: + If not None, the path is tested to confirmed exists or not. A symlink + need not point to an existing path to fulfil the "exists" condition. + is_mode: + If set, this callable will receive the path's `.lstat().st_mode`, + and an exception is raised, if the return value does not evaluate + to `True`. Typical callables for this feature are provided by the + `stat` module, e.g. `S_ISDIR()` + ref: + If set, defines a reference Path any given path is compared to. The + comparison operation is given by `ref_is`. + ref_is: {'parent-or-same-as', 'parent-of'} + Comparison operation to perform when `ref` is given. + """ + super().__init__() + self._path_type = path_type + self._is_format = is_format + self._lexists = lexists + self._is_mode = is_mode + self._ref = ref + self._ref_is = ref_is + if self._ref_is not in ('parent-or-same-as', 'parent-of'): + msg = f'unrecognized `ref_is` operation label: {self._ref_is}' + raise ValueError(msg) + + def __call__(self, value: Any) -> PurePath | Path: + # turn it into the target type to make everything below + # more straightforward + path = get_path_instance(self, value) + + # we are testing the format first, because resolve_path() + # will always turn things into absolute paths + if self._is_format is not None: + is_abs = path.is_absolute() + if self._is_format == 'absolute' and not is_abs: + self.raise_for(path, 'is not an absolute path') + elif self._is_format == 'relative' and is_abs: + self.raise_for(path, 'is not a relative path') + + mode = None + if self._lexists is not None or self._is_mode is not None: + with contextlib.suppress(FileNotFoundError): + # error would be OK, handled below + mode = path.lstat().st_mode if hasattr(path, 'lstat') else UnsetValue + if self._lexists is not None: + if self._lexists and mode is None: + self.raise_for(path, 'does not exist') + elif not self._lexists and mode is not None: + self.raise_for(path, 'does (already) exist') + if self._is_mode is not None: + if mode is UnsetValue: + self.raise_for(path, 'cannot check mode, PurePath given') + elif not self._is_mode(mode): + self.raise_for(path, 'does not match desired mode') + if self._ref: + ok = True + if self._ref_is == 'parent-or-same-as': + ok = path == self._ref or self._ref in path.parents + elif self._ref_is == 'parent-of': + ok = self._ref in path.parents + else: # pragma: no cover + # this code cannot be reached with normal usage. + # it is prevented by an assertion in __init__() + msg = f'unknown `ref_is` operation label {self._ref_is!r}' + raise RuntimeError(msg) + + if not ok: + self.raise_for( + path, + '{ref} is not {ref_is} {path}', + ref=self._ref, + ref_is=self._ref_is, + ) + return path + + @property + def input_synopsis(self): + return '{}{}path{}'.format( + 'existing ' if self._lexists else 'non-existing ' if self._lexists else '', + 'absolute ' + if self._is_format == 'absolute' + else 'relative' + if self._is_format == 'relative' + else '', + f' that is {self._ref_is} {self._ref}' if self._ref else '', + ) + + def for_dataset(self, dataset: Dataset) -> Constraint: + """Return an identically parametrized variant that resolves + paths against a given dataset. + """ + return EnsureDatasetPath(self, dataset) + + +class EnsureDatasetPath(Constraint): + def __init__( + self, + path_constraint: EnsurePath, + dataset: Dataset, + ): + """Resolves a path in the context of a particular dataset + + This constraint behaves exactly like the :class:`EnsurePath` + constraint it is parameterized with, except for two conditions: + + 1. When called with ``None``, it will process the path associated + with the :class:`Dataset` instance given to the ``dataset`` + parameter. + 2. When called with a relative path and the :class:`Dataset` + was created from a :class:`Worktree` instance, the relative + path will be considered relative to the worktree root. + + Otherwise, all given paths are interpreted as-is, or relative to + the current working directory (CWD). + """ + self._path_constraint = path_constraint + self._dataset = dataset + + def __call__(self, value: Any) -> PurePath | Path: + # only if the Dataset instance was created from a Worktree + # instance we deviate from EnsurePath() + if value is not None and not isinstance(self._dataset.pristine_spec, Worktree): + return self._path_constraint(value) + + path = ( + self._dataset.path + if value is None + else get_path_instance(self._path_constraint, value) + ) + # when dataset is based on a Worktree instance and we received + # a relative path, only then interpret the path as relative + # to the worktree. Always relative to CWD otherwise. + if not path.is_absolute(): + path = self._dataset.path / path + return self._path_constraint(path) + + @property + def input_synopsis(self): + return self._path_constraint.input_synopsis + + +def get_path_instance( + origin_constraint: EnsurePath, + value: Any, +) -> PurePath | Path: + try: + path = origin_constraint._path_type(value) # noqa: SLF001 + except (ValueError, TypeError) as e: + origin_constraint.raise_for( + value, + str(e), + ) + return path diff --git a/datalad_core/constraints/tests/test_path.py b/datalad_core/constraints/tests/test_path.py new file mode 100644 index 0000000..b168bdd --- /dev/null +++ b/datalad_core/constraints/tests/test_path.py @@ -0,0 +1,85 @@ +from pathlib import ( + Path, + PurePath, +) + +import pytest + +from datalad_core.commands import Dataset +from datalad_core.constraints import ConstraintError +from datalad_core.repo import Worktree + +from ..path import EnsurePath + + +def test_EnsurePath_nopath(): + c = EnsurePath() + with pytest.raises(ConstraintError, match='not.*int'): + c(5) + + +def test_EnsurePath(tmp_path): + target = Path(tmp_path) + + assert EnsurePath()(tmp_path) == target + assert EnsurePath(lexists=True)(tmp_path) == target + with pytest.raises(ConstraintError): + EnsurePath(lexists=False)(tmp_path) + with pytest.raises(ConstraintError): + EnsurePath(lexists=True)(tmp_path / 'nothere') + assert EnsurePath(is_format='absolute')(tmp_path) == target + with pytest.raises(ConstraintError): + EnsurePath(is_format='relative')(tmp_path) + with pytest.raises(ConstraintError): + EnsurePath(is_format='absolute')(tmp_path.name) + from stat import S_ISDIR, S_ISREG + + assert EnsurePath(is_mode=S_ISDIR)(tmp_path) == target + with pytest.raises(ConstraintError): + EnsurePath(is_mode=S_ISREG)(tmp_path) + # give particular path type + assert EnsurePath(path_type=PurePath)(tmp_path) == PurePath(tmp_path) + # not everything is possible, this is known and OK + with pytest.raises(ConstraintError, match='cannot check mode'): + EnsurePath( + path_type=PurePath, + is_mode=S_ISREG, + )(tmp_path) + assert EnsurePath().input_synopsis == 'path' + assert EnsurePath(is_format='absolute').input_synopsis == 'absolute path' + # default comparison mode is parent-or-same-as + c = EnsurePath(ref=target) + assert c(target) == target + assert c(target / 'some') == target / 'some' + with pytest.raises(ConstraintError): + assert c(target.parent) + c = EnsurePath(ref=target, ref_is='parent-of') + assert c(target / 'some') == target / 'some' + with pytest.raises(ConstraintError): + assert c(target) + assert c.input_synopsis == f'path that is parent-of {target}' + with pytest.raises(ValueError, match='unrecognized'): + c = EnsurePath(ref=target, ref_is='stupid') + + +def test_EnsurePath_fordataset(gitrepo): + test_relpath = Path('relpath') + # standard: relative in, relative out + c = EnsurePath() + assert c('relpath') == test_relpath + # tailor constraint for our dataset + # (this is what would be done by EnsureCommandParameterization + # 1. dataset given as a path -- resolve against CWD + # output is always absolute + tc = c.for_dataset(Dataset(gitrepo)) + # the description stays the same + assert c.input_synopsis == tc.input_synopsis + assert tc('relpath') == test_relpath + # 2. dataset is given as a worktree object + tc = c.for_dataset(Dataset(Worktree(gitrepo))) + assert tc('relpath') == (gitrepo / 'relpath') + # no change for any absolute path + assert tc(Path.cwd()) == Path.cwd() + + # returns the dataset path for `None` + assert tc(None) == gitrepo From a380d68f0d0857bf1a2ff2263d3d7e96cee9c383 Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Sun, 27 Oct 2024 16:43:15 +0100 Subject: [PATCH 16/17] doc: (dataset-)path resolution pattern for command implementations This aims to illustrate the necessary boilerplate to add the conventional DataLad path resolution behavior to a command. Importantly, this is done completely outside the implementation of the actual command. It is all parameterization of the `@datalad_command` decorator. --- docs/index.rst | 10 ++++-- docs/patterns/dataset_paths.rst | 57 +++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 2 deletions(-) create mode 100644 docs/patterns/dataset_paths.rst diff --git a/docs/index.rst b/docs/index.rst index 16e836a..3788a63 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,8 +1,8 @@ The `datalad-core` documentation ================================ -Package overview ----------------- +Package API overview +-------------------- Also see the :ref:`modindex`. @@ -17,6 +17,12 @@ Also see the :ref:`modindex`. repo runners +Implementation patterns +----------------------- + +.. toctree:: + patterns/dataset_paths + Indices and tables ================== diff --git a/docs/patterns/dataset_paths.rst b/docs/patterns/dataset_paths.rst new file mode 100644 index 0000000..c0c2406 --- /dev/null +++ b/docs/patterns/dataset_paths.rst @@ -0,0 +1,57 @@ +(Dataset-specific) path resolution +---------------------------------- + +If a command operates on an item on the file system that is identified by its +path, and can (optionally) operate in the context of a specific dataset, this +is achieved with two arguments: ``dataset``, and ``path``. It can be the case +that only one of these parameters is needed (and occasionally even neither one +is required) for a specific command invocation, hence they are typically both +used with ``None`` default values. + +The following parameter preprocessor setup for ``@datalad_command`` can be used +to guarantee that the function body of a command will always receive: + +- a ``pathlib`` path object instance via the ``path`` argument +- a ``Dataset`` instance via the ``dataset`` argument from which an + (optional) dataset context can be identified. + + >>> from datalad_core.commands import ( + ... EnsureDataset, + ... JointParamProcessor, + ... ) + >>> from datalad_core.constraints import EnsurePath + >>> preproc=JointParamProcessor( + ... { + ... 'dataset': EnsureDataset(), + ... 'path': EnsurePath() + ... }, + ... proc_defaults={'dataset', 'path'}, + ... tailor_for_dataset={ + ... 'path': 'dataset', + ... } + ... ), + +The above pattern takes care of resolving any path given to ``path`` +(including ``None``) to an actual path. This path is adequately determined, +also considering the value of any ``dataset`` parameter. This follows +the conventions of path resolution in DataLad: + +- any absolute path is taken as-is +- any relative path is interpreted as relative to CWD, unless there is + a dataset context based on a :class:`Worktree` instance. Only in the + latter case a relative path is interpreted as relative to the + worktree root + +Technically, this pattern works, because + +- even ``None`` defaults will be processed for ``dataset`` and ``path`` +- the ``tailored_for_dataset`` mapping causes the ``dataset`` argument + to be resolved first (before ``path``) +- and the :class:`EnsurePath` constraint is tailored to any :class:`Dataset` + given to ``dataset``, by generating a matching :class:`EnsureDatasetPath` + constraint for the ``path`` parameter internally + +Taken together, this pattern simplifies command implementation, because +a number of checks are factored out into common implementation following +accepted conventions. Command implementations can inspect the specifics +of a user-provided dataset context via :attr:`Dataset.pristine_spec`. From 6393bec6c25243cad3c888f1fcac64644065319b Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Sun, 27 Oct 2024 16:45:51 +0100 Subject: [PATCH 17/17] feat: expose `EnsureDataset` in the `datalad_core.commands` module --- datalad_core/commands/__init__.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/datalad_core/commands/__init__.py b/datalad_core/commands/__init__.py index 2a2528a..dfa3c76 100644 --- a/datalad_core/commands/__init__.py +++ b/datalad_core/commands/__init__.py @@ -30,6 +30,7 @@ get_default_result_handler set_default_result_handler Dataset + EnsureDataset ParamProcessor JointParamProcessor ParamConstraintContext @@ -39,6 +40,7 @@ __all__ = [ 'Dataset', + 'EnsureDataset', 'StandardResultHandler', 'ResultHandler', 'ParamProcessor', @@ -53,7 +55,10 @@ ] -from .dataset import Dataset +from .dataset import ( + Dataset, + EnsureDataset, +) from .decorator import datalad_command from .default_result_handler import ( StandardResultHandler,