diff --git a/README.md b/README.md index bba455e..ed7acbf 100644 --- a/README.md +++ b/README.md @@ -74,11 +74,6 @@ EOF > datalad save -m "add 'one-to-many' remake method" ``` -Create a `datalad-remake` git-annex special remote: -```bash -> git annex initremote datalad-remake encryption=none type=external externaltype=datalad-remake allow-untrusted-execution=true -``` - Execute a computation and save the result: ```bash > datalad make -p first=bob -p second=alice -p output=name -o name-1.txt \ @@ -95,8 +90,25 @@ content: bob content: alice ``` -Drop the content of `name-1.txt`, verify it is gone, recreate it via -`datalad get`, which "fetches" it from the `datalad-remake` remote: +### Recomputation + +DataLad REMAKE can recompute dropped content. To demonstrate this, we will +drop a file and then recreate it via `datalad get`. Before we can do that in +this example we have to make a small adjustement. This is due to the fact that +we use "untrusted" execution in this example. It makes the example easier +because no signing keys are required. However, the git annex special remote that +was created by the `datalad make` command does not allow untrusted execution by +default (for security reasons we never automatically create a datalad-remake +remote that supports untrusted execution). To instruct the special remote to +allow untrusted execution, we have to reconfigure it. This can be done via the +following command: + +```bash +> git annex enableremote datalad-remake-auto allow-untrusted-execution=true +``` + +Now we drop the content of `name-1.txt`, verify it is gone, and recreate it via +`datalad get`, which "fetches" it from the `datalad-remake` remote. ```bash > datalad drop name-1.txt @@ -107,28 +119,9 @@ Drop the content of `name-1.txt`, verify it is gone, recreate it via ### Prospective computation The `datalad make` command can also be used to perform a *prospective -computation*. To use this feature, the following configuration value -has to be set (): - -```bash -> git config remote.datalad-remake.annex-security-allow-unverified-downloads ACKTHPPT -``` +computation*. -
- Why does the configuration variable have to be set? - -This setting allows git-annex to download files from the special remote `datalad-remake` -although git-annex cannot check a hash to verify that the content is correct. -Because the computation was never performed, there is no hash available for content -verification of an output file yet. - -For more information see the description of -`remote..annex-security-allow-unverified-downloads` and of -`annex.security.allow-unverified-downloads` at -https://git-annex.branchable.com/git-annex/. -
- -Afterwards, a prospective computation can be initiated by using the +The prospective computation can be initiated by using the `--prospective-execution` option: ```bash @@ -165,6 +158,25 @@ time!) based on the specified instructions: content: john ``` +Please note, to use this feature, the following configuration variable +`remote.datalad-remake-auto.annex-security-allow-unverified-downloads` is set +to `ACKTHPPT` for each automatically created git-annex special remote + +
+ Why does the configuration variable have to be set? + +This setting allows git-annex to download files from the special remote `datalad-remake` +although git-annex cannot check a hash to verify that the content is correct. +Because the computation was never performed, there is no hash available for content +verification of an output file yet. + +For more information see the description of +`remote..annex-security-allow-unverified-downloads` and of +`annex.security.allow-unverified-downloads` at +https://git-annex.branchable.com/git-annex/. +
+ + Additional examples can be found in the [examples](https://github.com/datalad/datalad-remake/tree/main/examples) directory. diff --git a/datalad_remake/__init__.py b/datalad_remake/__init__.py index 451e6c9..01bf4b0 100644 --- a/datalad_remake/__init__.py +++ b/datalad_remake/__init__.py @@ -6,6 +6,7 @@ __all__ = [ '__version__', + 'auto_remote_name', 'command_suite', 'priority_config_key', 'specification_dir', @@ -52,3 +53,4 @@ specification_dir = '.datalad/make/specifications' trusted_keys_config_key = 'datalad.make.trusted-keys' priority_config_key = 'datalad.make.priority' +auto_remote_name = 'datalad-remake-auto' diff --git a/datalad_remake/commands/make_cmd.py b/datalad_remake/commands/make_cmd.py index eeedf40..bdd04ce 100644 --- a/datalad_remake/commands/make_cmd.py +++ b/datalad_remake/commands/make_cmd.py @@ -44,6 +44,7 @@ from datalad_remake.utils.compute import compute from datalad_remake.utils.getkeys import get_trusted_keys from datalad_remake.utils.glob import resolve_patterns +from datalad_remake.utils.remake_remote import add_remake_remote from datalad_remake.utils.verify import verify_file if TYPE_CHECKING: @@ -259,6 +260,8 @@ def __call__( else: resolved_output = set(output_pattern) + initialize_remotes(ds, resolved_output) + for out in resolved_output: url = add_url(ds, out, url_base, url_only=prospective_execution) yield get_status_dict( @@ -466,6 +469,10 @@ def collect( ) -> set[str]: output = resolve_patterns(root_dir=worktree, patterns=output_pattern) + # Ensure that all subdatasets that are touched by paths in `output` are + # installed. + install_containing_subdatasets(dataset, output) + # Unlock output files in the dataset-directory and copy the result unlock_files(dataset, output) for o in output: @@ -479,6 +486,57 @@ def collect( return output +def install_containing_subdatasets(dataset: Dataset, files: Iterable[str]) -> None: + """Install all subdatasets that contain a file from `files`.""" + + # Set the set of subdatasets to the set of subdatasets that are installed. + # Compare each prefix of a file path with the path of a subdataset from the + # root of `dataset`. If it matches, the subdataset is installed and the set + # of subdatasets is updated accordingly. + + # Get the relative paths of all known subdatasets + subdataset_infos = { + Path(result['path']).relative_to(Path(result['parentds'])): result['state'] + == 'present' + for result in dataset.subdatasets(recursive=True) + } + + # Get the prefixes of all required paths sorted by length + required_paths = sorted( + { + prefix + for file in files + for prefix in Path(file).parents + if prefix != Path('.') + }, + key=lambda p: p.parts.__len__(), + ) + + for path in required_paths: + if path in subdataset_infos and not subdataset_infos[path]: + dataset.install(path=str(path), result_renderer='disabled') + # Update subdataset_info to get newly installed subdatasets. + subdataset_infos = { + Path(result['path']).relative_to(Path(result['parentds'])): result[ + 'state' + ] + == 'present' + for result in dataset.subdatasets(recursive=True) + } + + +def initialize_remotes(dataset: Dataset, files: Iterable[str]) -> None: + """Add a remake remote to all datasets that are touched by the files""" + + # Get the subdatasets that contain generated files + touched_dataset_dirs = { + get_file_dataset(dataset.pathobj / file)[0] for file in files + } + + for dataset_dir in touched_dataset_dirs: + add_remake_remote(str(dataset_dir), allow_untrusted_execution=False) + + def unlock_files(dataset: Dataset, files: Iterable[str]) -> None: """Use datalad to resolve subdatasets and unlock files in the dataset.""" # TODO: for some reason `dataset unlock` does not operate in the diff --git a/datalad_remake/commands/provision_cmd.py b/datalad_remake/commands/provision_cmd.py index 8845d95..a6cf125 100644 --- a/datalad_remake/commands/provision_cmd.py +++ b/datalad_remake/commands/provision_cmd.py @@ -232,7 +232,7 @@ def resolve_patterns( This method will resolve relative path-patterns in the dataset. It will install all subdatasets that are matched by the patterns. Pattern are - described as outline in `glob.glob`. The method support recursive globbing + described as outlined in `glob.glob`. The method support recursive globbing of zero or more directories with the pattern: `**`. Parameters @@ -328,7 +328,8 @@ def glob_pattern( # Match all elements at the current position with the first part of the # pattern. for rec_match in glob( - '*' if pattern[0] == '**' else pattern[0], root_dir=root.pathobj / position + '*' if pattern[0] == '**' else pattern[0], + root_dir=root.pathobj / position, ): match = position / rec_match diff --git a/datalad_remake/commands/tests/create_datasets.py b/datalad_remake/commands/tests/create_datasets.py index 9f70d80..36ae6be 100644 --- a/datalad_remake/commands/tests/create_datasets.py +++ b/datalad_remake/commands/tests/create_datasets.py @@ -3,39 +3,9 @@ from pathlib import Path from datalad_next.datasets import Dataset -from datalad_next.runners import call_git_success from datalad_remake import template_dir - - -def update_config_for_remake(dataset: Dataset): - # set annex security related variables to allow remake-URLs - dataset.configuration( - action='set', - scope='local', - recursive=True, - spec=[('remote.remake.annex-security-allow-unverified-downloads', 'ACKTHPPT')], - result_renderer='disabled', - ) - - -def add_remake_remote(dataset: Dataset, signing_key: str | None = None): - aue = 'false' if signing_key else 'true' - call_git_success( - [ - '-C', - dataset.path, - 'annex', - 'initremote', - 'remake', - 'type=external', - 'externaltype=datalad-remake', - 'encryption=none', - f'allow-untrusted-execution={aue}', - ], - capture_output=True, - ) - update_config_for_remake(dataset) +from datalad_remake.utils.remake_remote import add_remake_remote def create_ds_hierarchy( @@ -77,13 +47,13 @@ def create_ds_hierarchy( root_dataset.get(recursive=True, result_renderer='disabled') # Add datalad-remake remotes to the root dataset and all subdatasets - add_remake_remote(root_dataset, signing_key) + add_remake_remote(root_dataset.path, allow_untrusted_execution=signing_key is None) subdataset_path = Path() for index in range(subdataset_levels): subdataset_path /= f'{name}_subds{index}' add_remake_remote( - Dataset(root_dataset.pathobj / subdataset_path), - signing_key, + str(root_dataset.pathobj / subdataset_path), + allow_untrusted_execution=signing_key is None, ) return datasets diff --git a/datalad_remake/utils/remake_remote.py b/datalad_remake/utils/remake_remote.py new file mode 100644 index 0000000..8e0d255 --- /dev/null +++ b/datalad_remake/utils/remake_remote.py @@ -0,0 +1,70 @@ +from __future__ import annotations + +import logging + +from datalad_next.datasets import Dataset + +from datalad_remake import auto_remote_name + +logger = logging.getLogger('datalad.remake.utils.remake_remote') + + +def add_remake_remote( + dataset_root: str, + *, + allow_untrusted_execution: bool = False, +): + aue = 'true' if allow_untrusted_execution else 'false' + options = [ + 'type=external', + 'externaltype=datalad-remake', + 'encryption=none', + 'autoenable=true', + f'allow-untrusted-execution={aue}', + ] + + # Create a `Dataset`-instance to use the `AnnexRepo`-methods for special + # remote handling. + dataset = Dataset(dataset_root) + + # If no `datalad-remake` remote exists, create a new one. Do not touch + # existing `datalad-remake` remotes. + if not get_remake_auto_remote(dataset): + dataset.repo.init_remote(auto_remote_name, options) + else: + logger.info( + 'Found already existing `%s` remote in %s. ' + 'Leaving it unmodified, please check its configuration.', + auto_remote_name, + dataset_root, + ) + + # Update the configuration to allow unverified downloads from the remake + # remote. This is necessary for prospective computation. + update_config_for_remake(dataset_root, auto_remote_name) + + +def get_remake_auto_remote(dataset: Dataset) -> list: + return [ + remote_info + for remote_info in dataset.repo.get_special_remotes().values() + if remote_info['type'] == 'external' + and remote_info['externaltype'] == 'datalad-remake' + ] + + +def update_config_for_remake(dataset_root: str, remote_name: str) -> None: + # set annex security related variables to allow remake-URLs in prospective + # computation + dataset = Dataset(dataset_root) + dataset.configuration( + action='set', + scope='local', + spec=[ + ( + f'remote.{remote_name}.annex-security-allow-unverified-downloads', + 'ACKTHPPT', + ), + ], + result_renderer='disabled', + )