diff --git a/README.md b/README.md
index bba455e..ed7acbf 100644
--- a/README.md
+++ b/README.md
@@ -74,11 +74,6 @@ EOF
> datalad save -m "add 'one-to-many' remake method"
```
-Create a `datalad-remake` git-annex special remote:
-```bash
-> git annex initremote datalad-remake encryption=none type=external externaltype=datalad-remake allow-untrusted-execution=true
-```
-
Execute a computation and save the result:
```bash
> datalad make -p first=bob -p second=alice -p output=name -o name-1.txt \
@@ -95,8 +90,25 @@ content: bob
content: alice
```
-Drop the content of `name-1.txt`, verify it is gone, recreate it via
-`datalad get`, which "fetches" it from the `datalad-remake` remote:
+### Recomputation
+
+DataLad REMAKE can recompute dropped content. To demonstrate this, we will
+drop a file and then recreate it via `datalad get`. Before we can do that in
+this example we have to make a small adjustement. This is due to the fact that
+we use "untrusted" execution in this example. It makes the example easier
+because no signing keys are required. However, the git annex special remote that
+was created by the `datalad make` command does not allow untrusted execution by
+default (for security reasons we never automatically create a datalad-remake
+remote that supports untrusted execution). To instruct the special remote to
+allow untrusted execution, we have to reconfigure it. This can be done via the
+following command:
+
+```bash
+> git annex enableremote datalad-remake-auto allow-untrusted-execution=true
+```
+
+Now we drop the content of `name-1.txt`, verify it is gone, and recreate it via
+`datalad get`, which "fetches" it from the `datalad-remake` remote.
```bash
> datalad drop name-1.txt
@@ -107,28 +119,9 @@ Drop the content of `name-1.txt`, verify it is gone, recreate it via
### Prospective computation
The `datalad make` command can also be used to perform a *prospective
-computation*. To use this feature, the following configuration value
-has to be set ():
-
-```bash
-> git config remote.datalad-remake.annex-security-allow-unverified-downloads ACKTHPPT
-```
+computation*.
-
- Why does the configuration variable have to be set?
-
-This setting allows git-annex to download files from the special remote `datalad-remake`
-although git-annex cannot check a hash to verify that the content is correct.
-Because the computation was never performed, there is no hash available for content
-verification of an output file yet.
-
-For more information see the description of
-`remote..annex-security-allow-unverified-downloads` and of
-`annex.security.allow-unverified-downloads` at
-https://git-annex.branchable.com/git-annex/.
-
-
-Afterwards, a prospective computation can be initiated by using the
+The prospective computation can be initiated by using the
`--prospective-execution` option:
```bash
@@ -165,6 +158,25 @@ time!) based on the specified instructions:
content: john
```
+Please note, to use this feature, the following configuration variable
+`remote.datalad-remake-auto.annex-security-allow-unverified-downloads` is set
+to `ACKTHPPT` for each automatically created git-annex special remote
+
+
+ Why does the configuration variable have to be set?
+
+This setting allows git-annex to download files from the special remote `datalad-remake`
+although git-annex cannot check a hash to verify that the content is correct.
+Because the computation was never performed, there is no hash available for content
+verification of an output file yet.
+
+For more information see the description of
+`remote..annex-security-allow-unverified-downloads` and of
+`annex.security.allow-unverified-downloads` at
+https://git-annex.branchable.com/git-annex/.
+
+
+
Additional examples can be found in the [examples](https://github.com/datalad/datalad-remake/tree/main/examples) directory.
diff --git a/datalad_remake/__init__.py b/datalad_remake/__init__.py
index 451e6c9..01bf4b0 100644
--- a/datalad_remake/__init__.py
+++ b/datalad_remake/__init__.py
@@ -6,6 +6,7 @@
__all__ = [
'__version__',
+ 'auto_remote_name',
'command_suite',
'priority_config_key',
'specification_dir',
@@ -52,3 +53,4 @@
specification_dir = '.datalad/make/specifications'
trusted_keys_config_key = 'datalad.make.trusted-keys'
priority_config_key = 'datalad.make.priority'
+auto_remote_name = 'datalad-remake-auto'
diff --git a/datalad_remake/commands/make_cmd.py b/datalad_remake/commands/make_cmd.py
index eeedf40..bdd04ce 100644
--- a/datalad_remake/commands/make_cmd.py
+++ b/datalad_remake/commands/make_cmd.py
@@ -44,6 +44,7 @@
from datalad_remake.utils.compute import compute
from datalad_remake.utils.getkeys import get_trusted_keys
from datalad_remake.utils.glob import resolve_patterns
+from datalad_remake.utils.remake_remote import add_remake_remote
from datalad_remake.utils.verify import verify_file
if TYPE_CHECKING:
@@ -259,6 +260,8 @@ def __call__(
else:
resolved_output = set(output_pattern)
+ initialize_remotes(ds, resolved_output)
+
for out in resolved_output:
url = add_url(ds, out, url_base, url_only=prospective_execution)
yield get_status_dict(
@@ -466,6 +469,10 @@ def collect(
) -> set[str]:
output = resolve_patterns(root_dir=worktree, patterns=output_pattern)
+ # Ensure that all subdatasets that are touched by paths in `output` are
+ # installed.
+ install_containing_subdatasets(dataset, output)
+
# Unlock output files in the dataset-directory and copy the result
unlock_files(dataset, output)
for o in output:
@@ -479,6 +486,57 @@ def collect(
return output
+def install_containing_subdatasets(dataset: Dataset, files: Iterable[str]) -> None:
+ """Install all subdatasets that contain a file from `files`."""
+
+ # Set the set of subdatasets to the set of subdatasets that are installed.
+ # Compare each prefix of a file path with the path of a subdataset from the
+ # root of `dataset`. If it matches, the subdataset is installed and the set
+ # of subdatasets is updated accordingly.
+
+ # Get the relative paths of all known subdatasets
+ subdataset_infos = {
+ Path(result['path']).relative_to(Path(result['parentds'])): result['state']
+ == 'present'
+ for result in dataset.subdatasets(recursive=True)
+ }
+
+ # Get the prefixes of all required paths sorted by length
+ required_paths = sorted(
+ {
+ prefix
+ for file in files
+ for prefix in Path(file).parents
+ if prefix != Path('.')
+ },
+ key=lambda p: p.parts.__len__(),
+ )
+
+ for path in required_paths:
+ if path in subdataset_infos and not subdataset_infos[path]:
+ dataset.install(path=str(path), result_renderer='disabled')
+ # Update subdataset_info to get newly installed subdatasets.
+ subdataset_infos = {
+ Path(result['path']).relative_to(Path(result['parentds'])): result[
+ 'state'
+ ]
+ == 'present'
+ for result in dataset.subdatasets(recursive=True)
+ }
+
+
+def initialize_remotes(dataset: Dataset, files: Iterable[str]) -> None:
+ """Add a remake remote to all datasets that are touched by the files"""
+
+ # Get the subdatasets that contain generated files
+ touched_dataset_dirs = {
+ get_file_dataset(dataset.pathobj / file)[0] for file in files
+ }
+
+ for dataset_dir in touched_dataset_dirs:
+ add_remake_remote(str(dataset_dir), allow_untrusted_execution=False)
+
+
def unlock_files(dataset: Dataset, files: Iterable[str]) -> None:
"""Use datalad to resolve subdatasets and unlock files in the dataset."""
# TODO: for some reason `dataset unlock` does not operate in the
diff --git a/datalad_remake/commands/provision_cmd.py b/datalad_remake/commands/provision_cmd.py
index 8845d95..a6cf125 100644
--- a/datalad_remake/commands/provision_cmd.py
+++ b/datalad_remake/commands/provision_cmd.py
@@ -232,7 +232,7 @@ def resolve_patterns(
This method will resolve relative path-patterns in the dataset. It will
install all subdatasets that are matched by the patterns. Pattern are
- described as outline in `glob.glob`. The method support recursive globbing
+ described as outlined in `glob.glob`. The method support recursive globbing
of zero or more directories with the pattern: `**`.
Parameters
@@ -328,7 +328,8 @@ def glob_pattern(
# Match all elements at the current position with the first part of the
# pattern.
for rec_match in glob(
- '*' if pattern[0] == '**' else pattern[0], root_dir=root.pathobj / position
+ '*' if pattern[0] == '**' else pattern[0],
+ root_dir=root.pathobj / position,
):
match = position / rec_match
diff --git a/datalad_remake/commands/tests/create_datasets.py b/datalad_remake/commands/tests/create_datasets.py
index 9f70d80..36ae6be 100644
--- a/datalad_remake/commands/tests/create_datasets.py
+++ b/datalad_remake/commands/tests/create_datasets.py
@@ -3,39 +3,9 @@
from pathlib import Path
from datalad_next.datasets import Dataset
-from datalad_next.runners import call_git_success
from datalad_remake import template_dir
-
-
-def update_config_for_remake(dataset: Dataset):
- # set annex security related variables to allow remake-URLs
- dataset.configuration(
- action='set',
- scope='local',
- recursive=True,
- spec=[('remote.remake.annex-security-allow-unverified-downloads', 'ACKTHPPT')],
- result_renderer='disabled',
- )
-
-
-def add_remake_remote(dataset: Dataset, signing_key: str | None = None):
- aue = 'false' if signing_key else 'true'
- call_git_success(
- [
- '-C',
- dataset.path,
- 'annex',
- 'initremote',
- 'remake',
- 'type=external',
- 'externaltype=datalad-remake',
- 'encryption=none',
- f'allow-untrusted-execution={aue}',
- ],
- capture_output=True,
- )
- update_config_for_remake(dataset)
+from datalad_remake.utils.remake_remote import add_remake_remote
def create_ds_hierarchy(
@@ -77,13 +47,13 @@ def create_ds_hierarchy(
root_dataset.get(recursive=True, result_renderer='disabled')
# Add datalad-remake remotes to the root dataset and all subdatasets
- add_remake_remote(root_dataset, signing_key)
+ add_remake_remote(root_dataset.path, allow_untrusted_execution=signing_key is None)
subdataset_path = Path()
for index in range(subdataset_levels):
subdataset_path /= f'{name}_subds{index}'
add_remake_remote(
- Dataset(root_dataset.pathobj / subdataset_path),
- signing_key,
+ str(root_dataset.pathobj / subdataset_path),
+ allow_untrusted_execution=signing_key is None,
)
return datasets
diff --git a/datalad_remake/utils/remake_remote.py b/datalad_remake/utils/remake_remote.py
new file mode 100644
index 0000000..8e0d255
--- /dev/null
+++ b/datalad_remake/utils/remake_remote.py
@@ -0,0 +1,70 @@
+from __future__ import annotations
+
+import logging
+
+from datalad_next.datasets import Dataset
+
+from datalad_remake import auto_remote_name
+
+logger = logging.getLogger('datalad.remake.utils.remake_remote')
+
+
+def add_remake_remote(
+ dataset_root: str,
+ *,
+ allow_untrusted_execution: bool = False,
+):
+ aue = 'true' if allow_untrusted_execution else 'false'
+ options = [
+ 'type=external',
+ 'externaltype=datalad-remake',
+ 'encryption=none',
+ 'autoenable=true',
+ f'allow-untrusted-execution={aue}',
+ ]
+
+ # Create a `Dataset`-instance to use the `AnnexRepo`-methods for special
+ # remote handling.
+ dataset = Dataset(dataset_root)
+
+ # If no `datalad-remake` remote exists, create a new one. Do not touch
+ # existing `datalad-remake` remotes.
+ if not get_remake_auto_remote(dataset):
+ dataset.repo.init_remote(auto_remote_name, options)
+ else:
+ logger.info(
+ 'Found already existing `%s` remote in %s. '
+ 'Leaving it unmodified, please check its configuration.',
+ auto_remote_name,
+ dataset_root,
+ )
+
+ # Update the configuration to allow unverified downloads from the remake
+ # remote. This is necessary for prospective computation.
+ update_config_for_remake(dataset_root, auto_remote_name)
+
+
+def get_remake_auto_remote(dataset: Dataset) -> list:
+ return [
+ remote_info
+ for remote_info in dataset.repo.get_special_remotes().values()
+ if remote_info['type'] == 'external'
+ and remote_info['externaltype'] == 'datalad-remake'
+ ]
+
+
+def update_config_for_remake(dataset_root: str, remote_name: str) -> None:
+ # set annex security related variables to allow remake-URLs in prospective
+ # computation
+ dataset = Dataset(dataset_root)
+ dataset.configuration(
+ action='set',
+ scope='local',
+ spec=[
+ (
+ f'remote.{remote_name}.annex-security-allow-unverified-downloads',
+ 'ACKTHPPT',
+ ),
+ ],
+ result_renderer='disabled',
+ )