feat: remove allow_untrusted_execution parameter

Remove the `allow_untrusted_execution` parameter from the annex special remote.
datalad · Dec 17, 2024 · 0332201 · 0332201
2 parents f585d4b + 7cbbe0b
commit 0332201
Show file tree

Hide file tree

Showing 17 changed files with 168 additions and 106 deletions.
diff --git a/README.md b/README.md
@@ -26,7 +26,11 @@ data curators, and infrastructure administrators.
 
 ## Requirements
 
-This extension requires Python >= `3.9`.
+This extension requires Python >= `3.9`. It also requires GPG to be installed
+as well as a GPG key-pair to sign and verify commits. In addition,
+git has to be configured to sign commits. For more information on how to sign
+commits, refer to the
+[Git documentation](https://git-scm.com/book/en/v2/Git-Tools-Signing-Your-Work).
 
 
 ## Installation
@@ -50,7 +54,7 @@ To check your installation, run:
 
 ## Example usage
 
-Create a dataset:
+Ensure that your commits are signed (see the [Git documentation](https://git-scm.com/book/en/v2/Git-Tools-Signing-Your-Work)), and create a dataset:
 
 
 ```bash
@@ -74,10 +78,14 @@ EOF
 > datalad save -m "add 'one-to-many' remake method"
 ```
 
+Before the computation can be executed, `datalad-make` has to be told to trust
+the public key of the signer. How this is done is described in the section
+[Trusted Keys](#trusted-keys).
+
 Execute a computation and save the result:
 ```bash
 > datalad make -p first=bob -p second=alice -p output=name -o name-1.txt \
--o name-2.txt --allow-untrusted-execution one-to-many
+-o name-2.txt one-to-many
 ```
 The method `one-to-many` will create two files with the names `<output>-1.txt`
 and `<output>-2.txt`. Thus, the two files `name-1.txt` and `name-2.txt` need to
@@ -93,22 +101,11 @@ content: alice
 ### Recomputation
 
 DataLad REMAKE can recompute dropped content. To demonstrate this, we will
-drop a file and then recreate it via `datalad get`. Before we can do that in
-this example we have to make a small adjustement. This is due to the fact that
-we use "untrusted" execution in this example. It makes the example easier
-because no signing keys are required. However, the git annex special remote that
-was created by the `datalad make` command does not allow untrusted execution by
-default (for security reasons we never automatically create a datalad-remake
-remote that supports untrusted execution). To instruct the special remote to
-allow untrusted execution, we have to reconfigure it. This can be done via the
-following command:
-
-```bash
-> git annex enableremote datalad-remake-auto allow-untrusted-execution=true
-``` 
+drop a file and then recreate it via `datalad get`.
 
-Now we drop the content of `name-1.txt`, verify it is gone, and recreate it via
-`datalad get`, which "fetches" it from the `datalad-remake` remote.
+Drop the content of `name-1.txt`, verify it is gone, and recreate it via
+`datalad get`, which "fetches" it from the `datalad-remake` remote. Note: the
+`datalad-remake` remote was automatically created by the command `datalad make`.
 
 ```bash
 > datalad drop name-1.txt
@@ -126,17 +123,17 @@ The prospective computation can be initiated by using the
 
 ```bash
 > datalad make -p first=john -p second=susan -p output=person \
--o person-1.txt -o person-2.txt --prospective-execution --allow-untrusted-execution one-to-many
+-o person-1.txt -o person-2.txt --prospective-execution one-to-many
 ```
 
 The following command will fail, because no computation has been performed,
-and the file content is unavailable:
+and the file content is not yet available:
 
 ```bash
 > cat person-1.txt    # this will fail, because the computation has not yet been performed
 ```
 
-We can further inspect this with `git annex info`:
+We can further inspect `person-1.txt` with `git annex info`:
 
 ```bash
 > git annex info person-1.txt
@@ -160,7 +157,7 @@ content: john
 
 Please note, to use this feature, the following configuration variable
 `remote.datalad-remake-auto.annex-security-allow-unverified-downloads` is set
-to `ACKTHPPT` for each automatically created git-annex special remote
+to `ACKTHPPT` for each automatically created git-annex special remote.
 
 <details>
     <summary>Why does the configuration variable have to be set?</summary>

diff --git a/conftest.py b/conftest.py
@@ -3,6 +3,6 @@
 # its tooling
 from datalad_next.conftest import setup_package
 
-pytest_plugins = 'datalad_next.tests.fixtures'
+pytest_plugins = ('datalad_core.tests.fixtures', 'datalad_next.tests.fixtures')
 
 __all__ = ['setup_package']
diff --git a/datalad_remake/__init__.py b/datalad_remake/__init__.py
@@ -6,6 +6,7 @@
 
 __all__ = [
     '__version__',
+    'allow_untrusted_execution_key',
     'auto_remote_name',
     'command_suite',
     'priority_config_key',
@@ -52,5 +53,6 @@
 template_dir = '.datalad/make/methods'
 specification_dir = '.datalad/make/specifications'
 trusted_keys_config_key = 'datalad.make.trusted-keys'
+allow_untrusted_execution_key = 'datalad.make.allow-untrusted-execution.d-'
 priority_config_key = 'datalad.make.priority'
 auto_remote_name = 'datalad-remake-auto'
diff --git a/datalad_remake/annexremotes/remake_remote.py b/datalad_remake/annexremotes/remake_remote.py
@@ -37,7 +37,10 @@
     get_file_dataset,
     provide_context,
 )
-from datalad_remake.utils.getkeys import get_trusted_keys
+from datalad_remake.utils.getconfig import (
+    get_allow_untrusted_execution,
+    get_trusted_keys,
+)
 from datalad_remake.utils.glob import resolve_patterns
 from datalad_remake.utils.verify import verify_file
 
@@ -53,11 +56,6 @@
 class RemakeRemote(SpecialRemote):
     def __init__(self, annex: Master):
         super().__init__(annex)
-        self.configs = {
-            'allow-untrusted-execution': 'Allow execution of untrusted code '
-            'with untrusted parameters. set to "true" to enable. THIS IS '
-            'DANGEROUS and might lead to remote code execution.',
-        }
         self._config_manager: ConfigManager | None = None
 
     @property
@@ -161,7 +159,13 @@ def get_assigned_value(assignment: str) -> str:
     def transfer_retrieve(self, key: str, file_name: str) -> None:
         self.annex.debug(f'TRANSFER RETRIEVE key: {key!r}, file_name: {file_name!r}')
 
-        if self.annex.getconfig('allow-untrusted-execution') == 'true':
+        dataset_id = self.config_manager.get('datalad.dataset.id').value
+        self.annex.debug(f'TRANSFER RETRIEVE dataset_id: {dataset_id!r}')
+        self.annex.debug(
+            'TRANSFER RETRIEVE get_allow_untrusted_execution: '
+            f'{get_allow_untrusted_execution(dataset_id)}'
+        )
+        if get_allow_untrusted_execution(dataset_id):
             trusted_key_ids = None
         else:
             trusted_key_ids = get_trusted_keys()

diff --git a/datalad_remake/annexremotes/tests/test_hierarchies.py b/datalad_remake/annexremotes/tests/test_hierarchies.py
@@ -3,9 +3,11 @@
 
 import pytest
 from datalad.distribution.get import Get as datalad_Get
+from datalad_core.config import ConfigItem
 from datalad_next.datasets import Dataset
 from datalad_next.tests import skip_if_on_windows
 
+from datalad_remake import allow_untrusted_execution_key
 from datalad_remake.commands.tests.create_datasets import (
     create_simple_computation_dataset,
 )
@@ -78,7 +80,7 @@ def _check_content(dataset, file_content: Iterable[tuple[str, str]]):
 
 @skip_if_on_windows
 @pytest.mark.parametrize('output_pattern', [output_pattern_static, output_pattern_glob])
-def test_end_to_end(tmp_path, monkeypatch, output_pattern):
+def test_end_to_end(tmp_path, cfgman, monkeypatch, output_pattern):
     root_dataset = create_simple_computation_dataset(tmp_path, 'd2', 3, test_method)
 
     # run `make` command
@@ -103,21 +105,28 @@ def test_end_to_end(tmp_path, monkeypatch, output_pattern):
     # check computation success
     _check_content(root_dataset, test_file_content)
 
-    # Drop all computed content
-    _drop_files(root_dataset, collected_output)
-
     # Go to the subdataset `d2_subds0/d2_subds1` and fetch the content of `a1.txt`
     # from a datalad-remake remote.
-    monkeypatch.chdir(root_dataset.pathobj / 'd2_subds0' / 'd2_subds1')
-    datalad_Get()('a1.txt')
-
-    # check that all known files that were computed are added to the annex
-    _check_content(root_dataset, test_file_content)
-
-    _drop_files(root_dataset, collected_output)
-
-    # check get in subdatasets
-    monkeypatch.chdir(root_dataset.pathobj)
-    datalad_Get()('d2_subds0/d2_subds1/a1.txt')
-
-    _check_content(root_dataset, test_file_content)
+    with cfgman.overrides(
+        {
+            # Allow the special remote to execute untrusted operations on the
+            # dataset `root_dataset/d2_subds0/d2_subds1`
+            allow_untrusted_execution_key
+            + Dataset(root_dataset.pathobj / 'd2_subds0' / 'd2_subds1').id: ConfigItem(
+                'true'
+            ),
+        }
+    ):
+        # Drop all computed content
+        _drop_files(root_dataset, collected_output)
+
+        monkeypatch.chdir(root_dataset.pathobj / 'd2_subds0' / 'd2_subds1')
+        datalad_Get()('a1.txt')
+        # check that all known files that were computed are added to the annex
+        _check_content(root_dataset, test_file_content)
+
+        _drop_files(root_dataset, collected_output)
+
+        monkeypatch.chdir(root_dataset.pathobj)
+        datalad_Get()('d2_subds0/d2_subds1/a1.txt')
+        _check_content(root_dataset, test_file_content)
diff --git a/datalad_remake/annexremotes/tests/test_priority.py b/datalad_remake/annexremotes/tests/test_priority.py
@@ -1,10 +1,10 @@
 import pytest
 from annexremote import Master
 from datalad_core.config import ConfigItem
-from datalad_core.tests.fixtures import cfgman  # noqa: F401
 from datalad_next.tests import skip_if_on_windows
 
 from datalad_remake import (
+    allow_untrusted_execution_key,
     priority_config_key,
     specification_dir,
     template_dir,
@@ -29,7 +29,7 @@
 
 @skip_if_on_windows
 @pytest.mark.parametrize('priority', [['alpha', 'beta'], ['beta', 'alpha']])
-def test_compute_remote_priority(tmp_path, cfgman, monkeypatch, priority):  # noqa: F811
+def test_compute_remote_priority(tmp_path, cfgman, monkeypatch, priority):
     dataset = create_ds_hierarchy(
         tmp_path=tmp_path,
         name='ds1',
@@ -74,9 +74,15 @@ def test_compute_remote_priority(tmp_path, cfgman, monkeypatch, priority):  # no
         for label in ['alpha', 'beta']
     ]
 
-    # Run the special remote with the given priority configuration.
-    with cfgman.overrides({priority_config_key: ConfigItem(','.join(priority))}):
-        run_remake_remote(tmp_path, urls, False)
+    with cfgman.overrides(
+        {
+            # Run the special remote with the given priority configuration.
+            priority_config_key: ConfigItem(','.join(priority)),
+            # Allow the special remote to execute untrusted operations on this dataset
+            allow_untrusted_execution_key + dataset.id: ConfigItem('true'),
+        }
+    ):
+        run_remake_remote(tmp_path, urls)
 
     # At this point the datalad-remake remote should have executed the
     # prioritized template and written the result.
@@ -85,7 +91,7 @@ def test_compute_remote_priority(tmp_path, cfgman, monkeypatch, priority):  # no
     ).read_text().strip() == f'from {priority[0]}: {priority[0]}_parameter'
 
 
-def test_config_precedence(existing_dataset, tmp_path, cfgman, monkeypatch):  # noqa: F811
+def test_config_precedence(existing_dataset, tmp_path, cfgman, monkeypatch):
     existing_dataset.config.add('datalad.make.priority', '1', scope='branch')
 
     monkeypatch.setattr(

diff --git a/datalad_remake/annexremotes/tests/test_remake_remote.py b/datalad_remake/annexremotes/tests/test_remake_remote.py
@@ -1,8 +1,8 @@
 import pytest
 from datalad_core.config import ConfigItem
-from datalad_core.tests.fixtures import cfgman  # noqa: F401
 from datalad_next.tests import skip_if_on_windows
 
+from datalad_remake import allow_untrusted_execution_key
 from datalad_remake.commands.tests.create_datasets import create_ds_hierarchy
 
 from ... import (
@@ -24,7 +24,7 @@
 
 @skip_if_on_windows
 @pytest.mark.parametrize('trusted', [True, False])
-def test_compute_remote_main(tmp_path, cfgman, monkeypatch, trusted):  # noqa: F811
+def test_compute_remote_main(tmp_path, cfgman, monkeypatch, trusted):
     if trusted:
         gpg_homedir = tmp_path / 'tmp_gpg_dir'
         tmp_home = tmp_path / 'tmp_home'
@@ -69,9 +69,10 @@ def test_compute_remote_main(tmp_path, cfgman, monkeypatch, trusted):  # noqa: F
     with cfgman.overrides(
         {
             trusted_keys_config_key: ConfigItem(signing_key),
+            allow_untrusted_execution_key + dataset.id: ConfigItem('true'),
         }
     ):
-        run_remake_remote(tmp_path, [url], trusted)
+        run_remake_remote(tmp_path, [url])
 
     # At this point the datalad-remake remote should have executed the
     # computation and written the result.

diff --git a/datalad_remake/annexremotes/tests/utils.py b/datalad_remake/annexremotes/tests/utils.py
@@ -130,7 +130,7 @@ def create_keypair(gpg_dir: Path, name: bytes = b'Test User'):
     )[0]
 
 
-def run_remake_remote(dest_path, urls, trusted):
+def run_remake_remote(dest_path, urls):
     input_ = MockedInput()
 
     annex_key = 'some-fake-annex-key'
@@ -139,8 +139,7 @@ def run_remake_remote(dest_path, urls, trusted):
     # below.
     input_.send('PREPARE\n')
     input_.send(f'TRANSFER RETRIEVE {annex_key} {dest_path / "remade.txt"!s}\n')
-    # The next line is the answer to `GETCONFIG allow-untrusted-execution`
-    input_.send(f'VALUE {"false" if trusted else "true"}\n')
+    input_.send('VALUE .git\n')
     # The next two lines assemble the answer to
     # `GETURLS <annex-key> datalad-remake:`
     for url in urls:

diff --git a/datalad_remake/commands/make_cmd.py b/datalad_remake/commands/make_cmd.py
@@ -42,7 +42,7 @@
 )
 from datalad_remake.utils.chdir import chdir
 from datalad_remake.utils.compute import compute
-from datalad_remake.utils.getkeys import get_trusted_keys
+from datalad_remake.utils.getconfig import get_trusted_keys
 from datalad_remake.utils.glob import resolve_patterns
 from datalad_remake.utils.remake_remote import add_remake_remote
 from datalad_remake.utils.verify import verify_file
@@ -534,7 +534,7 @@ def initialize_remotes(dataset: Dataset, files: Iterable[str]) -> None:
     }
 
     for dataset_dir in touched_dataset_dirs:
-        add_remake_remote(str(dataset_dir), allow_untrusted_execution=False)
+        add_remake_remote(str(dataset_dir))
 
 
 def unlock_files(dataset: Dataset, files: Iterable[str]) -> None:

diff --git a/datalad_remake/commands/tests/create_datasets.py b/datalad_remake/commands/tests/create_datasets.py
@@ -4,7 +4,10 @@
 
 from datalad_next.datasets import Dataset
 
-from datalad_remake import template_dir
+from datalad_remake import (
+    template_dir,
+    trusted_keys_config_key,
+)
 from datalad_remake.utils.remake_remote import add_remake_remote
 
 
@@ -19,7 +22,7 @@ def create_ds_hierarchy(
     root_dataset.create(force=True, result_renderer='disabled')
     (root_dataset.pathobj / 'a.txt').write_text('a\n')
     (root_dataset.pathobj / 'b.txt').write_text('b\n')
-    _enable_signing(root_dataset, signing_key)
+    _enable_verification(root_dataset, signing_key)
     root_dataset.save(result_renderer='disabled')
     datasets = [(name, tmp_path / name, root_dataset)]
 
@@ -31,7 +34,7 @@ def create_ds_hierarchy(
         (subdataset.pathobj / f'a{level}.txt').write_text(f'a{level}\n')
         (subdataset.pathobj / f'b{level}.txt').write_text(f'b{level}\n')
         subdataset.save(result_renderer='disabled')
-        _enable_signing(subdataset, signing_key)
+        _enable_verification(subdataset, signing_key)
         datasets.append((f'{name}_subds{level}', subdataset_path, subdataset))
 
     # Link the datasets
@@ -47,22 +50,20 @@ def create_ds_hierarchy(
     root_dataset.get(recursive=True, result_renderer='disabled')
 
     # Add datalad-remake remotes to the root dataset and all subdatasets
-    add_remake_remote(root_dataset.path, allow_untrusted_execution=signing_key is None)
+    add_remake_remote(root_dataset.path)
     subdataset_path = Path()
     for index in range(subdataset_levels):
         subdataset_path /= f'{name}_subds{index}'
-        add_remake_remote(
-            str(root_dataset.pathobj / subdataset_path),
-            allow_untrusted_execution=signing_key is None,
-        )
+        add_remake_remote(str(root_dataset.pathobj / subdataset_path))
 
     return datasets
 
 
-def _enable_signing(dataset: Dataset, key: str | None):
+def _enable_verification(dataset: Dataset, key: str | None):
     if key is not None:
         dataset.config.set('commit.gpgsign', 'true', scope='local')
         dataset.config.set('user.signingkey', key, scope='local')
+        dataset.config.set(trusted_keys_config_key, key, scope='global')
 
 
 def create_simple_computation_dataset(