feat: add trusted execution

This commit adds trusted execution. Trusted execution requires that the latest commit to the method (in `.datalad/make/methods`) and the specification file (in `.datalad/make/specifications`) were signed with a trusted key. That means, a `git verify-commit` succeeds.
datalad · Oct 25, 2024 · 979c0ef · 979c0ef
1 parent 76e4d8b
commit 979c0ef
Show file tree

Hide file tree

Showing 9 changed files with 93 additions and 5 deletions.
diff --git a/datalad_remake/annexremotes/remake_remote.py b/datalad_remake/annexremotes/remake_remote.py
@@ -29,6 +29,7 @@
     provide_context,
 )
 from datalad_remake.utils.glob import resolve_patterns
+from datalad_remake.utils.verify import verify_file
 
 if TYPE_CHECKING:
     from collections.abc import Iterable
@@ -41,6 +42,12 @@
 class RemakeRemote(SpecialRemote):
     def __init__(self, annex: Master):
         super().__init__(annex)
+        self.configs = {
+            'allow_untrusted_execution':
+                'Allow execution of untrusted code with untrusted parameters. '
+                'set to "true" to enable. THIS IS DANGEROUS and might lead to '
+                'remote code execution.',
+        }
 
     def __del__(self):
         self.close()
@@ -85,7 +92,12 @@ def get_url_for_key(self, key: str) -> str:
         self.annex.debug(f'get_url_for_key: key: {key!r}, urls: {urls!r}')
         return urls[0]
 
-    def get_compute_info(self, key: str) -> tuple[dict[str, Any], Dataset]:
+    def get_compute_info(self,
+                         key: str,
+                         *,
+                         allow_untrusted_specs: bool = False
+                         ) -> tuple[dict[str, Any], Dataset]:
+
         def get_assigned_value(assignment: str) -> str:
             return assignment.split('=', 1)[1]
 
@@ -96,6 +108,8 @@ def get_assigned_value(assignment: str) -> str:
 
         dataset = self._find_dataset(root_version)
         spec_path = dataset.pathobj / specification_dir / spec_name
+        if not allow_untrusted_specs:
+            verify_file(dataset.pathobj, spec_path)
         with open(spec_path, 'rb') as f:
             spec = json.load(f)
 
@@ -108,7 +122,14 @@ def get_assigned_value(assignment: str) -> str:
     def transfer_retrieve(self, key: str, file_name: str) -> None:
         self.annex.debug(f'TRANSFER RETRIEVE key: {key!r}, file_name: {file_name!r}')
 
-        compute_info, dataset = self.get_compute_info(key)
+        allow_untrusted_execution = self.annex.getconfig(
+            'allow_untrusted_execution'
+        ) == 'true'
+
+        compute_info, dataset = self.get_compute_info(
+            key,
+            allow_untrusted_specs=allow_untrusted_execution
+        )
         self.annex.debug(f'TRANSFER RETRIEVE compute_info: {compute_info!r}')
 
         # Perform the computation, and collect the results
@@ -124,6 +145,7 @@ def transfer_retrieve(self, key: str, file_name: str) -> None:
                 compute_info['method'],
                 compute_info['parameter'],
                 compute_info['output'],
+                allow_untrusted_code=allow_untrusted_execution,
             )
             lgr.debug('Starting collection')
             self.annex.debug('Starting collection')

diff --git a/datalad_remake/annexremotes/tests/test_hierarchies.py b/datalad_remake/annexremotes/tests/test_hierarchies.py
@@ -90,6 +90,7 @@ def test_end_to_end(tmp_path, monkeypatch, output_pattern):
         ],
         output=output_pattern,
         result_renderer='disabled',
+        allow_untrusted_code=True,
     )
 
     collected_output = [

diff --git a/datalad_remake/annexremotes/tests/test_remake_remote.py b/datalad_remake/annexremotes/tests/test_remake_remote.py
@@ -96,12 +96,16 @@ def test_compute_remote_main(tmp_path, monkeypatch):
     # below.
     input_.send('PREPARE\n')
     input_.send(f'TRANSFER RETRIEVE {key.decode()} {tmp_path / "remade.txt"!s}\n')
+    # The next line is the answer to `GETCONFIG allow_untrusted_execution`
+    input_.send('VALUE true\n')
     url = (
         'datalad-make:///?'
         f'root_version={dataset.repo.get_hexsha()}'
         '&specification=000001111122222'
         '&this=a.txt'
     )
+    # The next line is the answer to
+    # `GETURLS MD5E-s2--60b725f10c9c85c70d97880dfe8191b3.txt datalad-remake:`
     input_.send(f'VALUE {url}\n')
     input_.send('VALUE\n')
     input_.send('VALUE .git\n')

diff --git a/datalad_remake/commands/make_cmd.py b/datalad_remake/commands/make_cmd.py
@@ -34,6 +34,7 @@
     call_git_oneline,
     call_git_success,
 )
+from hypothesis.reporting import default
 
 from datalad_remake import (
     specification_dir,
@@ -42,6 +43,7 @@
 )
 from datalad_remake.utils.compute import compute
 from datalad_remake.utils.glob import resolve_patterns
+from datalad_remake.utils.verify import verify_file
 
 if TYPE_CHECKING:
     from collections.abc import (
@@ -165,6 +167,21 @@ class Make(ValidatedInterface):
             'before used. This is useful if a large number of parameters '
             'should be provided.',
         ),
+        'allow_untrusted_code': Parameter(
+            args=(
+                '--allow-untrusted-code',
+            ),
+            action='store_true',
+            default=False,
+            doc='Skip commit signature verification before executing code. This '
+                'should only be used in a strictly controlled environment with '
+                'fully trusted datasets. Trusted dataset means: every commit '
+                'stems from a trusted entity.  '
+                'DO NOT USE THIS OPTION, unless you are sure to understand the '
+                'consequences. One of which is that arbitrary parties can '
+                'execute arbitrary code under your account on your '
+                'infrastructure.',
+        ),
     }
 
     @staticmethod
@@ -182,14 +199,15 @@ def __call__(
         output_list: Path | None = None,
         parameter: list[str] | None = None,
         parameter_list: Path | None = None,
+        allow_untrusted_code: bool = False,
     ) -> Generator:
         ds: Dataset = dataset.ds if dataset else Dataset('.')
 
         input_pattern = (input or []) + read_list(input_list)
         output_pattern = (output or []) + read_list(output_list)
         parameter = (parameter or []) + read_list(parameter_list)
 
-        parameter_dict = {p.split('=', 1)[0]: p.split('=', 1)[1] for p in parameter}
+        parameter_dict = dict([p.split('=', 1) for p in parameter])
 
         # We have to get the URL first, because saving the specification to
         # the dataset will change the version.
@@ -203,7 +221,13 @@ def __call__(
                 branch,
                 input_pattern,
             ) as worktree:
-                execute(worktree, template, parameter_dict, output_pattern)
+                execute(
+                    worktree,
+                    template,
+                    parameter_dict,
+                    output_pattern,
+                    allow_untrusted_code=allow_untrusted_code,
+                )
                 resolved_output = collect(worktree, ds, output_pattern)
         else:
             resolved_output = set(output_pattern)
@@ -370,6 +394,8 @@ def execute(
     template_name: str,
     parameter: dict[str, str],
     output_pattern: list[str],
+    *,
+    allow_untrusted_code: bool = False,
 ) -> None:
     lgr.debug(
         'execute: %s %s %s %s',
@@ -392,6 +418,9 @@ def execute(
 
     # Run the computation in the worktree-directory
     template_path = Path(template_dir) / template_name
+    if not allow_untrusted_code:
+        verify_file(worktree_ds.pathobj, template_path)
+
     worktree_ds.get(template_path, result_renderer='disabled')
     compute(worktree, worktree / template_path, parameter)
 

diff --git a/datalad_remake/commands/tests/create_datasets.py b/datalad_remake/commands/tests/create_datasets.py
@@ -30,6 +30,7 @@ def add_remake_remote(dataset: Dataset):
             'type=external',
             'externaltype=datalad-remake',
             'encryption=none',
+            'allow_untrusted_execution=true',
         ],
         capture_output=True,
     )

diff --git a/datalad_remake/commands/tests/test_compute.py b/datalad_remake/commands/tests/test_compute.py
@@ -52,6 +52,7 @@ def _run_simple_computation(root_dataset: Dataset):
         parameter=['name=Robert', 'file=a.txt'],
         output=['a.txt'],
         result_renderer='disabled',
+        allow_untrusted_code=True,
     )
 
     # check that the output is correct

diff --git a/datalad_remake/utils/compute.py b/datalad_remake/utils/compute.py
@@ -4,7 +4,10 @@
 import logging
 import subprocess
 import tomllib
-from typing import TYPE_CHECKING, Any
+from typing import (
+    TYPE_CHECKING,
+    Any,
+)
 
 if TYPE_CHECKING:
     from pathlib import Path

diff --git a/datalad_remake/utils/verify.py b/datalad_remake/utils/verify.py
@@ -0,0 +1,26 @@
+from pathlib import Path
+
+from datalad_next.runners import (
+    call_git_oneline,
+    call_git_success,
+)
+
+
+def verify_file(root_directory: Path, file: Path):
+    # Get the latest commit of `file`
+    commit = call_git_oneline([
+        '-C', str(root_directory),
+        'log', '-1', '--follow',
+        '--pretty=%H',
+        str(file)
+    ])
+
+    # Let git do the verification of the commit
+    result = call_git_success([
+        '-C', str(root_directory),
+        'verify-commit',
+        commit
+    ])
+    if not result:
+        msg = f'Signature validation of {file} failed'
+        raise ValueError(msg)
diff --git a/pyproject.toml b/pyproject.toml
@@ -96,6 +96,7 @@ extra-dependencies = [
   "hypothesis",
   "mypy>=1.0.0",
   "pytest",
+  "pytest-gnupg-fixtures",
 ]
 
 [tool.hatch.envs.types.scripts]