diff --git a/datalad_remake/annexremotes/remake_remote.py b/datalad_remake/annexremotes/remake_remote.py index 5b0417f..11d8428 100644 --- a/datalad_remake/annexremotes/remake_remote.py +++ b/datalad_remake/annexremotes/remake_remote.py @@ -29,6 +29,7 @@ provide_context, ) from datalad_remake.utils.glob import resolve_patterns +from datalad_remake.utils.verify import verify_file if TYPE_CHECKING: from collections.abc import Iterable @@ -41,6 +42,12 @@ class RemakeRemote(SpecialRemote): def __init__(self, annex: Master): super().__init__(annex) + self.configs = { + 'allow_untrusted_execution': + 'Allow execution of untrusted code with untrusted parameters. ' + 'set to "true" to enable. THIS IS DANGEROUS and might lead to ' + 'remote code execution.', + } def __del__(self): self.close() @@ -85,7 +92,12 @@ def get_url_for_key(self, key: str) -> str: self.annex.debug(f'get_url_for_key: key: {key!r}, urls: {urls!r}') return urls[0] - def get_compute_info(self, key: str) -> tuple[dict[str, Any], Dataset]: + def get_compute_info(self, + key: str, + *, + allow_untrusted_specs: bool = False + ) -> tuple[dict[str, Any], Dataset]: + def get_assigned_value(assignment: str) -> str: return assignment.split('=', 1)[1] @@ -96,6 +108,8 @@ def get_assigned_value(assignment: str) -> str: dataset = self._find_dataset(root_version) spec_path = dataset.pathobj / specification_dir / spec_name + if not allow_untrusted_specs: + verify_file(dataset.pathobj, spec_path) with open(spec_path, 'rb') as f: spec = json.load(f) @@ -108,7 +122,14 @@ def get_assigned_value(assignment: str) -> str: def transfer_retrieve(self, key: str, file_name: str) -> None: self.annex.debug(f'TRANSFER RETRIEVE key: {key!r}, file_name: {file_name!r}') - compute_info, dataset = self.get_compute_info(key) + allow_untrusted_execution = self.annex.getconfig( + 'allow_untrusted_execution' + ) == 'true' + + compute_info, dataset = self.get_compute_info( + key, + allow_untrusted_specs=allow_untrusted_execution + ) self.annex.debug(f'TRANSFER RETRIEVE compute_info: {compute_info!r}') # Perform the computation, and collect the results @@ -124,6 +145,7 @@ def transfer_retrieve(self, key: str, file_name: str) -> None: compute_info['method'], compute_info['parameter'], compute_info['output'], + allow_untrusted_code=allow_untrusted_execution, ) lgr.debug('Starting collection') self.annex.debug('Starting collection') diff --git a/datalad_remake/annexremotes/tests/test_hierarchies.py b/datalad_remake/annexremotes/tests/test_hierarchies.py index 3cba4b5..addd23f 100644 --- a/datalad_remake/annexremotes/tests/test_hierarchies.py +++ b/datalad_remake/annexremotes/tests/test_hierarchies.py @@ -90,6 +90,7 @@ def test_end_to_end(tmp_path, monkeypatch, output_pattern): ], output=output_pattern, result_renderer='disabled', + allow_untrusted_code=True, ) collected_output = [ diff --git a/datalad_remake/annexremotes/tests/test_remake_remote.py b/datalad_remake/annexremotes/tests/test_remake_remote.py index 9833754..7fd5f3d 100644 --- a/datalad_remake/annexremotes/tests/test_remake_remote.py +++ b/datalad_remake/annexremotes/tests/test_remake_remote.py @@ -96,12 +96,16 @@ def test_compute_remote_main(tmp_path, monkeypatch): # below. input_.send('PREPARE\n') input_.send(f'TRANSFER RETRIEVE {key.decode()} {tmp_path / "remade.txt"!s}\n') + # The next line is the answer to `GETCONFIG allow_untrusted_execution` + input_.send('VALUE true\n') url = ( 'datalad-make:///?' f'root_version={dataset.repo.get_hexsha()}' '&specification=000001111122222' '&this=a.txt' ) + # The next line is the answer to + # `GETURLS MD5E-s2--60b725f10c9c85c70d97880dfe8191b3.txt datalad-remake:` input_.send(f'VALUE {url}\n') input_.send('VALUE\n') input_.send('VALUE .git\n') diff --git a/datalad_remake/commands/make_cmd.py b/datalad_remake/commands/make_cmd.py index c431e79..2347287 100644 --- a/datalad_remake/commands/make_cmd.py +++ b/datalad_remake/commands/make_cmd.py @@ -34,6 +34,7 @@ call_git_oneline, call_git_success, ) +from hypothesis.reporting import default from datalad_remake import ( specification_dir, @@ -42,6 +43,7 @@ ) from datalad_remake.utils.compute import compute from datalad_remake.utils.glob import resolve_patterns +from datalad_remake.utils.verify import verify_file if TYPE_CHECKING: from collections.abc import ( @@ -165,6 +167,21 @@ class Make(ValidatedInterface): 'before used. This is useful if a large number of parameters ' 'should be provided.', ), + 'allow_untrusted_code': Parameter( + args=( + '--allow-untrusted-code', + ), + action='store_true', + default=False, + doc='Skip commit signature verification before executing code. This ' + 'should only be used in a strictly controlled environment with ' + 'fully trusted datasets. Trusted dataset means: every commit ' + 'stems from a trusted entity. ' + 'DO NOT USE THIS OPTION, unless you are sure to understand the ' + 'consequences. One of which is that arbitrary parties can ' + 'execute arbitrary code under your account on your ' + 'infrastructure.', + ), } @staticmethod @@ -182,6 +199,7 @@ def __call__( output_list: Path | None = None, parameter: list[str] | None = None, parameter_list: Path | None = None, + allow_untrusted_code: bool = False, ) -> Generator: ds: Dataset = dataset.ds if dataset else Dataset('.') @@ -189,7 +207,7 @@ def __call__( output_pattern = (output or []) + read_list(output_list) parameter = (parameter or []) + read_list(parameter_list) - parameter_dict = {p.split('=', 1)[0]: p.split('=', 1)[1] for p in parameter} + parameter_dict = dict([p.split('=', 1) for p in parameter]) # We have to get the URL first, because saving the specification to # the dataset will change the version. @@ -203,7 +221,13 @@ def __call__( branch, input_pattern, ) as worktree: - execute(worktree, template, parameter_dict, output_pattern) + execute( + worktree, + template, + parameter_dict, + output_pattern, + allow_untrusted_code=allow_untrusted_code, + ) resolved_output = collect(worktree, ds, output_pattern) else: resolved_output = set(output_pattern) @@ -370,6 +394,8 @@ def execute( template_name: str, parameter: dict[str, str], output_pattern: list[str], + *, + allow_untrusted_code: bool = False, ) -> None: lgr.debug( 'execute: %s %s %s %s', @@ -392,6 +418,9 @@ def execute( # Run the computation in the worktree-directory template_path = Path(template_dir) / template_name + if not allow_untrusted_code: + verify_file(worktree_ds.pathobj, template_path) + worktree_ds.get(template_path, result_renderer='disabled') compute(worktree, worktree / template_path, parameter) diff --git a/datalad_remake/commands/tests/create_datasets.py b/datalad_remake/commands/tests/create_datasets.py index 9d4699a..bc28d12 100644 --- a/datalad_remake/commands/tests/create_datasets.py +++ b/datalad_remake/commands/tests/create_datasets.py @@ -30,6 +30,7 @@ def add_remake_remote(dataset: Dataset): 'type=external', 'externaltype=datalad-remake', 'encryption=none', + 'allow_untrusted_execution=true', ], capture_output=True, ) diff --git a/datalad_remake/commands/tests/test_compute.py b/datalad_remake/commands/tests/test_compute.py index 391b7e5..c725be4 100644 --- a/datalad_remake/commands/tests/test_compute.py +++ b/datalad_remake/commands/tests/test_compute.py @@ -52,6 +52,7 @@ def _run_simple_computation(root_dataset: Dataset): parameter=['name=Robert', 'file=a.txt'], output=['a.txt'], result_renderer='disabled', + allow_untrusted_code=True, ) # check that the output is correct diff --git a/datalad_remake/utils/compute.py b/datalad_remake/utils/compute.py index 4e32391..40025f0 100644 --- a/datalad_remake/utils/compute.py +++ b/datalad_remake/utils/compute.py @@ -4,7 +4,10 @@ import logging import subprocess import tomllib -from typing import TYPE_CHECKING, Any +from typing import ( + TYPE_CHECKING, + Any, +) if TYPE_CHECKING: from pathlib import Path diff --git a/datalad_remake/utils/verify.py b/datalad_remake/utils/verify.py new file mode 100644 index 0000000..874d634 --- /dev/null +++ b/datalad_remake/utils/verify.py @@ -0,0 +1,26 @@ +from pathlib import Path + +from datalad_next.runners import ( + call_git_oneline, + call_git_success, +) + + +def verify_file(root_directory: Path, file: Path): + # Get the latest commit of `file` + commit = call_git_oneline([ + '-C', str(root_directory), + 'log', '-1', '--follow', + '--pretty=%H', + str(file) + ]) + + # Let git do the verification of the commit + result = call_git_success([ + '-C', str(root_directory), + 'verify-commit', + commit + ]) + if not result: + msg = f'Signature validation of {file} failed' + raise ValueError(msg) diff --git a/pyproject.toml b/pyproject.toml index 4bbc41c..5ae71ff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -96,6 +96,7 @@ extra-dependencies = [ "hypothesis", "mypy>=1.0.0", "pytest", + "pytest-gnupg-fixtures", ] [tool.hatch.envs.types.scripts]