Skip to content

Commit

Permalink
feat: add trusted execution
Browse files Browse the repository at this point in the history
This commit adds trusted execution. Trusted
execution requires that the latest commit to
the method (in `.datalad/make/methods`) and
the specification file (in
`.datalad/make/specifications`) were signed
with a trusted key. That means, a
`git verify-commit` succeeds.
  • Loading branch information
christian-monch committed Oct 25, 2024
1 parent 76e4d8b commit 979c0ef
Show file tree
Hide file tree
Showing 9 changed files with 93 additions and 5 deletions.
26 changes: 24 additions & 2 deletions datalad_remake/annexremotes/remake_remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
provide_context,
)
from datalad_remake.utils.glob import resolve_patterns
from datalad_remake.utils.verify import verify_file

if TYPE_CHECKING:
from collections.abc import Iterable
Expand All @@ -41,6 +42,12 @@
class RemakeRemote(SpecialRemote):
def __init__(self, annex: Master):
super().__init__(annex)
self.configs = {
'allow_untrusted_execution':
'Allow execution of untrusted code with untrusted parameters. '
'set to "true" to enable. THIS IS DANGEROUS and might lead to '
'remote code execution.',
}

def __del__(self):
self.close()
Expand Down Expand Up @@ -85,7 +92,12 @@ def get_url_for_key(self, key: str) -> str:
self.annex.debug(f'get_url_for_key: key: {key!r}, urls: {urls!r}')
return urls[0]

def get_compute_info(self, key: str) -> tuple[dict[str, Any], Dataset]:
def get_compute_info(self,
key: str,
*,
allow_untrusted_specs: bool = False
) -> tuple[dict[str, Any], Dataset]:

def get_assigned_value(assignment: str) -> str:
return assignment.split('=', 1)[1]

Expand All @@ -96,6 +108,8 @@ def get_assigned_value(assignment: str) -> str:

dataset = self._find_dataset(root_version)
spec_path = dataset.pathobj / specification_dir / spec_name
if not allow_untrusted_specs:
verify_file(dataset.pathobj, spec_path)
with open(spec_path, 'rb') as f:
spec = json.load(f)

Expand All @@ -108,7 +122,14 @@ def get_assigned_value(assignment: str) -> str:
def transfer_retrieve(self, key: str, file_name: str) -> None:
self.annex.debug(f'TRANSFER RETRIEVE key: {key!r}, file_name: {file_name!r}')

compute_info, dataset = self.get_compute_info(key)
allow_untrusted_execution = self.annex.getconfig(
'allow_untrusted_execution'
) == 'true'

compute_info, dataset = self.get_compute_info(
key,
allow_untrusted_specs=allow_untrusted_execution
)
self.annex.debug(f'TRANSFER RETRIEVE compute_info: {compute_info!r}')

# Perform the computation, and collect the results
Expand All @@ -124,6 +145,7 @@ def transfer_retrieve(self, key: str, file_name: str) -> None:
compute_info['method'],
compute_info['parameter'],
compute_info['output'],
allow_untrusted_code=allow_untrusted_execution,
)
lgr.debug('Starting collection')
self.annex.debug('Starting collection')
Expand Down
1 change: 1 addition & 0 deletions datalad_remake/annexremotes/tests/test_hierarchies.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ def test_end_to_end(tmp_path, monkeypatch, output_pattern):
],
output=output_pattern,
result_renderer='disabled',
allow_untrusted_code=True,
)

collected_output = [
Expand Down
4 changes: 4 additions & 0 deletions datalad_remake/annexremotes/tests/test_remake_remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,12 +96,16 @@ def test_compute_remote_main(tmp_path, monkeypatch):
# below.
input_.send('PREPARE\n')
input_.send(f'TRANSFER RETRIEVE {key.decode()} {tmp_path / "remade.txt"!s}\n')
# The next line is the answer to `GETCONFIG allow_untrusted_execution`
input_.send('VALUE true\n')
url = (
'datalad-make:///?'
f'root_version={dataset.repo.get_hexsha()}'
'&specification=000001111122222'
'&this=a.txt'
)
# The next line is the answer to
# `GETURLS MD5E-s2--60b725f10c9c85c70d97880dfe8191b3.txt datalad-remake:`
input_.send(f'VALUE {url}\n')
input_.send('VALUE\n')
input_.send('VALUE .git\n')
Expand Down
33 changes: 31 additions & 2 deletions datalad_remake/commands/make_cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
call_git_oneline,
call_git_success,
)
from hypothesis.reporting import default

from datalad_remake import (
specification_dir,
Expand All @@ -42,6 +43,7 @@
)
from datalad_remake.utils.compute import compute
from datalad_remake.utils.glob import resolve_patterns
from datalad_remake.utils.verify import verify_file

if TYPE_CHECKING:
from collections.abc import (
Expand Down Expand Up @@ -165,6 +167,21 @@ class Make(ValidatedInterface):
'before used. This is useful if a large number of parameters '
'should be provided.',
),
'allow_untrusted_code': Parameter(
args=(
'--allow-untrusted-code',
),
action='store_true',
default=False,
doc='Skip commit signature verification before executing code. This '
'should only be used in a strictly controlled environment with '
'fully trusted datasets. Trusted dataset means: every commit '
'stems from a trusted entity. '
'DO NOT USE THIS OPTION, unless you are sure to understand the '
'consequences. One of which is that arbitrary parties can '
'execute arbitrary code under your account on your '
'infrastructure.',
),
}

@staticmethod
Expand All @@ -182,14 +199,15 @@ def __call__(
output_list: Path | None = None,
parameter: list[str] | None = None,
parameter_list: Path | None = None,
allow_untrusted_code: bool = False,
) -> Generator:
ds: Dataset = dataset.ds if dataset else Dataset('.')

input_pattern = (input or []) + read_list(input_list)
output_pattern = (output or []) + read_list(output_list)
parameter = (parameter or []) + read_list(parameter_list)

parameter_dict = {p.split('=', 1)[0]: p.split('=', 1)[1] for p in parameter}
parameter_dict = dict([p.split('=', 1) for p in parameter])

# We have to get the URL first, because saving the specification to
# the dataset will change the version.
Expand All @@ -203,7 +221,13 @@ def __call__(
branch,
input_pattern,
) as worktree:
execute(worktree, template, parameter_dict, output_pattern)
execute(
worktree,
template,
parameter_dict,
output_pattern,
allow_untrusted_code=allow_untrusted_code,
)
resolved_output = collect(worktree, ds, output_pattern)
else:
resolved_output = set(output_pattern)
Expand Down Expand Up @@ -370,6 +394,8 @@ def execute(
template_name: str,
parameter: dict[str, str],
output_pattern: list[str],
*,
allow_untrusted_code: bool = False,
) -> None:
lgr.debug(
'execute: %s %s %s %s',
Expand All @@ -392,6 +418,9 @@ def execute(

# Run the computation in the worktree-directory
template_path = Path(template_dir) / template_name
if not allow_untrusted_code:
verify_file(worktree_ds.pathobj, template_path)

worktree_ds.get(template_path, result_renderer='disabled')
compute(worktree, worktree / template_path, parameter)

Expand Down
1 change: 1 addition & 0 deletions datalad_remake/commands/tests/create_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def add_remake_remote(dataset: Dataset):
'type=external',
'externaltype=datalad-remake',
'encryption=none',
'allow_untrusted_execution=true',
],
capture_output=True,
)
Expand Down
1 change: 1 addition & 0 deletions datalad_remake/commands/tests/test_compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def _run_simple_computation(root_dataset: Dataset):
parameter=['name=Robert', 'file=a.txt'],
output=['a.txt'],
result_renderer='disabled',
allow_untrusted_code=True,
)

# check that the output is correct
Expand Down
5 changes: 4 additions & 1 deletion datalad_remake/utils/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@
import logging
import subprocess
import tomllib
from typing import TYPE_CHECKING, Any
from typing import (
TYPE_CHECKING,
Any,
)

if TYPE_CHECKING:
from pathlib import Path
Expand Down
26 changes: 26 additions & 0 deletions datalad_remake/utils/verify.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from pathlib import Path

from datalad_next.runners import (
call_git_oneline,
call_git_success,
)


def verify_file(root_directory: Path, file: Path):
# Get the latest commit of `file`
commit = call_git_oneline([
'-C', str(root_directory),
'log', '-1', '--follow',
'--pretty=%H',
str(file)
])

# Let git do the verification of the commit
result = call_git_success([
'-C', str(root_directory),
'verify-commit',
commit
])
if not result:
msg = f'Signature validation of {file} failed'
raise ValueError(msg)
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ extra-dependencies = [
"hypothesis",
"mypy>=1.0.0",
"pytest",
"pytest-gnupg-fixtures",
]

[tool.hatch.envs.types.scripts]
Expand Down

0 comments on commit 979c0ef

Please sign in to comment.