diff --git a/.appveyor.yml b/.appveyor.yml index e000a1b..f819110 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -70,6 +70,7 @@ environment: APPVEYOR_BUILD_WORKER_IMAGE: Ubuntu2204 PY: 3.11 INSTALL_GITANNEX: git-annex -m snapshot + KEYWORDS: not no_such_test # Windows core tests - job_name: test-win @@ -80,6 +81,7 @@ environment: COVERAGE_ROOT: C:\DLTMP HATCH_DATA_DIR: C:\hatch-data-dir PIP_CACHE: C:\Users\appveyor\AppData\Local\pip\Cache + KEYWORDS: not test_whitelist # MacOS core tests - job_name: test-mac @@ -89,7 +91,9 @@ environment: COVERAGE_ROOT: /Users/appveyor/DLTMP HATCH_DATA_DIR: /Users/appveyor/hatch-data-dir PIP_CACHE: /Users/appveyor/.cache/pip - + KEYWORDS: > + not test_compute_remote_main[True] + and not test_whitelist # only run the CI if there are code or tooling changes only_commits: @@ -140,6 +144,8 @@ for: # verify that a PY variable is declared that identifies the desired Python version # for this run - "[ \"x$PY\" != x ]" + # create a dedicated socket directory to prevent too long socket names + - tools/appveyor/setup-gpg # Missing system software - tools/appveyor/install-syspkgs $INSTALL_SYSPKGS # activate Python env solely to get `python` to become available consistently @@ -150,7 +156,7 @@ for: - "[ -f ${HOME}/dlinstaller_env.sh ] && . ${HOME}/dlinstaller_env.sh || true" test_script: - - 'hatch run tests.py${PY}:run-cov --doctest-modules --durations 10' + - 'hatch run tests.py${PY}:run-cov --doctest-modules --durations 10 -k "$KEYWORDS"' after_test: - 'hatch run tests.py${PY}:cov-combine' @@ -208,7 +214,7 @@ for: - cmd: IF DEFINED INSTALL_GITANNEX datalad-installer --sudo ok %INSTALL_GITANNEX% test_script: - - cmd: 'hatch run tests.py%PY%:run-cov --doctest-modules --durations 10' + - cmd: 'hatch run tests.py%PY%:run-cov --doctest-modules -k "%KEYWORDS%" --durations 10' after_test: - cmd: 'hatch run tests.py%PY%:cov-combine' diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 9dbc5a3..7dd00a5 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -31,3 +31,5 @@ python: install: - method: pip path: . + extra_requirements: + - docs diff --git a/README.md b/README.md index 98ff794..d731ac7 100644 --- a/README.md +++ b/README.md @@ -75,14 +75,15 @@ EOF Create a `datalad-remake` git-annex special remote: ```bash -> git annex initremote datalad-remake encryption=none type=external externaltype=datalad-remake +> git annex initremote datalad-remake encryption=none type=external externaltype=datalad-remake allow_untrusted_execution=true ``` Execute a computation and save the result: ```bash -> datalad make -p first=bob -p second=alice -p output=name \ --o name-1.txt -o name-2.txt one-to-many +> datalad make -p first=bob -p second=alice -p output=name -o name-1.txt \ +-o name-2.txt --allow-untrusted-execution one-to-many ``` + The method `one-to-many` will create two files with the names `-1.txt` and `-2.txt`. Thus, the two files `name-1.txt` and `name-2.txt` need to be specified as outputs in the command above. @@ -117,7 +118,8 @@ Afterwards, a prospective computation can be initiated by using the ```bash > datalad make -p first=john -p second=susan -p output=person \ --o person-1.txt -o person-2.txt -u one-to-many +-o person-1.txt -o person-2.txt -u --allow_untrusted_execution one-to-many +> cat person-1.txt # this will fail, because the computation has not yet been performed ``` The following command will fail, because no computation has been performed, diff --git a/datalad_remake/__init__.py b/datalad_remake/__init__.py index f90d415..70c62ca 100644 --- a/datalad_remake/__init__.py +++ b/datalad_remake/__init__.py @@ -9,6 +9,7 @@ 'command_suite', 'specification_dir', 'template_dir', + 'trusted_keys_config_key', ] @@ -47,3 +48,4 @@ url_scheme = 'datalad-remake' template_dir = '.datalad/make/methods' specification_dir = '.datalad/make/specifications' +trusted_keys_config_key = 'datalad.trusted-keys' diff --git a/datalad_remake/annexremotes/__init__.py b/datalad_remake/annexremotes/__init__.py index e69de29..1e9a27e 100644 --- a/datalad_remake/annexremotes/__init__.py +++ b/datalad_remake/annexremotes/__init__.py @@ -0,0 +1 @@ +"""The DataLad remake special remote""" diff --git a/datalad_remake/annexremotes/remake_remote.py b/datalad_remake/annexremotes/remake_remote.py index 5b0417f..388c624 100644 --- a/datalad_remake/annexremotes/remake_remote.py +++ b/datalad_remake/annexremotes/remake_remote.py @@ -28,7 +28,9 @@ get_file_dataset, provide_context, ) +from datalad_remake.utils.getkeys import get_trusted_keys from datalad_remake.utils.glob import resolve_patterns +from datalad_remake.utils.verify import verify_file if TYPE_CHECKING: from collections.abc import Iterable @@ -41,6 +43,11 @@ class RemakeRemote(SpecialRemote): def __init__(self, annex: Master): super().__init__(annex) + self.configs = { + 'allow_untrusted_execution': 'Allow execution of untrusted code with untrusted parameters. ' + 'set to "true" to enable. THIS IS DANGEROUS and might lead to ' + 'remote code execution.', + } def __del__(self): self.close() @@ -85,7 +92,11 @@ def get_url_for_key(self, key: str) -> str: self.annex.debug(f'get_url_for_key: key: {key!r}, urls: {urls!r}') return urls[0] - def get_compute_info(self, key: str) -> tuple[dict[str, Any], Dataset]: + def get_compute_info( + self, + key: str, + trusted_key_ids: list[str] | None, + ) -> tuple[dict[str, Any], Dataset]: def get_assigned_value(assignment: str) -> str: return assignment.split('=', 1)[1] @@ -96,6 +107,8 @@ def get_assigned_value(assignment: str) -> str: dataset = self._find_dataset(root_version) spec_path = dataset.pathobj / specification_dir / spec_name + if trusted_key_ids is not None: + verify_file(dataset.pathobj, spec_path, trusted_key_ids) with open(spec_path, 'rb') as f: spec = json.load(f) @@ -108,7 +121,12 @@ def get_assigned_value(assignment: str) -> str: def transfer_retrieve(self, key: str, file_name: str) -> None: self.annex.debug(f'TRANSFER RETRIEVE key: {key!r}, file_name: {file_name!r}') - compute_info, dataset = self.get_compute_info(key) + if self.annex.getconfig('allow_untrusted_execution') == 'true': + trusted_key_ids = None + else: + trusted_key_ids = get_trusted_keys() + + compute_info, dataset = self.get_compute_info(key, trusted_key_ids) self.annex.debug(f'TRANSFER RETRIEVE compute_info: {compute_info!r}') # Perform the computation, and collect the results @@ -124,6 +142,7 @@ def transfer_retrieve(self, key: str, file_name: str) -> None: compute_info['method'], compute_info['parameter'], compute_info['output'], + trusted_key_ids, ) lgr.debug('Starting collection') self.annex.debug('Starting collection') diff --git a/datalad_remake/annexremotes/tests/test_hierarchies.py b/datalad_remake/annexremotes/tests/test_hierarchies.py index 3cba4b5..8259334 100644 --- a/datalad_remake/annexremotes/tests/test_hierarchies.py +++ b/datalad_remake/annexremotes/tests/test_hierarchies.py @@ -90,6 +90,7 @@ def test_end_to_end(tmp_path, monkeypatch, output_pattern): ], output=output_pattern, result_renderer='disabled', + allow_untrusted_execution=True, ) collected_output = [ diff --git a/datalad_remake/annexremotes/tests/test_remake_remote.py b/datalad_remake/annexremotes/tests/test_remake_remote.py index 9833754..f9e3aea 100644 --- a/datalad_remake/annexremotes/tests/test_remake_remote.py +++ b/datalad_remake/annexremotes/tests/test_remake_remote.py @@ -1,14 +1,20 @@ +import re import subprocess from io import TextIOBase +from pathlib import Path from queue import Queue from typing import cast +import pytest from annexremote import Master from datalad_next.tests import skip_if_on_windows from datalad_remake.commands.tests.create_datasets import create_ds_hierarchy -from ... import specification_dir +from ... import ( + specification_dir, + template_dir, +) from ...commands.make_cmd import build_json from ..remake_remote import RemakeRemote @@ -64,11 +70,30 @@ def send(self, value): @skip_if_on_windows -def test_compute_remote_main(tmp_path, monkeypatch): - dataset = create_ds_hierarchy(tmp_path, 'ds1', 0)[0][2] +@pytest.mark.parametrize('trusted', [True, False]) +def test_compute_remote_main(tmp_path, datalad_cfg, monkeypatch, trusted): + if trusted: + gpg_homedir = tmp_path / 'tmp_gpg_dir' + tmp_home = tmp_path / 'tmp_home' + + # make sure that the users keystore is not overwritten + monkeypatch.setenv('HOME', str(tmp_home)) + + # Generate a keypair + signing_key = create_keypair(gpg_dir=gpg_homedir) + + # Activate the new keys + monkeypatch.setenv('GNUPGHOME', str(gpg_homedir)) + + datalad_cfg.add('datalad.trusted-keys', signing_key, where='global') + + else: + signing_key = None + + dataset = create_ds_hierarchy(tmp_path, 'ds1', 0, signing_key)[0][2] monkeypatch.chdir(dataset.path) - template_path = dataset.pathobj / '.datalad' / 'make' / 'methods' + template_path = dataset.pathobj / template_dir template_path.mkdir(parents=True) (template_path / 'echo').write_text(template) dataset.save() @@ -84,10 +109,13 @@ def test_compute_remote_main(tmp_path, monkeypatch): ) ).split(b': ')[1] - (dataset.pathobj / specification_dir).mkdir(parents=True, exist_ok=True) - (dataset.pathobj / specification_dir / '000001111122222').write_text( + specification_path = dataset.pathobj / specification_dir + spec_name = '000001111122222' + specification_path.mkdir(parents=True, exist_ok=True) + (specification_path / spec_name).write_text( build_json('echo', [], ['a.txt'], {'content': 'some_string'}) ) + dataset.save() input_ = MockedInput() @@ -96,12 +124,16 @@ def test_compute_remote_main(tmp_path, monkeypatch): # below. input_.send('PREPARE\n') input_.send(f'TRANSFER RETRIEVE {key.decode()} {tmp_path / "remade.txt"!s}\n') + # The next line is the answer to `GETCONFIG allow_untrusted_execution` + input_.send(f'VALUE {"false" if trusted else "true"}\n') url = ( 'datalad-make:///?' f'root_version={dataset.repo.get_hexsha()}' '&specification=000001111122222' '&this=a.txt' ) + # The next line is the answer to + # `GETURLS MD5E-s2--60b725f10c9c85c70d97880dfe8191b3.txt datalad-remake:` input_.send(f'VALUE {url}\n') input_.send('VALUE\n') input_.send('VALUE .git\n') @@ -117,3 +149,62 @@ def test_compute_remote_main(tmp_path, monkeypatch): # At this point the datalad-remake remote should have executed the # computation and written the result. assert (tmp_path / 'remade.txt').read_text().strip() == 'content: some_string' + + +def create_keypair(gpg_dir: Path, name: bytes = b'Test User'): + gpg_dir.mkdir(parents=True, exist_ok=True) + gpg_dir.chmod(0o700) + private_keys_dir = gpg_dir / 'private-keys-v1.d' + private_keys_dir.mkdir(exist_ok=True) + private_keys_dir.chmod(0o700) + template = b""" + Key-Type: RSA + Key-Length: 4096 + Subkey-Type: RSA + Subkey-Length: 4096 + Name-Real: $NAME + Name-Email: test@example.com + Expire-Date: 0 + %no-protection + #%transient-key + %commit + """ + script = template.replace(b'$NAME', name) + + # unset $HOME to prevent accidental changes to the user's keyring + environment = {'HOME': '/dev/null'} + + # use gpg to generate a keypair + subprocess.run( + [ # noqa: S607 + 'gpg', + '--batch', + '--homedir', + str(gpg_dir), + '--gen-key', + '--keyid-format', + 'long', + ], + input=script, + capture_output=True, + check=True, + env=environment, + ) + + result = subprocess.run( + [ # noqa: S607 + 'gpg', + '--homedir', + str(gpg_dir), + '--list-secret-keys', + '--keyid-format', + 'long', + ], + capture_output=True, + check=True, + env=environment, + ) + return re.findall( + r'(?m)sec.*rsa4096/([A-Z0-9]+).*\n.*\n.*' + name.decode(), + result.stdout.decode(), + )[0] diff --git a/datalad_remake/commands/__init__.py b/datalad_remake/commands/__init__.py index e69de29..528deef 100644 --- a/datalad_remake/commands/__init__.py +++ b/datalad_remake/commands/__init__.py @@ -0,0 +1 @@ +"""Commands provided by the datalad-remake extension.""" diff --git a/datalad_remake/commands/make_cmd.py b/datalad_remake/commands/make_cmd.py index c431e79..3d4a68e 100644 --- a/datalad_remake/commands/make_cmd.py +++ b/datalad_remake/commands/make_cmd.py @@ -41,7 +41,9 @@ url_scheme, ) from datalad_remake.utils.compute import compute +from datalad_remake.utils.getkeys import get_trusted_keys from datalad_remake.utils.glob import resolve_patterns +from datalad_remake.utils.verify import verify_file if TYPE_CHECKING: from collections.abc import ( @@ -82,11 +84,17 @@ class Make(ValidatedInterface): 'reading configuration items, this command does not interact with ' 'the dataset.', ), - 'url_only': Parameter( - args=('-u', '--url-only'), + 'prospective_execution': Parameter( + args=('--prospective-execution',), action='store_true', - doc="Don't perform the computation, register an URL-key " - 'instead. A `git annex get ` will trigger the computation', + doc="Don't perform the computation now, only register compute " + 'instructions, `datalad get ` or `git annex get ` ' + 'will trigger the computation. \n' + 'Note: if this option is provided, input- and output-patterns will ' + 'be stored verbatim. Input globbing will be performed ' + 'when the computation is triggered. But the name of the output ' + 'files that are created will be the verbatim output pattern ' + 'strings.', ), 'template': Parameter( args=('template',), @@ -108,8 +116,8 @@ class Make(ValidatedInterface): ), action='append', doc='An input file pattern (repeat for multiple inputs, ' - 'file pattern support python globbing, globbing is expanded ' - 'in the source dataset)', + 'file pattern support python globbing, globbing is performed in ' + 'the source dataset).', ), 'input_list': Parameter( args=( @@ -130,8 +138,8 @@ class Make(ValidatedInterface): ), action='append', doc='An output file pattern (repeat for multiple outputs)' - 'file pattern support python globbing, globbing is expanded ' - 'in the worktree)', + 'file pattern support python globbing, globbing is performed in ' + 'the worktree).', ), 'output_list': Parameter( args=( @@ -165,6 +173,19 @@ class Make(ValidatedInterface): 'before used. This is useful if a large number of parameters ' 'should be provided.', ), + 'allow_untrusted_execution': Parameter( + args=('--allow-untrusted-execution',), + action='store_true', + default=False, + doc='Skip commit signature verification before executing code. This ' + 'should only be used in a strictly controlled environment with ' + 'fully trusted datasets. Trusted dataset means: every commit ' + 'stems from a trusted entity. ' + 'DO NOT USE THIS OPTION, unless you are sure to understand the ' + 'consequences. One of which is that arbitrary parties can ' + 'execute arbitrary code under your account on your ' + 'infrastructure.', + ), } @staticmethod @@ -174,7 +195,7 @@ def __call__( dataset: DatasetParameter | None = None, *, template: str = '', - url_only: bool = False, + prospective_execution: bool = False, branch: str | None = None, input: list[str] | None = None, # noqa: A002 input_list: Path | None = None, @@ -182,6 +203,7 @@ def __call__( output_list: Path | None = None, parameter: list[str] | None = None, parameter_list: Path | None = None, + allow_untrusted_execution: bool = False, ) -> Generator: ds: Dataset = dataset.ds if dataset else Dataset('.') @@ -189,7 +211,7 @@ def __call__( output_pattern = (output or []) + read_list(output_list) parameter = (parameter or []) + read_list(parameter_list) - parameter_dict = {p.split('=', 1)[0]: p.split('=', 1)[1] for p in parameter} + parameter_dict = dict([p.split('=', 1) for p in parameter]) # We have to get the URL first, because saving the specification to # the dataset will change the version. @@ -197,19 +219,25 @@ def __call__( ds, branch, template, parameter_dict, input_pattern, output_pattern ) - if not url_only: + if not prospective_execution: with provide_context( ds, branch, input_pattern, ) as worktree: - execute(worktree, template, parameter_dict, output_pattern) + execute( + worktree, + template, + parameter_dict, + output_pattern, + None if allow_untrusted_execution else get_trusted_keys(), + ) resolved_output = collect(worktree, ds, output_pattern) else: resolved_output = set(output_pattern) for out in resolved_output: - url = add_url(ds, out, url_base, url_only=url_only) + url = add_url(ds, out, url_base, url_only=prospective_execution) yield get_status_dict( action='make', path=str(ds.pathobj / out), @@ -240,11 +268,11 @@ def get_url( input_pattern: list[str], output_pattern: list[str], ) -> tuple[str, str]: - # If something goes wrong after the make specification was saved, + # If something goes wrong after the compute specification was saved, # the dataset state should be reset to `branch` reset_branch = branch or dataset.repo.get_hexsha() - # Write the specification to a file in the dataset + # Write the compute specification to a file in the dataset digest = write_spec( dataset, template_name, input_pattern, output_pattern, parameters ) @@ -370,6 +398,7 @@ def execute( template_name: str, parameter: dict[str, str], output_pattern: list[str], + trusted_key_ids: list[str] | None, ) -> None: lgr.debug( 'execute: %s %s %s %s', @@ -392,6 +421,9 @@ def execute( # Run the computation in the worktree-directory template_path = Path(template_dir) / template_name + if trusted_key_ids is not None: + verify_file(worktree_ds.pathobj, template_path, trusted_key_ids) + worktree_ds.get(template_path, result_renderer='disabled') compute(worktree, worktree / template_path, parameter) diff --git a/datalad_remake/commands/tests/create_datasets.py b/datalad_remake/commands/tests/create_datasets.py index 9d4699a..5c375d1 100644 --- a/datalad_remake/commands/tests/create_datasets.py +++ b/datalad_remake/commands/tests/create_datasets.py @@ -19,7 +19,8 @@ def update_config_for_remake(dataset: Dataset): ) -def add_remake_remote(dataset: Dataset): +def add_remake_remote(dataset: Dataset, signing_key: str | None = None): + aue = 'false' if signing_key else 'true' call_git_success( [ '-C', @@ -30,19 +31,24 @@ def add_remake_remote(dataset: Dataset): 'type=external', 'externaltype=datalad-remake', 'encryption=none', + f'allow_untrusted_execution={aue}', ], capture_output=True, ) def create_ds_hierarchy( - tmp_path: Path, name: str, subdataset_levels: int = 2 + tmp_path: Path, + name: str, + subdataset_levels: int = 2, + signing_key: str | None = None, ) -> list[tuple[str, Path, Dataset]]: # Create root dataset root_dataset = Dataset(tmp_path / name) root_dataset.create(force=True, result_renderer='disabled') (root_dataset.pathobj / 'a.txt').write_text('a\n') (root_dataset.pathobj / 'b.txt').write_text('b\n') + _enable_signing(root_dataset, signing_key) root_dataset.save(result_renderer='disabled') datasets = [(name, tmp_path / name, root_dataset)] @@ -54,6 +60,7 @@ def create_ds_hierarchy( (subdataset.pathobj / f'a{level}.txt').write_text(f'a{level}\n') (subdataset.pathobj / f'b{level}.txt').write_text(f'b{level}\n') subdataset.save(result_renderer='disabled') + _enable_signing(subdataset, signing_key) datasets.append((f'{name}_subds{level}', subdataset_path, subdataset)) # Link the datasets @@ -70,15 +77,24 @@ def create_ds_hierarchy( update_config_for_remake(root_dataset) # Add datalad-remake remotes to the root dataset and all subdatasets - add_remake_remote(root_dataset) + add_remake_remote(root_dataset, signing_key) subdataset_path = Path() for index in range(subdataset_levels): subdataset_path /= f'{name}_subds{index}' - add_remake_remote(Dataset(root_dataset.pathobj / subdataset_path)) + add_remake_remote( + Dataset(root_dataset.pathobj / subdataset_path), + signing_key, + ) return datasets +def _enable_signing(dataset: Dataset, key: str | None): + if key is not None: + dataset.config.set('commit.gpgsign', 'true', scope='local') + dataset.config.set('user.signingkey', key, scope='local') + + def create_simple_computation_dataset( tmp_path: Path, dataset_name: str, diff --git a/datalad_remake/commands/tests/test_compute.py b/datalad_remake/commands/tests/test_make.py similarity index 95% rename from datalad_remake/commands/tests/test_compute.py rename to datalad_remake/commands/tests/test_make.py index 391b7e5..26644eb 100644 --- a/datalad_remake/commands/tests/test_compute.py +++ b/datalad_remake/commands/tests/test_make.py @@ -31,7 +31,7 @@ def test_speculative_computation(tmp_path, datalad_cfg): template='test_method', parameter=['name=Robert', 'file=spec.txt'], output=['spec.txt'], - url_only=True, + prospective_execution=True, result_renderer='disabled', ) @@ -52,6 +52,7 @@ def _run_simple_computation(root_dataset: Dataset): parameter=['name=Robert', 'file=a.txt'], output=['a.txt'], result_renderer='disabled', + allow_untrusted_execution=True, ) # check that the output is correct diff --git a/datalad_remake/utils/compute.py b/datalad_remake/utils/compute.py index 4e32391..40025f0 100644 --- a/datalad_remake/utils/compute.py +++ b/datalad_remake/utils/compute.py @@ -4,7 +4,10 @@ import logging import subprocess import tomllib -from typing import TYPE_CHECKING, Any +from typing import ( + TYPE_CHECKING, + Any, +) if TYPE_CHECKING: from pathlib import Path diff --git a/datalad_remake/utils/getkeys.py b/datalad_remake/utils/getkeys.py new file mode 100644 index 0000000..93b9590 --- /dev/null +++ b/datalad_remake/utils/getkeys.py @@ -0,0 +1,20 @@ +from __future__ import annotations + +from datalad_core.config import ( + ConfigManager, + get_manager, +) + +from datalad_remake import trusted_keys_config_key + + +def get_trusted_keys(config_manager: ConfigManager | None = None) -> list[str]: + if config_manager is None: + config_manager = get_manager() + + trusted_key_items = config_manager.get_from_protected_sources( + trusted_keys_config_key + ) + if trusted_key_items.value is None: + return [] + return [key.strip() for key in trusted_key_items.value.split(',')] diff --git a/datalad_remake/utils/tests/test_verification.py b/datalad_remake/utils/tests/test_verification.py new file mode 100644 index 0000000..80cc86a --- /dev/null +++ b/datalad_remake/utils/tests/test_verification.py @@ -0,0 +1,38 @@ +from pathlib import Path + +import pytest + +from datalad_remake.annexremotes.tests.test_remake_remote import create_keypair +from datalad_remake.commands.tests.create_datasets import create_ds_hierarchy +from datalad_remake.utils.verify import verify_file + + +def test_whitelist(tmp_path, monkeypatch): + gpg_dir = tmp_path / 'gpg' + tmp_home = tmp_path / 'tmp_home' + + # make sure that the users keystore is not overwritten + monkeypatch.setenv('HOME', str(tmp_home)) + + # Create two key-pairs, one is used for signing, the other is used to + # validate the whitelist functionality. + signing_key = create_keypair(gpg_dir=gpg_dir, name=b'Signing User') + other_key = create_keypair(gpg_dir=gpg_dir, name=b'Other User') + + # Activate the new keys to allow `create_ds_hierarchy` to sign the commits + monkeypatch.setenv('GNUPGHOME', str(gpg_dir)) + + # Generate a simple dataset + dataset = create_ds_hierarchy(tmp_path, 'ds1', 0, signing_key)[0][2] + + verify_file(dataset.pathobj, Path('a.txt'), [signing_key]) + verify_file(dataset.pathobj, Path('a.txt'), [signing_key, other_key]) + + # Expect verification to fail if only `other_key` is white-listed because + # the commits were signed with `signing_key`. + with pytest.raises(ValueError, match='Signature validation of a.txt failed'): + verify_file(dataset.pathobj, Path('a.txt'), [other_key]) + + # Expect verification to fail if no key is white-listed. + with pytest.raises(ValueError, match='No trusted keys provided'): + verify_file(dataset.pathobj, Path('a.txt'), []) diff --git a/datalad_remake/utils/verify.py b/datalad_remake/utils/verify.py new file mode 100644 index 0000000..c6c57ab --- /dev/null +++ b/datalad_remake/utils/verify.py @@ -0,0 +1,72 @@ +import contextlib +import logging +import os +import subprocess +import tempfile +from pathlib import Path + +from datalad_next.runners import ( + call_git_oneline, + call_git_success, +) + +lgr = logging.getLogger('datalad.remake.utils.verify') + + +def verify_file(root_directory: Path, file: Path, trusted_key_ids: list[str]): + if not trusted_key_ids: + msg = 'No trusted keys provided' + raise ValueError(msg) + + # Get the latest commit of `file` + commit = call_git_oneline( + ['-C', str(root_directory), 'log', '-1', '--follow', '--pretty=%H', str(file)] + ) + + with tempfile.TemporaryDirectory() as temp_gpg_dir: + # Create a temporary PGP keyring that contains the trusted keys + _copy_keys_to(trusted_key_ids, temp_gpg_dir) + + # Let git do the verification of the commit with the trusted keys + with _gpg_dir(temp_gpg_dir): + result = call_git_success( + ['-C', str(root_directory), 'verify-commit', commit] + ) + + if not result: + msg = f'Signature validation of {file} failed' + raise ValueError(msg) + + +def _copy_keys_to(trusted_key_ids: list[str], keyring_dir: str) -> None: + for key_id in trusted_key_ids: + # Export the requested key into `result.stdout` + result = subprocess.run( + ['gpg', '-a', '--export', key_id], # noqa: S607 + stdout=subprocess.PIPE, + check=False, + ) + + if result.returncode != 0: + lgr.warning(f'Could not locate trusted key with id: {key_id}') + continue + + # Import key from `result.stdout` into a keyring in `keyring_dir` + subprocess.run( + ['gpg', '--homedir', str(keyring_dir), '--import'], # noqa: S607 + input=result.stdout, + check=True, + ) + + +@contextlib.contextmanager +def _gpg_dir(directory: str): + _original_value = os.environ.get('GNUPGHOME') + try: + os.environ['GNUPGHOME'] = directory + yield + finally: + if _original_value is None: + del os.environ['GNUPGHOME'] + else: + os.environ['GNUPGHOME'] = _original_value diff --git a/docs/conf.py b/docs/conf.py index bba215a..0f4cab3 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -6,6 +6,8 @@ # -- Project information ----------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information +import datalad_remake + project = 'datalad-remake' copyright = '2024, DataLad team' author = 'DataLad team' @@ -16,6 +18,9 @@ extensions = [ 'sphinx.ext.autosummary', + 'sphinx.ext.autodoc', + 'sphinx.ext.autodoc.typehints', + 'sphinx.ext.viewcode', ] templates_path = ['_templates'] diff --git a/docs/index.rst b/docs/index.rst index b5501bc..be283d1 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -11,8 +11,7 @@ Also see the :ref:`modindex`. .. autosummary:: :toctree: generated - ... - + commands Indices and tables ================== diff --git a/docs/source/cli_reference.rst b/docs/source/cli_reference.rst deleted file mode 100644 index 2fe6ba9..0000000 --- a/docs/source/cli_reference.rst +++ /dev/null @@ -1,7 +0,0 @@ -Command line reference -====================== - -.. toctree:: - :maxdepth: 1 - - generated/man/datalad-remake diff --git a/docs/source/python_reference.rst b/docs/source/python_reference.rst deleted file mode 100644 index 7655dfe..0000000 --- a/docs/source/python_reference.rst +++ /dev/null @@ -1,8 +0,0 @@ -High-level API commands -======================= - -.. currentmodule:: datalad.api -.. autosummary:: - :toctree: generated - - compute diff --git a/examples/fmriprep-docker/README.md b/examples/fmriprep-docker/README.md index e27fd6e..d962695 100644 --- a/examples/fmriprep-docker/README.md +++ b/examples/fmriprep-docker/README.md @@ -63,7 +63,7 @@ Configure the dataset in which you want to collect the results of the (re)comput Add a `datalad-remake` special remote: ```bash -> git annex initremote datalad-remake type=external externaltype=datalad-remake encryption=none +> git annex initremote datalad-remake type=external externaltype=datalad-remake encryption=none allow_untrusted_execution=true ``` ### Add template @@ -89,7 +89,7 @@ To test the example, run: ```bash > cd $HOME/my-project -> datalad make -I input.txt -O output.txt -P parameter.txt fmriprep-docker +> datalad make -I input.txt -O output.txt -P parameter.txt --allow-untrusted-execution fmriprep-docker ``` You can also do that in `debug` mode: diff --git a/examples/fmriprep-singularity/README.md b/examples/fmriprep-singularity/README.md index f57b76a..8dd1d6d 100644 --- a/examples/fmriprep-singularity/README.md +++ b/examples/fmriprep-singularity/README.md @@ -66,7 +66,7 @@ Configure the dataset in which you want to collect the results of the (re)comput Add a `datalad-remake` special remote: ```bash -> git annex initremote datalad-remake type=external externaltype=datalad-remake encryption=none +> git annex initremote datalad-remake type=external externaltype=datalad-remake encryption=none allow-untrusted-execution=true ``` ### Add template @@ -92,7 +92,7 @@ To test the example, run: ```bash > cd $HOME/my-project -> datalad make -I input.txt -O output.txt -P parameter.txt fmriprep-singularity +> datalad make -I input.txt -O output.txt -P parameter.txt --allow-untrusted-execution fmriprep-singularity ``` You can also do that in `debug` mode: diff --git a/pyproject.toml b/pyproject.toml index 4bbc41c..5a4a52e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,9 +40,18 @@ classifiers = [ "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] + +# This section is required to allow the direct refernces to the `datalad_core` +# in the dependencies section below and in +# `tool.hatch.envs.hatch-test.extra_dependencies`. Remove the section +# `tool.hatch.metadata` when `datalad_core` is available on PyPI. +[tool.hatch.metadata] +allow-direct-references = true + dependencies = [ "annexremote", "datalad", + "datalad_core @ git+https://github.com/datalad/datalad-core", "datalad_next", "datasalad", ] @@ -69,6 +78,8 @@ version-file = "datalad_remake/_version.py" [tool.hatch.envs.hatch-test] default-args = ["datalad_remake"] extra-dependencies = [ + "datalad_next", + "datalad_core @ git+https://github.com/datalad/datalad-core", "hypothesis", "pytest", # if you come here, because coverage combination crashed for you diff --git a/requirements-devel.txt b/requirements-devel.txt index dfab594..eee10d7 100644 --- a/requirements-devel.txt +++ b/requirements-devel.txt @@ -2,6 +2,7 @@ annexremote coverage datalad +datalad-core @ git+https://github.com/datalad/datalad-core datalad-next datasalad hatch diff --git a/resources/type_stubs/datalad_core/config/__init__.pyi b/resources/type_stubs/datalad_core/config/__init__.pyi new file mode 100644 index 0000000..48fe433 --- /dev/null +++ b/resources/type_stubs/datalad_core/config/__init__.pyi @@ -0,0 +1,8 @@ +from .defaults import ImplementationDefaults as ImplementationDefaults, get_defaults as get_defaults +from .git import DataladBranchConfig as DataladBranchConfig, GitConfig as GitConfig, GlobalGitConfig as GlobalGitConfig, LocalGitConfig as LocalGitConfig, SystemGitConfig as SystemGitConfig, WorktreeGitConfig as WorktreeGitConfig +from .gitenv import GitEnvironment as GitEnvironment +from .item import ConfigItem as ConfigItem +from .manager import ConfigManager as ConfigManager, get_manager as get_manager +from datasalad.settings import UnsetValue as UnsetValue + +__all__ = ['ConfigItem', 'ConfigManager', 'GitConfig', 'SystemGitConfig', 'GlobalGitConfig', 'LocalGitConfig', 'DataladBranchConfig', 'WorktreeGitConfig', 'GitEnvironment', 'ImplementationDefaults', 'UnsetValue', 'get_defaults', 'get_manager'] diff --git a/resources/type_stubs/datalad_core/config/defaults.pyi b/resources/type_stubs/datalad_core/config/defaults.pyi new file mode 100644 index 0000000..ae434c1 --- /dev/null +++ b/resources/type_stubs/datalad_core/config/defaults.pyi @@ -0,0 +1,7 @@ +from datasalad.settings import Defaults + +class ImplementationDefaults(Defaults): ... + +def get_defaults() -> ImplementationDefaults: ... +def register_defaults_gitcfg(defaults: ImplementationDefaults) -> None: ... +def anything2bool(val): ... diff --git a/resources/type_stubs/datalad_core/config/git.pyi b/resources/type_stubs/datalad_core/config/git.pyi new file mode 100644 index 0000000..73a23b0 --- /dev/null +++ b/resources/type_stubs/datalad_core/config/git.pyi @@ -0,0 +1,31 @@ +import abc +from _typeshed import Incomplete +from collections.abc import Hashable +from datalad_core.config.item import ConfigItem as ConfigItem +from datalad_core.consts import DATALAD_BRANCH_CONFIG_RELPATH as DATALAD_BRANCH_CONFIG_RELPATH +from datalad_core.runners import CommandError as CommandError, call_git as call_git, call_git_oneline as call_git_oneline, iter_git_subproc as iter_git_subproc +from datasalad.settings import CachingSource, Setting as Setting +from os import PathLike + +lgr: Incomplete + +class GitConfig(CachingSource, metaclass=abc.ABCMeta): + def __init__(self) -> None: ... + def __contains__(self, key: Hashable) -> bool: ... + +class SystemGitConfig(GitConfig): ... +class GlobalGitConfig(GitConfig): ... + +class LocalGitConfig(GitConfig): + def __init__(self, path: PathLike) -> None: ... + +class WorktreeGitConfig(GitConfig): + def __init__(self, path: PathLike) -> None: ... + +class DataladBranchConfig(LocalGitConfig): + def __init__(self, path: PathLike) -> None: ... + @property + def is_writable(self): ... + +cfg_k_regex: Incomplete +cfg_kv_regex: Incomplete diff --git a/resources/type_stubs/datalad_core/config/gitenv.pyi b/resources/type_stubs/datalad_core/config/gitenv.pyi new file mode 100644 index 0000000..82c6cbb --- /dev/null +++ b/resources/type_stubs/datalad_core/config/gitenv.pyi @@ -0,0 +1,8 @@ +from collections.abc import Generator, Hashable +from datalad_core.config.item import ConfigItem as ConfigItem +from datalad_core.config.utils import get_gitconfig_items_from_env as get_gitconfig_items_from_env, set_gitconfig_items_in_env as set_gitconfig_items_in_env +from datasalad.settings import Setting as Setting, WritableMultivalueSource + +class GitEnvironment(WritableMultivalueSource): + item_type = ConfigItem + def overrides(self, overrides: dict[Hashable, Setting | tuple[Setting, ...]]) -> Generator[None]: ... diff --git a/resources/type_stubs/datalad_core/config/item.pyi b/resources/type_stubs/datalad_core/config/item.pyi new file mode 100644 index 0000000..839a353 --- /dev/null +++ b/resources/type_stubs/datalad_core/config/item.pyi @@ -0,0 +1,3 @@ +from datasalad.settings import Setting + +class ConfigItem(Setting): ... diff --git a/resources/type_stubs/datalad_core/config/manager.pyi b/resources/type_stubs/datalad_core/config/manager.pyi new file mode 100644 index 0000000..3c7fb53 --- /dev/null +++ b/resources/type_stubs/datalad_core/config/manager.pyi @@ -0,0 +1,16 @@ +from collections.abc import Generator, Hashable +from datalad_core.config.defaults import ImplementationDefaults as ImplementationDefaults, get_defaults as get_defaults +from datalad_core.config.git import GlobalGitConfig as GlobalGitConfig, SystemGitConfig as SystemGitConfig +from datalad_core.config.gitenv import GitEnvironment as GitEnvironment +from datalad_core.config.item import ConfigItem as ConfigItem +from datasalad.settings import Setting as Setting, Settings, Source as Source +from typing import Any + +class ConfigManager(Settings): + def __init__(self, defaults: ImplementationDefaults, sources: dict[str, Source] | None = None) -> None: ... + def overrides(self, overrides: dict[Hashable, Setting | tuple[Setting, ...]]) -> Generator[ConfigManager]: ... + def get(self, key: Hashable, default: Any = None) -> Setting: ... + def get_from_protected_sources(self, key: Hashable, default: Any = None) -> Setting: ... + def declare_source_protected(self, key: str): ... + +def get_manager() -> ConfigManager: ... diff --git a/resources/type_stubs/datalad_core/config/tests/__init__.pyi b/resources/type_stubs/datalad_core/config/tests/__init__.pyi new file mode 100644 index 0000000..e69de29 diff --git a/resources/type_stubs/datalad_core/config/tests/test_defaults.pyi b/resources/type_stubs/datalad_core/config/tests/test_defaults.pyi new file mode 100644 index 0000000..b6b8d89 --- /dev/null +++ b/resources/type_stubs/datalad_core/config/tests/test_defaults.pyi @@ -0,0 +1,4 @@ +from ..defaults import anything2bool as anything2bool, get_defaults as get_defaults + +def test_implementationdefaults() -> None: ... +def test_anything2bool() -> None: ... diff --git a/resources/type_stubs/datalad_core/config/tests/test_git.pyi b/resources/type_stubs/datalad_core/config/tests/test_git.pyi new file mode 100644 index 0000000..79de075 --- /dev/null +++ b/resources/type_stubs/datalad_core/config/tests/test_git.pyi @@ -0,0 +1,12 @@ +from ..git import DataladBranchConfig as DataladBranchConfig, GlobalGitConfig as GlobalGitConfig, LocalGitConfig as LocalGitConfig +from ..item import ConfigItem as ConfigItem +from datalad_core.consts import DATALAD_BRANCH_CONFIG_RELPATH as DATALAD_BRANCH_CONFIG_RELPATH +from datalad_core.runners import call_git_oneline as call_git_oneline + +def test_global_git_config() -> None: ... +def test_global_git_config_pure(cfgman) -> None: ... +def test_local_git_config_norepo(tmp_path) -> None: ... +def test_local_git_config(gitrepo) -> None: ... +def test_datalad_branch_config(gitrepo) -> None: ... +def test_datalad_branch_shorthand(gitrepo) -> None: ... +def test_gitcfg_rec_to_keyvalue() -> None: ... diff --git a/resources/type_stubs/datalad_core/config/tests/test_gitenv.pyi b/resources/type_stubs/datalad_core/config/tests/test_gitenv.pyi new file mode 100644 index 0000000..4811cb6 --- /dev/null +++ b/resources/type_stubs/datalad_core/config/tests/test_gitenv.pyi @@ -0,0 +1,5 @@ +from ..gitenv import GitEnvironment as GitEnvironment +from datalad_core.runners import call_git_lines as call_git_lines, call_git_oneline as call_git_oneline + +def test_gitenv_singlevalue() -> None: ... +def test_gitenv_multivalue() -> None: ... diff --git a/resources/type_stubs/datalad_core/config/tests/test_manager.pyi b/resources/type_stubs/datalad_core/config/tests/test_manager.pyi new file mode 100644 index 0000000..568feb6 --- /dev/null +++ b/resources/type_stubs/datalad_core/config/tests/test_manager.pyi @@ -0,0 +1,9 @@ +from datalad_core.config import ConfigItem as ConfigItem, UnsetValue as UnsetValue, get_manager as get_manager +from datalad_core.repo import Repo as Repo, Worktree as Worktree +from datalad_core.tests.fixtures import magic_marker as magic_marker + +def test_manager_setup() -> None: ... +def test_manager_overrides() -> None: ... +def test_manager_fordataset(gitrepo) -> None: ... +def test_manager_forbaredataset(baregitrepo) -> None: ... +def test_manager_protected_query(gitrepo) -> None: ... diff --git a/resources/type_stubs/datalad_core/config/tests/test_utils.pyi b/resources/type_stubs/datalad_core/config/tests/test_utils.pyi new file mode 100644 index 0000000..c8daa99 --- /dev/null +++ b/resources/type_stubs/datalad_core/config/tests/test_utils.pyi @@ -0,0 +1,6 @@ +from .. import utils as utils +from ..utils import get_gitconfig_items_from_env as get_gitconfig_items_from_env, set_gitconfig_items_in_env as set_gitconfig_items_in_env + +def test_get_gitconfig_items_from_env(monkeypatch) -> None: ... +def test_set_gitconfig_items_in_env(monkeypatch) -> None: ... +def test_get_set_gitconfig_env_roundtrip(monkeypatch) -> None: ... diff --git a/resources/type_stubs/datalad_core/config/utils.pyi b/resources/type_stubs/datalad_core/config/utils.pyi new file mode 100644 index 0000000..da9f440 --- /dev/null +++ b/resources/type_stubs/datalad_core/config/utils.pyi @@ -0,0 +1,4 @@ +from collections.abc import Mapping + +def get_gitconfig_items_from_env() -> dict[str, str | tuple[str, ...]]: ... +def set_gitconfig_items_in_env(items: Mapping[str, str | tuple[str, ...]]): ... diff --git a/tools/appveyor/setup-gpg b/tools/appveyor/setup-gpg new file mode 100755 index 0000000..fa5da38 --- /dev/null +++ b/tools/appveyor/setup-gpg @@ -0,0 +1,12 @@ +#!/bin/bash +# +# Set up gpg to use a short socket directory name on Linux VMs +# +set -e -u + +if (which apt-get > /dev/null ); then + sudo mkdir -p /run/user/$(id -u) + sudo chmod 700 /run/user/$(id -u) + sudo chown $(id -u) /run/user/$(id -u) + gpgconf -v --create-socketdir +fi