Merge branch 'main' into main

datalad · Nov 11, 2024 · a2214de · a2214de
2 parents a8e0eb9 + d492add
commit a2214de
Show file tree

Hide file tree

Showing 38 changed files with 490 additions and 57 deletions.
diff --git a/.appveyor.yml b/.appveyor.yml
@@ -70,6 +70,7 @@ environment:
       APPVEYOR_BUILD_WORKER_IMAGE: Ubuntu2204
       PY: 3.11
       INSTALL_GITANNEX: git-annex -m snapshot
+      KEYWORDS: not no_such_test
 
     # Windows core tests
     - job_name: test-win
@@ -80,6 +81,7 @@ environment:
       COVERAGE_ROOT: C:\DLTMP
       HATCH_DATA_DIR: C:\hatch-data-dir
       PIP_CACHE: C:\Users\appveyor\AppData\Local\pip\Cache
+      KEYWORDS: not test_whitelist
 
     # MacOS core tests
     - job_name: test-mac
@@ -89,7 +91,9 @@ environment:
       COVERAGE_ROOT: /Users/appveyor/DLTMP
       HATCH_DATA_DIR: /Users/appveyor/hatch-data-dir
       PIP_CACHE: /Users/appveyor/.cache/pip
-
+      KEYWORDS: >
+        not test_compute_remote_main[True]
+        and not test_whitelist
 
 # only run the CI if there are code or tooling changes
 only_commits:
@@ -140,6 +144,8 @@ for:
       # verify that a PY variable is declared that identifies the desired Python version
       # for this run
       - "[ \"x$PY\" != x ]"
+      # create a dedicated socket directory to prevent too long socket names
+      - tools/appveyor/setup-gpg
       # Missing system software
       - tools/appveyor/install-syspkgs $INSTALL_SYSPKGS
       # activate Python env solely to get `python` to become available consistently
@@ -150,7 +156,7 @@ for:
       - "[ -f ${HOME}/dlinstaller_env.sh ] && . ${HOME}/dlinstaller_env.sh || true"
 
     test_script:
-      - 'hatch run tests.py${PY}:run-cov --doctest-modules --durations 10'
+      - 'hatch run tests.py${PY}:run-cov --doctest-modules --durations 10 -k "$KEYWORDS"'
 
     after_test:
       - 'hatch run tests.py${PY}:cov-combine'
@@ -208,7 +214,7 @@ for:
       - cmd: IF DEFINED INSTALL_GITANNEX datalad-installer --sudo ok %INSTALL_GITANNEX%
 
     test_script:
-      - cmd: 'hatch run tests.py%PY%:run-cov --doctest-modules --durations 10'
+      - cmd: 'hatch run tests.py%PY%:run-cov --doctest-modules -k "%KEYWORDS%" --durations 10'
 
     after_test:
       - cmd: 'hatch run tests.py%PY%:cov-combine'

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -31,3 +31,5 @@ python:
    install:
    - method: pip
      path: .
+     extra_requirements:
+       - docs
diff --git a/README.md b/README.md
@@ -75,14 +75,15 @@ EOF
 
 Create a `datalad-remake` git-annex special remote:
 ```bash
-> git annex initremote datalad-remake encryption=none type=external externaltype=datalad-remake
+> git annex initremote datalad-remake encryption=none type=external externaltype=datalad-remake allow_untrusted_execution=true
 ```
 
 Execute a computation and save the result:
 ```bash
-> datalad make -p first=bob -p second=alice -p output=name \
--o name-1.txt -o name-2.txt one-to-many
+> datalad make -p first=bob -p second=alice -p output=name -o name-1.txt \
+-o name-2.txt --allow-untrusted-execution one-to-many
 ```
+
 The method `one-to-many` will create two files with the names `<output>-1.txt`
 and `<output>-2.txt`. Thus, the two files `name-1.txt` and `name-2.txt` need to
 be specified as outputs in the command above.
@@ -117,7 +118,8 @@ Afterwards, a prospective computation can be initiated by using the
 
 ```bash
 > datalad make -p first=john -p second=susan -p output=person \
--o person-1.txt -o person-2.txt -u one-to-many
+-o person-1.txt -o person-2.txt -u --allow_untrusted_execution one-to-many
+> cat person-1.txt    # this will fail, because the computation has not yet been performed
 ```
 
 The following command will fail, because no computation has been performed,

diff --git a/datalad_remake/__init__.py b/datalad_remake/__init__.py
@@ -9,6 +9,7 @@
     'command_suite',
     'specification_dir',
     'template_dir',
+    'trusted_keys_config_key',
 ]
 
 
@@ -47,3 +48,4 @@
 url_scheme = 'datalad-remake'
 template_dir = '.datalad/make/methods'
 specification_dir = '.datalad/make/specifications'
+trusted_keys_config_key = 'datalad.trusted-keys'
diff --git a/datalad_remake/annexremotes/__init__.py b/datalad_remake/annexremotes/__init__.py
@@ -0,0 +1 @@
+"""The DataLad remake special remote"""
diff --git a/datalad_remake/annexremotes/remake_remote.py b/datalad_remake/annexremotes/remake_remote.py
@@ -28,7 +28,9 @@
     get_file_dataset,
     provide_context,
 )
+from datalad_remake.utils.getkeys import get_trusted_keys
 from datalad_remake.utils.glob import resolve_patterns
+from datalad_remake.utils.verify import verify_file
 
 if TYPE_CHECKING:
     from collections.abc import Iterable
@@ -41,6 +43,11 @@
 class RemakeRemote(SpecialRemote):
     def __init__(self, annex: Master):
         super().__init__(annex)
+        self.configs = {
+            'allow_untrusted_execution': 'Allow execution of untrusted code with untrusted parameters. '
+            'set to "true" to enable. THIS IS DANGEROUS and might lead to '
+            'remote code execution.',
+        }
 
     def __del__(self):
         self.close()
@@ -85,7 +92,11 @@ def get_url_for_key(self, key: str) -> str:
         self.annex.debug(f'get_url_for_key: key: {key!r}, urls: {urls!r}')
         return urls[0]
 
-    def get_compute_info(self, key: str) -> tuple[dict[str, Any], Dataset]:
+    def get_compute_info(
+        self,
+        key: str,
+        trusted_key_ids: list[str] | None,
+    ) -> tuple[dict[str, Any], Dataset]:
         def get_assigned_value(assignment: str) -> str:
             return assignment.split('=', 1)[1]
 
@@ -96,6 +107,8 @@ def get_assigned_value(assignment: str) -> str:
 
         dataset = self._find_dataset(root_version)
         spec_path = dataset.pathobj / specification_dir / spec_name
+        if trusted_key_ids is not None:
+            verify_file(dataset.pathobj, spec_path, trusted_key_ids)
         with open(spec_path, 'rb') as f:
             spec = json.load(f)
 
@@ -108,7 +121,12 @@ def get_assigned_value(assignment: str) -> str:
     def transfer_retrieve(self, key: str, file_name: str) -> None:
         self.annex.debug(f'TRANSFER RETRIEVE key: {key!r}, file_name: {file_name!r}')
 
-        compute_info, dataset = self.get_compute_info(key)
+        if self.annex.getconfig('allow_untrusted_execution') == 'true':
+            trusted_key_ids = None
+        else:
+            trusted_key_ids = get_trusted_keys()
+
+        compute_info, dataset = self.get_compute_info(key, trusted_key_ids)
         self.annex.debug(f'TRANSFER RETRIEVE compute_info: {compute_info!r}')
 
         # Perform the computation, and collect the results
@@ -124,6 +142,7 @@ def transfer_retrieve(self, key: str, file_name: str) -> None:
                 compute_info['method'],
                 compute_info['parameter'],
                 compute_info['output'],
+                trusted_key_ids,
             )
             lgr.debug('Starting collection')
             self.annex.debug('Starting collection')

diff --git a/datalad_remake/annexremotes/tests/test_hierarchies.py b/datalad_remake/annexremotes/tests/test_hierarchies.py
@@ -90,6 +90,7 @@ def test_end_to_end(tmp_path, monkeypatch, output_pattern):
         ],
         output=output_pattern,
         result_renderer='disabled',
+        allow_untrusted_execution=True,
     )
 
     collected_output = [

diff --git a/datalad_remake/annexremotes/tests/test_remake_remote.py b/datalad_remake/annexremotes/tests/test_remake_remote.py
@@ -1,14 +1,20 @@
+import re
 import subprocess
 from io import TextIOBase
+from pathlib import Path
 from queue import Queue
 from typing import cast
 
+import pytest
 from annexremote import Master
 from datalad_next.tests import skip_if_on_windows
 
 from datalad_remake.commands.tests.create_datasets import create_ds_hierarchy
 
-from ... import specification_dir
+from ... import (
+    specification_dir,
+    template_dir,
+)
 from ...commands.make_cmd import build_json
 from ..remake_remote import RemakeRemote
 
@@ -64,11 +70,30 @@ def send(self, value):
 
 
 @skip_if_on_windows
-def test_compute_remote_main(tmp_path, monkeypatch):
-    dataset = create_ds_hierarchy(tmp_path, 'ds1', 0)[0][2]
+@pytest.mark.parametrize('trusted', [True, False])
+def test_compute_remote_main(tmp_path, datalad_cfg, monkeypatch, trusted):
+    if trusted:
+        gpg_homedir = tmp_path / 'tmp_gpg_dir'
+        tmp_home = tmp_path / 'tmp_home'
+
+        # make sure that the users keystore is not overwritten
+        monkeypatch.setenv('HOME', str(tmp_home))
+
+        # Generate a keypair
+        signing_key = create_keypair(gpg_dir=gpg_homedir)
+
+        # Activate the new keys
+        monkeypatch.setenv('GNUPGHOME', str(gpg_homedir))
+
+        datalad_cfg.add('datalad.trusted-keys', signing_key, where='global')
+
+    else:
+        signing_key = None
+
+    dataset = create_ds_hierarchy(tmp_path, 'ds1', 0, signing_key)[0][2]
     monkeypatch.chdir(dataset.path)
 
-    template_path = dataset.pathobj / '.datalad' / 'make' / 'methods'
+    template_path = dataset.pathobj / template_dir
     template_path.mkdir(parents=True)
     (template_path / 'echo').write_text(template)
     dataset.save()
@@ -84,10 +109,13 @@ def test_compute_remote_main(tmp_path, monkeypatch):
         )
     ).split(b': ')[1]
 
-    (dataset.pathobj / specification_dir).mkdir(parents=True, exist_ok=True)
-    (dataset.pathobj / specification_dir / '000001111122222').write_text(
+    specification_path = dataset.pathobj / specification_dir
+    spec_name = '000001111122222'
+    specification_path.mkdir(parents=True, exist_ok=True)
+    (specification_path / spec_name).write_text(
         build_json('echo', [], ['a.txt'], {'content': 'some_string'})
     )
+    dataset.save()
 
     input_ = MockedInput()
 
@@ -96,12 +124,16 @@ def test_compute_remote_main(tmp_path, monkeypatch):
     # below.
     input_.send('PREPARE\n')
     input_.send(f'TRANSFER RETRIEVE {key.decode()} {tmp_path / "remade.txt"!s}\n')
+    # The next line is the answer to `GETCONFIG allow_untrusted_execution`
+    input_.send(f'VALUE {"false" if trusted else "true"}\n')
     url = (
         'datalad-make:///?'
         f'root_version={dataset.repo.get_hexsha()}'
         '&specification=000001111122222'
         '&this=a.txt'
     )
+    # The next line is the answer to
+    # `GETURLS MD5E-s2--60b725f10c9c85c70d97880dfe8191b3.txt datalad-remake:`
     input_.send(f'VALUE {url}\n')
     input_.send('VALUE\n')
     input_.send('VALUE .git\n')
@@ -117,3 +149,62 @@ def test_compute_remote_main(tmp_path, monkeypatch):
     # At this point the datalad-remake remote should have executed the
     # computation and written the result.
     assert (tmp_path / 'remade.txt').read_text().strip() == 'content: some_string'
+
+
+def create_keypair(gpg_dir: Path, name: bytes = b'Test User'):
+    gpg_dir.mkdir(parents=True, exist_ok=True)
+    gpg_dir.chmod(0o700)
+    private_keys_dir = gpg_dir / 'private-keys-v1.d'
+    private_keys_dir.mkdir(exist_ok=True)
+    private_keys_dir.chmod(0o700)
+    template = b"""
+        Key-Type: RSA
+        Key-Length: 4096
+        Subkey-Type: RSA
+        Subkey-Length: 4096
+        Name-Real: $NAME
+        Name-Email: [email protected]
+        Expire-Date: 0
+        %no-protection
+        #%transient-key
+        %commit
+    """
+    script = template.replace(b'$NAME', name)
+
+    # unset $HOME to prevent accidental changes to the user's keyring
+    environment = {'HOME': '/dev/null'}
+
+    # use gpg to generate a keypair
+    subprocess.run(
+        [  # noqa: S607
+            'gpg',
+            '--batch',
+            '--homedir',
+            str(gpg_dir),
+            '--gen-key',
+            '--keyid-format',
+            'long',
+        ],
+        input=script,
+        capture_output=True,
+        check=True,
+        env=environment,
+    )
+
+    result = subprocess.run(
+        [  # noqa: S607
+            'gpg',
+            '--homedir',
+            str(gpg_dir),
+            '--list-secret-keys',
+            '--keyid-format',
+            'long',
+        ],
+        capture_output=True,
+        check=True,
+        env=environment,
+    )
+    return re.findall(
+        r'(?m)sec.*rsa4096/([A-Z0-9]+).*\n.*\n.*' + name.decode(),
+        result.stdout.decode(),
+    )[0]
diff --git a/datalad_remake/commands/__init__.py b/datalad_remake/commands/__init__.py
@@ -0,0 +1 @@
+"""Commands provided by the datalad-remake extension."""
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""Commands provided by the datalad-remake extension."""