diff --git a/conftest.py b/conftest.py index e69de29..d50ab96 100644 --- a/conftest.py +++ b/conftest.py @@ -0,0 +1,14 @@ +"""Fixture setup""" + +__all__ = [ + 'cfgman', + 'verify_pristine_gitconfig_global', +] + + +from datalad_core.tests.fixtures import ( + # function-scope config manager + cfgman, + # no test can leave global config modifications behind + verify_pristine_gitconfig_global, +) diff --git a/datalad_core/config/__init__.py b/datalad_core/config/__init__.py new file mode 100644 index 0000000..7a0f5bd --- /dev/null +++ b/datalad_core/config/__init__.py @@ -0,0 +1,44 @@ +"""Configuration query and manipulation + +.. currentmodule:: datalad_core.config +.. autosummary:: + :toctree: generated + + ConfigManager + GitConfig + SystemGitConfig + GlobalGitConfig + LocalGitConfig + GitEnvironment + ImplementationDefaults + get_defaults + get_manager +""" + +__all__ = [ + 'ConfigManager', + 'GitConfig', + 'SystemGitConfig', + 'GlobalGitConfig', + 'LocalGitConfig', + 'GitEnvironment', + 'ImplementationDefaults', + 'get_defaults', + 'get_manager', +] + +from .defaults import ( + ImplementationDefaults, + get_defaults, +) +from .git import ( + GitConfig, + GlobalGitConfig, + LocalGitConfig, + SystemGitConfig, +) +from .gitenv import GitEnvironment +from .manager import ( + ConfigManager, + get_manager, +) diff --git a/datalad_core/config/defaults.py b/datalad_core/config/defaults.py new file mode 100644 index 0000000..097c7f9 --- /dev/null +++ b/datalad_core/config/defaults.py @@ -0,0 +1,24 @@ +from __future__ import annotations + +from datasalad.settings import Defaults + + +class ImplementationDefaults(Defaults): + # TODO: item_type + def __str__(self): + return 'ImplementationDefaults' + + +__the_defaults: ImplementationDefaults | None = None + + +def get_defaults() -> ImplementationDefaults: + """Return a a process-unique `ImplementationDefault` instance + + This function can be used obtain a :class:`ImplementationDefaults` + instance for setting and/or getting defaults for settings. + """ + global __the_defaults # noqa: PLW0603 + if __the_defaults is None: + __the_defaults = ImplementationDefaults() + return __the_defaults diff --git a/datalad_core/config/git.py b/datalad_core/config/git.py new file mode 100644 index 0000000..4a98fdd --- /dev/null +++ b/datalad_core/config/git.py @@ -0,0 +1,281 @@ +from __future__ import annotations + +import logging +import re +from abc import abstractmethod +from os import name as os_name +from pathlib import Path +from typing import ( + TYPE_CHECKING, +) + +if TYPE_CHECKING: + from collections.abc import Hashable + from os import PathLike + + from datasalad.settings import Setting + +from datasalad.itertools import ( + decode_bytes, + itemize, +) +from datasalad.settings import CachingSource + +from datalad_core.config.item import ConfigurationItem +from datalad_core.consts import DATALAD_BRANCH_CONFIG_RELPATH +from datalad_core.runners import ( + CommandError, + call_git, + call_git_oneline, + iter_git_subproc, +) + +lgr = logging.getLogger('datalad.config') + + +class GitConfig(CachingSource): + """Configuration source using git-config to read and write""" + + # Unfortunately there is no known way to tell git to ignore possible local git + # repository, and unsetting of --git-dir could cause other problems. See + # https://lore.kernel.org/git/YscCKuoDPBbs4iPX@lena.dartmouth.edu/T/ . Setting + # the git directory to /dev/null or on Windows analogous nul file (could be + # anywhere, see https://stackoverflow.com/a/27773642/1265472) see allow to + # achieve the goal to prevent a repository in the current working directory + # from leaking configuration into the output. + _nul = 'b:\\nul' if os_name == 'nt' else '/dev/null' + + @abstractmethod + def _get_git_config_cmd(self) -> list[str]: + """Return the git-config command suitable for a particular config""" + + @abstractmethod + def _get_git_config_cwd(self) -> Path | None: + """Return path the git-config command should run in""" + + def _reinit(self) -> None: + super()._reinit() + self._sources: set[str | Path] = set() + + def _load(self) -> None: + cwd = self._get_git_config_cwd() or Path.cwd() + dct: dict[str, str | tuple[str, ...]] = {} + fileset: set[str] = set() + + try: + with iter_git_subproc( + [*self._get_git_config_cmd(), '--show-origin', '--list', '-z'], + inputs=None, + cwd=cwd, + ) as gitcfg: + for line in itemize( + decode_bytes(gitcfg), + sep='\0', + keep_ends=False, + ): + _proc_dump_line(line, fileset, dct) + except CommandError: + # TODO: only pass for the case where no corresponding + # source is found. E.g., it fails with --system whenever + # there is no /etc/gitconfig + pass + + # take blobs with verbatim markup + origin_blobs = {f for f in fileset if f.startswith('blob:')} + # convert file specifications to Path objects with absolute paths + origin_paths = {Path(f[5:]) for f in fileset if f.startswith('file:')} + origin_paths = {f if f.is_absolute() else cwd / f for f in origin_paths} + # TODO: add "version" tracking. The legacy config manager used mtimes + # and we will too. but we also need to ensure that the version for + # the "blobs" is known + self._sources = origin_paths.union(origin_blobs) + + for k, v in dct.items(): + vals = (v,) if not isinstance(v, tuple) else v + self.setall( + k, + tuple(ConfigurationItem(val) for val in vals), + ) + + def _set_item(self, key: Hashable, value: Setting) -> None: + call_git( + [*self._get_git_config_cmd(), '--replace-all', str(key), str(value.value)], + capture_output=True, + ) + super()._set_item(key, value) + + def _add(self, key: Hashable, value: Setting) -> None: + call_git( + [*self._get_git_config_cmd(), '--add', str(key), str(value.value)], + capture_output=True, + ) + super()._add(key, value) + + +class SystemGitConfig(GitConfig): + def _get_git_config_cmd(self) -> list[str]: + return [f'--git-dir={self._nul}', 'config', '--system'] + + def _get_git_config_cwd(self) -> Path | None: + return Path.cwd() + + +class GlobalGitConfig(GitConfig): + def _get_git_config_cmd(self) -> list[str]: + return [f'--git-dir={self._nul}', 'config', '--global'] + + def _get_git_config_cwd(self) -> Path | None: + return Path.cwd() + + +class LocalGitConfig(GitConfig): + def __init__(self, path: PathLike): + super().__init__() + pathobj = Path(path) + + try: + # TODO: CHECK FOR GIT_DIR and adjust + self._in_worktree = ( + call_git_oneline( + ['rev-parse', '--is-inside-work-tree'], + cwd=pathobj, + force_c_locale=True, + ) + == 'true' + ) + except CommandError as e: + from os import environ + + msg = f"no Git repository at {path}: {e!r} {environ.get('GIT_DIR')}" + raise ValueError(msg) from e + + self._gitdir = Path( + path + if not self._in_worktree + else call_git_oneline( + ['rev-parse', '--path-format=absolute', '--git-dir'], + cwd=pathobj, + force_c_locale=True, + ) + ) + + def _get_git_config_cmd(self) -> list[str]: + return ['--git-dir', str(self._gitdir), 'config', '--local'] + + def _get_git_config_cwd(self) -> Path | None: + # we set --git-dir, CWD does not matter + return None + + +class DataladBranchConfig(LocalGitConfig): + # TODO: reimplement is_writable to say False for bare repos + def __init__(self, path: PathLike): + super().__init__(path) + self._path = Path(path) + + def _get_git_config_cmd(self) -> list[str]: + return [ + '--git-dir', + str(self._gitdir), + 'config', + *( + ('--file', str(self._path / DATALAD_BRANCH_CONFIG_RELPATH)) + if self._in_worktree + else ('--blob', f'HEAD:{DATALAD_BRANCH_CONFIG_RELPATH}') + ), + ] + + def _ensure_target_dir(self): + cmd = self._get_git_config_cmd() + if '--file' in cmd: + custom_file = Path(cmd[cmd.index('--file') + 1]) + custom_file.parent.mkdir(exist_ok=True) + + def _set_item(self, key: Hashable, value: Setting) -> None: + self._ensure_target_dir() + super().__setitem__(key, value) + + def _add(self, key: Hashable, value: Setting) -> None: + self._ensure_target_dir() + super().add(key, value) + + +def _proc_dump_line( + line: str, + fileset: set[str], + dct: dict[str, str | tuple[str, ...]], +) -> None: + # line is a null-delimited chunk + k = None + # in anticipation of output contamination, process within a loop + # where we can reject non syntax compliant pieces + while line: + if line.startswith(('file:', 'blob:')): + fileset.add(line) + break + if line.startswith('command line:'): + # no origin that we could as a pathobj + break + # try getting key/value pair from the present chunk + k, v = _gitcfg_rec_to_keyvalue(line) + if k is not None: + # we are done with this chunk when there is a good key + break + # discard the first line and start over + ignore, line = line.split('\n', maxsplit=1) + lgr.debug('Non-standard git-config output, ignoring: %s', ignore) + if not k: + # nothing else to log, all ignored dump was reported before + return + if TYPE_CHECKING: + assert k is not None + if v is None: + # man git-config: + # just name, which is a short-hand to say that the variable is + # the boolean + v = 'true' + # multi-value reporting + present_v = dct.get(k) + if present_v is None: + dct[k] = v + elif isinstance(present_v, tuple): + dct[k] = (*present_v, v) + else: + dct[k] = (present_v, v) + + +# git-config key syntax with a section and a subsection +# see git-config(1) for syntax details +cfg_k_regex = re.compile(r'([a-zA-Z0-9-.]+\.[^\0\n]+)$', flags=re.MULTILINE) +# identical to the key regex, but with an additional group for a +# value in a null-delimited git-config dump +cfg_kv_regex = re.compile( + r'([a-zA-Z0-9-.]+\.[^\0\n]+)\n(.*)$', flags=re.MULTILINE | re.DOTALL +) + + +def _gitcfg_rec_to_keyvalue(rec: str) -> tuple[str | None, str | None]: + """Helper for parse_gitconfig_dump() + + Parameters + ---------- + rec: str + Key/value specification string + + Returns + ------- + str, str + Parsed key and value. Key and/or value could be None + if not syntax-compliant (former) or absent (latter). + """ + kv_match = cfg_kv_regex.match(rec) + if kv_match: + k, v = kv_match.groups() + elif cfg_k_regex.match(rec): + # could be just a key without = value, which git treats as True + # if asked for a bool + k, v = rec, None + else: + # no value, no good key + k = v = None + return k, v diff --git a/datalad_core/config/gitenv.py b/datalad_core/config/gitenv.py new file mode 100644 index 0000000..5007eac --- /dev/null +++ b/datalad_core/config/gitenv.py @@ -0,0 +1,99 @@ +from __future__ import annotations + +from contextlib import contextmanager +from typing import ( + TYPE_CHECKING, +) + +if TYPE_CHECKING: + from collections.abc import ( + Collection, + Generator, + Hashable, + ) + +from datasalad.settings import ( + Setting, + UnsetValue, + WritableMultivalueSource, +) + +from datalad_core.config.item import ConfigurationItem +from datalad_core.config.utils import ( + get_gitconfig_items_from_env, + set_gitconfig_items_in_env, +) + + +class GitEnvironment(WritableMultivalueSource): + # this implementation is intentionally stateless to ease providing a + # context manager for temporary manipulations + item_type = ConfigurationItem + + def _reinit(self): + """Does nothing""" + + def _load(self) -> None: + """Does nothing + + All accessors inspect the process environment directly. + """ + + def _get_item(self, key: Hashable) -> Setting: + val = get_gitconfig_items_from_env()[str(key)] + if isinstance(val, tuple): + return self.item_type(val[-1]) + return self.item_type(val) + + def _set_item(self, key: Hashable, value: Setting) -> None: + env = get_gitconfig_items_from_env() + env[str(key)] = str(value.value) + set_gitconfig_items_in_env(env) + + def _del_item(self, key: Hashable) -> None: + env = get_gitconfig_items_from_env() + del env[str(key)] + set_gitconfig_items_in_env(env) + + def _get_keys(self) -> Collection: + return get_gitconfig_items_from_env().keys() + + def _getall( + self, + key: Hashable, + ) -> tuple[Setting, ...]: + val = get_gitconfig_items_from_env()[str(key)] + vals = val if isinstance(val, tuple) else (val,) + return tuple(self.item_type(v) for v in vals) + + def _setall(self, key: Hashable, values: tuple[Setting, ...]) -> None: + key_str = str(key) + prepped_values = tuple(str(v.value) for v in values) + env = get_gitconfig_items_from_env() + env[key_str] = prepped_values + set_gitconfig_items_in_env(env) + + @contextmanager + def overrides( + self, + overrides: dict[Hashable, Setting | tuple[Setting, ...]], + ) -> Generator[None]: + """Context manager to temporarily set overrides""" + restore: dict[Hashable, tuple[Setting, ...]] = {} + + for k, v in overrides.items(): + restore[k] = self.getall(k, self.item_type(UnsetValue)) + if isinstance(v, tuple): + self.setall(k, v) + else: + self[k] = v + try: + yield + finally: + # NOTE: coverage is falsely reported as absent for PY3.9 + # https://github.com/nedbat/coveragepy/issues/1292 + for k, vals in restore.items(): + if len(vals) == 1 and vals[0].pristine_value is UnsetValue: + del self[k] + break + self.setall(k, vals) diff --git a/datalad_core/config/item.py b/datalad_core/config/item.py new file mode 100644 index 0000000..9469a2c --- /dev/null +++ b/datalad_core/config/item.py @@ -0,0 +1,13 @@ +from __future__ import annotations + +from datasalad.settings import Setting + + +class ConfigurationItem(Setting): + """ """ + + # at this point this class does nothing different + # than `Setting`. However, we foresee customization + # and we want to change the foundational type + # already now, such that we can have smoother upgrades + # later on diff --git a/datalad_core/config/manager.py b/datalad_core/config/manager.py new file mode 100644 index 0000000..fc6261a --- /dev/null +++ b/datalad_core/config/manager.py @@ -0,0 +1,69 @@ +from __future__ import annotations + +from contextlib import contextmanager +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import ( + Generator, + Hashable, + ) + +from datasalad.settings import ( + Setting, + Settings, +) + +from datalad_core.config.defaults import ( + ImplementationDefaults, + get_defaults, +) +from datalad_core.config.git import ( + GlobalGitConfig, + SystemGitConfig, +) +from datalad_core.config.gitenv import GitEnvironment + + +class ConfigManager(Settings): + def __init__(self, defaults: ImplementationDefaults): + super().__init__( + { + # call this one 'command', because that is what Git calls the scope + # of items pulled from the process environment + 'git-command': GitEnvironment(), + 'git-global': GlobalGitConfig(), + 'git-system': SystemGitConfig(), + 'defaults': defaults, + } + ) + # TODO: set .item_type for all sources + + @contextmanager + def overrides( + self, + overrides: dict[Hashable, Setting | tuple[Setting, ...]], + ) -> Generator[ConfigManager]: + """Context manager to temporarily set configuration overrides + + Internally, these overrides are posted to the 'git-command' scope, + hence affect the process environment and newly spawn subprocesses. + """ + gitcmdsrc = self.sources['git-command'] + with gitcmdsrc.overrides(overrides): + yield self + + +__the_manager: ConfigManager | None = None + + +def get_manager() -> ConfigManager: + """Return a a process-unique, global `ConfigManager` instance + + This function can be used obtain a :class:`ConfigManager` + instance for query and manipulation of settings. + """ + global __the_manager # noqa: PLW0603 + if __the_manager is None: + __the_manager = ConfigManager(get_defaults()) + return __the_manager diff --git a/datalad_core/config/tests/__init__.py b/datalad_core/config/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/datalad_core/config/tests/test_defaults.py b/datalad_core/config/tests/test_defaults.py new file mode 100644 index 0000000..bfe3ff4 --- /dev/null +++ b/datalad_core/config/tests/test_defaults.py @@ -0,0 +1,6 @@ +from ..defaults import get_defaults + + +def test_implementationdefaults(): + df = get_defaults() + assert str(df) == 'ImplementationDefaults' diff --git a/datalad_core/config/tests/test_git.py b/datalad_core/config/tests/test_git.py new file mode 100644 index 0000000..d6583ff --- /dev/null +++ b/datalad_core/config/tests/test_git.py @@ -0,0 +1,44 @@ +import pytest + +from datalad_core.runners import call_git_oneline + +from ..git import ( + GlobalGitConfig, +) +from ..item import ConfigurationItem + + +@pytest.mark.usefixtures('cfgman') +def test_global_git_config(): + target_key = 'my.config.key' + target_value = 'my/config.value' + + gc = GlobalGitConfig() + gc[target_key] = ConfigurationItem(value=target_value) + # immediate availability + assert target_key in gc + assert gc[target_key].value == target_value + + # if we create another instance, it also has the key, because + # we wrote to a file, not just the instance + gc2 = GlobalGitConfig() + assert target_key in gc2 + assert gc2[target_key].value == target_value + + assert 'user.email' in gc + assert gc['user.email'] + + +def test_global_git_config_pure(cfgman): + orig_keys = GlobalGitConfig().keys() + with cfgman.overrides( + { + 'datalad.absurdkey': ConfigurationItem('absurddummy'), + } + ): + # check that the comamnd-scope configuration does not bleed + # into the global scope (global here being an example for any + # other scope) + assert GlobalGitConfig().keys() == orig_keys + # but Git does see the manipulation + assert call_git_oneline(['config', 'datalad.absurdkey']) == 'absurddummy' diff --git a/datalad_core/config/tests/test_gitenv.py b/datalad_core/config/tests/test_gitenv.py new file mode 100644 index 0000000..34baf73 --- /dev/null +++ b/datalad_core/config/tests/test_gitenv.py @@ -0,0 +1,40 @@ +import pytest + +from datalad_core.runners import ( + call_git_lines, + call_git_oneline, +) + +from ..gitenv import GitEnvironment + + +def test_gitenv_singlevalue(): + env = GitEnvironment() + target_key = 'absurd.key' + target_value = 'absurd_value' + env[target_key] = env.item_type(target_value) + assert target_key in env + assert env[target_key].value == target_value + assert env.get(target_key).value == target_value + assert env.getall(target_key) == (env.item_type(target_value),) + assert call_git_oneline(['config', target_key]) == target_value + del env[target_key] + assert target_key not in env + with pytest.raises(KeyError): + env[target_key] + + +def test_gitenv_multivalue(): + env = GitEnvironment() + target_key = 'absurd.key' + target_values = ('absurd_value1', 'absurd_value2', 'absurd_value3') + assert target_key not in env + for tv in target_values: + env.add(target_key, env.item_type(tv)) + assert target_key in env + assert env[target_key].value == target_values[-1] + assert env.getall(target_key) == tuple(env.item_type(tv) for tv in target_values) + # git sees all values + assert call_git_lines(['config', '--get-all', target_key]) == list(target_values) + assert env.getall('notakey', 'mike') == (env.item_type('mike'),) + del env[target_key] diff --git a/datalad_core/config/tests/test_manager.py b/datalad_core/config/tests/test_manager.py new file mode 100644 index 0000000..6fde88c --- /dev/null +++ b/datalad_core/config/tests/test_manager.py @@ -0,0 +1,67 @@ +from os import environ + +import pytest + +from datalad_core.config import get_manager + + +def test_manager_setup(): + """Test the actual global configuration manager""" + manager = get_manager() + target_sources = [ + 'git-command', + 'git-global', + 'git-system', + 'defaults', + ] + target_key = 'user.name' + absurd_must_be_absent_key = 'nobody.would.use.such.a.key' + # the order of sources is the precedence rule + assert list(manager.sources.keys()) == target_sources + # any real manager will have some keys + assert len(manager) + assert target_key in manager + assert absurd_must_be_absent_key not in manager + # test query + item = manager[target_key] + with pytest.raises(KeyError): + manager[absurd_must_be_absent_key] + # we cannot be really specific and also robust + assert item.value + assert manager[target_key] + assert manager.get(absurd_must_be_absent_key).value is None + + with pytest.raises(TypeError): + del manager.sources['git-system'] + with pytest.raises(TypeError): + manager.sources['new'] = manager.sources['defaults'] + + +def test_manager_overrides(): + manager = get_manager() + test_key = 'test_manager_overrides.test_key' + target_key_env = 'GIT_CONFIG_KEY_0' + target_val_env = 'GIT_CONFIG_VALUE_0' + try: + # we need at least one item posted to the environment for + # this test. this is done next, and taken out again in + # finally. + manager.sources['git-command'][test_key] = manager.item_type('irrelevant') + # we don't care what is KEY_0 at this point, but there should be some + # capture whatever it is + orig_key_name = environ[target_key_env] + orig_key_value = environ[target_val_env] + assert orig_key_name + with manager.overrides({ + orig_key_name: manager.item_type('alsoirrelevant'), + 'smoketest.for.tuple': ( + manager.item_type('ping'), + manager.item_type('pong'), + ), + }): + assert environ[target_val_env] != orig_key_value + # after context manager exit we find things restored + assert environ[target_key_env] == orig_key_name + assert environ[target_val_env] == orig_key_value + finally: + del manager.sources['git-command'][test_key] diff --git a/datalad_core/config/tests/test_utils.py b/datalad_core/config/tests/test_utils.py new file mode 100644 index 0000000..60fd3c0 --- /dev/null +++ b/datalad_core/config/tests/test_utils.py @@ -0,0 +1,183 @@ +import pytest + +from .. import utils # for patching environ +from ..utils import ( + get_gitconfig_items_from_env, + set_gitconfig_items_in_env, +) + + +def test_get_gitconfig_items_from_env(monkeypatch): + with monkeypatch.context() as m: + # without the COUNT the rest does not matter and we always + # get an empty dict + m.delenv('GIT_CONFIG_COUNT', raising=False) + assert get_gitconfig_items_from_env() == {} + + with monkeypatch.context() as m: + # setting zero items, also makes everything else irrelevant + m.setenv('GIT_CONFIG_COUNT', '0') + assert get_gitconfig_items_from_env() == {} + + with monkeypatch.context() as m: + # predictable error for botched count + m.setenv('GIT_CONFIG_COUNT', 'rubbish') + with pytest.raises( + ValueError, + match='bogus count in GIT_CONFIG_COUNT', + ): + get_gitconfig_items_from_env() + + # bunch of std error conditions + for env, excstr in ( + ( + {'GIT_CONFIG_COUNT': 1, 'GIT_CONFIG_KEY_0': 'section.name'}, + 'missing config value', + ), + ({'GIT_CONFIG_COUNT': 1, 'GIT_CONFIG_VALUE_0': 'value'}, 'missing config key'), + ( + { + 'GIT_CONFIG_COUNT': 1, + 'GIT_CONFIG_KEY_0': '', + 'GIT_CONFIG_VALUE_0': 'value', + }, + 'empty config key', + ), + ( + { + 'GIT_CONFIG_COUNT': 1, + 'GIT_CONFIG_KEY_0': 'nosection', + 'GIT_CONFIG_VALUE_0': 'value', + }, + 'does not contain a section', + ), + ): + with monkeypatch.context() as m: + m.setattr(utils, 'environ', env) + with pytest.raises(ValueError, match=excstr): + get_gitconfig_items_from_env() + + # proper functioning + for env, target in ( + ( + { + 'GIT_CONFIG_COUNT': 1, + 'GIT_CONFIG_KEY_0': 'section.name', + 'GIT_CONFIG_VALUE_0': 'value', + }, + {'section.name': 'value'}, + ), + ( + { + 'GIT_CONFIG_COUNT': 2, + 'GIT_CONFIG_KEY_0': 'section.name1', + 'GIT_CONFIG_VALUE_0': 'value1', + 'GIT_CONFIG_KEY_1': 'section.name2', + 'GIT_CONFIG_VALUE_1': 'value2', + }, + {'section.name1': 'value1', 'section.name2': 'value2'}, + ), + # double-specification appends + # > GIT_CONFIG_COUNT=2 \ + # GIT_CONFIG_KEY_0=section.name \ + # GIT_CONFIG_VALUE_0=val1 \ + # GIT_CONFIG_KEY_1=section.name \ + # GIT_CONFIG_VALUE_1=val2 \ + # git config --list --show-origin | grep 'command line:' + # command line: section.name=val1 + # command line: section.name=val2 + ( + { + 'GIT_CONFIG_COUNT': 3, + 'GIT_CONFIG_KEY_0': 'section.name', + 'GIT_CONFIG_VALUE_0': 'value0', + 'GIT_CONFIG_KEY_1': 'section.name', + 'GIT_CONFIG_VALUE_1': 'value1', + 'GIT_CONFIG_KEY_2': 'section.name', + 'GIT_CONFIG_VALUE_2': 'value2', + }, + {'section.name': ('value0', 'value1', 'value2')}, + ), + ): + with monkeypatch.context() as m: + m.setattr(utils, 'environ', env) + assert get_gitconfig_items_from_env() == target + + +def test_set_gitconfig_items_in_env(monkeypatch): + for start, items, target in ( + # giving nothing preserves statusquo + ({}, {}, {}), + ({'DUMMY': 'value'}, {}, {'DUMMY': 'value'}), + # fixable specification is cleaned up + ({'GIT_CONFIG_COUNT': '526'}, {}, {}), + # but it has limits + ({'GIT_CONFIG_COUNT': 'nochance'}, {}, {'GIT_CONFIG_COUNT': 'nochance'}), + # and there is no exhaustive search + ({'GIT_CONFIG_KEY_3': 'dummy'}, {}, {'GIT_CONFIG_KEY_3': 'dummy'}), + # virgin territory + ( + {}, + {'section.name': 'value'}, + { + 'GIT_CONFIG_COUNT': '1', + 'GIT_CONFIG_KEY_0': 'section.name', + 'GIT_CONFIG_VALUE_0': 'value', + }, + ), + # "set" means "replace, not amend + ( + { + 'GIT_CONFIG_COUNT': '1', + 'GIT_CONFIG_KEY_0': 'section.name', + 'GIT_CONFIG_VALUE_0': 'value', + }, + {'altsection.name2': 'value2'}, + { + 'GIT_CONFIG_COUNT': '1', + 'GIT_CONFIG_KEY_0': 'altsection.name2', + 'GIT_CONFIG_VALUE_0': 'value2', + }, + ), + # full cleanupage + ( + { + 'GIT_CONFIG_COUNT': '2', + 'GIT_CONFIG_KEY_0': 'section.name', + 'GIT_CONFIG_VALUE_0': 'value', + 'GIT_CONFIG_KEY_1': 'altsection.name2', + 'GIT_CONFIG_VALUE_1': 'value2', + }, + {}, + {}, + ), + # multi-value support, order preserved + ( + {}, + {'section.name': ('c', 'a', 'b')}, + { + 'GIT_CONFIG_COUNT': '3', + 'GIT_CONFIG_KEY_0': 'section.name', + 'GIT_CONFIG_VALUE_0': 'c', + 'GIT_CONFIG_KEY_1': 'section.name', + 'GIT_CONFIG_VALUE_1': 'a', + 'GIT_CONFIG_KEY_2': 'section.name', + 'GIT_CONFIG_VALUE_2': 'b', + }, + ), + ): + with monkeypatch.context() as m: + env = dict(start) + m.setattr(utils, 'environ', env) + set_gitconfig_items_in_env(items) + assert env == target + + +def test_get_set_gitconfig_env_roundtrip(monkeypatch): + items = {'section.name': ('c', 'a', 'b'), 'space section.na me.so me': 'v al'} + with monkeypatch.context() as m: + env = {} + m.setattr(utils, 'environ', env) + # feed in copy to ensure validity of the test + set_gitconfig_items_in_env(dict(items)) + assert get_gitconfig_items_from_env() == items diff --git a/datalad_core/config/utils.py b/datalad_core/config/utils.py new file mode 100644 index 0000000..2f9eba2 --- /dev/null +++ b/datalad_core/config/utils.py @@ -0,0 +1,117 @@ +from __future__ import annotations + +from os import environ +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Mapping + + +def get_gitconfig_items_from_env() -> dict[str, str | tuple[str, ...]]: + """Parse git-config ENV (``GIT_CONFIG_COUNT|KEY|VALUE``) and return as dict + + This implementation does not use ``git-config`` directly, but aims to + mimic its behavior with respect to parsing the environment as much + as possible. + + Raises + ------ + ValueError + Whenever ``git-config`` would also error out, and includes an + message in the respective exception that resembles ``git-config``'s + for that specific case. + + Returns + ------- + dict + Configuration key-value mappings. When a key is declared multiple + times, the respective values are aggregated in reported as a tuple + for that specific key. + """ + items: dict[str, str | tuple[str, ...]] = {} + for k, v in ( + (_get_gitconfig_var_from_env(i, 'key'), _get_gitconfig_var_from_env(i, 'value')) + for i in range(_get_gitconfig_itemcount()) + ): + val = items.get(k) + if val is None: + items[k] = v + elif isinstance(val, tuple): + items[k] = (*val, v) + else: + items[k] = (val, v) + return items + + +def _get_gitconfig_itemcount() -> int: + try: + return int(environ.get('GIT_CONFIG_COUNT', '0')) + except (TypeError, ValueError) as e: + msg = 'bogus count in GIT_CONFIG_COUNT' + raise ValueError(msg) from e + + +def _get_gitconfig_var_from_env(nid: int, kind: str) -> str: + envname = f'GIT_CONFIG_{kind.upper()}_{nid}' + var = environ.get(envname) + if var is None: + msg = f'missing config {kind} {envname}' + raise ValueError(msg) + if kind != 'key': + return var + if not var: + msg = f'empty config key {envname}' + raise ValueError(msg) + if '.' not in var: + msg = f'key {envname} does not contain a section: {var}' + raise ValueError(msg) + return var + + +def set_gitconfig_items_in_env(items: Mapping[str, str | tuple[str, ...]]): + """Set git-config ENV (``GIT_CONFIG_COUNT|KEY|VALUE``) from a mapping + + Any existing declaration of configuration items in the environment is + replaced. Any ENV variable of a *valid* existing declaration is removed, + before the set configuration items are posted in the ENV. + + Multi-value configuration keys are supported (values provided as a tuple). + + Any item with a value of ``None`` will be posted into the ENV with an + empty string as value, i.e. the corresponding ``GIT_CONFIG_VALUE_{count}`` + variable will be an empty string. ``None`` item values indicate that the + configuration key was unset on the command line, via the global option + ``-c``. + + No verification (e.g., of syntax compliance) is performed. + """ + _clean_env_from_gitconfig_items() + + count = 0 + for key, value in items.items(): + # homogeneous processing of multiple value items, and single values + values = value if isinstance(value, tuple) else (value,) + for v in values: + environ[f'GIT_CONFIG_KEY_{count}'] = key + # we support None even though not an allowed input type, because + # of https://github.com/datalad/datalad/issues/7589 + # this can be removed, when that issue is resolved. + environ[f'GIT_CONFIG_VALUE_{count}'] = '' if v is None else str(v) + count += 1 + if count: + environ['GIT_CONFIG_COUNT'] = str(count) + + +def _clean_env_from_gitconfig_items(): + # we only care about intact specifications here, if there was cruft + # to start with, we have no responsibilities + try: + count = _get_gitconfig_itemcount() + except ValueError: + return + + for i in range(count): + environ.pop(f'GIT_CONFIG_KEY_{i}', None) + environ.pop(f'GIT_CONFIG_VALUE_{i}', None) + + environ.pop('GIT_CONFIG_COUNT', None) diff --git a/datalad_core/consts/__init__.py b/datalad_core/consts/__init__.py new file mode 100644 index 0000000..c1e0b1b --- /dev/null +++ b/datalad_core/consts/__init__.py @@ -0,0 +1,17 @@ +"""Assorted common constants""" + +from os.path import join as opj + +DATALAD_DOTDIR_RELPATH = '.datalad' +"""Path to dataset directory with committed datalad-specific information + +``str`` path in platform conventions, relative to the root of the dataset. +""" +DATALAD_BRANCH_CONFIG_RELPATH = opj(DATALAD_DOTDIR_RELPATH, 'config') +"""Path to the branch-specific DataLad configuration file in a dataset + +``str`` path in platform conventions, relative to the root of the dataset. +""" + +PRE_INIT_COMMIT_SHA = '4b825dc642cb6eb9a060e54bf8d69288fbee4904' +"""Magic SHA from `git hash-object -t tree /dev/null`, i.e. from nothing""" diff --git a/datalad_core/runners/__init__.py b/datalad_core/runners/__init__.py new file mode 100644 index 0000000..5b59e2f --- /dev/null +++ b/datalad_core/runners/__init__.py @@ -0,0 +1,46 @@ +"""Execution of subprocesses + +This module provides all relevant components for subprocess execution. The +main work horse is :func:`~datalad_core.runners.iter_subproc`, a context +manager that enables interaction with a subprocess in the form of an iterable +for input/output processing. Execution errors are communicated with the +:class:`~datalad_core.runners.CommandError` exception. In addition, a few +convenience functions are provided to execute Git commands (including +git-annex). + +.. currentmodule:: datalad_core.runners +.. autosummary:: + :toctree: generated + + call_git + call_git_lines + call_git_oneline + call_git_success + iter_subproc + iter_git_subproc + CommandError +""" + +__all__ = [ + 'CommandError', + 'iter_subproc', + 'iter_git_subproc', + 'call_git', + 'call_git_lines', + 'call_git_oneline', + 'call_git_success', +] + + +from datasalad.runners import ( + CommandError, + iter_subproc, +) + +from .git import ( + call_git, + call_git_lines, + call_git_oneline, + call_git_success, + iter_git_subproc, +) diff --git a/datalad_core/runners/git.py b/datalad_core/runners/git.py new file mode 100644 index 0000000..a06f77e --- /dev/null +++ b/datalad_core/runners/git.py @@ -0,0 +1,218 @@ +from __future__ import annotations + +import logging +import os +import subprocess +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from pathlib import Path + +from datalad_core.runners.imports import ( + CommandError, + iter_subproc, +) + +lgr = logging.getLogger('datalad.runners') + + +def _call_git( + args: list[str], + *, + capture_output: bool = False, + cwd: Path | None = None, + check: bool = False, + text: bool | None = None, + inputs: str | bytes | None = None, + force_c_locale: bool = False, +) -> subprocess.CompletedProcess: + """Wrapper around ``subprocess.run`` for calling Git command + + ``args`` is a list of argument for the Git command. This list must not + contain the Git executable itself. It will be prepended (unconditionally) + to the arguments before passing them on. + + If ``force_c_locale`` is ``True`` the environment of the Git process + is altered to ensure output according to the C locale. This is useful + when output has to be processed in a locale invariant fashion. + + All other argument are pass on to ``subprocess.run()`` verbatim. + """ + env = None + if force_c_locale: + env = dict(os.environ, LC_ALL='C') + + # make configurable + git_executable = 'git' + cmd = [git_executable, *args] + try: + return subprocess.run( + cmd, + capture_output=capture_output, + cwd=cwd, + check=check, + text=text, + input=inputs, + env=env, + ) + except subprocess.CalledProcessError as e: + # TODO: we could support post-error forensics, but some client + # might call this knowing that it could fail, and may not + # appreciate the slow-down. Add option `expect_fail=False`? + # + # normalize exception to datalad-wide standard + # TODO: CommandError.from_callprocesserror + raise CommandError( + cmd=cmd, + returncode=e.returncode, + stdout=e.stdout, + stderr=e.stderr, + cwd=cwd, + ) from e + + +def call_git( + args: list[str], + *, + cwd: Path | None = None, + force_c_locale: bool = False, + capture_output: bool = False, +) -> None: + """Call Git with no output capture, raises on non-zero exit. + + If ``cwd`` is not None, the function changes the working directory to + ``cwd`` before executing the command. + + If ``force_c_locale`` is ``True`` the environment of the Git process + is altered to ensure output according to the C locale. This is useful + when output has to be processed in a locale invariant fashion. + + If ``capture_output`` is ``True``, process output is captured. This is + necessary for reporting any error messaging via a ``CommandError`` exception. + By default process output is not captured. + """ + _call_git( + args, + capture_output=capture_output, + cwd=cwd, + check=True, + force_c_locale=force_c_locale, + ) + + +def call_git_success( + args: list[str], + *, + cwd: Path | None = None, + capture_output: bool = False, +) -> bool: + """Call Git and report success or failure of the command + + ``args`` is a list of arguments for the Git command. This list must not + contain the Git executable itself. It will be prepended (unconditionally) + to the arguments before passing them on. + + If ``cwd`` is not None, the function changes the working directory to + ``cwd`` before executing the command. + + If ``capture_output`` is ``True``, process output is captured, but not + returned. By default process output is not captured. + """ + try: + _call_git( + args, + capture_output=capture_output, + cwd=cwd, + check=True, + ) + except CommandError: + # exc_info=True replaces CapturedException from legacy datalad + lgr.debug('call_git_success() failed with exception', exc_info=True) + return False + return True + + +def call_git_lines( + args: list[str], + *, + cwd: Path | None = None, + inputs: str | None = None, + force_c_locale: bool = False, +) -> list[str]: + """Call Git for any (small) number of lines of output + + ``args`` is a list of arguments for the Git command. This list must not + contain the Git executable itself. It will be prepended (unconditionally) + to the arguments before passing them on. + + If ``cwd`` is not None, the function changes the working directory to + ``cwd`` before executing the command. + + If ``inputs`` is not None, the argument becomes the subprocess's stdin. + This is intended for small-scale inputs. For call that require processing + large inputs, ``iter_git_subproc()`` is to be preferred. + + If ``force_c_locale`` is ``True`` the environment of the Git process + is altered to ensure output according to the C locale. This is useful + when output has to be processed in a locale invariant fashion. + + Raises + ------ + CommandError if the call exits with a non-zero status. + """ + res = _call_git( + args, + capture_output=True, + cwd=cwd, + check=True, + text=True, + inputs=inputs, + force_c_locale=force_c_locale, + ) + return res.stdout.splitlines() + + +def call_git_oneline( + args: list[str], + *, + cwd: Path | None = None, + inputs: str | None = None, + force_c_locale: bool = False, +) -> str: + """Call Git for a single line of output + + If ``cwd`` is not None, the function changes the working directory to + ``cwd`` before executing the command. + + If ``inputs`` is not None, the argument becomes the subprocess's stdin. + This is intended for small-scale inputs. For call that require processing + large inputs, ``iter_git_subproc()`` is to be preferred. + + If ``force_c_locale`` is ``True`` the environment of the Git process + is altered to ensure output according to the C locale. This is useful + when output has to be processed in a locale invariant fashion. + + Raises + ------ + CommandError if the call exits with a non-zero status. + AssertionError if there is more than one line of output. + """ + lines = call_git_lines(args, cwd=cwd, inputs=inputs, force_c_locale=force_c_locale) + if len(lines) > 1: + msg = f'Expected Git {args} to return a single line, but got {lines}' + raise AssertionError(msg) + return lines[0] + + +def iter_git_subproc(args: list[str], **kwargs): + """``iter_subproc()`` wrapper for calling Git commands + + All argument semantics are identical to those of ``iter_subproc()``, + except that ``args`` must not contain the Git binary, but need to be + exclusively arguments to it. The respective `git` command/binary is + automatically added internally. + """ + cmd = ['git'] + cmd.extend(args) + + return iter_subproc(cmd, **kwargs) diff --git a/datalad_core/runners/imports.py b/datalad_core/runners/imports.py new file mode 100644 index 0000000..b240404 --- /dev/null +++ b/datalad_core/runners/imports.py @@ -0,0 +1,9 @@ +from datasalad.runners import ( + CommandError, + iter_subproc, +) + +__all__ = [ + 'CommandError', + 'iter_subproc', +] diff --git a/datalad_core/tests/fixtures.py b/datalad_core/tests/fixtures.py new file mode 100644 index 0000000..d9270fa --- /dev/null +++ b/datalad_core/tests/fixtures.py @@ -0,0 +1,77 @@ +"""Collection of fixtures for facilitation test implementations""" + +from __future__ import annotations + +from tempfile import NamedTemporaryFile + +import pytest + +from datalad_core.config import get_manager + +magic_marker = 'c4d0de12-8008-11ef-86ea-3776083add61' +standard_gitconfig = f"""\ +[datalad "magic"] + test-marker = {magic_marker} +[user] + name = DataLad Tester + email = test@example.com +""" + + +@pytest.fixture(autouse=False, scope='function') # noqa: PT003 +def cfgman(monkeypatch): + """Yield a configuration manager with a test-specific global scope + + Any test using this fixture will be skipped for Git versions earlier + than 2.32, because the `GIT_CONFIG_GLOBAL` environment variable used + here was only introduced with that version. + """ + manager = get_manager() + ggc = manager.sources['git-global'] + with NamedTemporaryFile( + 'w', + prefix='datalad_gitcfg_global_', + delete=False, + ) as tf: + tf.write(standard_gitconfig) + # we must close, because windows does not like the file being open + # already when ConfigManager would open it for reading + tf.close() + with monkeypatch.context() as m: + m.setenv('GIT_CONFIG_GLOBAL', tf.name) + ggc = manager.sources['git-global'] + ggc.reinit() + ggc.load() + if ggc['datalad.magic.test-marker'].pristine_value != magic_marker: + pytest.skip( + 'Cannot establish isolated global Git config scope ' + '(possibly Git too old (needs v2.32)' + ) + yield manager + # reload to put the previous config in effect again + ggc.reinit() + ggc.load() + + +@pytest.fixture(autouse=True, scope='function') # noqa: PT003 +def verify_pristine_gitconfig_global(): + """No test must modify a user's global Git config. + + If such modifications are needed, a custom configuration setup + limited to the scope of the test requiring it must be arranged. + """ + from datalad_core.config import GlobalGitConfig + + def get_ggc_state(): + ggc = GlobalGitConfig() + return {k: ggc[k].pristine_value for k in ggc} + + pre = get_ggc_state() + yield + if pre != get_ggc_state(): + msg = ( + 'Global Git config modification detected. ' + 'Test must be modified to use a temporary configuration target. ' + 'Hint: use the `isolated_global_cfg` fixture.' + ) + raise AssertionError(msg) diff --git a/docs/index.rst b/docs/index.rst index d3ce74b..52f10ae 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -7,11 +7,12 @@ Package overview Also see the :ref:`modindex`. .. currentmodule:: datalad_core -.. - .. autosummary:: +.. autosummary:: :toctree: generated - ... + config + consts + runners Indices and tables diff --git a/pyproject.toml b/pyproject.toml index c3d73f1..9c6129e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,7 +42,15 @@ classifiers = [ "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] -dependencies = [] +dependencies = [ + # co-development with datasalad for now and here + #"datasalad >= 0.2.1", + "datasalad@git+https://github.com/datalad/datasalad@settings#egg=datasalad", + "typing_extensions", +] + +[tool.hatch.metadata] +allow-direct-references = true [project.urls] Homepage = "https://github.com/datalad/datalad-core"