Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Docker adaptor can retrieve an image from the local docker service #246

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 90 additions & 0 deletions datalad_container/adapters/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import json
import os
import os.path as op
from pathlib import Path
import subprocess as sp
import sys
import tarfile
Expand Down Expand Up @@ -88,6 +89,14 @@ def _list_images():
return out.decode().splitlines()


def _get_repotag_from_image_sha256(sha):
out = sp.check_output(
['docker', 'image', 'inspect', '--format',
'{{range $v := .RepoTags}}{{$v}} {{end}}',
sha])
return out.decode().splitlines()[0].strip()


def get_image(path, repo_tag=None, config=None):
"""Return the image ID of the image extracted at `path`.
"""
Expand Down Expand Up @@ -153,6 +162,87 @@ def load(path, repo_tag, config):
return image_id


def repopulate_from_daemon(contds, imgpath: Path) -> None:
# crude check whether anything at the image location is not
# locally present
contrepo = contds.repo
if not contrepo.call_annex(
['find', '--not', '--in', 'here'],
files=str(imgpath),
):
# nothing is missing, we have nothing to do here
return

# a docker image is a collection of files in a directory
assert imgpath.is_dir()
# we could look into `manifest.json`, but it might also be
# annexed and not around. instead look for the config filename
imgcfg = [
p.name for p in imgpath.iterdir()
# a sha256 is 64 chars plus '.json'
if len(p.name) == 69 and p.name.endswith('.json')
]
# there is only one
assert len(imgcfg) == 1

# look for the employed annex backend, we need it for key reinject below
backends = set(contrepo.call_annex_oneline([
'find',
f'--branch=HEAD:{imgpath.relative_to(contds.pathobj)}',
# this needs git-annex 10.20230126 or later
'--anything',
# the trailing space is not a mistake!
'--format=${backend} ',
]).split())
# we can only deal with a single homogeneous backend here
assert len(backends) == 1

# ID is filename, minus .json extension
img_id = imgcfg[0][:-5]

# make an effort to get the repotags matching the image sha256
# from docker. This is needed, because the query tag will end up
# in manifest.json, and the original addition was likely via a tag
# and not a sha256
repo_tag = None
try:
repo_tag = _get_repotag_from_image_sha256(img_id)
except Exception:
# however, we will go on without a tag. In the worst case, it
# would trigger a download of manifest.json (tiny file), but
# the large `layer.tar` will still be successfully extracted
# and reinject via a query by ID/sha256
pass

# let docker dump into a TMPDIR inside the dataset
# this place is likely to have sufficient space
with tempfile.TemporaryDirectory(dir=imgpath) as tmpdir:
# try to export the image from a local docker instance
save(
# prefer the tag, but continue with ID (see above)
repo_tag or f'sha256:{img_id}',
tmpdir,
)
# the line above will raise an exception when
# - this docker does not have the image.
# - or there is not docker running at all.
# this is fine, we will just not proceed.

# now let git-annex reinject any file that matches a known
# key (given the backend determined above). This will populate
# as much as we can. This approach has built-in content verification.
# this means that even if this docker instance has different metadata
# we will be able to harvest any image piece that fits, and ignore
# anything else
contrepo.call_annex(
['reinject', '--known', '--backend', backends.pop()],
files=[
str(p) for p in Path(tmpdir).glob('**/*')
if p.is_file()
],
)


# Command-line


Expand Down
29 changes: 29 additions & 0 deletions datalad_container/containers_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@

import logging
import os.path as op
from pathlib import Path
import sys

from datalad.interface.base import Interface
from datalad.interface.base import build_doc
from datalad.support.exceptions import CapturedException
from datalad.support.param import Parameter
from datalad.distribution.dataset import datasetmethod
from datalad.distribution.dataset import require_dataset
Expand Down Expand Up @@ -163,6 +165,33 @@ def __call__(cmd, container_name=None, dataset=None,

lgr.debug("extra_inputs = %r", extra_inputs)

if '-m datalad_container.adapters.docker run' in cmd:
# this will use the docker adapter to execute the container.
# below we let the adaptor have a first look at the image
# it will run. The adaptor might query a local docker service,
# and try to populate missing image parts -- possibly avoiding
# a download (via the `get()` that `run()` would perform), whenever
# the local service already has the respective images.
# this is a scenario that would occur frequently in short-lived
# clones that are repeatedly generated on the same machine.
from datalad_container.adapters.docker import repopulate_from_daemon
contds = require_dataset(
container['parentds'], check_installed=True,
purpose='check for docker images')
try:
repopulate_from_daemon(
contds,
# we use the container report here too, and not any of the
# processed variants from above to stay internally
# consistent
imgpath=Path(container['path']),
)
except Exception as e:
# get basic logging of a failure, but overall consider this
# a "best effort". if anything fails, we will silently fall
# back on a standard "get" via the `extra_inputs` below
CapturedException(e)

with patch.dict('os.environ',
{CONTAINER_NAME_ENVVAR: container['name']}):
# fire!
Expand Down