From a4b3c6d872c06ef1259bb15bd0d69eaa4ccae7ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20Rowlands=20=28=EB=B3=80=EA=B8=B0=ED=98=B8=29?= Date: Wed, 22 Jul 2020 09:59:08 +0900 Subject: [PATCH] erepo: --rev BRANCH: shallow clone (#4246) * erepo: use shallow clones when rev is branch or tag name * tests: test erepo shallow clone behavior * erepo: fix detached head after shallow cloning a tag * move unshallow logic into its own function * erepo: default rev should be origin HEAD not local HEAD * use functools.partial * review fixes * comment shallow clone behavior in erepo * don't duplicate unnecessary fetch/pull call --- dvc/external_repo.py | 67 +++++++++++++++++++++++++------- dvc/scm/git.py | 18 +++++++-- tests/func/test_external_repo.py | 56 +++++++++++++++++++++++++- 3 files changed, 122 insertions(+), 19 deletions(-) diff --git a/dvc/external_repo.py b/dvc/external_repo.py index 42df57c691..15332e7de1 100644 --- a/dvc/external_repo.py +++ b/dvc/external_repo.py @@ -19,6 +19,7 @@ from dvc.path_info import PathInfo from dvc.repo import Repo from dvc.repo.tree import RepoTree +from dvc.scm.base import CloneError from dvc.scm.git import Git from dvc.tree.local import LocalTree from dvc.utils.fs import remove @@ -31,7 +32,10 @@ def external_repo(url, rev=None, for_write=False): logger.debug("Creating external repo %s@%s", url, rev) path = _cached_clone(url, rev, for_write=for_write) if not rev: - rev = "HEAD" + # Local HEAD points to the tip of whatever branch we first cloned from + # (which may not be the default branch), use origin/HEAD here to get + # the tip of the default branch + rev = "refs/remotes/origin/HEAD" try: repo = ExternalRepo(path, url, rev, for_write=for_write) except NotDvcRepoError: @@ -59,7 +63,7 @@ def external_repo(url, rev=None, for_write=False): def clean_repos(): # Outside code should not see cache while we are removing - paths = list(CLONES.values()) + list(CACHE_DIRS.values()) + paths = [path for path, _ in CLONES.values()] + list(CACHE_DIRS.values()) CLONES.clear() CACHE_DIRS.clear() @@ -251,10 +255,10 @@ def _cached_clone(url, rev, for_write=False): """ # even if we have already cloned this repo, we may need to # fetch/fast-forward to get specified rev - clone_path = _clone_default_branch(url, rev) + clone_path, shallow = _clone_default_branch(url, rev, for_write=for_write) if not for_write and (url) in CLONES: - return CLONES[url] + return CLONES[url][0] # Copy to a new dir to keep the clone clean repo_path = tempfile.mkdtemp("dvc-erepo") @@ -265,17 +269,17 @@ def _cached_clone(url, rev, for_write=False): if for_write: _git_checkout(repo_path, rev) else: - CLONES[url] = repo_path + CLONES[url] = (repo_path, shallow) return repo_path @wrap_with(threading.Lock()) -def _clone_default_branch(url, rev): +def _clone_default_branch(url, rev, for_write=False): """Get or create a clean clone of the url. The cloned is reactualized with git pull unless rev is a known sha. """ - clone_path = CLONES.get(url) + clone_path, shallow = CLONES.get(url, (None, False)) git = None try: @@ -283,18 +287,55 @@ def _clone_default_branch(url, rev): git = Git(clone_path) # Do not pull for known shas, branches and tags might move if not Git.is_sha(rev) or not git.has_rev(rev): - logger.debug("erepo: git pull %s", url) - git.pull() + if shallow: + # If we are missing a rev in a shallow clone, fallback to + # a full (unshallowed) clone. Since fetching specific rev + # SHAs is only available in certain git versions, if we + # have need to reference multiple specific revs for a + # given repo URL it is easier/safer for us to work with + # full clones in this case. + logger.debug("erepo: unshallowing clone for '%s'", url) + _unshallow(git) + shallow = False + CLONES[url] = (clone_path, shallow) + else: + logger.debug("erepo: git pull '%s'", url) + git.pull() else: - logger.debug("erepo: git clone %s to a temporary dir", url) + logger.debug("erepo: git clone '%s' to a temporary dir", url) clone_path = tempfile.mkdtemp("dvc-clone") - git = Git.clone(url, clone_path) - CLONES[url] = clone_path + if not for_write and rev and not Git.is_sha(rev): + # If rev is a tag or branch name try shallow clone first + try: + git = Git.clone(url, clone_path, shallow_branch=rev) + shallow = True + logger.debug( + "erepo: using shallow clone for branch '%s'", rev + ) + except CloneError: + pass + if not git: + git = Git.clone(url, clone_path) + shallow = False + CLONES[url] = (clone_path, shallow) finally: if git: git.close() - return clone_path + return clone_path, shallow + + +def _unshallow(git): + if git.repo.head.is_detached: + # If this is a detached head (i.e. we shallow cloned a tag) switch to + # the default branch + origin_refs = git.repo.remotes["origin"].refs + ref = origin_refs["HEAD"].reference + branch_name = ref.name.split("/")[-1] + branch = git.repo.create_head(branch_name, ref) + branch.set_tracking_branch(ref) + branch.checkout() + git.pull(unshallow=True) def _git_checkout(repo_path, rev): diff --git a/dvc/scm/git.py b/dvc/scm/git.py index 30b5a4b495..12b0b198cf 100644 --- a/dvc/scm/git.py +++ b/dvc/scm/git.py @@ -3,6 +3,7 @@ import logging import os import shlex +from functools import partial import yaml from funcy import cached_property @@ -92,7 +93,7 @@ def root_dir(self): return self.repo.working_tree_dir @staticmethod - def clone(url, to_path, rev=None): + def clone(url, to_path, rev=None, shallow_branch=None): import git ld_key = "LD_LIBRARY_PATH" @@ -109,14 +110,23 @@ def clone(url, to_path, rev=None): env[ld_key] = "" try: + if shallow_branch is not None and os.path.exists(url): + # git disables --depth for local clones unless file:// url + # scheme is used + url = f"file://{url}" with TqdmGit(desc="Cloning", unit="obj") as pbar: - tmp_repo = git.Repo.clone_from( + clone_from = partial( + git.Repo.clone_from, url, to_path, env=env, # needed before we can fix it in __init__ no_single_branch=True, progress=pbar.update_git, ) + if shallow_branch is None: + tmp_repo = clone_from() + else: + tmp_repo = clone_from(branch=shallow_branch, depth=1) tmp_repo.close() except git.exc.GitCommandError as exc: # pylint: disable=no-member raise CloneError(url, to_path) from exc @@ -250,8 +260,8 @@ def checkout(self, branch, create_new=False): else: self.repo.git.checkout(branch) - def pull(self): - infos = self.repo.remote().pull() + def pull(self, **kwargs): + infos = self.repo.remote().pull(**kwargs) for info in infos: if info.flags & info.ERROR: raise SCMError(f"pull failed: {info.note}") diff --git a/tests/func/test_external_repo.py b/tests/func/test_external_repo.py index d8c96fbcd8..50b142dd48 100644 --- a/tests/func/test_external_repo.py +++ b/tests/func/test_external_repo.py @@ -1,8 +1,8 @@ import os -from mock import patch +from mock import ANY, patch -from dvc.external_repo import external_repo +from dvc.external_repo import CLONES, external_repo from dvc.path_info import PathInfo from dvc.scm.git import Git from dvc.tree.local import LocalTree @@ -121,3 +121,55 @@ def test_relative_remote(erepo_dir, tmp_dir): assert os.path.isdir(repo.config["remote"]["upstream"]["url"]) with repo.open_by_relpath("file") as fd: assert fd.read() == "contents" + + +def test_shallow_clone_branch(erepo_dir): + with erepo_dir.chdir(): + with erepo_dir.branch("branch", new=True): + erepo_dir.dvc_gen("file", "branch", commit="create file on branch") + erepo_dir.dvc_gen("file", "master", commit="create file on master") + + url = os.fspath(erepo_dir) + + with patch.object(Git, "clone", wraps=Git.clone) as mock_clone: + with external_repo(url, rev="branch") as repo: + with repo.open_by_relpath("file") as fd: + assert fd.read() == "branch" + + mock_clone.assert_called_with(url, ANY, shallow_branch="branch") + _, shallow = CLONES[url] + assert shallow + + with external_repo(url) as repo: + with repo.open_by_relpath("file") as fd: + assert fd.read() == "master" + + assert mock_clone.call_count == 1 + _, shallow = CLONES[url] + assert not shallow + + +def test_shallow_clone_tag(erepo_dir): + with erepo_dir.chdir(): + erepo_dir.dvc_gen("file", "foo", commit="init") + erepo_dir.scm.tag("v1") + erepo_dir.dvc_gen("file", "bar", commit="update file") + + url = os.fspath(erepo_dir) + + with patch.object(Git, "clone", wraps=Git.clone) as mock_clone: + with external_repo(url, rev="v1") as repo: + with repo.open_by_relpath("file") as fd: + assert fd.read() == "foo" + + mock_clone.assert_called_with(url, ANY, shallow_branch="v1") + _, shallow = CLONES[url] + assert shallow + + with external_repo(url, rev="master") as repo: + with repo.open_by_relpath("file") as fd: + assert fd.read() == "bar" + + assert mock_clone.call_count == 1 + _, shallow = CLONES[url] + assert not shallow