Skip to content

Commit

Permalink
erepo: --rev BRANCH: shallow clone (#4246)
Browse files Browse the repository at this point in the history
* erepo: use shallow clones when rev is branch or tag name

* tests: test erepo shallow clone behavior

* erepo: fix detached head after shallow cloning a tag

* move unshallow logic into its own function

* erepo: default rev should be origin HEAD not local HEAD

* use functools.partial

* review fixes

* comment shallow clone behavior in erepo
* don't duplicate unnecessary fetch/pull call
  • Loading branch information
pmrowla authored Jul 22, 2020
1 parent b77ce02 commit a4b3c6d
Show file tree
Hide file tree
Showing 3 changed files with 122 additions and 19 deletions.
67 changes: 54 additions & 13 deletions dvc/external_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from dvc.path_info import PathInfo
from dvc.repo import Repo
from dvc.repo.tree import RepoTree
from dvc.scm.base import CloneError
from dvc.scm.git import Git
from dvc.tree.local import LocalTree
from dvc.utils.fs import remove
Expand All @@ -31,7 +32,10 @@ def external_repo(url, rev=None, for_write=False):
logger.debug("Creating external repo %s@%s", url, rev)
path = _cached_clone(url, rev, for_write=for_write)
if not rev:
rev = "HEAD"
# Local HEAD points to the tip of whatever branch we first cloned from
# (which may not be the default branch), use origin/HEAD here to get
# the tip of the default branch
rev = "refs/remotes/origin/HEAD"
try:
repo = ExternalRepo(path, url, rev, for_write=for_write)
except NotDvcRepoError:
Expand Down Expand Up @@ -59,7 +63,7 @@ def external_repo(url, rev=None, for_write=False):

def clean_repos():
# Outside code should not see cache while we are removing
paths = list(CLONES.values()) + list(CACHE_DIRS.values())
paths = [path for path, _ in CLONES.values()] + list(CACHE_DIRS.values())
CLONES.clear()
CACHE_DIRS.clear()

Expand Down Expand Up @@ -251,10 +255,10 @@ def _cached_clone(url, rev, for_write=False):
"""
# even if we have already cloned this repo, we may need to
# fetch/fast-forward to get specified rev
clone_path = _clone_default_branch(url, rev)
clone_path, shallow = _clone_default_branch(url, rev, for_write=for_write)

if not for_write and (url) in CLONES:
return CLONES[url]
return CLONES[url][0]

# Copy to a new dir to keep the clone clean
repo_path = tempfile.mkdtemp("dvc-erepo")
Expand All @@ -265,36 +269,73 @@ def _cached_clone(url, rev, for_write=False):
if for_write:
_git_checkout(repo_path, rev)
else:
CLONES[url] = repo_path
CLONES[url] = (repo_path, shallow)
return repo_path


@wrap_with(threading.Lock())
def _clone_default_branch(url, rev):
def _clone_default_branch(url, rev, for_write=False):
"""Get or create a clean clone of the url.
The cloned is reactualized with git pull unless rev is a known sha.
"""
clone_path = CLONES.get(url)
clone_path, shallow = CLONES.get(url, (None, False))

git = None
try:
if clone_path:
git = Git(clone_path)
# Do not pull for known shas, branches and tags might move
if not Git.is_sha(rev) or not git.has_rev(rev):
logger.debug("erepo: git pull %s", url)
git.pull()
if shallow:
# If we are missing a rev in a shallow clone, fallback to
# a full (unshallowed) clone. Since fetching specific rev
# SHAs is only available in certain git versions, if we
# have need to reference multiple specific revs for a
# given repo URL it is easier/safer for us to work with
# full clones in this case.
logger.debug("erepo: unshallowing clone for '%s'", url)
_unshallow(git)
shallow = False
CLONES[url] = (clone_path, shallow)
else:
logger.debug("erepo: git pull '%s'", url)
git.pull()
else:
logger.debug("erepo: git clone %s to a temporary dir", url)
logger.debug("erepo: git clone '%s' to a temporary dir", url)
clone_path = tempfile.mkdtemp("dvc-clone")
git = Git.clone(url, clone_path)
CLONES[url] = clone_path
if not for_write and rev and not Git.is_sha(rev):
# If rev is a tag or branch name try shallow clone first
try:
git = Git.clone(url, clone_path, shallow_branch=rev)
shallow = True
logger.debug(
"erepo: using shallow clone for branch '%s'", rev
)
except CloneError:
pass
if not git:
git = Git.clone(url, clone_path)
shallow = False
CLONES[url] = (clone_path, shallow)
finally:
if git:
git.close()

return clone_path
return clone_path, shallow


def _unshallow(git):
if git.repo.head.is_detached:
# If this is a detached head (i.e. we shallow cloned a tag) switch to
# the default branch
origin_refs = git.repo.remotes["origin"].refs
ref = origin_refs["HEAD"].reference
branch_name = ref.name.split("/")[-1]
branch = git.repo.create_head(branch_name, ref)
branch.set_tracking_branch(ref)
branch.checkout()
git.pull(unshallow=True)


def _git_checkout(repo_path, rev):
Expand Down
18 changes: 14 additions & 4 deletions dvc/scm/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import logging
import os
import shlex
from functools import partial

import yaml
from funcy import cached_property
Expand Down Expand Up @@ -92,7 +93,7 @@ def root_dir(self):
return self.repo.working_tree_dir

@staticmethod
def clone(url, to_path, rev=None):
def clone(url, to_path, rev=None, shallow_branch=None):
import git

ld_key = "LD_LIBRARY_PATH"
Expand All @@ -109,14 +110,23 @@ def clone(url, to_path, rev=None):
env[ld_key] = ""

try:
if shallow_branch is not None and os.path.exists(url):
# git disables --depth for local clones unless file:// url
# scheme is used
url = f"file://{url}"
with TqdmGit(desc="Cloning", unit="obj") as pbar:
tmp_repo = git.Repo.clone_from(
clone_from = partial(
git.Repo.clone_from,
url,
to_path,
env=env, # needed before we can fix it in __init__
no_single_branch=True,
progress=pbar.update_git,
)
if shallow_branch is None:
tmp_repo = clone_from()
else:
tmp_repo = clone_from(branch=shallow_branch, depth=1)
tmp_repo.close()
except git.exc.GitCommandError as exc: # pylint: disable=no-member
raise CloneError(url, to_path) from exc
Expand Down Expand Up @@ -250,8 +260,8 @@ def checkout(self, branch, create_new=False):
else:
self.repo.git.checkout(branch)

def pull(self):
infos = self.repo.remote().pull()
def pull(self, **kwargs):
infos = self.repo.remote().pull(**kwargs)
for info in infos:
if info.flags & info.ERROR:
raise SCMError(f"pull failed: {info.note}")
Expand Down
56 changes: 54 additions & 2 deletions tests/func/test_external_repo.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import os

from mock import patch
from mock import ANY, patch

from dvc.external_repo import external_repo
from dvc.external_repo import CLONES, external_repo
from dvc.path_info import PathInfo
from dvc.scm.git import Git
from dvc.tree.local import LocalTree
Expand Down Expand Up @@ -121,3 +121,55 @@ def test_relative_remote(erepo_dir, tmp_dir):
assert os.path.isdir(repo.config["remote"]["upstream"]["url"])
with repo.open_by_relpath("file") as fd:
assert fd.read() == "contents"


def test_shallow_clone_branch(erepo_dir):
with erepo_dir.chdir():
with erepo_dir.branch("branch", new=True):
erepo_dir.dvc_gen("file", "branch", commit="create file on branch")
erepo_dir.dvc_gen("file", "master", commit="create file on master")

url = os.fspath(erepo_dir)

with patch.object(Git, "clone", wraps=Git.clone) as mock_clone:
with external_repo(url, rev="branch") as repo:
with repo.open_by_relpath("file") as fd:
assert fd.read() == "branch"

mock_clone.assert_called_with(url, ANY, shallow_branch="branch")
_, shallow = CLONES[url]
assert shallow

with external_repo(url) as repo:
with repo.open_by_relpath("file") as fd:
assert fd.read() == "master"

assert mock_clone.call_count == 1
_, shallow = CLONES[url]
assert not shallow


def test_shallow_clone_tag(erepo_dir):
with erepo_dir.chdir():
erepo_dir.dvc_gen("file", "foo", commit="init")
erepo_dir.scm.tag("v1")
erepo_dir.dvc_gen("file", "bar", commit="update file")

url = os.fspath(erepo_dir)

with patch.object(Git, "clone", wraps=Git.clone) as mock_clone:
with external_repo(url, rev="v1") as repo:
with repo.open_by_relpath("file") as fd:
assert fd.read() == "foo"

mock_clone.assert_called_with(url, ANY, shallow_branch="v1")
_, shallow = CLONES[url]
assert shallow

with external_repo(url, rev="master") as repo:
with repo.open_by_relpath("file") as fd:
assert fd.read() == "bar"

assert mock_clone.call_count == 1
_, shallow = CLONES[url]
assert not shallow

0 comments on commit a4b3c6d

Please sign in to comment.