From e46d7652fe0a041aaa4f6d1df9e386d098b055e0 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Fri, 18 Oct 2024 12:06:20 -0700 Subject: [PATCH 1/6] t9390: add a new test demonstrating problems with pruning of merges If a merge contains changes other than what a straightforward merging would entail (i.e. an "evil merge", though these could just be semantic conflict resolutions), our logic to prune degenerate merges may overlook those important other changes. Add a testcase demonstrating this. Signed-off-by: Elijah Newren --- t/t9390-filter-repo.sh | 16 +++++++ t/t9390/degenerate-evil-merge | 81 +++++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+) create mode 100644 t/t9390/degenerate-evil-merge diff --git a/t/t9390-filter-repo.sh b/t/t9390-filter-repo.sh index 418ac73b..613c7bc1 100755 --- a/t/t9390-filter-repo.sh +++ b/t/t9390-filter-repo.sh @@ -1875,6 +1875,22 @@ test_expect_success 'degenerate merge with typechange' ' ) ' +test_expect_failure 'degenerate evil merge' ' + test_create_repo degenerate_evil_merge && + ( + cd degenerate_evil_merge && + + cat $DATA/degenerate-evil-merge | git fast-import --quiet && + git filter-repo --force --subdirectory-filter module-of-interest && + test_path_is_missing module-of-interest && + test_path_is_missing other-module && + test_path_is_missing irrelevant && + test_path_is_file file1 && + test_path_is_file file2 && + test_path_is_file file3 + ) +' + test_expect_success 'Filtering a blob to make it match previous version' ' test_create_repo remove_unique_bits_of_blob && ( diff --git a/t/t9390/degenerate-evil-merge b/t/t9390/degenerate-evil-merge new file mode 100644 index 00000000..a59ef270 --- /dev/null +++ b/t/t9390/degenerate-evil-merge @@ -0,0 +1,81 @@ +feature done +blob +mark :1 +data 0 + +reset refs/heads/master +commit refs/heads/master +mark :2 +author Full Name 2000000000 +0100 +committer Full Name 2000000000 +0100 +data 2 +A +M 100644 :1 irrelevant +M 100644 :1 module-of-interest/file1 +M 100644 :1 module-of-interest/file2 +M 100644 :1 module-of-interest/file3 +M 100644 :1 other-module/files +M 100644 :1 other-module/are +M 100644 :1 other-module/fun + +commit refs/heads/master +mark :3 +author Full Name 2000030000 +0100 +committer Full Name 2000030000 +0100 +data 2 +B +from :2 +D irrelevant +D module-of-interest/file1 +D module-of-interest/file2 +D module-of-interest/file3 + +blob +mark :4 +data 8 +content + +commit refs/heads/master +mark :5 +author Full Name 2000040000 +0100 +committer Full Name 2000040000 +0100 +data 2 +D +from :3 +M 100644 :4 other-module/fun + +commit refs/heads/master +mark :6 +author Full Name 2000020000 +0100 +committer Full Name 2000020000 +0100 +data 2 +C +from :2 +M 100644 :4 irrelevant + +commit refs/heads/master +mark :7 +author Full Name 2000050000 +0100 +committer Full Name 2000050000 +0100 +data 31 +Merge and ignore the deletions +from :6 +merge :5 +M 100644 :4 irrelevant +M 100644 :4 other-module/fun + +blob +mark :8 +data 6 +final + +commit refs/heads/master +mark :7 +author Full Name 2000060000 +0100 +committer Full Name 2000060000 +0100 +data 13 +Final change +from :7 +M 100644 :8 module-of-interest/file2 + +done From 43b1298399d9fbea58149380577902f4d2ec7db9 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Tue, 8 Oct 2024 10:37:50 -0700 Subject: [PATCH 2/6] filter-repo: make purpose of trim_extra_parents clearer Signed-off-by: Elijah Newren --- git-filter-repo | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/git-filter-repo b/git-filter-repo index efdef23a..ba403567 100755 --- a/git-filter-repo +++ b/git-filter-repo @@ -3288,11 +3288,11 @@ class RepoFilter(object): return old_hash return new_hash - def _trim_extra_parents(self, orig_parents, parents): + def _maybe_trim_extra_parents(self, orig_parents, parents): '''Due to pruning of empty commits, some parents could be non-existent (None) or otherwise redundant. Remove the non-existent parents, and - remove redundant parents so long as that doesn't transform a merge - commit into a non-merge commit. + remove redundant parents ***SO LONG AS*** that doesn't transform a + merge commit into a non-merge commit. Returns a tuple: (parents, new_first_parent_if_would_become_non_merge)''' @@ -3675,7 +3675,8 @@ class RepoFilter(object): # Prune parents (due to pruning of empty commits) if relevant old_1st_parent = parents[0] if parents else None - parents, new_1st_parent = self._trim_extra_parents(orig_parents, parents) + parents, new_1st_parent = self._maybe_trim_extra_parents(orig_parents, + parents) commit.parents = parents # If parents were pruned, then we need our file changes to be relative From 3382fc27655f3d678b051a36dd71d1b84a3a1ee7 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Fri, 18 Oct 2024 12:03:15 -0700 Subject: [PATCH 3/6] filter-repo: rename old_1st_parent -> prev_1st_parent In _tweak_commit() we had a local variable named old_1st_parent. This name was misleading because we often use "old" to refer to objects from the original fast export stream, and "new" to refer to the new identifiers used with the new fast import stream (due to insertions of new blobs or commits, the identifier for a given blob or commit has different "old" and "new" values). old_1st_parent, though, was clearly referring to a "new" identifier for a commit since it was part of parents rather than orig_parents. Update the variable name. Signed-off-by: Elijah Newren --- git-filter-repo | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/git-filter-repo b/git-filter-repo index ba403567..ee782ac3 100755 --- a/git-filter-repo +++ b/git-filter-repo @@ -3674,14 +3674,14 @@ class RepoFilter(object): commit.original_id) # Prune parents (due to pruning of empty commits) if relevant - old_1st_parent = parents[0] if parents else None + prev_1st_parent = parents[0] if parents else None parents, new_1st_parent = self._maybe_trim_extra_parents(orig_parents, parents) commit.parents = parents # If parents were pruned, then we need our file changes to be relative # to the new first parent - if parents and old_1st_parent != parents[0]: + if parents and prev_1st_parent != parents[0]: # Get the id from the original fast export stream corresponding to the # new 1st parent. Since pruning of commits can cause multiple old # commits to map to the same new commit, we have a uniqueness problem, From 1866fa3e24ce4c7e222cda1f9f1bb4c745414bc2 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Fri, 11 Oct 2024 16:04:46 -0700 Subject: [PATCH 4/6] filter-repo: add explanatory note for complicated condition When the first parent has been pruned away, or this is a merge commit that is up for pruning if it has no relevant file changes, then we need to know what the file_changes are relative to the new first parent rather than the old one (especially if the original merge was an "evil" merge). The conditions that are relevant can get more than a little confusing so add a very detailed comment trying to explain the conditions. A later commit will actually address these conditions and make the fixes. Signed-off-by: Elijah Newren --- git-filter-repo | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/git-filter-repo b/git-filter-repo index ee782ac3..60c642bd 100755 --- a/git-filter-repo +++ b/git-filter-repo @@ -3673,7 +3673,9 @@ class RepoFilter(object): self._orig_graph.add_commit_and_parents(commit.old_id, orig_parents, commit.original_id) - # Prune parents (due to pruning of empty commits) if relevant + # Prune parents (due to pruning of empty commits) if relevant, note that + # new_1st_parent is None unless this was a merge commit that is becoming + # a non-merge prev_1st_parent = parents[0] if parents else None parents, new_1st_parent = self._maybe_trim_extra_parents(orig_parents, parents) @@ -3681,6 +3683,28 @@ class RepoFilter(object): # If parents were pruned, then we need our file changes to be relative # to the new first parent + # + # FIXME: Note that there are two cases we should be handling here: + # * new_1st_parent and new_1st_parent != parents[0] uniquely happens for example when: + # working on merge, selecting subset of files and merge base still + # valid while first parent history doesn't touch any of those paths, + # but second parent history does. prev_1st_parent had already been + # rewritten to the non-None first ancestor and it remains valid. + # self._maybe_trim_extra_parents() avoids removing this first parent + # because it'd make the commit a non-merge. However, if there are + # no file_changes of note, we'll drop this commit and mark + # new_1st_parent as the new replacement. To correctly determine if + # there are no file_changes of note, we need to have the list of + # file_changes relative to new_1st_parent. + # (See t9390#3, "basic -> basic-ten using '--path ten'") + # * prev_1st_parent != parents[0] happens for example when: + # similar to above, but the merge base is no longer valid and was + # pruned away as well. Then parents started as e.g. [None, $num], + # and both prev_1st_parent and new_1st_parent are None, while parents + # after self._maybe_trim_extra_parents() becomes just [$num]. + # (See t9390#67, "degenerate merge with non-matching filename".) + # Since $num was originally a second parent, we need to rewrite + # file changes to be relative to parents[0]. if parents and prev_1st_parent != parents[0]: # Get the id from the original fast export stream corresponding to the # new 1st parent. Since pruning of commits can cause multiple old From 912dbca53e91392db65ca557529393e8e97b846d Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Fri, 18 Oct 2024 15:01:56 -0700 Subject: [PATCH 5/6] filter-repo: fix corner case involving bad pruning of evil degenerate merge When a former merge commit either has its entire first parent history removed, or a former merge commit has enough of its first parent history removed that its new first parent is an ancestory of its second parent, we consider whether the merge itself should be pruned. The basic logic is a merge can be removed in either of these cases if the merge introduces no file changes relative to its remaining parent. However, our logic for determining if the merge introduced no file changes relative to its remaining parent was faulty in the case of some merges that had changes beyond what a simple merge would imply (i.e. merges that had semantic conflict fixes or "evil" changes snuck in). To properly address those, we need to instead work with the list of changes relative to the new proposed parent. Add code to handle this, and mark the relevant testcase as passing. Signed-off-by: Elijah Newren --- git-filter-repo | 63 ++++++++++++++++++++++++++++++------------ t/t9390-filter-repo.sh | 2 +- 2 files changed, 47 insertions(+), 18 deletions(-) diff --git a/git-filter-repo b/git-filter-repo index 60c642bd..573f5f33 100755 --- a/git-filter-repo +++ b/git-filter-repo @@ -514,14 +514,6 @@ class _IDs(object): else: return old_id - def reverse_translate(self, new_id): - """ - If new_id is an alternate id mapping, return the list of original ids. - (Since commits get pruned, multiple old commit ids map map to the same - new commit id.) - """ - return self._reverse_translation.get(new_id, [new_id]) - def __str__(self): """ Convert IDs to string; used for debugging @@ -3667,6 +3659,11 @@ class RepoFilter(object): external_parents = parents else: external_parents = [p for p in parents if not isinstance(p, int)] + # The use of 'reversed' is intentional here; there is a risk that we have + # duplicates in parents, and we want to map from parents to the first + # entry we find in orig_parents in such cases. + parent_reverse_dict = dict(zip(reversed(parents), reversed(orig_parents))) + self._graph.record_external_commits(external_parents) self._orig_graph.record_external_commits(external_parents) self._graph.add_commit_and_parents(commit.id, parents) # new githash unknown @@ -3684,7 +3681,7 @@ class RepoFilter(object): # If parents were pruned, then we need our file changes to be relative # to the new first parent # - # FIXME: Note that there are two cases we should be handling here: + # Notes: # * new_1st_parent and new_1st_parent != parents[0] uniquely happens for example when: # working on merge, selecting subset of files and merge base still # valid while first parent history doesn't touch any of those paths, @@ -3705,18 +3702,50 @@ class RepoFilter(object): # (See t9390#67, "degenerate merge with non-matching filename".) # Since $num was originally a second parent, we need to rewrite # file changes to be relative to parents[0]. - if parents and prev_1st_parent != parents[0]: + # * TODO: We should be getting the changes relative to the new first + # parent even if self._fep is None, BUT we can't. Our method of + # getting the changes right now is an external git diff invocation, + # which we can't do if we just have a fast export stream. We can't + # really work around it by querying the fast-import stream either, + # because the 'ls' directive only allows us to list info about + # specific paths, but we need to find out which paths exist in two + # commits and then query them. We could maybe force checkpointing in + # fast-import, then doing a diff from what'll be the new first parent + # back to prev_1st_parent (which may be None, i.e. empty tree), using + # the fact that in A->{B,C}->D, where D is merge of B & C, the diff + # from C->D == C->A + A->B + B->D, and in these cases A==B, so it + # simplifies to C->D == C->A + B->D, and C is our new 1st parent + # commit, A is prev_1st_commit, and B->D is commit.file_changes that + # we already have. However, checkpointing the fast-import process + # and figuring out how long to wait before we can run our diff just + # seems excessive. For now, just punt and assume the merge wasn't + # "evil" (i.e. that it's remerge-diff is empty, as is true for most + # merges). If the merge isn't evil, no further steps are necessary. + if parents and self._fep and ( + prev_1st_parent != parents[0] or + new_1st_parent and new_1st_parent != parents[0]): # Get the id from the original fast export stream corresponding to the - # new 1st parent. Since pruning of commits can cause multiple old - # commits to map to the same new commit, we have a uniqueness problem, - # but pruning means there are no relevant file changes between the - # commit so we can just take the first old commit id. - new_1st_parent_old_id = _IDS.reverse_translate(parents[0])[0] + # new 1st parent. As noted above, that new 1st parent might be + # new_1st_parent, or if that is None, it'll be parents[0]. + will_be_1st = new_1st_parent or parents[0] + old_id = parent_reverse_dict[will_be_1st] # Now, translate that to a hash - new_1st_parent_old_hash = self._orig_graph.map_to_hash(new_1st_parent_old_id) + will_be_1st_commit_hash = self._orig_graph.map_to_hash(old_id) + # Get the changes from what is going to be the new 1st parent to this + # merge commit. Note that since we are going from the new 1st parent + # to the merge commit, we can just replace the existing + # commit.file_changes rather than getting something we need to combine + # with the existing commit.file_changes. Also, we can just replace + # because prev_1st_parent is an ancestor of will_be_1st_commit_hash + # (or prev_1st_parent is None and first parent history is gone), so + # even if we retain prev_1st_parent and do not prune it, the changes + # will still work given the snapshot-based way fast-export/fast-import + # work. commit.file_changes = GitUtils.get_file_changes(self._repo_working_dir, - new_1st_parent_old_hash, + will_be_1st_commit_hash, commit.original_id) + + # Save these and filter them orig_file_changes = set(commit.file_changes) self._filter_files(commit) diff --git a/t/t9390-filter-repo.sh b/t/t9390-filter-repo.sh index 613c7bc1..4a54067f 100755 --- a/t/t9390-filter-repo.sh +++ b/t/t9390-filter-repo.sh @@ -1875,7 +1875,7 @@ test_expect_success 'degenerate merge with typechange' ' ) ' -test_expect_failure 'degenerate evil merge' ' +test_expect_success 'degenerate evil merge' ' test_create_repo degenerate_evil_merge && ( cd degenerate_evil_merge && From 704e25878ce10de201629bc83f81975a4e808b9c Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Mon, 21 Oct 2024 09:25:24 -0700 Subject: [PATCH 6/6] filter-repo: limit searches for ref values to actual refs The FreeBSD repository has refs that look like ref expressions. For example: refs/tags/vendor/openzfs/2.0-rc3-gfc5966 This is a tag. However, if you rewrite the FreeBSD history to only include the libexec/ftpd directory, then this tag pre-dated any file within that directory and thus should be deleted. Once that tag is deleted, trying to pass it to `git cat-file --batch-check` will result in the value of refs/tags/vendor/openzfs/2.0-rc3-gfc5966 being printed as the value of some blob. refs/tags/vendor/openzfs/2.0-rc3-gfc5966 but notices the `-gfc5966` extension and goes looking for any object whose name begins with `fc5966`. Since it turns out there is one object that starts with that name and there is only one, and it happens to be a blob, `git cat-file` will say, oh, you must have been asking for blob fc5966c9c467e0a7d460498b7581e845d33d89d7 This matters to code that runs after the new history has been writtten and refs have been updated, but before the old history has been pruned. Since we are specifically just wanting to know the values of refs, use show-refs instead. Signed-off-by: Elijah Newren --- git-filter-repo | 34 ++++++++++------------------------ 1 file changed, 10 insertions(+), 24 deletions(-) diff --git a/git-filter-repo b/git-filter-repo index 573f5f33..e905f6ef 100755 --- a/git-filter-repo +++ b/git-filter-repo @@ -4194,8 +4194,8 @@ class RepoFilter(object): original_hash = old_commit_unrenames.get(old_hash, old_hash) old_ref_map[refname] = (original_hash, deleted_hash) - batch_check_process = None - batch_check_output_re = re.compile(b'^([0-9a-f]{40}) ([a-z]+) ([0-9]+)$') + new_refs = {} + new_refs_initialized = False ref_maps = {} self._orig_graph._ensure_reverse_maps_populated() for refname, pair in old_ref_map.items(): @@ -4209,36 +4209,22 @@ class RepoFilter(object): else: new_hash = intermediate else: # Must be either an annotated tag, or a ref whose tip was pruned - if not batch_check_process: - cmd = 'git cat-file --batch-check'.split() + if not new_refs_initialized: target_working_dir = self._args.target or b'.' - batch_check_process = subproc.Popen(cmd, - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - cwd=target_working_dir) - batch_check_process.stdin.write(refname+b"\n") - batch_check_process.stdin.flush() - line = batch_check_process.stdout.readline() - m = batch_check_output_re.match(line) - if m and m.group(2) in (b'tag', b'commit'): - new_hash = m.group(1) - elif line.endswith(b' missing\n'): - new_hash = deleted_hash + new_refs = GitUtils.get_refs(target_working_dir) + if refname in new_refs: + new_hash = new_refs[refname] else: - raise SystemExit(_("Failed to find new id for %(refname)s " - "(old id was %(old_hash)s)") - % ({'refname': refname, 'old_hash': old_hash}) - ) # pragma: no cover + new_hash = deleted_hash ref_maps[refname] = (old_hash, new_hash) if self._args.source or self._args.target: - new_refs = GitUtils.get_refs(self._args.target or b'.') + if not new_refs_initialized: + target_working_dir = self._args.target or b'.' + new_refs = GitUtils.get_refs(target_working_dir) for ref, new_hash in new_refs.items(): if ref not in orig_refs and not ref.startswith(b'refs/replace/'): old_hash = b'0'*len(new_hash) ref_maps[ref] = (old_hash, new_hash) - if batch_check_process: - batch_check_process.stdin.close() - batch_check_process.wait() # # Third, handle first_changes