From b97cdfbe10e82c16058a581d2fc6ee6412f6b311 Mon Sep 17 00:00:00 2001 From: Elijah Newren <newren@gmail.com> Date: Fri, 4 Oct 2024 12:25:05 -0700 Subject: [PATCH 1/2] filter-repo: add a new get_parent_hashes() convenience function We want to add the ability to show the first changed commit(s) in a rewrite, i.e. those commits that were rewritten but which did not have any parents that were rewritten. And we want to be able to update these on subsequent runs of filter-repo when users rewrite their repository in multiple steps. To facilitate this, add a get_parent_hashes() function to the AncestryGraph class. Signed-off-by: Elijah Newren <newren@gmail.com> --- git-filter-repo | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/git-filter-repo b/git-filter-repo index 1aa0eb3a..a2f16fbe 100755 --- a/git-filter-repo +++ b/git-filter-repo @@ -248,8 +248,9 @@ class AncestryGraph(object): # elsewhere self.git_hash = {} - # Reverse map; only populated if needed. Callers of functions using - # this reverse map are responsible to ensure it is populated + # Reverse maps; only populated if needed. Caller responsible to check + # and ensure they are populated + self._reverse_value = {} self._hash_to_id = {} # Cached results from previous calls to is_ancestor(). @@ -302,7 +303,29 @@ class AncestryGraph(object): def _ensure_reverse_maps_populated(self): if not self._hash_to_id: + assert not self._reverse_value self._hash_to_id = {v: k for k, v in self.git_hash.items()} + self._reverse_value = {v: k for k, v in self.value.items()} + + def get_parent_hashes(self, commit_hash): + ''' + Given a commit_hash, return its parents hashes + ''' + # + # We have to map: + # commit hash -> fast export stream id -> graph id + # then lookup + # parent graph ids for given graph id + # then we need to map + # parent graph ids -> parent fast export ids -> parent commit hashes + # + self._ensure_reverse_maps_populated() + commit_fast_export_id = self._hash_to_id[commit_hash] + commit_graph_id = self.value[commit_fast_export_id] + parent_graph_ids = self.graph[commit_graph_id][1] + parent_fast_export_ids = [self._reverse_value[x] for x in parent_graph_ids] + parent_hashes = [self.git_hash[x] for x in parent_fast_export_ids] + return parent_hashes def map_to_hash(self, commit_id): ''' From d41f9d91ead77c1e5e476f04d6a75d00a32a8696 Mon Sep 17 00:00:00 2001 From: Elijah Newren <newren@gmail.com> Date: Fri, 4 Oct 2024 12:38:58 -0700 Subject: [PATCH 2/2] filter-repo: add tracking and recording of first changed commit(s) Compute the set of first changed commits in a rewrite, that is, those commits that were rewritten which had no parents that were rewritten. There will typically be just one of these in a rewrite, making it a good candidate for a "ban commit" hook to prevent the old history from being repushed to a repository. Store these in $GIT_DIR/filter-repo/first-changed-commits. Signed-off-by: Elijah Newren <newren@gmail.com> --- Documentation/git-filter-repo.txt | 37 ++++++++++- git-filter-repo | 102 +++++++++++++++++++++++++++++- t/t9390-filter-repo.sh | 1 + t/t9393-rerun.sh | 93 ++++++++++++++++++++++++++- 4 files changed, 228 insertions(+), 5 deletions(-) diff --git a/Documentation/git-filter-repo.txt b/Documentation/git-filter-repo.txt index 86a01ee1..47c0ad8c 100644 --- a/Documentation/git-filter-repo.txt +++ b/Documentation/git-filter-repo.txt @@ -405,6 +405,26 @@ references were changed. * An all-zeros hash, or null SHA, represents a non-existent object. When in the "new" column, this means the ref was removed entirely. +First Changed Commits +~~~~~~~~~~~~~~~~~~~~~ + +The `$GIT_DIR/filter-repo/first-changed-commits` contains a list of the +first commit(s) changed by the filtering operation. These are the commits +that got rewritten and which had no parents that were also rewritten. + +So, for example if you had commits + A1-B1-C1-D1-E1 +before running git-filter-repo, and afterward you had commits + A1-B2-C2-D2-E2 +then the First Changed Commits file would contain just one line, which +would be the hash of B2. + +In most cases, there will only be one commit listed, but if you had +multiple root commits or a non-linear history where the commits on +those diverging histories were the first ones modified, then there +could be multiple first changed commits and they will each be listed +on separate lines. + Already Ran ~~~~~~~~~~~ @@ -429,10 +449,25 @@ Concretely, this means: commit B to commit C, then the second run would have an "A C" entry rather than a "B C" entry for the changed commit. + * The first changed commit(s) (reported When using the + --sensitive-data-removal option) will be the first original commit + modified, not the first intermediate commit modified. + + In more detail, if the repository original had the following commits: + A1-B1-C1-D1-E1 + and the first invocation of filter-repo changed this to + A1-B1-C2-D2-E2 + then the first run would report "C1" as the first changed commit. If + a second filter-repo run further changed this to + A1-B1-C2-D3-E3 + then it would report "C1" as the first changed commit, not "D2", + because it is comparing to the original commits rather than the + intermediate ones. + However, if the already_ran file exists but is older than 1 day when they invoke git-filter-repo, the user will be prompted for whether the new run should be considered a continuation of the previous run. If they do not -answer in the affirmative, then the above two bullets will not apply. +answer in the affirmative, then the above three bullets will not apply. This prompt exists because users might do a history rewrite in a repository, forget about it and leave the $GIT_DIR/filter-repo directory around, and then some months or years later need to do another rewrite. If commits diff --git a/git-filter-repo b/git-filter-repo index a2f16fbe..efdef23a 100755 --- a/git-filter-repo +++ b/git-filter-repo @@ -4186,11 +4186,105 @@ class RepoFilter(object): batch_check_process.stdin.close() batch_check_process.wait() - return commit_renames, ref_maps + # + # Third, handle first_changes + # + + old_first_changes = dict() + if already_ran: + # Read first_changes into old_first_changes + with open(os.path.join(metadata_dir, b'first-changed-commits'), 'br') as f: + for line in f: + changed_commit, undeleted_self_or_ancestor = line.strip().split() + old_first_changes[changed_commit] = undeleted_self_or_ancestor + # We need to find the commits that were modified whose parents were not. + # To be able to find parents, we need the commit names as of the beginning + # of this run, and then when we are done, we need to map them back to the + # name of the commits from before any git-filter-repo runs. + # + # We are excluding here any commits deleted in previous git-filter-repo + # runs + undo_old_commit_renames = dict((v,k) for (k,v) in old_commit_renames.items() + if v != deleted_hash) + # Get a list of all commits that were changed, as of the beginning of + # this latest run. + changed_commits = {new + for (old,new) in old_commit_renames.items() + if old != new and new != deleted_hash} | \ + {old + for (old,new) in self._commit_renames.items() + if old != new} + special_changed_commits = {old + for (old,new) in old_commit_renames.items() + if new == deleted_hash} + first_changes = dict() + for (old,new) in self._commit_renames.items(): + if old == new: + # old wasn't modified, can't be first change if not even a change + continue + if old_commit_unrenames.get(old,old) != old: + # old was already modified in previous run; while it might represent + # something that is still a first change, we'll handle that as we + # loop over old_first_changes below + continue + if any(parent in changed_commits + for parent in self._orig_graph.get_parent_hashes(old)): + # a parent of old was modified, so old is not a first change + continue + # At this point, old IS a first change. We need to find out what new + # commit it maps to, or if it doesn't map to one, what new commit was + # its most recent ancestor that wasn't pruned. + if new is None: + new = self._remap_to(old) + first_changes[old] = (new if new is not None else deleted_hash) + for (old,undeleted_self_or_ancestor) in old_first_changes.items(): + if undeleted_self_or_ancestor == deleted_hash: + # old represents a commit that was pruned and whose entire ancestry + # was pruned. So, old is still a first change + first_changes[old] = undeleted_self_or_ancestor + continue + intermediate = old_commit_renames.get(old, old) + usoa = undeleted_self_or_ancestor + new_ancestor = self._commit_renames.get(usoa, usoa) + if intermediate == deleted_hash: + # old was pruned in previous rewrite + if usoa != new_ancestor: + # old's ancestor got rewritten in this filtering run; we can drop + # this one from first_changes. + continue + # Getting here means old was a first change and old was pruned in a + # previous run, and its ancestors that survived were non rewritten in + # this run, so old remains a first change + first_changes[old] = new_ancestor # or usoa, since new_ancestor == usoa + continue + assert(usoa == intermediate) # old wasn't pruned => usoa == intermediate + + # Check whether parents of intermediate were rewritten. Note that + # intermediate in self._commit_renames only means that intermediate was + # processed by the latest filtering (not necessarily that it changed), + # but we need to know that before we can check for parent hashes having + # changed. + if intermediate not in self._commit_renames: + # This commit was not processed by this run, so it remains a first + # change + first_changes[old] = usoa + continue + if any(parent in changed_commits + for parent in self._orig_graph.get_parent_hashes(intermediate)): + # An ancestor was modified by this run, so it is no longer a first + # change; continue to the next one. + continue + # This change is a first_change; find the new commit its usoa maps to + new = self._remap_to(intermediate) + assert(new is not None) + first_changes[old] = new + + return commit_renames, ref_maps, first_changes def _record_metadata(self, metadata_dir, orig_refs): self._flush_renames() - commit_renames, ref_maps = self._compute_metadata(metadata_dir, orig_refs) + commit_renames, ref_maps, first_changes = \ + self._compute_metadata(metadata_dir, orig_refs) with open(os.path.join(metadata_dir, b'commit-map'), 'bw') as f: f.write(("%-40s %s\n" % (_("old"), _("new"))).encode()) @@ -4204,6 +4298,10 @@ class RepoFilter(object): (old_hash, new_hash) = hash_pair f.write(b'%s %s %s\n' % (old_hash, new_hash, refname)) + with open(os.path.join(metadata_dir, b'first-changed-commits'), 'bw') as f: + for commit, undeleted_self_or_ancestor in sorted(first_changes.items()): + f.write(b'%s %s\n' % (commit, undeleted_self_or_ancestor)) + with open(os.path.join(metadata_dir, b'suboptimal-issues'), 'bw') as f: issues_found = False if self._commits_no_longer_merges: diff --git a/t/t9390-filter-repo.sh b/t/t9390-filter-repo.sh index e54560a9..418ac73b 100755 --- a/t/t9390-filter-repo.sh +++ b/t/t9390-filter-repo.sh @@ -21,6 +21,7 @@ filter_testcase() { # Clean up from previous run git pack-refs --all && rm .git/packed-refs && + rm -rf .git/filter-repo/ && # Run the example cat $DATA/$INPUT | git filter-repo --stdin --quiet --force --replace-refs delete-no-add "${REST[@]}" && diff --git a/t/t9393-rerun.sh b/t/t9393-rerun.sh index 63af601c..60659acc 100755 --- a/t/t9393-rerun.sh +++ b/t/t9393-rerun.sh @@ -49,6 +49,11 @@ test_expect_success 'a re-run that is treated as a clean slate' ' cat sha-expect >>expect && test_cmp expect .git/filter-repo/ref-map && + cat <<-EOF | sort >expect && + ${FILE_B_CHANGE} ${FILE_A_CHANGE} + EOF + test_cmp expect .git/filter-repo/first-changed-commits && + touch -t 197001010000 .git/filter-repo/already_ran && echo no | git filter-repo --invert-paths --path fileC --force && FINAL_FILE_D_CHANGE=$(git rev-list -1 HEAD -- fileD) && @@ -72,7 +77,12 @@ test_expect_success 'a re-run that is treated as a clean slate' ' EOF printf "%-40s %-40s %s\n" old new ref >expect && cat sha-expect >>expect && - test_cmp expect .git/filter-repo/ref-map + test_cmp expect .git/filter-repo/ref-map && + + cat <<-EOF | sort >expect && + ${NEW_FILE_C_CHANGE} ${FILE_A_CHANGE} + EOF + test_cmp expect .git/filter-repo/first-changed-commits ) ' @@ -115,7 +125,13 @@ test_expect_success 'remove two files, no re-run' ' EOF printf "%-40s %-40s %s\n" old new ref >expect && cat sha-expect >>expect && - test_cmp expect .git/filter-repo/ref-map + test_cmp expect .git/filter-repo/ref-map && + + cat <<-EOF | sort >expect && + ${FIRST_ORPHAN} ${DELETED_SHA} + ${FILE_C_CHANGE} ${FILE_B_CHANGE} + EOF + test_cmp expect .git/filter-repo/first-changed-commits ) ' @@ -139,6 +155,12 @@ test_expect_success 'remove two files, then remove a later file' ' NEW_FILE_D_CHANGE=$(git rev-list -1 HEAD -- fileD) && NEW_TAG=$(git rev-parse v1.0) && + cat <<-EOF | sort >expect && + ${FIRST_ORPHAN} ${DELETED_SHA} + ${FILE_C_CHANGE} ${FILE_B_CHANGE} + EOF + test_cmp expect .git/filter-repo/first-changed-commits && + cat <<-EOF | sort -k 3 >sha-expect && ${FILE_D_CHANGE} ${NEW_FILE_D_CHANGE} $(git symbolic-ref HEAD) ${FINAL_ORPHAN} ${DELETED_SHA} refs/heads/orphan-me @@ -152,6 +174,12 @@ test_expect_success 'remove two files, then remove a later file' ' FINAL_TAG=$(git rev-parse v1.0) && + cat <<-EOF | sort >expect && + ${FIRST_ORPHAN} ${DELETED_SHA} + ${FILE_C_CHANGE} ${FILE_B_CHANGE} + EOF + test_cmp expect .git/filter-repo/first-changed-commits && + cat <<-EOF | sort >sha-expect && ${FIRST_ORPHAN} ${DELETED_SHA} ${FINAL_ORPHAN} ${DELETED_SHA} @@ -194,9 +222,21 @@ test_expect_success 'remove two files, then remove a later file via --refs' ' NEW_FILE_C_CHANGE=$(git rev-list -1 HEAD -- fileC) && + cat <<-EOF | sort >expect && + ${FIRST_ORPHAN} ${DELETED_SHA} + ${FILE_B_CHANGE} ${FILE_A_CHANGE} + EOF + test_cmp expect .git/filter-repo/first-changed-commits && + git filter-repo --invert-paths --path fileD --refs HEAD~1..HEAD && FINAL_TAG=$(git rev-parse v1.0) && + cat <<-EOF | sort >expect && + ${FIRST_ORPHAN} ${DELETED_SHA} + ${FILE_B_CHANGE} ${FILE_A_CHANGE} + EOF + test_cmp expect .git/filter-repo/first-changed-commits && + cat <<-EOF | sort >sha-expect && ${FIRST_ORPHAN} ${DELETED_SHA} ${FINAL_ORPHAN} ${DELETED_SHA} @@ -242,6 +282,12 @@ test_expect_success 'remove two files, then remove an earlier file' ' NEW_FILE_D_CHANGE=$(git rev-list -1 HEAD -- fileD) && FINAL_TAG=$(git rev-parse v1.0) && + cat <<-EOF | sort >expect && + ${FIRST_ORPHAN} ${DELETED_SHA} + ${FILE_B_CHANGE} ${FILE_A_CHANGE} + EOF + test_cmp expect .git/filter-repo/first-changed-commits && + cat <<-EOF | sort >sha-expect && ${FIRST_ORPHAN} ${DELETED_SHA} ${FINAL_ORPHAN} ${DELETED_SHA} @@ -284,10 +330,20 @@ test_expect_success 'modify a file, then remove a later file' ' NEW_FILE_C_CHANGE=$(git rev-list -1 HEAD -- fileC) && + cat <<-EOF | sort >expect && + ${FILE_C_CHANGE} ${NEW_FILE_C_CHANGE} + EOF + test_cmp expect .git/filter-repo/first-changed-commits && + git filter-repo --invert-paths --path fileD && FINAL_TAG=$(git rev-parse v1.0) && + cat <<-EOF | sort >expect && + ${FILE_C_CHANGE} ${NEW_FILE_C_CHANGE} + EOF + test_cmp expect .git/filter-repo/first-changed-commits && + # Make sure the fileD commit was indeed removed echo $NEW_FILE_C_CHANGE >expect && git rev-parse HEAD >actual && @@ -337,10 +393,20 @@ test_expect_success 'modify a file, then remove a later file via --refs' ' NEW_FILE_B_CHANGE=$(git rev-list -1 HEAD -- fileB) && NEW_FILE_C_CHANGE=$(git rev-list -1 HEAD -- fileC) && + cat <<-EOF | sort >expect && + ${FILE_B_CHANGE} ${NEW_FILE_B_CHANGE} + EOF + test_cmp expect .git/filter-repo/first-changed-commits && + git filter-repo --invert-paths --path fileD \ --refs HEAD~1..HEAD && FINAL_TAG=$(git rev-parse v1.0) && + cat <<-EOF | sort >expect && + ${FILE_B_CHANGE} ${NEW_FILE_B_CHANGE} + EOF + test_cmp expect .git/filter-repo/first-changed-commits && + # Make sure the fileD commit was indeed removed git rev-parse HEAD^ >expect && echo ${NEW_FILE_B_CHANGE} >actual && @@ -386,12 +452,24 @@ test_expect_success 'modify a file, then remove an earlier file' ' git filter-repo --force \ --replace-text <(echo "file 3 contents==>Alternate C") && + NEW_FILE_C_CHANGE=$(git rev-list -1 HEAD -- fileC) && + + cat <<-EOF | sort >expect && + ${FILE_C_CHANGE} ${NEW_FILE_C_CHANGE} + EOF + test_cmp expect .git/filter-repo/first-changed-commits && + git filter-repo --invert-paths --path fileB && NEW_FILE_C_CHANGE=$(git rev-list -1 HEAD -- fileC) && NEW_FILE_D_CHANGE=$(git rev-list -1 HEAD -- fileD) && FINAL_TAG=$(git rev-parse v1.0) && + cat <<-EOF | sort >expect && + ${FILE_B_CHANGE} ${FILE_A_CHANGE} + EOF + test_cmp expect .git/filter-repo/first-changed-commits && + cat <<-EOF | sort >sha-expect && ${FIRST_ORPHAN} ${FIRST_ORPHAN} ${FINAL_ORPHAN} ${FINAL_ORPHAN} @@ -433,9 +511,20 @@ test_expect_success 'use --refs heavily with a rerun' ' NEW_FINAL_ORPHAN=$(git rev-list -1 orphan-me) && + cat <<-EOF | sort >expect && + ${FINAL_ORPHAN} ${NEW_FINAL_ORPHAN} + EOF + test_cmp expect .git/filter-repo/first-changed-commits && + git filter-repo --refs $(git symbolic-ref HEAD) \ --invert-paths --path fileD && + cat <<-EOF | sort >expect && + ${FINAL_ORPHAN} ${NEW_FINAL_ORPHAN} + ${FILE_D_CHANGE} ${FILE_C_CHANGE} + EOF + test_cmp expect .git/filter-repo/first-changed-commits && + cat <<-EOF | sort >sha-expect && ${FIRST_ORPHAN} ${FIRST_ORPHAN} ${FINAL_ORPHAN} ${NEW_FINAL_ORPHAN}