From b97cdfbe10e82c16058a581d2fc6ee6412f6b311 Mon Sep 17 00:00:00 2001
From: Elijah Newren <newren@gmail.com>
Date: Fri, 4 Oct 2024 12:25:05 -0700
Subject: [PATCH 1/2] filter-repo: add a new get_parent_hashes() convenience
 function

We want to add the ability to show the first changed commit(s) in a
rewrite, i.e. those commits that were rewritten but which did not have
any parents that were rewritten.  And we want to be able to update these
on subsequent runs of filter-repo when users rewrite their repository in
multiple steps.

To facilitate this, add a get_parent_hashes() function to the
AncestryGraph class.

Signed-off-by: Elijah Newren <newren@gmail.com>
---
 git-filter-repo | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/git-filter-repo b/git-filter-repo
index 1aa0eb3a..a2f16fbe 100755
--- a/git-filter-repo
+++ b/git-filter-repo
@@ -248,8 +248,9 @@ class AncestryGraph(object):
     # elsewhere
     self.git_hash = {}
 
-    # Reverse map; only populated if needed.  Callers of functions using
-    # this reverse map are responsible to ensure it is populated
+    # Reverse maps; only populated if needed.  Caller responsible to check
+    # and ensure they are populated
+    self._reverse_value = {}
     self._hash_to_id = {}
 
     # Cached results from previous calls to is_ancestor().
@@ -302,7 +303,29 @@ class AncestryGraph(object):
 
   def _ensure_reverse_maps_populated(self):
     if not self._hash_to_id:
+      assert not self._reverse_value
       self._hash_to_id = {v: k for k, v in self.git_hash.items()}
+      self._reverse_value = {v: k for k, v in self.value.items()}
+
+  def get_parent_hashes(self, commit_hash):
+    '''
+    Given a commit_hash, return its parents hashes
+    '''
+    #
+    # We have to map:
+    #    commit hash -> fast export stream id -> graph id
+    # then lookup
+    #    parent graph ids for given graph id
+    # then we need to map
+    #    parent graph ids -> parent fast export ids -> parent commit hashes
+    #
+    self._ensure_reverse_maps_populated()
+    commit_fast_export_id = self._hash_to_id[commit_hash]
+    commit_graph_id = self.value[commit_fast_export_id]
+    parent_graph_ids = self.graph[commit_graph_id][1]
+    parent_fast_export_ids = [self._reverse_value[x] for x in parent_graph_ids]
+    parent_hashes = [self.git_hash[x] for x in parent_fast_export_ids]
+    return parent_hashes
 
   def map_to_hash(self, commit_id):
     '''

From d41f9d91ead77c1e5e476f04d6a75d00a32a8696 Mon Sep 17 00:00:00 2001
From: Elijah Newren <newren@gmail.com>
Date: Fri, 4 Oct 2024 12:38:58 -0700
Subject: [PATCH 2/2] filter-repo: add tracking and recording of first changed
 commit(s)

Compute the set of first changed commits in a rewrite, that is, those
commits that were rewritten which had no parents that were rewritten.
There will typically be just one of these in a rewrite, making it a
good candidate for a "ban commit" hook to prevent the old history from
being repushed to a repository.  Store these in
$GIT_DIR/filter-repo/first-changed-commits.

Signed-off-by: Elijah Newren <newren@gmail.com>
---
 Documentation/git-filter-repo.txt |  37 ++++++++++-
 git-filter-repo                   | 102 +++++++++++++++++++++++++++++-
 t/t9390-filter-repo.sh            |   1 +
 t/t9393-rerun.sh                  |  93 ++++++++++++++++++++++++++-
 4 files changed, 228 insertions(+), 5 deletions(-)

diff --git a/Documentation/git-filter-repo.txt b/Documentation/git-filter-repo.txt
index 86a01ee1..47c0ad8c 100644
--- a/Documentation/git-filter-repo.txt
+++ b/Documentation/git-filter-repo.txt
@@ -405,6 +405,26 @@ references were changed.
   * An all-zeros hash, or null SHA, represents a non-existent object.
     When in the "new" column, this means the ref was removed entirely.
 
+First Changed Commits
+~~~~~~~~~~~~~~~~~~~~~
+
+The `$GIT_DIR/filter-repo/first-changed-commits` contains a list of the
+first commit(s) changed by the filtering operation.  These are the commits
+that got rewritten and which had no parents that were also rewritten.
+
+So, for example if you had commits
+  A1-B1-C1-D1-E1
+before running git-filter-repo, and afterward you had commits
+  A1-B2-C2-D2-E2
+then the First Changed Commits file would contain just one line, which
+would be the hash of B2.
+
+In most cases, there will only be one commit listed, but if you had
+multiple root commits or a non-linear history where the commits on
+those diverging histories were the first ones modified, then there
+could be multiple first changed commits and they will each be listed
+on separate lines.
+
 Already Ran
 ~~~~~~~~~~~
 
@@ -429,10 +449,25 @@ Concretely, this means:
     commit B to commit C, then the second run would have an "A C"
     entry rather than a "B C" entry for the changed commit.
 
+  * The first changed commit(s) (reported When using the
+    --sensitive-data-removal option) will be the first original commit
+    modified, not the first intermediate commit modified.
+
+    In more detail, if the repository original had the following commits:
+       A1-B1-C1-D1-E1
+    and the first invocation of filter-repo changed this to
+       A1-B1-C2-D2-E2
+    then the first run would report "C1" as the first changed commit.  If
+    a second filter-repo run further changed this to
+       A1-B1-C2-D3-E3
+    then it would report "C1" as the first changed commit, not "D2",
+    because it is comparing to the original commits rather than the
+    intermediate ones.
+
 However, if the already_ran file exists but is older than 1 day when they
 invoke git-filter-repo, the user will be prompted for whether the new run
 should be considered a continuation of the previous run.  If they do not
-answer in the affirmative, then the above two bullets will not apply.
+answer in the affirmative, then the above three bullets will not apply.
 This prompt exists because users might do a history rewrite in a repository,
 forget about it and leave the $GIT_DIR/filter-repo directory around, and
 then some months or years later need to do another rewrite.  If commits
diff --git a/git-filter-repo b/git-filter-repo
index a2f16fbe..efdef23a 100755
--- a/git-filter-repo
+++ b/git-filter-repo
@@ -4186,11 +4186,105 @@ class RepoFilter(object):
       batch_check_process.stdin.close()
       batch_check_process.wait()
 
-    return commit_renames, ref_maps
+    #
+    # Third, handle first_changes
+    #
+
+    old_first_changes = dict()
+    if already_ran:
+      # Read first_changes into old_first_changes
+      with open(os.path.join(metadata_dir, b'first-changed-commits'), 'br') as f:
+        for line in f:
+          changed_commit, undeleted_self_or_ancestor = line.strip().split()
+          old_first_changes[changed_commit] = undeleted_self_or_ancestor
+    # We need to find the commits that were modified whose parents were not.
+    # To be able to find parents, we need the commit names as of the beginning
+    # of this run, and then when we are done, we need to map them back to the
+    # name of the commits from before any git-filter-repo runs.
+    #
+    # We are excluding here any commits deleted in previous git-filter-repo
+    # runs
+    undo_old_commit_renames = dict((v,k) for (k,v) in old_commit_renames.items()
+                                   if v != deleted_hash)
+    # Get a list of all commits that were changed, as of the beginning of
+    # this latest run.
+    changed_commits = {new
+                       for (old,new) in old_commit_renames.items()
+                       if old != new and new != deleted_hash} | \
+                      {old
+                       for (old,new) in self._commit_renames.items()
+                       if old != new}
+    special_changed_commits = {old
+                               for (old,new) in old_commit_renames.items()
+                               if new == deleted_hash}
+    first_changes = dict()
+    for (old,new) in self._commit_renames.items():
+      if old == new:
+        # old wasn't modified, can't be first change if not even a change
+        continue
+      if old_commit_unrenames.get(old,old) != old:
+        # old was already modified in previous run; while it might represent
+        # something that is still a first change, we'll handle that as we
+        # loop over old_first_changes below
+        continue
+      if any(parent in changed_commits
+             for parent in self._orig_graph.get_parent_hashes(old)):
+        # a parent of old was modified, so old is not a first change
+        continue
+      # At this point, old IS a first change.  We need to find out what new
+      # commit it maps to, or if it doesn't map to one, what new commit was
+      # its most recent ancestor that wasn't pruned.
+      if new is None:
+        new = self._remap_to(old)
+      first_changes[old] = (new if new is not None else deleted_hash)
+    for (old,undeleted_self_or_ancestor) in old_first_changes.items():
+      if undeleted_self_or_ancestor == deleted_hash:
+        # old represents a commit that was pruned and whose entire ancestry
+        # was pruned.  So, old is still a first change
+        first_changes[old] = undeleted_self_or_ancestor
+        continue
+      intermediate = old_commit_renames.get(old, old)
+      usoa = undeleted_self_or_ancestor
+      new_ancestor = self._commit_renames.get(usoa, usoa)
+      if intermediate == deleted_hash:
+        # old was pruned in previous rewrite
+        if usoa != new_ancestor:
+          # old's ancestor got rewritten in this filtering run; we can drop
+          # this one from first_changes.
+          continue
+        # Getting here means old was a first change and old was pruned in a
+        # previous run, and its ancestors that survived were non rewritten in
+        # this run, so old remains a first change
+        first_changes[old] = new_ancestor # or usoa, since new_ancestor == usoa
+        continue
+      assert(usoa == intermediate) # old wasn't pruned => usoa == intermediate
+
+      # Check whether parents of intermediate were rewritten.  Note that
+      # intermediate in self._commit_renames only means that intermediate was
+      # processed by the latest filtering (not necessarily that it changed),
+      # but we need to know that before we can check for parent hashes having
+      # changed.
+      if intermediate not in self._commit_renames:
+        # This commit was not processed by this run, so it remains a first
+        # change
+        first_changes[old] = usoa
+        continue
+      if any(parent in changed_commits
+             for parent in self._orig_graph.get_parent_hashes(intermediate)):
+        # An ancestor was modified by this run, so it is no longer a first
+        # change; continue to the next one.
+        continue
+      # This change is a first_change; find the new commit its usoa maps to
+      new = self._remap_to(intermediate)
+      assert(new is not None)
+      first_changes[old] = new
+
+    return commit_renames, ref_maps, first_changes
 
   def _record_metadata(self, metadata_dir, orig_refs):
     self._flush_renames()
-    commit_renames, ref_maps = self._compute_metadata(metadata_dir, orig_refs)
+    commit_renames, ref_maps, first_changes = \
+      self._compute_metadata(metadata_dir, orig_refs)
 
     with open(os.path.join(metadata_dir, b'commit-map'), 'bw') as f:
       f.write(("%-40s %s\n" % (_("old"), _("new"))).encode())
@@ -4204,6 +4298,10 @@ class RepoFilter(object):
         (old_hash, new_hash) = hash_pair
         f.write(b'%s %s %s\n' % (old_hash, new_hash, refname))
 
+    with open(os.path.join(metadata_dir, b'first-changed-commits'), 'bw') as f:
+      for commit, undeleted_self_or_ancestor in sorted(first_changes.items()):
+        f.write(b'%s %s\n' % (commit, undeleted_self_or_ancestor))
+
     with open(os.path.join(metadata_dir, b'suboptimal-issues'), 'bw') as f:
       issues_found = False
       if self._commits_no_longer_merges:
diff --git a/t/t9390-filter-repo.sh b/t/t9390-filter-repo.sh
index e54560a9..418ac73b 100755
--- a/t/t9390-filter-repo.sh
+++ b/t/t9390-filter-repo.sh
@@ -21,6 +21,7 @@ filter_testcase() {
 		# Clean up from previous run
 		git pack-refs --all &&
 		rm .git/packed-refs &&
+		rm -rf .git/filter-repo/ &&
 
 		# Run the example
 		cat $DATA/$INPUT | git filter-repo --stdin --quiet --force --replace-refs delete-no-add "${REST[@]}" &&
diff --git a/t/t9393-rerun.sh b/t/t9393-rerun.sh
index 63af601c..60659acc 100755
--- a/t/t9393-rerun.sh
+++ b/t/t9393-rerun.sh
@@ -49,6 +49,11 @@ test_expect_success 'a re-run that is treated as a clean slate' '
 		cat sha-expect >>expect &&
 		test_cmp expect .git/filter-repo/ref-map &&
 
+		cat <<-EOF | sort >expect &&
+		${FILE_B_CHANGE} ${FILE_A_CHANGE}
+		EOF
+		test_cmp expect .git/filter-repo/first-changed-commits &&
+
 		touch -t 197001010000 .git/filter-repo/already_ran &&
 		echo no | git filter-repo --invert-paths --path fileC --force &&
 		FINAL_FILE_D_CHANGE=$(git rev-list -1 HEAD -- fileD) &&
@@ -72,7 +77,12 @@ test_expect_success 'a re-run that is treated as a clean slate' '
 		EOF
 		printf "%-40s %-40s %s\n" old new ref >expect &&
 		cat sha-expect >>expect &&
-		test_cmp expect .git/filter-repo/ref-map
+		test_cmp expect .git/filter-repo/ref-map &&
+
+		cat <<-EOF | sort >expect &&
+		${NEW_FILE_C_CHANGE} ${FILE_A_CHANGE}
+		EOF
+		test_cmp expect .git/filter-repo/first-changed-commits
 	)
 '
 
@@ -115,7 +125,13 @@ test_expect_success 'remove two files, no re-run' '
 		EOF
 		printf "%-40s %-40s %s\n" old new ref >expect &&
 		cat sha-expect >>expect &&
-		test_cmp expect .git/filter-repo/ref-map
+		test_cmp expect .git/filter-repo/ref-map &&
+
+		cat <<-EOF | sort >expect &&
+		${FIRST_ORPHAN} ${DELETED_SHA}
+		${FILE_C_CHANGE} ${FILE_B_CHANGE}
+		EOF
+		test_cmp expect .git/filter-repo/first-changed-commits
 	)
 '
 
@@ -139,6 +155,12 @@ test_expect_success 'remove two files, then remove a later file' '
 		NEW_FILE_D_CHANGE=$(git rev-list -1 HEAD -- fileD) &&
 		NEW_TAG=$(git rev-parse v1.0) &&
 
+		cat <<-EOF | sort >expect &&
+		${FIRST_ORPHAN} ${DELETED_SHA}
+		${FILE_C_CHANGE} ${FILE_B_CHANGE}
+		EOF
+		test_cmp expect .git/filter-repo/first-changed-commits &&
+
 		cat <<-EOF | sort -k 3 >sha-expect &&
 		${FILE_D_CHANGE} ${NEW_FILE_D_CHANGE} $(git symbolic-ref HEAD)
 		${FINAL_ORPHAN} ${DELETED_SHA} refs/heads/orphan-me
@@ -152,6 +174,12 @@ test_expect_success 'remove two files, then remove a later file' '
 
 		FINAL_TAG=$(git rev-parse v1.0) &&
 
+		cat <<-EOF | sort >expect &&
+		${FIRST_ORPHAN} ${DELETED_SHA}
+		${FILE_C_CHANGE} ${FILE_B_CHANGE}
+		EOF
+		test_cmp expect .git/filter-repo/first-changed-commits &&
+
 		cat <<-EOF | sort >sha-expect &&
 		${FIRST_ORPHAN} ${DELETED_SHA}
 		${FINAL_ORPHAN} ${DELETED_SHA}
@@ -194,9 +222,21 @@ test_expect_success 'remove two files, then remove a later file via --refs' '
 
 		NEW_FILE_C_CHANGE=$(git rev-list -1 HEAD -- fileC) &&
 
+		cat <<-EOF | sort >expect &&
+		${FIRST_ORPHAN} ${DELETED_SHA}
+		${FILE_B_CHANGE} ${FILE_A_CHANGE}
+		EOF
+		test_cmp expect .git/filter-repo/first-changed-commits &&
+
 		git filter-repo --invert-paths --path fileD --refs HEAD~1..HEAD &&
 		FINAL_TAG=$(git rev-parse v1.0) &&
 
+		cat <<-EOF | sort >expect &&
+		${FIRST_ORPHAN} ${DELETED_SHA}
+		${FILE_B_CHANGE} ${FILE_A_CHANGE}
+		EOF
+		test_cmp expect .git/filter-repo/first-changed-commits &&
+
 		cat <<-EOF | sort >sha-expect &&
 		${FIRST_ORPHAN} ${DELETED_SHA}
 		${FINAL_ORPHAN} ${DELETED_SHA}
@@ -242,6 +282,12 @@ test_expect_success 'remove two files, then remove an earlier file' '
 		NEW_FILE_D_CHANGE=$(git rev-list -1 HEAD -- fileD) &&
 		FINAL_TAG=$(git rev-parse v1.0) &&
 
+		cat <<-EOF | sort >expect &&
+		${FIRST_ORPHAN} ${DELETED_SHA}
+		${FILE_B_CHANGE} ${FILE_A_CHANGE}
+		EOF
+		test_cmp expect .git/filter-repo/first-changed-commits &&
+
 		cat <<-EOF | sort >sha-expect &&
 		${FIRST_ORPHAN} ${DELETED_SHA}
 		${FINAL_ORPHAN} ${DELETED_SHA}
@@ -284,10 +330,20 @@ test_expect_success 'modify a file, then remove a later file' '
 
 		NEW_FILE_C_CHANGE=$(git rev-list -1 HEAD -- fileC) &&
 
+		cat <<-EOF | sort >expect &&
+		${FILE_C_CHANGE} ${NEW_FILE_C_CHANGE}
+		EOF
+		test_cmp expect .git/filter-repo/first-changed-commits &&
+
 		git filter-repo --invert-paths --path fileD &&
 
 		FINAL_TAG=$(git rev-parse v1.0) &&
 
+		cat <<-EOF | sort >expect &&
+		${FILE_C_CHANGE} ${NEW_FILE_C_CHANGE}
+		EOF
+		test_cmp expect .git/filter-repo/first-changed-commits &&
+
 		# Make sure the fileD commit was indeed removed
 		echo $NEW_FILE_C_CHANGE >expect &&
 		git rev-parse HEAD >actual &&
@@ -337,10 +393,20 @@ test_expect_success 'modify a file, then remove a later file via --refs' '
 		NEW_FILE_B_CHANGE=$(git rev-list -1 HEAD -- fileB) &&
 		NEW_FILE_C_CHANGE=$(git rev-list -1 HEAD -- fileC) &&
 
+		cat <<-EOF | sort >expect &&
+		${FILE_B_CHANGE} ${NEW_FILE_B_CHANGE}
+		EOF
+		test_cmp expect .git/filter-repo/first-changed-commits &&
+
 		git filter-repo --invert-paths --path fileD \
 		                --refs HEAD~1..HEAD &&
 		FINAL_TAG=$(git rev-parse v1.0) &&
 
+		cat <<-EOF | sort >expect &&
+		${FILE_B_CHANGE} ${NEW_FILE_B_CHANGE}
+		EOF
+		test_cmp expect .git/filter-repo/first-changed-commits &&
+
 		# Make sure the fileD commit was indeed removed
 		git rev-parse HEAD^ >expect &&
 		echo ${NEW_FILE_B_CHANGE} >actual &&
@@ -386,12 +452,24 @@ test_expect_success 'modify a file, then remove an earlier file' '
 		git filter-repo --force \
 		    --replace-text <(echo "file 3 contents==>Alternate C") &&
 
+		NEW_FILE_C_CHANGE=$(git rev-list -1 HEAD -- fileC) &&
+
+		cat <<-EOF | sort >expect &&
+		${FILE_C_CHANGE} ${NEW_FILE_C_CHANGE}
+		EOF
+		test_cmp expect .git/filter-repo/first-changed-commits &&
+
 		git filter-repo --invert-paths --path fileB &&
 
 		NEW_FILE_C_CHANGE=$(git rev-list -1 HEAD -- fileC) &&
 		NEW_FILE_D_CHANGE=$(git rev-list -1 HEAD -- fileD) &&
 		FINAL_TAG=$(git rev-parse v1.0) &&
 
+		cat <<-EOF | sort >expect &&
+		${FILE_B_CHANGE} ${FILE_A_CHANGE}
+		EOF
+		test_cmp expect .git/filter-repo/first-changed-commits &&
+
 		cat <<-EOF | sort >sha-expect &&
 		${FIRST_ORPHAN} ${FIRST_ORPHAN}
 		${FINAL_ORPHAN} ${FINAL_ORPHAN}
@@ -433,9 +511,20 @@ test_expect_success 'use --refs heavily with a rerun' '
 
 		NEW_FINAL_ORPHAN=$(git rev-list -1 orphan-me) &&
 
+		cat <<-EOF | sort >expect &&
+		${FINAL_ORPHAN} ${NEW_FINAL_ORPHAN}
+		EOF
+		test_cmp expect .git/filter-repo/first-changed-commits &&
+
 		git filter-repo --refs $(git symbolic-ref HEAD) \
 		    --invert-paths --path fileD &&
 
+		cat <<-EOF | sort >expect &&
+		${FINAL_ORPHAN} ${NEW_FINAL_ORPHAN}
+		${FILE_D_CHANGE} ${FILE_C_CHANGE}
+		EOF
+		test_cmp expect .git/filter-repo/first-changed-commits &&
+
 		cat <<-EOF | sort >sha-expect &&
 		${FIRST_ORPHAN} ${FIRST_ORPHAN}
 		${FINAL_ORPHAN} ${NEW_FINAL_ORPHAN}