Skip to content

Commit

Permalink
Merge pull request #8637 from reasonerjt/rm-leaked-vs
Browse files Browse the repository at this point in the history
Clean up leaked CSI snapshot for incomplete backup
  • Loading branch information
reasonerjt authored Jan 23, 2025
2 parents 1e54f1c + 1c37289 commit bedea9c
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 4 deletions.
1 change: 1 addition & 0 deletions changelogs/unreleased/8637-raesonerjt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Clean up leaked CSI snapshot for incomplete backup
30 changes: 28 additions & 2 deletions pkg/controller/backup_deletion_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ import (
"fmt"
"time"

"github.com/vmware-tanzu/velero/pkg/util/csi"

jsonpatch "github.com/evanphx/json-patch/v5"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
Expand All @@ -35,6 +37,8 @@ import (
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"

snapshotv1api "github.com/kubernetes-csi/external-snapshotter/client/v7/apis/volumesnapshot/v1"

"github.com/vmware-tanzu/velero/internal/credentials"
"github.com/vmware-tanzu/velero/internal/delete"
"github.com/vmware-tanzu/velero/internal/volume"
Expand Down Expand Up @@ -251,12 +255,18 @@ func (r *backupDeletionReconciler) Reconcile(ctx context.Context, req ctrl.Reque
}
// don't defer CleanupClients here, since it was already called above.

var errs []string

if len(actions) > 0 {
// Download the tarball
backupFile, err := downloadToTempFile(backup.Name, backupStore, log)

if err != nil {
log.WithError(err).Errorf("Unable to download tarball for backup %s, skipping associated DeleteItemAction plugins", backup.Name)
log.Info("Cleaning up CSI volumesnapshots")
if err := r.deleteCSIVolumeSnapshots(ctx, backup, log); err != nil {
errs = append(errs, err.Error())
}
} else {
defer closeAndRemoveFile(backupFile, r.logger)
deleteCtx := &delete.Context{
Expand All @@ -279,8 +289,6 @@ func (r *backupDeletionReconciler) Reconcile(ctx context.Context, req ctrl.Reque
}
}

var errs []string

if backupStore != nil {
log.Info("Removing PV snapshots")

Expand Down Expand Up @@ -495,6 +503,24 @@ func (r *backupDeletionReconciler) deleteExistingDeletionRequests(ctx context.Co
return errs
}

// deleteCSIVolumeSnapshots clean up the CSI snapshots created by the backup, this should be called when the backup is failed
// when it's running, e.g. due to velero pod restart, and the backup.tar is failed to be downloaded from storage.
func (r *backupDeletionReconciler) deleteCSIVolumeSnapshots(ctx context.Context, backup *velerov1api.Backup, log logrus.FieldLogger) error {
vsList := snapshotv1api.VolumeSnapshotList{}
if err := r.Client.List(ctx, &vsList, &client.ListOptions{
LabelSelector: labels.SelectorFromSet(map[string]string{
velerov1api.BackupNameLabel: label.GetValidName(backup.Name),
}),
}); err != nil {
return errors.Wrap(err, "error listing volume snapshots")
}
for _, item := range vsList.Items {
vs := item
csi.CleanupVolumeSnapshot(&vs, r.Client, log)
}
return nil
}

func (r *backupDeletionReconciler) deletePodVolumeSnapshots(ctx context.Context, backup *velerov1api.Backup) []error {
if r.repoMgr == nil {
return nil
Expand Down
17 changes: 16 additions & 1 deletion pkg/controller/backup_deletion_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ import (
"reflect"
"time"

snapshotv1api "github.com/kubernetes-csi/external-snapshotter/client/v7/apis/volumesnapshot/v1"

"context"

"github.com/sirupsen/logrus"
Expand Down Expand Up @@ -609,7 +611,13 @@ func TestBackupDeletionControllerReconcile(t *testing.T) {
Provider: "provider-1",
},
}
td := setupBackupDeletionControllerTest(t, defaultTestDbr(), backup, location, snapshotLocation)

csiSnapshot := builder.ForVolumeSnapshot("user-ns", "vs-1").ObjectMeta(
builder.WithLabelsMap(map[string]string{
"velero.io/backup-name": "foo",
})).SourcePVC("some-pvc").Result()

td := setupBackupDeletionControllerTest(t, defaultTestDbr(), backup, location, snapshotLocation, csiSnapshot)
td.volumeSnapshotter.SnapshotsTaken.Insert("snap-1")

snapshots := []*volume.Snapshot{
Expand Down Expand Up @@ -654,6 +662,13 @@ func TestBackupDeletionControllerReconcile(t *testing.T) {
}, &velerov1api.Backup{})
assert.True(t, apierrors.IsNotFound(err), "Expected not found error, but actual value of error: %v", err)

// leaked CSI snapshot should be deleted
err = td.fakeClient.Get(context.TODO(), types.NamespacedName{
Namespace: "user-ns",
Name: "vs-1",
}, &snapshotv1api.VolumeSnapshot{})
assert.True(t, apierrors.IsNotFound(err), "Expected not found error for the leaked CSI snapshot, but actual value of error: %v", err)

// Make sure snapshot was deleted
assert.Equal(t, 0, td.volumeSnapshotter.SnapshotsTaken.Len())
})
Expand Down
5 changes: 4 additions & 1 deletion pkg/util/csi/volume_snapshot.go
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,8 @@ func SetVolumeSnapshotContentDeletionPolicy(
return crClient.Patch(context.TODO(), vsc, crclient.MergeFrom(originVSC))
}

// CleanupVolumeSnapshot deletes the VolumeSnapshot and the associated VolumeSnapshotContent. It will make sure the
// physical snapshot is also deleted.
func CleanupVolumeSnapshot(
volSnap *snapshotv1api.VolumeSnapshot,
crClient crclient.Client,
Expand Down Expand Up @@ -528,7 +530,8 @@ func CleanupVolumeSnapshot(
}
}

// DeleteVolumeSnapshot handles the VolumeSnapshot instance deletion.
// DeleteVolumeSnapshot handles the VolumeSnapshot instance deletion. It will make sure the VolumeSnapshotContent is
// recreated so that the physical snapshot is retained.
func DeleteVolumeSnapshot(
vs snapshotv1api.VolumeSnapshot,
vsc snapshotv1api.VolumeSnapshotContent,
Expand Down

0 comments on commit bedea9c

Please sign in to comment.