From b2c304d3d7f748caf9715382badd6af3a89d1319 Mon Sep 17 00:00:00 2001 From: Martin Necas Date: Tue, 3 Dec 2024 15:05:58 +0100 Subject: [PATCH] MTV-1717 | Wait for DV status Issue: When CDI is managing lot of DVs at once it can take some time before the CDI reconciles the DVs status and start the import. In the function `updateCopyProgress` we check the DV status and if it is paused we do continue and as next phase we have snapshot removal. So if the DV is still in paused state and we start the cutover and MTV skips over the phase thinking it already finished as the DV is in paused. Fix: Add WaitForDataVolumesStatus phase to wait until the DVs do not have the paused status. Ref: https://issues.redhat.com/browse/MTV-1717 https://github.com/kubev2v/forklift/blob/ea83264881b3dcd4a88dd627c1ca75da2483f408/pkg/controller/plan/migration.go#L1573-L1575 Signed-off-by: Martin Necas --- .../forkliftcontroller/defaults/main.yml | 1 + .../controller/deployment-controller.yml.j2 | 4 + pkg/controller/plan/migration.go | 131 +++++++++++++----- pkg/settings/migration.go | 6 + 4 files changed, 106 insertions(+), 36 deletions(-) diff --git a/operator/roles/forkliftcontroller/defaults/main.yml b/operator/roles/forkliftcontroller/defaults/main.yml index 80ba5b46e..92db25c0f 100644 --- a/operator/roles/forkliftcontroller/defaults/main.yml +++ b/operator/roles/forkliftcontroller/defaults/main.yml @@ -34,6 +34,7 @@ controller_precopy_interval: 60 controller_snapshot_removal_timeout_minuts: 120 controller_snapshot_status_check_rate_seconds: 10 controller_cleanup_retries: 10 +controller_dv_status_check_retries: 10 controller_vsphere_incremental_backup: true controller_ovirt_warm_migration: true controller_retain_precopy_importer_pods: false diff --git a/operator/roles/forkliftcontroller/templates/controller/deployment-controller.yml.j2 b/operator/roles/forkliftcontroller/templates/controller/deployment-controller.yml.j2 index 41963ee19..84cc6d480 100644 --- a/operator/roles/forkliftcontroller/templates/controller/deployment-controller.yml.j2 +++ b/operator/roles/forkliftcontroller/templates/controller/deployment-controller.yml.j2 @@ -81,6 +81,10 @@ spec: - name: CLEANUP_RETRIES value: "{{ controller_cleanup_retries }}" {% endif %} +{% if controller_dv_status_check_retries is number %} + - name: DV_STATUS_CHECK_RETRIES + value: "{{ controller_dv_status_check_retries }}" +{% endif %} {% if controller_max_vm_inflight is number %} - name: MAX_VM_INFLIGHT value: "{{ controller_max_vm_inflight }}" diff --git a/pkg/controller/plan/migration.go b/pkg/controller/plan/migration.go index 4ef55f4cb..0a7c7878b 100644 --- a/pkg/controller/plan/migration.go +++ b/pkg/controller/plan/migration.go @@ -55,36 +55,38 @@ var ( // Phases. const ( - Started = "Started" - PreHook = "PreHook" - StorePowerState = "StorePowerState" - PowerOffSource = "PowerOffSource" - WaitForPowerOff = "WaitForPowerOff" - CreateDataVolumes = "CreateDataVolumes" - CreateVM = "CreateVM" - CopyDisks = "CopyDisks" - AllocateDisks = "AllocateDisks" - CopyingPaused = "CopyingPaused" - AddCheckpoint = "AddCheckpoint" - AddFinalCheckpoint = "AddFinalCheckpoint" - CreateSnapshot = "CreateSnapshot" - CreateInitialSnapshot = "CreateInitialSnapshot" - CreateFinalSnapshot = "CreateFinalSnapshot" - Finalize = "Finalize" - CreateGuestConversionPod = "CreateGuestConversionPod" - ConvertGuest = "ConvertGuest" - CopyDisksVirtV2V = "CopyDisksVirtV2V" - PostHook = "PostHook" - Completed = "Completed" - WaitForSnapshot = "WaitForSnapshot" - WaitForInitialSnapshot = "WaitForInitialSnapshot" - WaitForFinalSnapshot = "WaitForFinalSnapshot" - ConvertOpenstackSnapshot = "ConvertOpenstackSnapshot" - StoreSnapshotDeltas = "StoreSnapshotDeltas" - StoreInitialSnapshotDeltas = "StoreInitialSnapshotDeltas" - RemovePreviousSnapshot = "RemovePreviousSnapshot" - RemovePenultimateSnapshot = "RemovePenultimateSnapshot" - RemoveFinalSnapshot = "RemoveFinalSnapshot" + Started = "Started" + PreHook = "PreHook" + StorePowerState = "StorePowerState" + PowerOffSource = "PowerOffSource" + WaitForPowerOff = "WaitForPowerOff" + CreateDataVolumes = "CreateDataVolumes" + WaitForDataVolumesStatus = "WaitForDataVolumesStatus" + WaitForFinalDataVolumesStatus = "WaitForFinalDataVolumesStatus" + CreateVM = "CreateVM" + CopyDisks = "CopyDisks" + AllocateDisks = "AllocateDisks" + CopyingPaused = "CopyingPaused" + AddCheckpoint = "AddCheckpoint" + AddFinalCheckpoint = "AddFinalCheckpoint" + CreateSnapshot = "CreateSnapshot" + CreateInitialSnapshot = "CreateInitialSnapshot" + CreateFinalSnapshot = "CreateFinalSnapshot" + Finalize = "Finalize" + CreateGuestConversionPod = "CreateGuestConversionPod" + ConvertGuest = "ConvertGuest" + CopyDisksVirtV2V = "CopyDisksVirtV2V" + PostHook = "PostHook" + Completed = "Completed" + WaitForSnapshot = "WaitForSnapshot" + WaitForInitialSnapshot = "WaitForInitialSnapshot" + WaitForFinalSnapshot = "WaitForFinalSnapshot" + ConvertOpenstackSnapshot = "ConvertOpenstackSnapshot" + StoreSnapshotDeltas = "StoreSnapshotDeltas" + StoreInitialSnapshotDeltas = "StoreInitialSnapshotDeltas" + RemovePreviousSnapshot = "RemovePreviousSnapshot" + RemovePenultimateSnapshot = "RemovePenultimateSnapshot" + RemoveFinalSnapshot = "RemoveFinalSnapshot" ) // Steps. @@ -100,8 +102,9 @@ const ( ) const ( - TransferCompleted = "Transfer completed." - PopulatorPodPrefix = "populate-" + TransferCompleted = "Transfer completed." + PopulatorPodPrefix = "populate-" + DvStatusCheckRetriesAnnotation = "dvStatusCheckRetries" ) var ( @@ -134,6 +137,7 @@ var ( {Name: WaitForInitialSnapshot}, {Name: StoreInitialSnapshotDeltas, All: VSphere}, {Name: CreateDataVolumes}, + {Name: WaitForDataVolumesStatus}, {Name: CopyDisks}, {Name: CopyingPaused}, {Name: RemovePreviousSnapshot, All: VSphere}, @@ -148,6 +152,7 @@ var ( {Name: CreateFinalSnapshot}, {Name: WaitForFinalSnapshot}, {Name: AddFinalCheckpoint}, + {Name: WaitForFinalDataVolumesStatus}, {Name: Finalize}, {Name: RemoveFinalSnapshot, All: VSphere}, {Name: CreateGuestConversionPod, All: RequiresConversion}, @@ -662,9 +667,9 @@ func (r *Migration) step(vm *plan.VMStatus) (step string) { step = Initialize case AllocateDisks: step = DiskAllocation - case CopyDisks, CopyingPaused, RemovePreviousSnapshot, CreateSnapshot, WaitForSnapshot, StoreSnapshotDeltas, AddCheckpoint, ConvertOpenstackSnapshot: + case CopyDisks, CopyingPaused, RemovePreviousSnapshot, CreateSnapshot, WaitForSnapshot, StoreSnapshotDeltas, AddCheckpoint, ConvertOpenstackSnapshot, WaitForDataVolumesStatus: step = DiskTransfer - case RemovePenultimateSnapshot, CreateFinalSnapshot, WaitForFinalSnapshot, AddFinalCheckpoint, Finalize, RemoveFinalSnapshot: + case RemovePenultimateSnapshot, CreateFinalSnapshot, WaitForFinalSnapshot, AddFinalCheckpoint, Finalize, RemoveFinalSnapshot, WaitForFinalDataVolumesStatus: step = Cutover case CreateGuestConversionPod, ConvertGuest: step = ImageConversion @@ -1039,6 +1044,51 @@ func (r *Migration) execute(vm *plan.VMStatus) (err error) { if ready { vm.Phase = r.next(vm.Phase) } + case WaitForDataVolumesStatus, WaitForFinalDataVolumesStatus: + step, found := vm.FindStep(r.step(vm)) + if !found { + vm.AddError(fmt.Sprintf("Step '%s' not found", r.step(vm))) + break + } + + dvs, err := r.kubevirt.getDVs(vm) + if err != nil { + step.AddError(err.Error()) + err = nil + break + } + if !r.hasPausedDv(dvs) { + vm.Phase = r.next(vm.Phase) + // Reset for next precopy + step.Annotations[DvStatusCheckRetriesAnnotation] = "1" + } else { + var retries int + retriesAnnotation := step.Annotations[DvStatusCheckRetriesAnnotation] + if retriesAnnotation == "" { + step.Annotations[DvStatusCheckRetriesAnnotation] = "1" + } else { + retries, err = strconv.Atoi(retriesAnnotation) + if err != nil { + step.AddError(err.Error()) + err = nil + break + } + if retries >= settings.Settings.DvStatusCheckRetries { + // Do not fail the step as this can happen when the user runs the warm migration but the VM is already shutdown + // In that case we don't create any delta and don't change the CDI DV status. + r.Log.Info( + "DataVolume status check exceeded the retry limit."+ + "If this causes the problems with the snapshot removal in the CDI please bump the controller_dv_status_check_retries.", + "vm", + vm.String()) + vm.Phase = r.next(vm.Phase) + // Reset for next precopy + step.Annotations[DvStatusCheckRetriesAnnotation] = "1" + } else { + step.Annotations[DvStatusCheckRetriesAnnotation] = strconv.Itoa(retries + 1) + } + } + } case StoreInitialSnapshotDeltas, StoreSnapshotDeltas: step, found := vm.FindStep(r.step(vm)) if !found { @@ -1073,9 +1123,9 @@ func (r *Migration) execute(vm *plan.VMStatus) (err error) { switch vm.Phase { case AddCheckpoint: - vm.Phase = CopyDisks + vm.Phase = WaitForDataVolumesStatus case AddFinalCheckpoint: - vm.Phase = Finalize + vm.Phase = WaitForFinalDataVolumesStatus } case StorePowerState: step, found := vm.FindStep(r.step(vm)) @@ -1259,6 +1309,15 @@ func (r *Migration) execute(vm *plan.VMStatus) (err error) { return } +func (r *Migration) hasPausedDv(dvs []ExtendedDataVolume) bool { + for _, dv := range dvs { + if dv.Status.Phase == Paused { + return true + } + } + return false +} + func (r *Migration) resetPrecopyTasks(vm *plan.VMStatus, step *plan.Step) { step.Completed = nil for _, task := range step.Tasks { diff --git a/pkg/settings/migration.go b/pkg/settings/migration.go index 70413e400..0b8096156 100644 --- a/pkg/settings/migration.go +++ b/pkg/settings/migration.go @@ -24,6 +24,7 @@ const ( FileSystemOverhead = "FILESYSTEM_OVERHEAD" BlockOverhead = "BLOCK_OVERHEAD" CleanupRetries = "CLEANUP_RETRIES" + DvStatusCheckRetries = "DV_STATUS_CHECK_RETRIES" OvirtOsConfigMap = "OVIRT_OS_MAP" VsphereOsConfigMap = "VSPHERE_OS_MAP" VirtCustomizeConfigMap = "VIRT_CUSTOMIZE_MAP" @@ -58,6 +59,8 @@ type Migration struct { BlockOverhead int64 // Cleanup retries CleanupRetries int + // DvStatusCheckRetries retries + DvStatusCheckRetries int // oVirt OS config map name OvirtOsConfigMap string // vSphere OS config map name @@ -100,6 +103,9 @@ func (r *Migration) Load() (err error) { if r.CleanupRetries, err = getPositiveEnvLimit(CleanupRetries, 10); err != nil { return liberr.Wrap(err) } + if r.DvStatusCheckRetries, err = getPositiveEnvLimit(DvStatusCheckRetries, 10); err != nil { + return liberr.Wrap(err) + } if virtV2vImage, ok := os.LookupEnv(VirtV2vImage); ok { r.VirtV2vImage = virtV2vImage } else if Settings.Role.Has(MainRole) {