diff --git a/operator/roles/forkliftcontroller/defaults/main.yml b/operator/roles/forkliftcontroller/defaults/main.yml index 80ba5b46e..92db25c0f 100644 --- a/operator/roles/forkliftcontroller/defaults/main.yml +++ b/operator/roles/forkliftcontroller/defaults/main.yml @@ -34,6 +34,7 @@ controller_precopy_interval: 60 controller_snapshot_removal_timeout_minuts: 120 controller_snapshot_status_check_rate_seconds: 10 controller_cleanup_retries: 10 +controller_dv_status_check_retries: 10 controller_vsphere_incremental_backup: true controller_ovirt_warm_migration: true controller_retain_precopy_importer_pods: false diff --git a/operator/roles/forkliftcontroller/templates/controller/deployment-controller.yml.j2 b/operator/roles/forkliftcontroller/templates/controller/deployment-controller.yml.j2 index 41963ee19..84cc6d480 100644 --- a/operator/roles/forkliftcontroller/templates/controller/deployment-controller.yml.j2 +++ b/operator/roles/forkliftcontroller/templates/controller/deployment-controller.yml.j2 @@ -81,6 +81,10 @@ spec: - name: CLEANUP_RETRIES value: "{{ controller_cleanup_retries }}" {% endif %} +{% if controller_dv_status_check_retries is number %} + - name: DV_STATUS_CHECK_RETRIES + value: "{{ controller_dv_status_check_retries }}" +{% endif %} {% if controller_max_vm_inflight is number %} - name: MAX_VM_INFLIGHT value: "{{ controller_max_vm_inflight }}" diff --git a/pkg/controller/plan/migration.go b/pkg/controller/plan/migration.go index 4ef55f4cb..0a7c7878b 100644 --- a/pkg/controller/plan/migration.go +++ b/pkg/controller/plan/migration.go @@ -55,36 +55,38 @@ var ( // Phases. const ( - Started = "Started" - PreHook = "PreHook" - StorePowerState = "StorePowerState" - PowerOffSource = "PowerOffSource" - WaitForPowerOff = "WaitForPowerOff" - CreateDataVolumes = "CreateDataVolumes" - CreateVM = "CreateVM" - CopyDisks = "CopyDisks" - AllocateDisks = "AllocateDisks" - CopyingPaused = "CopyingPaused" - AddCheckpoint = "AddCheckpoint" - AddFinalCheckpoint = "AddFinalCheckpoint" - CreateSnapshot = "CreateSnapshot" - CreateInitialSnapshot = "CreateInitialSnapshot" - CreateFinalSnapshot = "CreateFinalSnapshot" - Finalize = "Finalize" - CreateGuestConversionPod = "CreateGuestConversionPod" - ConvertGuest = "ConvertGuest" - CopyDisksVirtV2V = "CopyDisksVirtV2V" - PostHook = "PostHook" - Completed = "Completed" - WaitForSnapshot = "WaitForSnapshot" - WaitForInitialSnapshot = "WaitForInitialSnapshot" - WaitForFinalSnapshot = "WaitForFinalSnapshot" - ConvertOpenstackSnapshot = "ConvertOpenstackSnapshot" - StoreSnapshotDeltas = "StoreSnapshotDeltas" - StoreInitialSnapshotDeltas = "StoreInitialSnapshotDeltas" - RemovePreviousSnapshot = "RemovePreviousSnapshot" - RemovePenultimateSnapshot = "RemovePenultimateSnapshot" - RemoveFinalSnapshot = "RemoveFinalSnapshot" + Started = "Started" + PreHook = "PreHook" + StorePowerState = "StorePowerState" + PowerOffSource = "PowerOffSource" + WaitForPowerOff = "WaitForPowerOff" + CreateDataVolumes = "CreateDataVolumes" + WaitForDataVolumesStatus = "WaitForDataVolumesStatus" + WaitForFinalDataVolumesStatus = "WaitForFinalDataVolumesStatus" + CreateVM = "CreateVM" + CopyDisks = "CopyDisks" + AllocateDisks = "AllocateDisks" + CopyingPaused = "CopyingPaused" + AddCheckpoint = "AddCheckpoint" + AddFinalCheckpoint = "AddFinalCheckpoint" + CreateSnapshot = "CreateSnapshot" + CreateInitialSnapshot = "CreateInitialSnapshot" + CreateFinalSnapshot = "CreateFinalSnapshot" + Finalize = "Finalize" + CreateGuestConversionPod = "CreateGuestConversionPod" + ConvertGuest = "ConvertGuest" + CopyDisksVirtV2V = "CopyDisksVirtV2V" + PostHook = "PostHook" + Completed = "Completed" + WaitForSnapshot = "WaitForSnapshot" + WaitForInitialSnapshot = "WaitForInitialSnapshot" + WaitForFinalSnapshot = "WaitForFinalSnapshot" + ConvertOpenstackSnapshot = "ConvertOpenstackSnapshot" + StoreSnapshotDeltas = "StoreSnapshotDeltas" + StoreInitialSnapshotDeltas = "StoreInitialSnapshotDeltas" + RemovePreviousSnapshot = "RemovePreviousSnapshot" + RemovePenultimateSnapshot = "RemovePenultimateSnapshot" + RemoveFinalSnapshot = "RemoveFinalSnapshot" ) // Steps. @@ -100,8 +102,9 @@ const ( ) const ( - TransferCompleted = "Transfer completed." - PopulatorPodPrefix = "populate-" + TransferCompleted = "Transfer completed." + PopulatorPodPrefix = "populate-" + DvStatusCheckRetriesAnnotation = "dvStatusCheckRetries" ) var ( @@ -134,6 +137,7 @@ var ( {Name: WaitForInitialSnapshot}, {Name: StoreInitialSnapshotDeltas, All: VSphere}, {Name: CreateDataVolumes}, + {Name: WaitForDataVolumesStatus}, {Name: CopyDisks}, {Name: CopyingPaused}, {Name: RemovePreviousSnapshot, All: VSphere}, @@ -148,6 +152,7 @@ var ( {Name: CreateFinalSnapshot}, {Name: WaitForFinalSnapshot}, {Name: AddFinalCheckpoint}, + {Name: WaitForFinalDataVolumesStatus}, {Name: Finalize}, {Name: RemoveFinalSnapshot, All: VSphere}, {Name: CreateGuestConversionPod, All: RequiresConversion}, @@ -662,9 +667,9 @@ func (r *Migration) step(vm *plan.VMStatus) (step string) { step = Initialize case AllocateDisks: step = DiskAllocation - case CopyDisks, CopyingPaused, RemovePreviousSnapshot, CreateSnapshot, WaitForSnapshot, StoreSnapshotDeltas, AddCheckpoint, ConvertOpenstackSnapshot: + case CopyDisks, CopyingPaused, RemovePreviousSnapshot, CreateSnapshot, WaitForSnapshot, StoreSnapshotDeltas, AddCheckpoint, ConvertOpenstackSnapshot, WaitForDataVolumesStatus: step = DiskTransfer - case RemovePenultimateSnapshot, CreateFinalSnapshot, WaitForFinalSnapshot, AddFinalCheckpoint, Finalize, RemoveFinalSnapshot: + case RemovePenultimateSnapshot, CreateFinalSnapshot, WaitForFinalSnapshot, AddFinalCheckpoint, Finalize, RemoveFinalSnapshot, WaitForFinalDataVolumesStatus: step = Cutover case CreateGuestConversionPod, ConvertGuest: step = ImageConversion @@ -1039,6 +1044,51 @@ func (r *Migration) execute(vm *plan.VMStatus) (err error) { if ready { vm.Phase = r.next(vm.Phase) } + case WaitForDataVolumesStatus, WaitForFinalDataVolumesStatus: + step, found := vm.FindStep(r.step(vm)) + if !found { + vm.AddError(fmt.Sprintf("Step '%s' not found", r.step(vm))) + break + } + + dvs, err := r.kubevirt.getDVs(vm) + if err != nil { + step.AddError(err.Error()) + err = nil + break + } + if !r.hasPausedDv(dvs) { + vm.Phase = r.next(vm.Phase) + // Reset for next precopy + step.Annotations[DvStatusCheckRetriesAnnotation] = "1" + } else { + var retries int + retriesAnnotation := step.Annotations[DvStatusCheckRetriesAnnotation] + if retriesAnnotation == "" { + step.Annotations[DvStatusCheckRetriesAnnotation] = "1" + } else { + retries, err = strconv.Atoi(retriesAnnotation) + if err != nil { + step.AddError(err.Error()) + err = nil + break + } + if retries >= settings.Settings.DvStatusCheckRetries { + // Do not fail the step as this can happen when the user runs the warm migration but the VM is already shutdown + // In that case we don't create any delta and don't change the CDI DV status. + r.Log.Info( + "DataVolume status check exceeded the retry limit."+ + "If this causes the problems with the snapshot removal in the CDI please bump the controller_dv_status_check_retries.", + "vm", + vm.String()) + vm.Phase = r.next(vm.Phase) + // Reset for next precopy + step.Annotations[DvStatusCheckRetriesAnnotation] = "1" + } else { + step.Annotations[DvStatusCheckRetriesAnnotation] = strconv.Itoa(retries + 1) + } + } + } case StoreInitialSnapshotDeltas, StoreSnapshotDeltas: step, found := vm.FindStep(r.step(vm)) if !found { @@ -1073,9 +1123,9 @@ func (r *Migration) execute(vm *plan.VMStatus) (err error) { switch vm.Phase { case AddCheckpoint: - vm.Phase = CopyDisks + vm.Phase = WaitForDataVolumesStatus case AddFinalCheckpoint: - vm.Phase = Finalize + vm.Phase = WaitForFinalDataVolumesStatus } case StorePowerState: step, found := vm.FindStep(r.step(vm)) @@ -1259,6 +1309,15 @@ func (r *Migration) execute(vm *plan.VMStatus) (err error) { return } +func (r *Migration) hasPausedDv(dvs []ExtendedDataVolume) bool { + for _, dv := range dvs { + if dv.Status.Phase == Paused { + return true + } + } + return false +} + func (r *Migration) resetPrecopyTasks(vm *plan.VMStatus, step *plan.Step) { step.Completed = nil for _, task := range step.Tasks { diff --git a/pkg/settings/migration.go b/pkg/settings/migration.go index 70413e400..0b8096156 100644 --- a/pkg/settings/migration.go +++ b/pkg/settings/migration.go @@ -24,6 +24,7 @@ const ( FileSystemOverhead = "FILESYSTEM_OVERHEAD" BlockOverhead = "BLOCK_OVERHEAD" CleanupRetries = "CLEANUP_RETRIES" + DvStatusCheckRetries = "DV_STATUS_CHECK_RETRIES" OvirtOsConfigMap = "OVIRT_OS_MAP" VsphereOsConfigMap = "VSPHERE_OS_MAP" VirtCustomizeConfigMap = "VIRT_CUSTOMIZE_MAP" @@ -58,6 +59,8 @@ type Migration struct { BlockOverhead int64 // Cleanup retries CleanupRetries int + // DvStatusCheckRetries retries + DvStatusCheckRetries int // oVirt OS config map name OvirtOsConfigMap string // vSphere OS config map name @@ -100,6 +103,9 @@ func (r *Migration) Load() (err error) { if r.CleanupRetries, err = getPositiveEnvLimit(CleanupRetries, 10); err != nil { return liberr.Wrap(err) } + if r.DvStatusCheckRetries, err = getPositiveEnvLimit(DvStatusCheckRetries, 10); err != nil { + return liberr.Wrap(err) + } if virtV2vImage, ok := os.LookupEnv(VirtV2vImage); ok { r.VirtV2vImage = virtV2vImage } else if Settings.Role.Has(MainRole) {