Skip to content

Commit

Permalink
MTV-1753 | Add wait for snapshot removal timeout
Browse files Browse the repository at this point in the history
Issue: The migration fails with `Another task is already in progress`.
This is caused because of the snapshot creation is started right after
snapshot removal and the snapshot removal did not finish due to the
consolidation.

Fix: I have added a timeout which users can control using the `controller_snapshot_removal_check_retries`
variable. This will wait some time right after snapshot removal.
This is just a temporary solution to unblock the users. I'll fix it
properly using the VM events and tasks, but that will require a lot of
changes in the controller inventory and main.

Ref: https://issues.redhat.com/browse/MTV-1753

Signed-off-by: Martin Necas <[email protected]>
  • Loading branch information
mnecas committed Dec 9, 2024
1 parent 2ecf2a2 commit 1575b38
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 53 deletions.
1 change: 1 addition & 0 deletions operator/roles/forkliftcontroller/defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ controller_snapshot_removal_timeout_minuts: 120
controller_snapshot_status_check_rate_seconds: 10
controller_cleanup_retries: 10
controller_dv_status_check_retries: 10
controller_snapshot_removal_check_retries: 20
controller_vsphere_incremental_backup: true
controller_ovirt_warm_migration: true
controller_retain_precopy_importer_pods: false
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,10 @@ spec:
- name: DV_STATUS_CHECK_RETRIES
value: "{{ controller_dv_status_check_retries }}"
{% endif %}
{% if controller_snapshot_removal_check_retries is number %}
- name: SNAPSHOT_REMOVAL_CHECK_RETRIES
value: "{{ controller_snapshot_removal_check_retries }}"
{% endif %}
{% if controller_max_vm_inflight is number %}
- name: MAX_VM_INFLIGHT
value: "{{ controller_max_vm_inflight }}"
Expand Down
103 changes: 69 additions & 34 deletions pkg/controller/plan/migration.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,38 +55,41 @@ var (

// Phases.
const (
Started = "Started"
PreHook = "PreHook"
StorePowerState = "StorePowerState"
PowerOffSource = "PowerOffSource"
WaitForPowerOff = "WaitForPowerOff"
CreateDataVolumes = "CreateDataVolumes"
WaitForDataVolumesStatus = "WaitForDataVolumesStatus"
WaitForFinalDataVolumesStatus = "WaitForFinalDataVolumesStatus"
CreateVM = "CreateVM"
CopyDisks = "CopyDisks"
AllocateDisks = "AllocateDisks"
CopyingPaused = "CopyingPaused"
AddCheckpoint = "AddCheckpoint"
AddFinalCheckpoint = "AddFinalCheckpoint"
CreateSnapshot = "CreateSnapshot"
CreateInitialSnapshot = "CreateInitialSnapshot"
CreateFinalSnapshot = "CreateFinalSnapshot"
Finalize = "Finalize"
CreateGuestConversionPod = "CreateGuestConversionPod"
ConvertGuest = "ConvertGuest"
CopyDisksVirtV2V = "CopyDisksVirtV2V"
PostHook = "PostHook"
Completed = "Completed"
WaitForSnapshot = "WaitForSnapshot"
WaitForInitialSnapshot = "WaitForInitialSnapshot"
WaitForFinalSnapshot = "WaitForFinalSnapshot"
ConvertOpenstackSnapshot = "ConvertOpenstackSnapshot"
StoreSnapshotDeltas = "StoreSnapshotDeltas"
StoreInitialSnapshotDeltas = "StoreInitialSnapshotDeltas"
RemovePreviousSnapshot = "RemovePreviousSnapshot"
RemovePenultimateSnapshot = "RemovePenultimateSnapshot"
RemoveFinalSnapshot = "RemoveFinalSnapshot"
Started = "Started"
PreHook = "PreHook"
StorePowerState = "StorePowerState"
PowerOffSource = "PowerOffSource"
WaitForPowerOff = "WaitForPowerOff"
CreateDataVolumes = "CreateDataVolumes"
WaitForDataVolumesStatus = "WaitForDataVolumesStatus"
WaitForFinalDataVolumesStatus = "WaitForFinalDataVolumesStatus"
CreateVM = "CreateVM"
CopyDisks = "CopyDisks"
AllocateDisks = "AllocateDisks"
CopyingPaused = "CopyingPaused"
AddCheckpoint = "AddCheckpoint"
AddFinalCheckpoint = "AddFinalCheckpoint"
CreateSnapshot = "CreateSnapshot"
CreateInitialSnapshot = "CreateInitialSnapshot"
CreateFinalSnapshot = "CreateFinalSnapshot"
Finalize = "Finalize"
CreateGuestConversionPod = "CreateGuestConversionPod"
ConvertGuest = "ConvertGuest"
CopyDisksVirtV2V = "CopyDisksVirtV2V"
PostHook = "PostHook"
Completed = "Completed"
WaitForSnapshot = "WaitForSnapshot"
WaitForInitialSnapshot = "WaitForInitialSnapshot"
WaitForFinalSnapshot = "WaitForFinalSnapshot"
ConvertOpenstackSnapshot = "ConvertOpenstackSnapshot"
StoreSnapshotDeltas = "StoreSnapshotDeltas"
StoreInitialSnapshotDeltas = "StoreInitialSnapshotDeltas"
RemovePreviousSnapshot = "RemovePreviousSnapshot"
RemovePenultimateSnapshot = "RemovePenultimateSnapshot"
RemoveFinalSnapshot = "RemoveFinalSnapshot"
WaitForFinalSnapshotRemoval = "WaitForFinalSnapshotRemoval"
WaitForPreviousSnapshotRemoval = "WaitForPreviousSnapshotRemoval"
WaitForPenultimateSnapshotRemoval = "WaitForPenultimateSnapshotRemoval"
)

// Steps.
Expand All @@ -105,6 +108,7 @@ const (
TransferCompleted = "Transfer completed."
PopulatorPodPrefix = "populate-"
DvStatusCheckRetriesAnnotation = "dvStatusCheckRetries"
SnapshotRemovalCheckRetries = "snapshotRemovalCheckRetries"
)

var (
Expand Down Expand Up @@ -141,6 +145,7 @@ var (
{Name: CopyDisks},
{Name: CopyingPaused},
{Name: RemovePreviousSnapshot, All: VSphere},
{Name: WaitForPreviousSnapshotRemoval, All: VSphere},
{Name: CreateSnapshot},
{Name: WaitForSnapshot},
{Name: StoreSnapshotDeltas, All: VSphere},
Expand All @@ -149,12 +154,14 @@ var (
{Name: PowerOffSource},
{Name: WaitForPowerOff},
{Name: RemovePenultimateSnapshot, All: VSphere},
{Name: WaitForPenultimateSnapshotRemoval, All: VSphere},
{Name: CreateFinalSnapshot},
{Name: WaitForFinalSnapshot},
{Name: AddFinalCheckpoint},
{Name: WaitForFinalDataVolumesStatus},
{Name: Finalize},
{Name: RemoveFinalSnapshot, All: VSphere},
{Name: WaitForFinalSnapshotRemoval, All: VSphere},
{Name: CreateGuestConversionPod, All: RequiresConversion},
{Name: ConvertGuest, All: RequiresConversion},
{Name: CreateVM},
Expand Down Expand Up @@ -667,9 +674,9 @@ func (r *Migration) step(vm *plan.VMStatus) (step string) {
step = Initialize
case AllocateDisks:
step = DiskAllocation
case CopyDisks, CopyingPaused, RemovePreviousSnapshot, CreateSnapshot, WaitForSnapshot, StoreSnapshotDeltas, AddCheckpoint, ConvertOpenstackSnapshot, WaitForDataVolumesStatus:
case CopyDisks, CopyingPaused, RemovePreviousSnapshot, WaitForPreviousSnapshotRemoval, CreateSnapshot, WaitForSnapshot, StoreSnapshotDeltas, AddCheckpoint, ConvertOpenstackSnapshot, WaitForDataVolumesStatus:
step = DiskTransfer
case RemovePenultimateSnapshot, CreateFinalSnapshot, WaitForFinalSnapshot, AddFinalCheckpoint, Finalize, RemoveFinalSnapshot, WaitForFinalDataVolumesStatus:
case RemovePenultimateSnapshot, WaitForPenultimateSnapshotRemoval, CreateFinalSnapshot, WaitForFinalSnapshot, AddFinalCheckpoint, Finalize, RemoveFinalSnapshot, WaitForFinalSnapshotRemoval, WaitForFinalDataVolumesStatus:
step = Cutover
case CreateGuestConversionPod, ConvertGuest:
step = ImageConversion
Expand Down Expand Up @@ -1008,6 +1015,34 @@ func (r *Migration) execute(vm *plan.VMStatus) (err error) {
break
}
vm.Phase = r.next(vm.Phase)
case WaitForPreviousSnapshotRemoval, WaitForPenultimateSnapshotRemoval, WaitForFinalSnapshotRemoval:
step, found := vm.FindStep(r.step(vm))
if !found {
vm.AddError(fmt.Sprintf("Step '%s' not found", r.step(vm)))
break
}
// FIXME: This is just temporary timeout to unblock the migrations which get stuck on issue https://issues.redhat.com/browse/MTV-1753
// This should be fixed properly by adding the task manager inside the inventory and monitor the task status
// from the main controller.
var retries int
retriesAnnotation := step.Annotations[SnapshotRemovalCheckRetries]
if retriesAnnotation == "" {
step.Annotations[SnapshotRemovalCheckRetries] = "1"
} else {
retries, err = strconv.Atoi(retriesAnnotation)
if err != nil {
step.AddError(err.Error())
err = nil
break
}
if retries >= settings.Settings.SnapshotRemovalCheckRetries {
vm.Phase = r.next(vm.Phase)
// Reset for next precopy
step.Annotations[SnapshotRemovalCheckRetries] = "1"
} else {
step.Annotations[SnapshotRemovalCheckRetries] = strconv.Itoa(retries + 1)
}
}
case CreateInitialSnapshot, CreateSnapshot, CreateFinalSnapshot:
step, found := vm.FindStep(r.step(vm))
if !found {
Expand Down
44 changes: 25 additions & 19 deletions pkg/settings/migration.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,25 +12,26 @@ import (

// Environment variables.
const (
MaxVmInFlight = "MAX_VM_INFLIGHT"
HookRetry = "HOOK_RETRY"
ImporterRetry = "IMPORTER_RETRY"
VirtV2vImage = "VIRT_V2V_IMAGE"
PrecopyInterval = "PRECOPY_INTERVAL"
VirtV2vDontRequestKVM = "VIRT_V2V_DONT_REQUEST_KVM"
SnapshotRemovalTimeout = "SNAPSHOT_REMOVAL_TIMEOUT"
SnapshotStatusCheckRate = "SNAPSHOT_STATUS_CHECK_RATE"
CDIExportTokenTTL = "CDI_EXPORT_TOKEN_TTL"
FileSystemOverhead = "FILESYSTEM_OVERHEAD"
BlockOverhead = "BLOCK_OVERHEAD"
CleanupRetries = "CLEANUP_RETRIES"
DvStatusCheckRetries = "DV_STATUS_CHECK_RETRIES"
OvirtOsConfigMap = "OVIRT_OS_MAP"
VsphereOsConfigMap = "VSPHERE_OS_MAP"
VirtCustomizeConfigMap = "VIRT_CUSTOMIZE_MAP"
VddkJobActiveDeadline = "VDDK_JOB_ACTIVE_DEADLINE"
VirtV2vExtraArgs = "VIRT_V2V_EXTRA_ARGS"
VirtV2vExtraConfConfigMap = "VIRT_V2V_EXTRA_CONF_CONFIG_MAP"
MaxVmInFlight = "MAX_VM_INFLIGHT"
HookRetry = "HOOK_RETRY"
ImporterRetry = "IMPORTER_RETRY"
VirtV2vImage = "VIRT_V2V_IMAGE"
PrecopyInterval = "PRECOPY_INTERVAL"
VirtV2vDontRequestKVM = "VIRT_V2V_DONT_REQUEST_KVM"
SnapshotRemovalTimeout = "SNAPSHOT_REMOVAL_TIMEOUT"
SnapshotStatusCheckRate = "SNAPSHOT_STATUS_CHECK_RATE"
CDIExportTokenTTL = "CDI_EXPORT_TOKEN_TTL"
FileSystemOverhead = "FILESYSTEM_OVERHEAD"
BlockOverhead = "BLOCK_OVERHEAD"
CleanupRetries = "CLEANUP_RETRIES"
DvStatusCheckRetries = "DV_STATUS_CHECK_RETRIES"
SnapshotRemovalCheckRetries = "SNAPSHOT_REMOVAL_CHECK_RETRIES"
OvirtOsConfigMap = "OVIRT_OS_MAP"
VsphereOsConfigMap = "VSPHERE_OS_MAP"
VirtCustomizeConfigMap = "VIRT_CUSTOMIZE_MAP"
VddkJobActiveDeadline = "VDDK_JOB_ACTIVE_DEADLINE"
VirtV2vExtraArgs = "VIRT_V2V_EXTRA_ARGS"
VirtV2vExtraConfConfigMap = "VIRT_V2V_EXTRA_CONF_CONFIG_MAP"
)

// Migration settings
Expand Down Expand Up @@ -61,6 +62,8 @@ type Migration struct {
CleanupRetries int
// DvStatusCheckRetries retries
DvStatusCheckRetries int
// SnapshotRemovalCheckRetries retries
SnapshotRemovalCheckRetries int
// oVirt OS config map name
OvirtOsConfigMap string
// vSphere OS config map name
Expand Down Expand Up @@ -106,6 +109,9 @@ func (r *Migration) Load() (err error) {
if r.DvStatusCheckRetries, err = getPositiveEnvLimit(DvStatusCheckRetries, 10); err != nil {
return liberr.Wrap(err)
}
if r.SnapshotRemovalCheckRetries, err = getPositiveEnvLimit(SnapshotRemovalCheckRetries, 20); err != nil {
return liberr.Wrap(err)
}
if virtV2vImage, ok := os.LookupEnv(VirtV2vImage); ok {
r.VirtV2vImage = virtV2vImage
} else if Settings.Role.Has(MainRole) {
Expand Down

0 comments on commit 1575b38

Please sign in to comment.