Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

server: investigate pending HA work when executing in new MS session #10167

Draft
wants to merge 3 commits into
base: 4.20
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,13 @@ public enum WorkType {
HA; // Restart a VM.
}

enum ReasonType {
Unknown,
HostMaintenance,
HostDown,
HostDegraded;
}

enum Step {
Scheduled, Investigating, Fencing, Stopping, Restarting, Migrating, Cancelled, Done, Error,
}
Expand All @@ -92,7 +99,7 @@ enum Step {
* Investigate why a host has disconnected and migrate the VMs on it
* if necessary.
*
* @param host - the host that has disconnected.
* @param hostId - the id of the host that has disconnected.
*/
Status investigate(long hostId);

Expand All @@ -109,17 +116,19 @@ enum Step {
* @param investigate must be investigated before we do anything with this vm.
*/
void scheduleRestart(VMInstanceVO vm, boolean investigate);
void scheduleRestart(VMInstanceVO vm, boolean investigate, ReasonType reasonType);

void cancelDestroy(VMInstanceVO vm, Long hostId);

boolean scheduleDestroy(VMInstanceVO vm, long hostId);
boolean scheduleDestroy(VMInstanceVO vm, long hostId, ReasonType reasonType);

/**
* Schedule restarts for all vms running on the host.
* @param host host.
* @param investigate TODO
* @param investigate whether to investigate
* @param reasonType reason for HA work
*/
void scheduleRestartForVmsOnHost(HostVO host, boolean investigate);
void scheduleRestartForVmsOnHost(HostVO host, boolean investigate, ReasonType reasonType);

/**
* Schedule the vm for migration.
Expand All @@ -128,6 +137,7 @@ enum Step {
* @return true if schedule worked.
*/
boolean scheduleMigration(VMInstanceVO vm);
boolean scheduleMigration(VMInstanceVO vm, ReasonType reasonType);

List<VMInstanceVO> findTakenMigrationWork();

Expand All @@ -140,10 +150,11 @@ enum Step {
* 3. Check if a VM has been stopped: WorkType.CheckStop
*
* @param vm virtual machine to stop.
* @param host host the virtual machine is on.
* @param hostId the id of the host the virtual machine is on.
* @param type which type of stop is requested.
*/
boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type);
boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type, ReasonType reasonType);

void cancelScheduledMigrations(HostVO host);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -993,7 +993,7 @@
handleDisconnectWithoutInvestigation(attache, event, true, true);
host = _hostDao.findById(hostId); // Maybe the host magically reappeared?
if (host != null && host.getStatus() == Status.Down) {
_haMgr.scheduleRestartForVmsOnHost(host, true);
_haMgr.scheduleRestartForVmsOnHost(host, true, HighAvailabilityManager.ReasonType.HostDown);

Check warning on line 996 in engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java

View check run for this annotation

Codecov / codecov/patch

engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java#L996

Added line #L996 was not covered by tests
}
return true;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,6 @@ CALL `cloud`.`IDEMPOTENT_ADD_COLUMN`('cloud.volumes', 'last_id', 'bigint(20) uns

-- Add used_iops column to support IOPS data in storage stats
CALL `cloud`.`IDEMPOTENT_ADD_COLUMN`('cloud.storage_pool', 'used_iops', 'bigint unsigned DEFAULT NULL COMMENT "IOPS currently in use for this storage pool" ');

-- Add reason column for op_ha_work
CALL `cloud`.`IDEMPOTENT_ADD_COLUMN`('cloud.op_ha_work', 'reason', 'varchar(32) DEFAULT NULL COMMENT "Reason for the HA work"');
15 changes: 14 additions & 1 deletion server/src/main/java/com/cloud/ha/HaWorkVO.java
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,10 @@
@Column(name = "tried")
int timesTried;

@Column(name = "reason")
@Enumerated(value = EnumType.STRING)
private HighAvailabilityManager.ReasonType reasonType;

protected HaWorkVO() {
}

Expand Down Expand Up @@ -179,7 +183,7 @@
}

public HaWorkVO(final long instanceId, final VirtualMachine.Type type, final WorkType workType, final Step step, final long hostId, final State previousState,
final int timesTried, final long updated) {
final int timesTried, final long updated, HighAvailabilityManager.ReasonType reasonType) {
this.workType = workType;
this.type = type;
this.instanceId = instanceId;
Expand All @@ -191,6 +195,7 @@
this.step = step;
this.timeToTry = System.currentTimeMillis() >> 10;
this.updateTime = updated;
this.reasonType = reasonType;
}

@Override
Expand All @@ -207,4 +212,12 @@
.append("]")
.toString();
}

public HighAvailabilityManager.ReasonType getReasonType() {
return reasonType;
}

Check warning on line 218 in server/src/main/java/com/cloud/ha/HaWorkVO.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/com/cloud/ha/HaWorkVO.java#L216-L218

Added lines #L216 - L218 were not covered by tests

public void setReasonType(HighAvailabilityManager.ReasonType reasonType) {
this.reasonType = reasonType;
}

Check warning on line 222 in server/src/main/java/com/cloud/ha/HaWorkVO.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/com/cloud/ha/HaWorkVO.java#L220-L222

Added lines #L220 - L222 were not covered by tests
}
74 changes: 62 additions & 12 deletions server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
import org.apache.cloudstack.managed.context.ManagedContext;
import org.apache.cloudstack.managed.context.ManagedContextRunnable;
import org.apache.cloudstack.management.ManagementServerHost;
import org.apache.logging.log4j.ThreadContext;

import com.cloud.agent.AgentManager;
import com.cloud.alert.AlertManager;
Expand Down Expand Up @@ -90,7 +91,6 @@
import com.cloud.vm.VirtualMachineManager;
import com.cloud.vm.VirtualMachineProfile;
import com.cloud.vm.dao.VMInstanceDao;
import org.apache.logging.log4j.ThreadContext;

/**
* HighAvailabilityManagerImpl coordinates the HA process. VMs are registered with the HA Manager for HA. The request is stored
Expand Down Expand Up @@ -133,6 +133,9 @@
protected static ConfigKey<Boolean> VmHaAlertsEnabled = new ConfigKey<>("Advanced", Boolean.class, "vm.ha.alerts.enabled", "true",
"Enable/Disable alerts for the VM HA operations, it is enabled by default.", true, Zone);

protected static final List<ReasonType> CancellableWorkReasonTypes =
List.of(ReasonType.HostMaintenance, ReasonType.HostDown, ReasonType.HostDegraded);

WorkerThread[] _workers;
boolean _stopped;
long _timeToSleep;
Expand Down Expand Up @@ -269,8 +272,7 @@
}

@Override
public void scheduleRestartForVmsOnHost(final HostVO host, boolean investigate) {

public void scheduleRestartForVmsOnHost(final HostVO host, boolean investigate, ReasonType reasonType) {
if (host.getType() != Host.Type.Routing) {
return;
}
Expand Down Expand Up @@ -337,12 +339,12 @@
logger.debug("VM {} is not on down host {} it is on other host {} VM HA is done", vm, host, hostId);
continue;
}
scheduleRestart(vm, investigate);
scheduleRestart(vm, investigate, reasonType);
}
}

@Override
public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type, ReasonType reasonType) {
assert (type == WorkType.CheckStop || type == WorkType.ForceStop || type == WorkType.Stop);

if (_haDao.hasBeenScheduled(vm.getId(), type)) {
Expand All @@ -359,7 +361,7 @@
return false;
}

HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), type, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated());
HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), type, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated(), reasonType);
_haDao.persist(work);
if (logger.isDebugEnabled()) {
logger.debug("Scheduled " + work);
Expand All @@ -368,6 +370,11 @@
return true;
}

@Override
public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
return scheduleStop(vm, hostId, type, null);
}

protected void wakeupWorkers() {
logger.debug("Wakeup workers HA");
for (WorkerThread worker : _workers) {
Expand All @@ -376,7 +383,7 @@
}

@Override
public boolean scheduleMigration(final VMInstanceVO vm) {
public boolean scheduleMigration(final VMInstanceVO vm, ReasonType reasonType) {
if (vm.getHostId() == null) {
return false;
}
Expand All @@ -390,15 +397,20 @@
return false;
}

final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Migration, Step.Scheduled, vm.getHostId(), vm.getState(), 0, vm.getUpdated());
final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Migration, Step.Scheduled, vm.getHostId(), vm.getState(), 0, vm.getUpdated(), reasonType);
_haDao.persist(work);
logger.info("Scheduled migration work of VM {} from host {} with HAWork {}", vm, _hostDao.findById(vm.getHostId()), work);
wakeupWorkers();
return true;
}

@Override
public void scheduleRestart(VMInstanceVO vm, boolean investigate) {
public boolean scheduleMigration(final VMInstanceVO vm) {
return scheduleMigration(vm, null);
}

@Override
public void scheduleRestart(VMInstanceVO vm, boolean investigate, ReasonType reasonType) {
if (!VmHaEnabled.valueIn(vm.getDataCenterId())) {
String message = String.format("Unable to schedule restart for the VM %s (%d), VM high availability manager is disabled.", vm.getName(), vm.getId());
if (logger.isDebugEnabled()) {
Expand Down Expand Up @@ -490,7 +502,7 @@
}

HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.HA, investigate ? Step.Investigating : Step.Scheduled,
hostId != null ? hostId : 0L, vm.getState(), timesTried, vm.getUpdated());
hostId != null ? hostId : 0L, vm.getState(), timesTried, vm.getUpdated(), reasonType);
_haDao.persist(work);

if (logger.isInfoEnabled()) {
Expand All @@ -500,6 +512,11 @@
wakeupWorkers();
}

@Override
public void scheduleRestart(VMInstanceVO vm, boolean investigate) {
scheduleRestart(vm, investigate, null);
}

private void startVm(VirtualMachine vm, Map<VirtualMachineProfile.Param, Object> params,
DeploymentPlanner planner) throws InsufficientCapacityException, ResourceUnavailableException,
ConcurrentOperationException, OperationTimedoutException {
Expand Down Expand Up @@ -561,6 +578,9 @@
logger.info("Unable to find vm: " + vmId);
return null;
}
if (checkAndCancelWorkIfNeeded(work)) {
return null;

Check warning on line 582 in server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java#L582

Added line #L582 was not covered by tests
}

logger.info("HA on " + vm);
if (vm.getState() != work.getPreviousState() || vm.getUpdated() != work.getUpdateTime()) {
Expand Down Expand Up @@ -762,6 +782,23 @@
return (System.currentTimeMillis() >> 10) + _restartRetryInterval;
}

protected boolean checkAndCancelWorkIfNeeded(final HaWorkVO work) {
if (!Step.Investigating.equals(work.getStep())) {
return false;
}
if (!CancellableWorkReasonTypes.contains(work.getReasonType())) {
return false;
}

Status hostStatus = investigate(work.getHostId());
if (!Status.Up.equals(hostStatus)) {
return false;
}
logger.debug("Cancelling {} as it is not needed anymore", () -> work);
work.setStep(Step.Cancelled);
return true;
}

public Long migrate(final HaWorkVO work) {
long vmId = work.getInstanceId();
long srcHostId = work.getHostId();
Expand All @@ -772,6 +809,9 @@
logger.info("Unable to find vm: " + vmId + ", skipping migrate.");
return null;
}
if (checkAndCancelWorkIfNeeded(work)) {
return null;

Check warning on line 813 in server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java#L813

Added line #L813 was not covered by tests
}
logger.info("Migration attempt: for VM {}from host {}. Starting attempt: {}/{} times.", vm, srcHost, 1 + work.getTimesTried(), _maxRetries);
try {
work.setStep(Step.Migrating);
Expand All @@ -791,7 +831,7 @@
}

@Override
public boolean scheduleDestroy(VMInstanceVO vm, long hostId) {
public boolean scheduleDestroy(VMInstanceVO vm, long hostId, ReasonType reasonType) {
if (!VmHaEnabled.valueIn(vm.getDataCenterId())) {
String message = String.format("Unable to schedule destroy for the VM %s (%d) on host %d, VM high availability manager is disabled.", vm.getName(), vm.getId(), hostId);
if (logger.isDebugEnabled()) {
Expand All @@ -801,7 +841,7 @@
return false;
}

final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Destroy, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated());
final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Destroy, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated(), reasonType);
_haDao.persist(work);
if (logger.isDebugEnabled()) {
logger.debug("Scheduled " + work.toString());
Expand Down Expand Up @@ -838,6 +878,9 @@
logger.info("No longer can find VM " + work.getInstanceId() + ". Throwing away " + work);
return null;
}
if (checkAndCancelWorkIfNeeded(work)) {
return null;

Check warning on line 882 in server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java#L882

Added line #L882 was not covered by tests
}
boolean expunge = VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType())
|| VirtualMachine.Type.ConsoleProxy.equals(vm.getType());
if (!expunge && VirtualMachine.State.Destroyed.equals(work.getPreviousState())) {
Expand Down Expand Up @@ -872,6 +915,9 @@
work.setStep(Step.Done);
return null;
}
if (checkAndCancelWorkIfNeeded(work)) {
return null;

Check warning on line 919 in server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java#L919

Added line #L919 was not covered by tests
}
logger.info("Stopping " + vm);
try {
if (work.getWorkType() == WorkType.Stop) {
Expand Down Expand Up @@ -1057,6 +1103,8 @@
public boolean start() {
_stopped = false;

_haDao.markPendingWorksAsInvestigating();

Check warning on line 1106 in server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java#L1106

Added line #L1106 was not covered by tests

for (final WorkerThread thread : _workers) {
thread.start();
}
Expand All @@ -1074,6 +1122,8 @@

_executor.shutdown();

_haDao.markServerPendingWorksAsInvestigating(_msServer.getId());

Check warning on line 1125 in server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java#L1125

Added line #L1125 was not covered by tests

return true;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,4 +86,6 @@ public interface HighAvailabilityDao extends GenericDao<HaWorkVO, Long> {

List<HaWorkVO> listPendingMigrationsForVm(long vmId);
int expungeByVmList(List<Long> vmIds, Long batchSize);
void markPendingWorksAsInvestigating();
void markServerPendingWorksAsInvestigating(long managementServerId);
}
Loading
Loading