Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

issue# 307 fix: Failure detection by maximum retry fail doesnt take the exact value set in exchange.max-retry-count config parameter #308

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion hetu-docs/en/admin/properties.md
Original file line number Diff line number Diff line change
Expand Up @@ -389,7 +389,7 @@ Exchanges transfer data between openLooKeng nodes for different stages of a quer
> - **Type:** `integer`
> - **Default value:** `10`
>
> The maximum number of retry for failed task performed by the coordinator before considering it as a permanent failure. This property is used only when exchange.is-timeout-failure-detection-enabled is set to false.
> The maximum number of retry for failed task performed by the coordinator before considering it as a permanent failure. This property is used only when exchange.is-timeout-failure-detection-enabled is set to false. This value needs to be atleast 3 (minimum retry count) to take effect.

### `sink.max-buffer-size`

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,14 +75,13 @@ public Backoff(Duration maxFailureInterval, Ticker ticker, int maxTries)
public Backoff(int minTries, Duration maxFailureInterval, Ticker ticker, List<Duration> backoffDelayIntervals, int maxTries)
{
checkArgument(minTries > 0, "minTries must be at least 1");
checkArgument(maxTries > 0, "maxTries must be at least 1");
requireNonNull(maxFailureInterval, "maxFailureInterval is null");
requireNonNull(ticker, "ticker is null");
requireNonNull(backoffDelayIntervals, "backoffDelayIntervals is null");
checkArgument(!backoffDelayIntervals.isEmpty(), "backoffDelayIntervals must contain at least one entry");

this.minTries = minTries;
this.maxTries = maxTries;
this.maxTries = (minTries < maxTries) ? maxTries : minTries;
this.maxFailureIntervalNanos = maxFailureInterval.roundTo(NANOSECONDS);
this.ticker = ticker;
this.backoffDelayIntervalsNanos = backoffDelayIntervals.stream()
Expand All @@ -100,7 +99,7 @@ public Backoff(int minTries, Duration maxFailureInterval, Ticker ticker, List<Du
checkArgument(!backoffDelayIntervals.isEmpty(), "backoffDelayIntervals must contain at least one entry");

this.minTries = minTries;
this.maxTries = minTries + MAX_RETRIES; // to guaranty higher max retry value when min retry is > MAX_RETRIES
this.maxTries = (minTries < MAX_RETRIES) ? MAX_RETRIES : minTries;
this.maxFailureIntervalNanos = maxFailureInterval.roundTo(NANOSECONDS);
this.ticker = ticker;
this.backoffDelayIntervalsNanos = backoffDelayIntervals.stream()
Expand Down Expand Up @@ -136,10 +135,19 @@ public synchronized void success()
{
lastRequestStart = 0;
firstFailureTime = 0;
failureCount = 0;
setFailureCount(0, false);
lastFailureTime = 0;
}

private synchronized void setFailureCount(int n, boolean isInc)
{
if (isInc) {
failureCount = failureCount + n;
return;
}
failureCount = n;
}

/**
* @return true if max retry failed, now it is time to check node status from HeartbeatFailureDetector
*/
Expand All @@ -148,7 +156,7 @@ public synchronized boolean maxTried()
long now = ticker.read();

lastFailureTime = now;
failureCount++;
setFailureCount(1, true);
if (lastRequestStart != 0) {
failureRequestTimeTotal += now - lastRequestStart;
lastRequestStart = 0;
Expand All @@ -160,11 +168,10 @@ public synchronized boolean maxTried()
return false;
}

if (failureCount < minTries) {
if (getFailureCount() < minTries) {
return false;
}

return failureCount > maxTries;
return getFailureCount() >= maxTries;
}

/**
Expand All @@ -186,7 +193,7 @@ public synchronized boolean failure()
long now = ticker.read();

lastFailureTime = now;
failureCount++;
setFailureCount(1, true);
if (lastRequestStart != 0) {
failureRequestTimeTotal += now - lastRequestStart;
lastRequestStart = 0;
Expand All @@ -198,7 +205,7 @@ public synchronized boolean failure()
return false;
}

if (failureCount < minTries) {
if (getFailureCount() < minTries) {
return false;
}

Expand All @@ -208,7 +215,7 @@ public synchronized boolean failure()

public synchronized long getBackoffDelayNanos()
{
int tmpFailureCount = (int) min(backoffDelayIntervalsNanos.length, this.failureCount);
int tmpFailureCount = (int) min(backoffDelayIntervalsNanos.length, getFailureCount());
if (tmpFailureCount == 0) {
return 0;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -459,9 +459,9 @@ public void testMaxRetryFailedRemoteHostGone()
assertEquals(callback.getPages().size(), 0);
assertEquals(callback.getCompletedRequests(), 11);
assertEquals(callback.getFinishedBuffers(), 0);
assertEquals(callback.getFailedBuffers(), 1);
assertEquals(callback.getFailedBuffers(), 2);
assertInstanceOf(callback.getFailure(), PageTransportTimeoutException.class);
assertContains(callback.getFailure().getMessage(), WORKER_NODE_ERROR + " (http://localhost:8080/0 - 11 failures,");
assertContains(callback.getFailure().getMessage(), WORKER_NODE_ERROR + " (http://localhost:8080/0 - 10 failures,");
assertStatus(client, location, "queued", 0, 11, 11, 11, "not scheduled");
}

Expand Down