Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Configurable retry delay and jitter #676

Draft
wants to merge 5 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 47 additions & 15 deletions src/main/java/hudson/remoting/Engine.java
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.security.interfaces.RSAPublicKey;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Base64;
Expand Down Expand Up @@ -101,6 +102,7 @@
import org.jenkinsci.remoting.protocol.cert.PublicKeyMatchingX509ExtendedTrustManager;
import org.jenkinsci.remoting.protocol.impl.ConnectionRefusalException;
import org.jenkinsci.remoting.util.KeyUtils;
import org.jenkinsci.remoting.util.RetryUtils;
import org.jenkinsci.remoting.util.VersionNumber;
import org.jenkinsci.remoting.util.https.NoCheckHostnameVerifier;
import org.jenkinsci.remoting.util.https.NoCheckTrustManager;
Expand Down Expand Up @@ -193,6 +195,12 @@ public Thread newThread(@NonNull final Runnable r) {

private boolean noReconnect = false;

private int delay = 10;

private double jitterFactor = 0;

private int jitter = 0;

/**
* Determines whether the socket will have {@link Socket#setKeepAlive(boolean)} set or not.
*
Expand Down Expand Up @@ -412,6 +420,21 @@ public void setNoReconnect(boolean noReconnect) {
this.noReconnect = noReconnect;
}

@Restricted(NoExternalUse.class)
public void setDelay(int delay) {
this.delay = delay;
}

@Restricted(NoExternalUse.class)
public void setJitterFactor(double jitterFactor) {
this.jitterFactor = jitterFactor;
}

@Restricted(NoExternalUse.class)
public void setJitter(int jitter) {
this.jitter = jitter;
}

/**
* Determines if JNLPAgentEndpointResolver will not perform certificate validation in the HTTPs mode.
*
Expand Down Expand Up @@ -742,8 +765,8 @@ public void closeRead() throws IOException {
}
events.onDisconnect();
while (true) {
// TODO refactor various sleep statements into a common method
TimeUnit.SECONDS.sleep(10);
Duration duration = RetryUtils.getDuration(delay, jitterFactor, jitter);
Thread.sleep(duration.toMillis());
// Unlike JnlpAgentEndpointResolver, we do not use $jenkins/tcpSlaveAgentListener/, as that will be a 404 if the TCP port is disabled.
URL ping = new URL(hudsonUrl, "login");
try {
Expand Down Expand Up @@ -809,9 +832,9 @@ private void innerRun(IOHub hub, SSLContext context, ExecutorService service) {
endpoint = resolver.resolve();
} catch (IOException e) {
if (!noReconnect) {
events.status("Could not locate server among " + candidateUrls + "; waiting 10 seconds before retry", e);
// TODO refactor various sleep statements into a common method
TimeUnit.SECONDS.sleep(10);
Duration duration = RetryUtils.getDuration(delay, jitterFactor, jitter);
events.status("Could not locate server among " + candidateUrls + "; waiting " + RetryUtils.formatDuration(duration) + " seconds before retry", e);
Thread.sleep(duration.toMillis());
continue;
} else {
if (Boolean.getBoolean(Engine.class.getName() + ".nonFatalJnlpAgentEndpointResolutionExceptions")) {
Expand Down Expand Up @@ -929,26 +952,35 @@ private void innerRun(IOHub hub, SSLContext context, ExecutorService service) {
}

private JnlpEndpointResolver createEndpointResolver(List<String> jenkinsUrls, String agentName) {
JnlpEndpointResolver resolver;
if (directConnection == null) {
SSLSocketFactory sslSocketFactory = null;
try {
sslSocketFactory = getSSLSocketFactory(candidateCertificates, disableHttpsCertValidation);
} catch (Exception e) {
events.error(e);
}
resolver = new JnlpAgentEndpointResolver(jenkinsUrls, agentName, credentials, proxyCredentials, tunnel,
sslSocketFactory, disableHttpsCertValidation);
JnlpAgentEndpointResolver jnlpAgentEndpointResolver =
new JnlpAgentEndpointResolver(
jenkinsUrls,
agentName,
credentials,
proxyCredentials,
tunnel,
sslSocketFactory,
disableHttpsCertValidation);
jnlpAgentEndpointResolver.setDelay(delay);
jnlpAgentEndpointResolver.setJitterFactor(jitterFactor);
jnlpAgentEndpointResolver.setJitter(jitter);
return jnlpAgentEndpointResolver;
} else {
resolver = new JnlpAgentEndpointConfigurator(directConnection, instanceIdentity, protocols, proxyCredentials);
return new JnlpAgentEndpointConfigurator(directConnection, instanceIdentity, protocols, proxyCredentials);
}
return resolver;
}

private void onConnectionRejected(String greeting) throws InterruptedException {
events.status("reconnect rejected, sleeping 10s: ", new Exception("The server rejected the connection: " + greeting));
// TODO refactor various sleep statements into a common method
TimeUnit.SECONDS.sleep(10);
Duration duration = RetryUtils.getDuration(delay, jitterFactor, jitter);
events.status("reconnect rejected, sleeping " + RetryUtils.formatDuration(duration) + "s: ", new Exception("The server rejected the connection: " + greeting));
Thread.sleep(duration.toMillis());
}

/**
Expand All @@ -970,8 +1002,8 @@ private Socket connectTcp(@NonNull JnlpAgentEndpoint endpoint) throws IOExceptio
if(retry++>10) {
throw e;
}
// TODO refactor various sleep statements into a common method
TimeUnit.SECONDS.sleep(10);
Duration duration = RetryUtils.getDuration(delay, jitterFactor, jitter);
Thread.sleep(duration.toMillis());
events.status(msg+" (retrying:"+retry+")",e);
}
}
Expand Down
42 changes: 38 additions & 4 deletions src/main/java/hudson/remoting/Launcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import org.jenkinsci.remoting.engine.JnlpAgentEndpointResolver;
import org.jenkinsci.remoting.engine.WorkDirManager;
import org.jenkinsci.remoting.util.PathUtils;
import org.jenkinsci.remoting.util.RetryUtils;
import org.jenkinsci.remoting.util.https.NoCheckHostnameVerifier;
import org.kohsuke.args4j.Argument;
import org.kohsuke.args4j.CmdLineException;
Expand Down Expand Up @@ -73,6 +74,7 @@
import java.security.cert.CertificateException;
import java.security.cert.CertificateFactory;
import java.security.cert.X509Certificate;
import java.time.Duration;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
Expand All @@ -81,7 +83,6 @@
import java.util.Properties;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;
import java.util.logging.Logger;

Expand Down Expand Up @@ -249,6 +250,25 @@
@Option(name="-noReconnect",aliases="-noreconnect",usage="Doesn't try to reconnect when a communication fail, and exit instead")
public boolean noReconnect = false;

@Option(name = "-retryDelay",
usage = "The fixed delay to occur between retries. Default is 10 seconds.")
public int delay = 10;

@Option(name = "-retryJitterFactor",
usage = "The jitter factor to randomly vary retry delays by. For each retry delay, a"
+ " random portion of the delay multiplied by the jitter factor will be"
+ " added or subtracted to the delay. For example: a retry delay of 10"
+ " seconds and a jitter factor of .25 will result in a random retry delay"
+ " between 7.5 and 12.5 seconds.")
public double jitterFactor = 0;

@Option(name = "-retryJitter",
usage = "The jitter to randomly vary retry delays by. For each retry delay, a random"
+ " portion of the jitter will be added or subtracted to the delay. For"
+ " example: a jitter of 5 seconds will randomly add between -5 and 5"
+ " seconds to each retry delay.")
public int jitter = 0;

@Option(name = "-noKeepAlive",
usage = "Disable TCP socket keep alive on connection to the controller.")
public boolean noKeepAlive = false;
Expand Down Expand Up @@ -450,8 +470,22 @@
if (initialized) {
throw new IllegalStateException("double initialization");
}

if (jitterFactor != 0 && jitter != 0) {
throw new IllegalArgumentException("Jitter factor and jitter are mutually exclusive");
}
if (jitterFactor < 0 || jitterFactor > 1) {
throw new IllegalArgumentException("Jitter factor must be >= 0 and <= 1");
}
if (jitter < 0) {
throw new IllegalArgumentException("Jitter must be >= 0");
}
if (jitter > 0 && jitter >= delay) {
throw new IllegalArgumentException("Jitter must be < delay");
}

// Create and verify working directory and logging
// TODO: The pass-through for the JNLP mode has been added in JENKINS-39817. But we still need to keep this parameter in

Check warning on line 488 in src/main/java/hudson/remoting/Launcher.java

View check run for this annotation

ci.jenkins.io / Open Tasks Scanner

TODO

NORMAL: The pass-through for the JNLP mode has been added in JENKINS-39817. But we still need to keep this parameter in
// consideration for other modes (TcpServer, TcpClient, etc.) to retain the legacy behavior.
// On the other hand, in such case there is no need to invoke WorkDirManager and handle the double initialization logic
final WorkDirManager workDirManager = WorkDirManager.getInstance();
Expand Down Expand Up @@ -745,9 +779,9 @@

System.err.println("Failed to obtain "+ agentJnlpURL);
e.printStackTrace(System.err);
System.err.println("Waiting 10 seconds before retry");
// TODO refactor various sleep statements into a common method
TimeUnit.SECONDS.sleep(10);
Duration duration = RetryUtils.getDuration(delay, jitterFactor, jitter);
System.err.println("Waiting " + RetryUtils.formatDuration(duration) + " seconds before retry");
Thread.sleep(duration.toMillis());
// retry
} finally {
if (con instanceof HttpURLConnection) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import hudson.remoting.Engine;
import hudson.remoting.Launcher;
import hudson.remoting.NoProxyEvaluator;
import org.jenkinsci.remoting.util.RetryUtils;
import org.jenkinsci.remoting.util.VersionNumber;
import org.jenkinsci.remoting.util.https.NoCheckHostnameVerifier;
import org.kohsuke.accmod.Restricted;
Expand Down Expand Up @@ -57,6 +58,7 @@
import java.nio.charset.StandardCharsets;
import java.security.interfaces.RSAPublicKey;
import java.security.spec.InvalidKeySpecException;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Base64;
import java.util.Iterator;
Expand Down Expand Up @@ -100,6 +102,12 @@ public class JnlpAgentEndpointResolver extends JnlpEndpointResolver {

private HostnameVerifier hostnameVerifier;

private int delay = 10;

private double jitterFactor = 0;

private int jitter = 0;

/**
* If specified, only the protocols from the list will be tried during the connection.
* The option provides protocol names, but the order of the check is defined internally and cannot be changed.
Expand Down Expand Up @@ -185,6 +193,21 @@ public void setDisableHttpsCertValidation(boolean disableHttpsCertValidation) {
}
}

@Restricted(NoExternalUse.class)
public void setDelay(int delay) {
this.delay = delay;
}

@Restricted(NoExternalUse.class)
public void setJitterFactor(double jitterFactor) {
this.jitterFactor = jitterFactor;
}

@Restricted(NoExternalUse.class)
public void setJitter(int jitter) {
this.jitter = jitter;
}

@CheckForNull
@Override
public JnlpAgentEndpoint resolve() throws IOException {
Expand Down Expand Up @@ -402,8 +425,8 @@ public void waitForReady() throws InterruptedException {
try {
int retries = 0;
while (true) {
// TODO refactor various sleep statements into a common method
Thread.sleep(1000 * 10);
Duration duration = RetryUtils.getDuration(delay, jitterFactor, jitter);
Thread.sleep(duration.toMillis());
// Jenkins top page might be read-protected. see http://www.nabble
// .com/more-lenient-retry-logic-in-Engine.waitForServerToBack-td24703172.html
if (jenkinsUrls.isEmpty()) {
Expand Down
67 changes: 67 additions & 0 deletions src/main/java/org/jenkinsci/remoting/util/RetryUtils.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/*
* The MIT License
*
* Copyright (c) 2023, CloudBees, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/

package org.jenkinsci.remoting.util;

import java.security.SecureRandom;
import java.text.NumberFormat;
import java.time.Duration;
import java.util.Random;
import org.kohsuke.accmod.Restricted;
import org.kohsuke.accmod.restrictions.NoExternalUse;

/**
* Retry-related utility methods. Used in place of a library like <a
* href="https://failsafe.dev/">Failsafe</a> to minimize external third-party dependencies.
*/
@Restricted(NoExternalUse.class)
public class RetryUtils {

private static final Random RANDOM = new SecureRandom();

// Suppress default constructor for noninstantiability
private RetryUtils() {
throw new AssertionError();
}

/**
* Get the retry duration based on the CLI arguments.
*/
public static Duration getDuration(int delay, double jitterFactor, int jitter) {
if (jitterFactor != 0) {
double randomFactor = 1 + (1 - RANDOM.nextDouble() * 2) * jitterFactor;
return Duration.ofMillis((long) (Duration.ofSeconds(delay).toMillis() * randomFactor));
} else if (jitter != 0) {
double randomAddend =
(1 - RANDOM.nextDouble() * 2) * Duration.ofSeconds(jitter).toMillis();
return Duration.ofMillis((long) (Duration.ofSeconds(delay).toMillis() + randomAddend));
} else {
return Duration.ofSeconds(delay);
}
}

public static String formatDuration(Duration duration) {
return NumberFormat.getNumberInstance().format(duration.toMillis() / 1000.0);
}
}
Loading