Skip to content

Commit

Permalink
Retry slack notifications in case of unexpected errors
Browse files Browse the repository at this point in the history
If the slack client errors in some unexpected way, it can
cause a scheduled job to error without returning any success/fail
status, which means the job gets stuck in the "running" state, even
though it's only been reserved and isn't actually running. Now we
retry a max of 3 times and log the error if we can't do the
notification. This could mean that jobs run and never report
back, if the slack client is more than transiently broken, but
then the chances are that simple commands like help are also
broken and at least the output of jobs should be accessible in
the job logs.
  • Loading branch information
rebkwok committed Jan 8, 2024
1 parent 1a25562 commit 941b1fa
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 10 deletions.
24 changes: 20 additions & 4 deletions ebmbot/slack.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@


def notify_slack(
slack_client, channel, message_text, thread_ts=None, message_format=None
slack_client, channel, message_text, thread_ts=None, message_format=None, fail=False
):
"""Send message to Slack."""
msg_kwargs = {"text": str(message_text)}
Expand All @@ -12,7 +12,23 @@ def notify_slack(
logger.info(
"Sending message", channel=channel, message=message_text, thread_ts=thread_ts
)
resp = slack_client.chat_postMessage(
channel=channel, thread_ts=thread_ts, **msg_kwargs
# In case of any unexpected transient exception posting to slack, retry up to 3
# times and then log the error, to avoid errors in scheduled jobs.
attempts = 0
while attempts < 3:
try:
resp = slack_client.chat_postMessage(
channel=channel, thread_ts=thread_ts, **msg_kwargs
)
return resp.data
except Exception as err:
attempts += 1
error = err

logger.error(
"Could not notify slack",
channel=channel,
message=message_text,
thread_ts=thread_ts,
error=error,
)
return resp.data
27 changes: 21 additions & 6 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ def reset_db():
pass


class WebClientWithSlackException(WebClient):
def chat_postMessage(self, *args, **kwargs):
raise Exception("Error notifying slack")


@dataclass
class MockRecordingClient:
client: WebClient
Expand All @@ -38,19 +43,29 @@ class MockRecordingApp:
recorder: Mock


@pytest.fixture
def mock_client():
test_recorder = Mock()
def _get_mock_recording_client(client_class, test_recorder):
setup_mock_web_api_server(test_recorder)
mock_api_server_base_url = "http://localhost:8888"

yield MockRecordingClient(
client=WebClient(
return MockRecordingClient(
client=client_class(
token="xoxb-valid",
base_url=mock_api_server_base_url,
),
recorder=test_recorder,
)


@pytest.fixture
def mock_client():
test_recorder = Mock()
yield _get_mock_recording_client(WebClient, test_recorder)
cleanup_mock_web_api_server(test_recorder)


@pytest.fixture
def mock_client_with_slack_exception():
test_recorder = Mock()
yield _get_mock_recording_client(WebClientWithSlackException, test_recorder)
cleanup_mock_web_api_server(test_recorder)


Expand Down
20 changes: 20 additions & 0 deletions tests/test_dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,26 @@ def test_job_success_with_no_report(mock_client):
assert f.read() == ""


def test_job_success_with_slack_exception(mock_client_with_slack_exception):
# Test that the job still succeeds even if notifying slack errors
log_dir = build_log_dir("test_good_job")

scheduler.schedule_job("test_good_job", {}, "channel", TS, 0)
job = scheduler.reserve_job()

do_job(mock_client_with_slack_exception.client, job)
assert_slack_client_sends_messages(
mock_client_with_slack_exception.recorder,
messages_kwargs=[],
)

with open(os.path.join(log_dir, "stdout")) as f:
assert f.read() == "the owl and the pussycat\n"

with open(os.path.join(log_dir, "stderr")) as f:
assert f.read() == ""


def test_job_failure(mock_client):
log_dir = build_log_dir("test_bad_job")

Expand Down

0 comments on commit 941b1fa

Please sign in to comment.