Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add timeout for starting tarantool server #302

Merged
merged 4 commits into from
Mar 15, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .luacheckrc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
globals = {"box", "_TARANTOOL", "tonumber64", "os"}
globals = {"box", "_TARANTOOL", "tonumber64", "os", "test_run"}
ignore = {
-- Accessing an undefined field of a global variable <debug>.
"143/debug",
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ luacheck:
luacheck --config .luacheckrc .

test_integration:
PYTHONPATH=$(PROJECT_DIR) $(PYTHON) test/test-run.py --force $(TEST_RUN_EXTRA_PARAMS)
PYTHONPATH=$(PROJECT_DIR) $(PYTHON) test/test-run.py --force --exclude unittest $(TEST_RUN_EXTRA_PARAMS)

test_unittest:
$(PYTHON) -m unittest discover test/unittest/
Expand Down
8 changes: 8 additions & 0 deletions lib/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,14 @@ def __init__(self):
Such files created by workers in the "var/reproduce" directory.
Note: The option works now only with parallel testing.""")

parser.add_argument(
Totktonada marked this conversation as resolved.
Show resolved Hide resolved
"--server-start-timeout",
dest="server_start_timeout",
default=env_int('SERVER_START_TIMEOUT', 90),
type=int,
help="""Stop the server process if the server starts longer than
this amount of seconds. Default: 90 [seconds].""")

parser.add_argument(
"--test-timeout",
dest="test_timeout",
Expand Down
73 changes: 42 additions & 31 deletions lib/tarantool_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,8 +414,15 @@ def execute(self, server):


class TarantoolStartError(OSError):
def __init__(self, name=None):
def __init__(self, name=None, timeout=None):
self.name = name
self.timeout = timeout

def __str__(self):
message = '[Instance "{}"] Failed to start'.format(self.name)
if self.timeout:
return "\n{} within {} seconds\n".format(message, self.timeout)
return "\n{}\n".format(message)


class TarantoolLog(object):
Expand Down Expand Up @@ -444,27 +451,30 @@ def seek_once(self, msg):
if pos != -1:
return pos

def seek_wait(self, msg, proc=None, name=None):
while True:
def seek_wait(self, msg, proc=None, name=None, deadline=None):
while not deadline or time.time() < deadline:
if os.path.exists(self.path):
break
time.sleep(0.001)
ligurio marked this conversation as resolved.
Show resolved Hide resolved
gevent.sleep(0.001)
VitaliyaIoffe marked this conversation as resolved.
Show resolved Hide resolved
Totktonada marked this conversation as resolved.
Show resolved Hide resolved
ylobankov marked this conversation as resolved.
Show resolved Hide resolved
Totktonada marked this conversation as resolved.
Show resolved Hide resolved
else:
return False

with open(self.path, 'r') as f:
f.seek(self.log_begin, os.SEEK_SET)
cur_pos = self.log_begin
while True:
while not deadline or time.time() < deadline:
if not (proc is None):
if not (proc.poll() is None):
raise TarantoolStartError(name)
log_str = f.readline()
if not log_str:
time.sleep(0.001)
gevent.sleep(0.001)
f.seek(cur_pos, os.SEEK_SET)
continue
if re.findall(msg, log_str):
return
return True
cur_pos = f.tell()
return False
VitaliyaIoffe marked this conversation as resolved.
Show resolved Hide resolved


class TarantoolServer(Server):
Expand Down Expand Up @@ -873,9 +883,10 @@ def start(self, silent=True, wait=True, wait_load=True, rais=True, args=[],
self.crash_detector.start()

if wait:
deadline = time.time() + Options().args.server_start_timeout
try:
self.wait_until_started(wait_load)
except TarantoolStartError:
self.wait_until_started(wait_load, deadline)
except TarantoolStartError as err:
# Python tests expect we raise an exception when non-default
# server fails
if self.crash_expected:
Expand All @@ -884,9 +895,7 @@ def start(self, silent=True, wait=True, wait_load=True, rais=True, args=[],
self.current_test.is_crash_reported):
if self.current_test:
self.current_test.is_crash_reported = True
color_stdout('\n[Instance "{0.name}"] Tarantool server '
'failed to start\n'.format(self),
schema='error')
color_stdout(err, schema='error')
self.print_log(15)
# Raise exception when caller ask for it (e.g. in case of
# non-default servers)
Expand Down Expand Up @@ -1007,16 +1016,6 @@ def stop(self, silent=True, signal=signal.SIGTERM):
color_log('Server [%s] start against running ...\n',
schema='test_var')
return
ligurio marked this conversation as resolved.
Show resolved Hide resolved
if self.status != 'started':
if not silent:
raise Exception('Server is not started')
else:
color_log(
'Server [{0.name}] is not started '
'(status:{0.status}) ...\n'.format(self),
schema='test_var'
)
return
Totktonada marked this conversation as resolved.
Show resolved Hide resolved
if not silent:
color_stdout('[Instance {}] Stopping the server...\n'.format(
self.name), schema='info')
Expand Down Expand Up @@ -1084,7 +1083,19 @@ def kill_old_server(self, silent=True):
self.wait_until_stopped(pid)
return True

def wait_until_started(self, wait_load=True):
def wait_load(self, deadline):
"""Wait until the server log file is matched the entry pattern

If the entry pattern couldn't be found in a log file until a timeout
is up, it will raise a TarantoolStartError exception.
"""
msg = 'entering the event loop|will retry binding|hot standby mode'
p = self.process if not self.gdb and not self.lldb else None
if not self.logfile_pos.seek_wait(msg, p, self.name, deadline):
raise TarantoolStartError(
self.name, Options().args.server_start_timeout)

def wait_until_started(self, wait_load=True, deadline=None):
""" Wait until server is started.

Server consists of two parts:
Expand All @@ -1095,12 +1106,9 @@ def wait_until_started(self, wait_load=True):
color_log('DEBUG: [Instance {}] Waiting until started '
'(wait_load={})\n'.format(self.name, str(wait_load)),
schema='info')

if wait_load:
msg = 'entering the event loop|will retry binding|hot standby mode'
p = self.process if not self.gdb and not self.lldb else None
self.logfile_pos.seek_wait(msg, p, self.name)
while True:
self.wait_load(deadline)
while not deadline or time.time() < deadline:
try:
temp = AdminConnection('localhost', self.admin.port)
if not wait_load:
Expand All @@ -1123,14 +1131,17 @@ def wait_until_started(self, wait_load=True):
if e.errno == errno.ECONNREFUSED:
color_log(' | Connection refused; will retry every 0.1 '
'seconds...')
time.sleep(0.1)
gevent.sleep(0.1)
continue
raise
else:
raise TarantoolStartError(
self.name, Options().args.server_start_timeout)

def wait_until_stopped(self, pid):
while True:
try:
time.sleep(0.01)
gevent.sleep(0.01)
os.kill(pid, 0)
continue
except OSError:
Expand Down Expand Up @@ -1221,7 +1232,7 @@ def get_lsn(self, node_id):
def wait_lsn(self, node_id, lsn):
while (self.get_lsn(node_id) < lsn):
# print("wait_lsn", node_id, lsn, self.get_param("vclock"))
time.sleep(0.01)
gevent.sleep(0.01)

def get_log(self):
return TarantoolLog(self.logfile).positioning()
2 changes: 2 additions & 0 deletions test-run.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ def main_loop_parallel():

color_stdout("Timeout options:\n", schema='tr_text')
color_stdout('-' * 19, "\n", schema='separator')
color_stdout("SERVER_START_TIMEOUT:" . ljust(26) + "{}\n" .
ligurio marked this conversation as resolved.
Show resolved Hide resolved
format(args.server_start_timeout), schema='tr_text')
color_stdout("REPLICATION_SYNC_TIMEOUT:" . ljust(26) + "{}\n" .
format(args.replication_sync_timeout), schema='tr_text')
color_stdout("TEST_TIMEOUT:" . ljust(26) + "{}\n" .
Expand Down
7 changes: 7 additions & 0 deletions test/unittest/box-cc0544b6afd1.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/usr/bin/env tarantool
ligurio marked this conversation as resolved.
Show resolved Hide resolved

box.cfg {
listen = os.getenv('LISTEN'),
}

require('console').listen(os.getenv('ADMIN'))
ligurio marked this conversation as resolved.
Show resolved Hide resolved
Empty file added test/unittest/hang.result
Empty file.
6 changes: 6 additions & 0 deletions test/unittest/hang.test.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
test_run = require('test_run').new()

-- This test should hang: we are unable to bootstrap the replica, because it is
-- unable to join the master because of lack of granting user permissions.
test_run:cmd('create server replica with rpl_master=default, script="unittest/replica-7f4d4895ff58.lua"')
test_run:cmd('start server replica')
8 changes: 8 additions & 0 deletions test/unittest/replica-7f4d4895ff58.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/usr/bin/env tarantool

box.cfg {
listen = os.getenv('LISTEN'),
replication = os.getenv('MASTER'),
}

require('console').listen(os.getenv('ADMIN'))
6 changes: 6 additions & 0 deletions test/unittest/suite.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[default]
core = tarantool
description = unit tests
script = box-cc0544b6afd1.lua
use_unix_sockets = True
use_unix_sockets_iproto = True
47 changes: 47 additions & 0 deletions test/unittest/test_tarantool_server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import os
ligurio marked this conversation as resolved.
Show resolved Hide resolved
import subprocess
import sys
import unittest


class TestTarantoolServer(unittest.TestCase):
def test_tarantool_server_not_hanging(self):
env = os.environ.copy()
env['SERVER_START_TIMEOUT'] = '5'

cmd = [sys.executable, 'test/test-run.py', 'unittest/hang.test.lua']

# File names intentionally have hashes to find exactly these processes
# using 'ps' command.
box_instance = 'box-cc0544b6afd1'
replica_instance = 'replica-7f4d4895ff58'

err_msg_1 = ('[Instance "%s"] Failed to start tarantool '
'instance "%s"' % (box_instance, replica_instance))
err_msg_2 = ('[Instance "%s"] Failed to start within %s seconds'
% (replica_instance, env['SERVER_START_TIMEOUT']))

try:
subprocess.check_output(cmd, env=env, universal_newlines=True)
self.fail("Command `%s` did not return non-zero exit code"
% ' '.join(cmd))
except subprocess.CalledProcessError as exc:
err_obj = exc

self.assertIn(err_msg_1, err_obj.output)
self.assertIn(err_msg_2, err_obj.output)

ps_lines = subprocess.check_output(
['ps', '-o', 'command'], universal_newlines=True
).splitlines()
proc_lines = [line.strip() for line in ps_lines
if 'tarantool %s.lua' % box_instance in line or
'tarantool %s.lua' % replica_instance in line]

self.assertFalse(
proc_lines, 'There are some hanging tarantool processes!'
)


if __name__ == '__main__':
unittest.main()