diff --git a/.luacheckrc b/.luacheckrc index bdb965a5..575bfce2 100644 --- a/.luacheckrc +++ b/.luacheckrc @@ -1,4 +1,4 @@ -globals = {"box", "_TARANTOOL", "tonumber64", "os"} +globals = {"box", "_TARANTOOL", "tonumber64", "os", "test_run"} ignore = { -- Accessing an undefined field of a global variable . "143/debug", diff --git a/Makefile b/Makefile index 39bc1df6..fcd6acf7 100644 --- a/Makefile +++ b/Makefile @@ -15,7 +15,7 @@ luacheck: luacheck --config .luacheckrc . test_integration: - PYTHONPATH=$(PROJECT_DIR) $(PYTHON) test/test-run.py --force $(TEST_RUN_EXTRA_PARAMS) + PYTHONPATH=$(PROJECT_DIR) $(PYTHON) test/test-run.py --force --exclude unittest $(TEST_RUN_EXTRA_PARAMS) test_unittest: $(PYTHON) -m unittest discover test/unittest/ diff --git a/lib/options.py b/lib/options.py index 371b527b..06526c9f 100644 --- a/lib/options.py +++ b/lib/options.py @@ -213,6 +213,14 @@ def __init__(self): Such files created by workers in the "var/reproduce" directory. Note: The option works now only with parallel testing.""") + parser.add_argument( + "--server-start-timeout", + dest="server_start_timeout", + default=env_int('SERVER_START_TIMEOUT', 90), + type=int, + help="""Stop the server process if the server starts longer than + this amount of seconds. Default: 90 [seconds].""") + parser.add_argument( "--test-timeout", dest="test_timeout", diff --git a/lib/tarantool_server.py b/lib/tarantool_server.py index 80897e25..8d319675 100644 --- a/lib/tarantool_server.py +++ b/lib/tarantool_server.py @@ -414,8 +414,15 @@ def execute(self, server): class TarantoolStartError(OSError): - def __init__(self, name=None): + def __init__(self, name=None, timeout=None): self.name = name + self.timeout = timeout + + def __str__(self): + message = '[Instance "{}"] Failed to start'.format(self.name) + if self.timeout: + return "\n{} within {} seconds\n".format(message, self.timeout) + return "\n{}\n".format(message) class TarantoolLog(object): @@ -444,27 +451,30 @@ def seek_once(self, msg): if pos != -1: return pos - def seek_wait(self, msg, proc=None, name=None): - while True: + def seek_wait(self, msg, proc=None, name=None, deadline=None): + while not deadline or time.time() < deadline: if os.path.exists(self.path): break - time.sleep(0.001) + gevent.sleep(0.001) + else: + return False with open(self.path, 'r') as f: f.seek(self.log_begin, os.SEEK_SET) cur_pos = self.log_begin - while True: + while not deadline or time.time() < deadline: if not (proc is None): if not (proc.poll() is None): raise TarantoolStartError(name) log_str = f.readline() if not log_str: - time.sleep(0.001) + gevent.sleep(0.001) f.seek(cur_pos, os.SEEK_SET) continue if re.findall(msg, log_str): - return + return True cur_pos = f.tell() + return False class TarantoolServer(Server): @@ -873,9 +883,10 @@ def start(self, silent=True, wait=True, wait_load=True, rais=True, args=[], self.crash_detector.start() if wait: + deadline = time.time() + Options().args.server_start_timeout try: - self.wait_until_started(wait_load) - except TarantoolStartError: + self.wait_until_started(wait_load, deadline) + except TarantoolStartError as err: # Python tests expect we raise an exception when non-default # server fails if self.crash_expected: @@ -884,9 +895,7 @@ def start(self, silent=True, wait=True, wait_load=True, rais=True, args=[], self.current_test.is_crash_reported): if self.current_test: self.current_test.is_crash_reported = True - color_stdout('\n[Instance "{0.name}"] Tarantool server ' - 'failed to start\n'.format(self), - schema='error') + color_stdout(err, schema='error') self.print_log(15) # Raise exception when caller ask for it (e.g. in case of # non-default servers) @@ -1007,16 +1016,6 @@ def stop(self, silent=True, signal=signal.SIGTERM): color_log('Server [%s] start against running ...\n', schema='test_var') return - if self.status != 'started': - if not silent: - raise Exception('Server is not started') - else: - color_log( - 'Server [{0.name}] is not started ' - '(status:{0.status}) ...\n'.format(self), - schema='test_var' - ) - return if not silent: color_stdout('[Instance {}] Stopping the server...\n'.format( self.name), schema='info') @@ -1084,7 +1083,19 @@ def kill_old_server(self, silent=True): self.wait_until_stopped(pid) return True - def wait_until_started(self, wait_load=True): + def wait_load(self, deadline): + """Wait until the server log file is matched the entry pattern + + If the entry pattern couldn't be found in a log file until a timeout + is up, it will raise a TarantoolStartError exception. + """ + msg = 'entering the event loop|will retry binding|hot standby mode' + p = self.process if not self.gdb and not self.lldb else None + if not self.logfile_pos.seek_wait(msg, p, self.name, deadline): + raise TarantoolStartError( + self.name, Options().args.server_start_timeout) + + def wait_until_started(self, wait_load=True, deadline=None): """ Wait until server is started. Server consists of two parts: @@ -1095,12 +1106,9 @@ def wait_until_started(self, wait_load=True): color_log('DEBUG: [Instance {}] Waiting until started ' '(wait_load={})\n'.format(self.name, str(wait_load)), schema='info') - if wait_load: - msg = 'entering the event loop|will retry binding|hot standby mode' - p = self.process if not self.gdb and not self.lldb else None - self.logfile_pos.seek_wait(msg, p, self.name) - while True: + self.wait_load(deadline) + while not deadline or time.time() < deadline: try: temp = AdminConnection('localhost', self.admin.port) if not wait_load: @@ -1123,14 +1131,17 @@ def wait_until_started(self, wait_load=True): if e.errno == errno.ECONNREFUSED: color_log(' | Connection refused; will retry every 0.1 ' 'seconds...') - time.sleep(0.1) + gevent.sleep(0.1) continue raise + else: + raise TarantoolStartError( + self.name, Options().args.server_start_timeout) def wait_until_stopped(self, pid): while True: try: - time.sleep(0.01) + gevent.sleep(0.01) os.kill(pid, 0) continue except OSError: @@ -1221,7 +1232,7 @@ def get_lsn(self, node_id): def wait_lsn(self, node_id, lsn): while (self.get_lsn(node_id) < lsn): # print("wait_lsn", node_id, lsn, self.get_param("vclock")) - time.sleep(0.01) + gevent.sleep(0.01) def get_log(self): return TarantoolLog(self.logfile).positioning() diff --git a/test-run.py b/test-run.py index f706d4b3..a6477f2e 100755 --- a/test-run.py +++ b/test-run.py @@ -91,6 +91,8 @@ def main_loop_parallel(): color_stdout("Timeout options:\n", schema='tr_text') color_stdout('-' * 19, "\n", schema='separator') + color_stdout("SERVER_START_TIMEOUT:" . ljust(26) + "{}\n" . + format(args.server_start_timeout), schema='tr_text') color_stdout("REPLICATION_SYNC_TIMEOUT:" . ljust(26) + "{}\n" . format(args.replication_sync_timeout), schema='tr_text') color_stdout("TEST_TIMEOUT:" . ljust(26) + "{}\n" . diff --git a/test/unittest/box-cc0544b6afd1.lua b/test/unittest/box-cc0544b6afd1.lua new file mode 100644 index 00000000..8a7c6f12 --- /dev/null +++ b/test/unittest/box-cc0544b6afd1.lua @@ -0,0 +1,7 @@ +#!/usr/bin/env tarantool + +box.cfg { + listen = os.getenv('LISTEN'), +} + +require('console').listen(os.getenv('ADMIN')) diff --git a/test/unittest/hang.result b/test/unittest/hang.result new file mode 100644 index 00000000..e69de29b diff --git a/test/unittest/hang.test.lua b/test/unittest/hang.test.lua new file mode 100644 index 00000000..d9e4047f --- /dev/null +++ b/test/unittest/hang.test.lua @@ -0,0 +1,6 @@ +test_run = require('test_run').new() + +-- This test should hang: we are unable to bootstrap the replica, because it is +-- unable to join the master because of lack of granting user permissions. +test_run:cmd('create server replica with rpl_master=default, script="unittest/replica-7f4d4895ff58.lua"') +test_run:cmd('start server replica') diff --git a/test/unittest/replica-7f4d4895ff58.lua b/test/unittest/replica-7f4d4895ff58.lua new file mode 100644 index 00000000..ef36ac3a --- /dev/null +++ b/test/unittest/replica-7f4d4895ff58.lua @@ -0,0 +1,8 @@ +#!/usr/bin/env tarantool + +box.cfg { + listen = os.getenv('LISTEN'), + replication = os.getenv('MASTER'), +} + +require('console').listen(os.getenv('ADMIN')) diff --git a/test/unittest/suite.ini b/test/unittest/suite.ini new file mode 100644 index 00000000..4c53f4f8 --- /dev/null +++ b/test/unittest/suite.ini @@ -0,0 +1,6 @@ +[default] +core = tarantool +description = unit tests +script = box-cc0544b6afd1.lua +use_unix_sockets = True +use_unix_sockets_iproto = True diff --git a/test/unittest/test_tarantool_server.py b/test/unittest/test_tarantool_server.py new file mode 100644 index 00000000..39c69601 --- /dev/null +++ b/test/unittest/test_tarantool_server.py @@ -0,0 +1,47 @@ +import os +import subprocess +import sys +import unittest + + +class TestTarantoolServer(unittest.TestCase): + def test_tarantool_server_not_hanging(self): + env = os.environ.copy() + env['SERVER_START_TIMEOUT'] = '5' + + cmd = [sys.executable, 'test/test-run.py', 'unittest/hang.test.lua'] + + # File names intentionally have hashes to find exactly these processes + # using 'ps' command. + box_instance = 'box-cc0544b6afd1' + replica_instance = 'replica-7f4d4895ff58' + + err_msg_1 = ('[Instance "%s"] Failed to start tarantool ' + 'instance "%s"' % (box_instance, replica_instance)) + err_msg_2 = ('[Instance "%s"] Failed to start within %s seconds' + % (replica_instance, env['SERVER_START_TIMEOUT'])) + + try: + subprocess.check_output(cmd, env=env, universal_newlines=True) + self.fail("Command `%s` did not return non-zero exit code" + % ' '.join(cmd)) + except subprocess.CalledProcessError as exc: + err_obj = exc + + self.assertIn(err_msg_1, err_obj.output) + self.assertIn(err_msg_2, err_obj.output) + + ps_lines = subprocess.check_output( + ['ps', '-o', 'command'], universal_newlines=True + ).splitlines() + proc_lines = [line.strip() for line in ps_lines + if 'tarantool %s.lua' % box_instance in line or + 'tarantool %s.lua' % replica_instance in line] + + self.assertFalse( + proc_lines, 'There are some hanging tarantool processes!' + ) + + +if __name__ == '__main__': + unittest.main()