From e6995c95c1c07983cae76c2695b1c59f9803d6ee Mon Sep 17 00:00:00 2001 From: "Dylan Kiss (dyki)" Date: Thu, 19 Sep 2024 16:46:19 +0200 Subject: [PATCH] Allow extracting deeply nested calls in Python Currently the Python extractor does not support deeply nested gettext calls (deeper than as a direct argument to the top-level gettext call). e.g. ```py _("Hello %s", _("Person")) _("Hello %s", random_function(", ".join([_("Person 1"), _("Person 2")]))) ``` The extraction code was refactored quite a bit to simplify the flow and support this use-case. Fixes https://github.com/python-babel/babel/issues/1125 (meanwhile also fixes https://github.com/python-babel/babel/issues/1123) --- babel/messages/extract.py | 196 +++++++++++++++++++-------------- tests/messages/test_extract.py | 46 +++++--- 2 files changed, 148 insertions(+), 94 deletions(-) diff --git a/babel/messages/extract.py b/babel/messages/extract.py index 8d4bbeaf8..7650cfe1e 100644 --- a/babel/messages/extract.py +++ b/babel/messages/extract.py @@ -502,14 +502,6 @@ def extract_python( :param options: a dictionary of additional options (optional) :rtype: ``iterator`` """ - funcname = lineno = message_lineno = None - call_stack = -1 - buf = [] - messages = [] - translator_comments = [] - in_def = in_translator_comments = False - comment_tag = None - encoding = parse_encoding(fileobj) or options.get('encoding', 'UTF-8') future_flags = parse_future_flags(fileobj, encoding) next_line = lambda: fileobj.readline().decode(encoding) @@ -520,103 +512,145 @@ def extract_python( # currently parsing one. current_fstring_start = None - for tok, value, (lineno, _), _, _ in tokens: - if call_stack == -1 and tok == NAME and value in ('def', 'class'): + # Keep the stack of all function calls and its related contextual variables, + # so we can handle nested gettext calls. + function_stack = [] + # Keep the last encountered function name for when we encounter + # an opening parenthesis + last_function_name = None + # Keep track of whether we're in a class or function definition + in_def = False + # Keep track of whether we're in a block of translator comments + in_translator_comments = False + # Keep track of the last encountered translator comments + translator_comments = [] + # Keep track of the (split) strings encountered + message_buffer = [] + + for token, value, (line_no, _), _, _ in tokens: + if not function_stack and token == NAME and value in ('def', 'class'): + # We're entering a class or function definition in_def = True - elif tok == OP and value == '(': - if in_def: - # Avoid false positives for declarations such as: - # def gettext(arg='message'): - in_def = False - continue - if funcname: - message_lineno = lineno - call_stack += 1 - elif in_def and tok == OP and value == ':': - # End of a class definition without parens + + elif in_def and token == OP and value in ('(', ':'): + # We're in a class or function definition and should not do anything in_def = False continue - elif call_stack == -1 and tok == COMMENT: + + elif token == OP and value == '(' and last_function_name: + # We're entering a function call + cur_translator_comments = translator_comments + if function_stack and function_stack[-1]['function_line_no'] == line_no: + # If our current function call is on the same line as the previous one, + # copy their translator comments, since they also apply to us. + cur_translator_comments = function_stack[-1]['translator_comments'] + + # We add all information needed later for the current function call + function_stack.append({ + 'function_line_no': line_no, + 'function_name': last_function_name, + 'message_line_no': None, + 'messages': [], + 'translator_comments': cur_translator_comments, + }) + translator_comments = [] + + elif token == COMMENT: # Strip the comment token from the line value = value[1:].strip() - if in_translator_comments and \ - translator_comments[-1][0] == lineno - 1: + if in_translator_comments and translator_comments[-1][0] == line_no - 1: # We're already inside a translator comment, continue appending - translator_comments.append((lineno, value)) + translator_comments.append((line_no, value)) continue - # If execution reaches this point, let's see if comment line - # starts with one of the comment tags + for comment_tag in comment_tags: if value.startswith(comment_tag): + # Comment starts with one of the comment tags, + # so let's start capturing it in_translator_comments = True - translator_comments.append((lineno, value)) + translator_comments.append((line_no, value)) break - elif funcname and call_stack == 0: - nested = (tok == NAME and value in keywords) - if (tok == OP and value == ')') or nested: - if buf: - messages.append(''.join(buf)) - del buf[:] + + elif function_stack and function_stack[-1]['function_name'] in keywords: + # We're inside a translation function call + if token == OP and value == ')': + # The call has ended, so we yield the translatable term(s) + messages = function_stack[-1]['messages'] + line_no = ( + function_stack[-1]['message_line_no'] + or function_stack[-1]['function_line_no'] + ) + cur_translator_comments = function_stack[-1]['translator_comments'] + + if message_buffer: + messages.append(''.join(message_buffer)) + message_buffer.clear() else: messages.append(None) messages = tuple(messages) if len(messages) > 1 else messages[0] - # Comments don't apply unless they immediately - # precede the message - if translator_comments and \ - translator_comments[-1][0] < message_lineno - 1: - translator_comments = [] - - yield (message_lineno, funcname, messages, - [comment[1] for comment in translator_comments]) - - funcname = lineno = message_lineno = None - call_stack = -1 - messages = [] - translator_comments = [] - in_translator_comments = False - if nested: - funcname = value - elif tok == STRING: - val = _parse_python_string(value, encoding, future_flags) - if val is not None: - buf.append(val) + if ( + cur_translator_comments + and cur_translator_comments[-1][0] < line_no - 1 + ): + # The translator comments are not immediately preceding the current + # term, so we skip them. + cur_translator_comments = [] + + yield ( + line_no, + function_stack[-1]['function_name'], + messages, + [comment[1] for comment in cur_translator_comments], + ) + + function_stack.pop() + + elif token == STRING: + # We've encountered a string inside a translation function call + string_value = _parse_python_string(value, encoding, future_flags) + if not function_stack[-1]['message_line_no']: + function_stack[-1]['message_line_no'] = line_no + if string_value is not None: + message_buffer.append(string_value) # Python 3.12+, see https://peps.python.org/pep-0701/#new-tokens - elif tok == FSTRING_START: + elif token == FSTRING_START: current_fstring_start = value - elif tok == FSTRING_MIDDLE: + elif token == FSTRING_MIDDLE: if current_fstring_start is not None: current_fstring_start += value - elif tok == FSTRING_END: + elif token == FSTRING_END: if current_fstring_start is not None: fstring = current_fstring_start + value - val = _parse_python_string(fstring, encoding, future_flags) - if val is not None: - buf.append(val) - - elif tok == OP and value == ',': - if buf: - messages.append(''.join(buf)) - del buf[:] + string_value = _parse_python_string(fstring, encoding, future_flags) + if string_value is not None: + message_buffer.append(string_value) + + elif token == OP and value == ',': + # End of a function call argument + if message_buffer: + function_stack[-1]['messages'].append(''.join(message_buffer)) + message_buffer.clear() else: - messages.append(None) - if translator_comments: - # We have translator comments, and since we're on a - # comma(,) user is allowed to break into a new line - # Let's increase the last comment's lineno in order - # for the comment to still be a valid one - old_lineno, old_comment = translator_comments.pop() - translator_comments.append((old_lineno + 1, old_comment)) - elif call_stack > 0 and tok == OP and value == ')': - call_stack -= 1 - elif funcname and call_stack == -1: - funcname = None - elif tok == NAME and value in keywords: - funcname = value + function_stack[-1]['messages'].append(None) + + elif function_stack and token == OP and value == ')': + function_stack.pop() + + if in_translator_comments and translator_comments[-1][0] < line_no: + # We have a newline in between the comments, so they don't belong + # together anymore + in_translator_comments = False + + if token == NAME: + last_function_name = value + if function_stack and not function_stack[-1]['message_line_no']: + function_stack[-1]['message_line_no'] = line_no - if (current_fstring_start is not None - and tok not in {FSTRING_START, FSTRING_MIDDLE} + if ( + current_fstring_start is not None + and token not in {FSTRING_START, FSTRING_MIDDLE} ): # In Python 3.12, tokens other than FSTRING_* mean the # f-string is dynamic, so we don't wan't to extract it. diff --git a/tests/messages/test_extract.py b/tests/messages/test_extract.py index 7d3a05aa7..9b6348813 100644 --- a/tests/messages/test_extract.py +++ b/tests/messages/test_extract.py @@ -97,10 +97,10 @@ def test_comments_with_calls_that_spawn_multiple_lines(self): messages = list(extract.extract_python(buf, ('ngettext', '_'), ['NOTE:'], {'strip_comment_tags': False})) - assert messages[0] == (3, 'ngettext', ('Catalog deleted.', 'Catalogs deleted.', None), ['NOTE: This Comment SHOULD Be Extracted']) + assert messages[0] == (2, 'ngettext', ('Catalog deleted.', 'Catalogs deleted.', None), ['NOTE: This Comment SHOULD Be Extracted']) assert messages[1] == (6, '_', 'Locale deleted.', ['NOTE: This Comment SHOULD Be Extracted']) assert messages[2] == (10, 'ngettext', ('Foo deleted.', 'Foos deleted.', None), ['NOTE: This Comment SHOULD Be Extracted']) - assert messages[3] == (15, 'ngettext', ('Bar deleted.', 'Bars deleted.', None), ['NOTE: This Comment SHOULD Be Extracted', 'NOTE: And This One Too']) + assert messages[3] == (14, 'ngettext', ('Bar deleted.', 'Bars deleted.', None), ['NOTE: This Comment SHOULD Be Extracted', 'NOTE: And This One Too']) def test_declarations(self): buf = BytesIO(b"""\ @@ -422,24 +422,44 @@ def test_nested_messages(self): # NOTE: Third _(u'Hello, {0} and {1}!', _(u'Heungsub'), _(u'Armin')) + +# NOTE: Fourth +_("Hello %(person)", person=random_function(_("Person"))) + +# NOTE: Fifth +_("Hello %(people)", + person=random_function( + ", ".join([_("Person 1"), _("Person 2")]) + ) +) """) messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {})) - assert messages[0][2] == ('Hello, {name}!', None) + assert messages[0][2] == 'Foo Bar' assert messages[0][3] == ['NOTE: First'] - assert messages[1][2] == 'Foo Bar' - assert messages[1][3] == [] - assert messages[2][2] == ('Hello, {name1} and {name2}!', None) + assert messages[1][2] == ('Hello, {name}!', None) + assert messages[1][3] == ['NOTE: First'] + assert messages[2][2] == 'Heungsub' assert messages[2][3] == ['NOTE: Second'] - assert messages[3][2] == 'Heungsub' + assert messages[3][2] == 'Armin' assert messages[3][3] == [] - assert messages[4][2] == 'Armin' - assert messages[4][3] == [] - assert messages[5][2] == ('Hello, {0} and {1}!', None) + assert messages[4][2] == ('Hello, {name1} and {name2}!', None, None) + assert messages[4][3] == ['NOTE: Second'] + assert messages[5][2] == 'Heungsub' assert messages[5][3] == ['NOTE: Third'] - assert messages[6][2] == 'Heungsub' + assert messages[6][2] == 'Armin' assert messages[6][3] == [] - assert messages[7][2] == 'Armin' - assert messages[7][3] == [] + assert messages[7][2] == ('Hello, {0} and {1}!', None, None) + assert messages[7][3] == ['NOTE: Third'] + assert messages[8][2] == 'Person' + assert messages[8][3] == ['NOTE: Fourth'] + assert messages[9][2] == ('Hello %(person)', None) + assert messages[9][3] == ['NOTE: Fourth'] + assert messages[10][2] == 'Person 1' + assert messages[10][3] == [] + assert messages[11][2] == 'Person 2' + assert messages[11][3] == [] + assert messages[12][2] == ('Hello %(people)', None) + assert messages[12][3] == ['NOTE: Fifth'] class ExtractTestCase(unittest.TestCase):