diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index ab53a20cff5539..4ed94b649fc0f2 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -3188,6 +3188,13 @@ def get_tokens(string): with self.subTest(case=case): self.assertRaises(tokenize.TokenError, get_tokens, case) + def test_tstring_multiline_bang_underflow(self): + # gh-149183: t-string with '!' across two lines used to raise + # MemoryError because last_expr_end > last_expr_size produced a + # negative length that was cast to a huge size_t. + readline = BytesIO(b't"{!\n!x').readline + self.assertRaises(tokenize.TokenError, list, tokenize.tokenize(readline)) + @support.skip_wasi_stack_overflow() def test_max_indent(self): MAXINDENT = 100 diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2026-05-11-10-46-00.gh-issue-149183.aB3xQr.rst b/Misc/NEWS.d/next/Core_and_Builtins/2026-05-11-10-46-00.gh-issue-149183.aB3xQr.rst new file mode 100644 index 00000000000000..14bb6ca4080bc3 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2026-05-11-10-46-00.gh-issue-149183.aB3xQr.rst @@ -0,0 +1,2 @@ +Fix :exc:`MemoryError` in the t-string tokenizer when the opening ``{`` +and closing delimiter span different source lines. diff --git a/Parser/lexer/lexer.c b/Parser/lexer/lexer.c index 7f25afec302c22..bf987cb85cf44e 100644 --- a/Parser/lexer/lexer.c +++ b/Parser/lexer/lexer.c @@ -119,6 +119,17 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { if (!(tok_mode->in_debug || tok_mode->string_kind == TSTRING) || token->metadata) { return 0; } + + Py_ssize_t expr_len = tok_mode->last_expr_size - tok_mode->last_expr_end; + if (expr_len < 0) { + /* last_expr_end > last_expr_size: happens when '{' and the closing + delimiter span different source lines, causing the strlen-based + size tracking to underflow. Treat as a tokenizer error rather + than passing a negative length (cast to huge size_t) to malloc or + PyUnicode_DecodeUTF8. */ + return -1; + } + PyObject *res = NULL; // Look for a # character outside of string literals @@ -126,7 +137,7 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { int in_string = 0; char quote_char = 0; - for (Py_ssize_t i = 0; i < tok_mode->last_expr_size - tok_mode->last_expr_end; i++) { + for (Py_ssize_t i = 0; i < expr_len; i++) { char ch = tok_mode->last_expr_buffer[i]; // Skip escaped characters @@ -163,7 +174,7 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { // If we found a # character in the expression, we need to handle comments if (hash_detected) { // Allocate buffer for processed result - char *result = (char *)PyMem_Malloc((tok_mode->last_expr_size - tok_mode->last_expr_end + 1) * sizeof(char)); + char *result = (char *)PyMem_Malloc((expr_len + 1) * sizeof(char)); if (!result) { return -1; } @@ -174,7 +185,7 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { quote_char = 0; // Current string quote char // Process each character - while (i < tok_mode->last_expr_size - tok_mode->last_expr_end) { + while (i < expr_len) { char ch = tok_mode->last_expr_buffer[i]; // Handle string quotes @@ -190,11 +201,10 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { } // Skip comments else if (ch == '#' && !in_string) { - while (i < tok_mode->last_expr_size - tok_mode->last_expr_end && - tok_mode->last_expr_buffer[i] != '\n') { + while (i < expr_len && tok_mode->last_expr_buffer[i] != '\n') { i++; } - if (i < tok_mode->last_expr_size - tok_mode->last_expr_end) { + if (i < expr_len) { result[j++] = '\n'; } } @@ -211,7 +221,7 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { } else { res = PyUnicode_DecodeUTF8( tok_mode->last_expr_buffer, - tok_mode->last_expr_size - tok_mode->last_expr_end, + expr_len, NULL ); }