From 81e236cde56ff006d9c661140790b42f40ffc43b Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Wed, 25 Oct 2023 22:27:50 +0200 Subject: [PATCH] Fix infinite loop when mb_detect_encoding is used on UTF-8 BOM This bug was introduced in cb840799b4. Thanks to Ignace Nyamagana Butera for discovering this bug and to Sebastian Bergmann for doing an initial investigation and opening a bug ticket. --- ext/mbstring/mbstring.c | 6 ++++++ ext/mbstring/tests/mb_detect_encoding.phpt | 13 +++++++++++++ 2 files changed, 19 insertions(+) diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 7bf6ef02ee2ef..f3559e65b411b 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -3068,6 +3068,12 @@ static size_t count_demerits(struct candidate *array, size_t length, bool strict uint32_t wchar_buf[128]; unsigned int finished = 0; /* For how many candidate encodings have we processed all the input? */ + for (size_t i = 0; i < length; i++) { + if (array[i].in_len == 0) { + finished++; + } + } + while ((strict || length > 1) && finished < length) { /* Iterate in reverse order to avoid moving candidates that can be eliminated. */ for (size_t i = length - 1; i != (size_t)-1; i--) { diff --git a/ext/mbstring/tests/mb_detect_encoding.phpt b/ext/mbstring/tests/mb_detect_encoding.phpt index b3c457738c1f8..2721ec3c83c6a 100644 --- a/ext/mbstring/tests/mb_detect_encoding.phpt +++ b/ext/mbstring/tests/mb_detect_encoding.phpt @@ -25,6 +25,13 @@ print("Bad ASCII (strict): " . mb_detect_encoding("\xDD\x92", ['ASCII', 'UTF-8'] print("Bad ASCII/UTF-8, with more errors for ASCII (non-strict): " . mb_detect_encoding("\xD6\x8A\x8A", ['ASCII', 'UTF-8'], false) . "\n"); print("Bad ASCII/UTF-8, with more errors for ASCII (strict): " . var_export(mb_detect_encoding("\xD6\x8A\x8A", ['ASCII', 'UTF-8'], true), true) . "\n"); +print("UTF-8 BOM (non-strict): " . mb_detect_encoding("\xEF\xBB\xBF", ["UTF-8", "ASCII"], false) . "\n"); +print("UTF-8 BOM (strict): " . mb_detect_encoding("\xEF\xBB\xBF", ["UTF-8", "ASCII"], true) . "\n"); +print("UTF-16BE BOM (non-strict): " . mb_detect_encoding("\xFE\xFF", ["UTF-8", "UTF-16BE", "UTF-16LE"], false) . "\n"); +print("UTF-16BE BOM (strict): " . mb_detect_encoding("\xFE\xFF", ["UTF-8", "UTF-16BE", "UTF-16LE"], true) . "\n"); +print("UTF-16LE BOM (non-strict): " . mb_detect_encoding("\xFF\xFE", ["UTF-8", "UTF-16BE", "UTF-16LE"], false) . "\n"); +print("UTF-16LE BOM (strict): " . mb_detect_encoding("\xFF\xFE", ["UTF-8", "UTF-16BE", "UTF-16LE"], true) . "\n"); + print("SJIS: " . mb_detect_encoding($sjis, 'SJIS', true) . "\n"); print("JIS: " . mb_detect_encoding($jis, 'JIS', true) . "\n"); print("EUC-JP (strict): " . mb_detect_encoding($euc_jp, 'UTF-8,EUC-JP,JIS', true) . "\n"); @@ -399,6 +406,12 @@ Bad ASCII (non-strict): UTF-8 Bad ASCII (strict): UTF-8 Bad ASCII/UTF-8, with more errors for ASCII (non-strict): UTF-8 Bad ASCII/UTF-8, with more errors for ASCII (strict): false +UTF-8 BOM (non-strict): UTF-8 +UTF-8 BOM (strict): UTF-8 +UTF-16BE BOM (non-strict): UTF-16BE +UTF-16BE BOM (strict): UTF-16BE +UTF-16LE BOM (non-strict): UTF-16LE +UTF-16LE BOM (strict): UTF-16LE SJIS: SJIS JIS: JIS EUC-JP (strict): EUC-JP