diff --git a/UPGRADING b/UPGRADING index 51ef91a6736d5..d7c640c3e2b23 100644 --- a/UPGRADING +++ b/UPGRADING @@ -35,6 +35,13 @@ PHP 8.4 UPGRADE NOTES . mb_encode_numericentity() and mb_decode_numericentity() now check that the $map is only composed of integers, if not a ValueError is thrown. . mb_http_input() now always throws a ValueError if the $type is invalid. + . On invalid strings (those with encoding errors), mb_substr() now interprets + character indices in the same manner as most other mbstring functions. This + means that character indices returned by mb_strpos() can be passed to mb_substr(). + . For SJIS-Mac (MacJapanese) strings, character indices passed to mb_substr() now + refer to the indices of the Unicode codepoints which are produced when the string + is converted to Unicode. This is significant because around 40 SJIS-Mac characters + convert to a sequence of multiple Unicode codepoints. - PDO_DBLIB: . setAttribute, DBLIB_ATTR_STRINGIFY_UNIQUEIDENTIFIER and DBLIB_ATTR_DATETIME_CONVERT diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 3cc75abe37c12..ffedb08c9a0d0 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -38,6 +38,7 @@ #include "libmbfl/mbfl/mbfilter_wchar.h" #include "libmbfl/mbfl/eaw_table.h" #include "libmbfl/filters/mbfilter_base64.h" +#include "libmbfl/filters/mbfilter_cjk.h" #include "libmbfl/filters/mbfilter_qprint.h" #include "libmbfl/filters/mbfilter_htmlent.h" #include "libmbfl/filters/mbfilter_uuencode.h" @@ -2112,8 +2113,9 @@ static zend_string* mb_get_substr(zend_string *input, size_t from, size_t len, c unsigned char *in = (unsigned char*)ZSTR_VAL(input); size_t in_len = ZSTR_LEN(input); - if (from >= in_len || len == 0) { - /* No supported text encoding decodes to more than one codepoint per byte + if (len == 0 || (from >= in_len && enc != &mbfl_encoding_sjis_mac)) { + /* Other than MacJapanese, no supported text encoding decodes to + * more than one codepoint per byte * So if the number of codepoints to skip >= number of input bytes, * then definitely the output should be empty */ return zend_empty_string; @@ -2134,30 +2136,6 @@ static zend_string* mb_get_substr(zend_string *input, size_t from, size_t len, c len = in_len; } return zend_string_init_fast((const char*)in, len); - } else if (enc->mblen_table) { - /* The use of the `mblen_table` means that for encodings like MacJapanese, - * we treat each character in its native charset as "1 character", even if it - * maps to a sequence of several codepoints */ - const unsigned char *mbtab = enc->mblen_table; - unsigned char *limit = in + in_len; - while (from && in < limit) { - in += mbtab[*in]; - from--; - } - if (in >= limit) { - return zend_empty_string; - } else if (len == MBFL_SUBSTR_UNTIL_END) { - return zend_string_init_fast((const char*)in, limit - in); - } - unsigned char *end = in; - while (len && end < limit) { - end += mbtab[*end]; - len--; - } - if (end > limit) { - end = limit; - } - return zend_string_init_fast((const char*)in, end - in); } return mb_get_substr_slow(in, in_len, from, len, enc); @@ -2350,21 +2328,7 @@ PHP_FUNCTION(mb_substr) size_t mblen = 0; if (from < 0 || (!len_is_null && len < 0)) { - if (enc->mblen_table) { - /* Because we use the `mblen_table` when iterating over the string and - * extracting the requested part, we also need to use it here for counting - * the "length" of the string - * Otherwise, we can get wrong results for text encodings like MacJapanese, - * where one native 'character' can map to a sequence of several codepoints */ - const unsigned char *mbtab = enc->mblen_table; - unsigned char *p = (unsigned char*)ZSTR_VAL(str), *e = p + ZSTR_LEN(str); - while (p < e) { - p += mbtab[*p]; - mblen++; - } - } else { - mblen = mb_get_strlen(str, enc); - } + mblen = mb_get_strlen(str, enc); } /* if "from" position is negative, count start position from the end diff --git a/ext/mbstring/tests/mb_strstr.phpt b/ext/mbstring/tests/mb_strstr.phpt index 4cdbb3df24839..85b0eeb19f404 100644 --- a/ext/mbstring/tests/mb_strstr.phpt +++ b/ext/mbstring/tests/mb_strstr.phpt @@ -26,6 +26,11 @@ var_dump(FROM_EUC_JP(mb_strstr(EUC_JP("あいうえおかきくけこ"), EUC_JP( var_dump(bin2hex(mb_strstr("\xdd\x00", "", false, 'UTF-8'))); var_dump(bin2hex(mb_strstr("M\xff\xff\xff\x00", "\x00", false, "SJIS"))); +// Test handling of invalid UTF-8 string +// Thanks to Stefan Schiller +var_dump(mb_strstr("\xf0start", "start", false, "UTF-8")); +var_dump(mb_strstr("\xf0start", "start", true, "UTF-8")); + ?> --EXPECT-- string(18) "おかきくけこ" @@ -36,5 +41,7 @@ string(12) "あいうえ" string(18) "おかきくけこ" string(18) "おかきくけこ" string(12) "あいうえ" -string(4) "dd00" +string(4) "3f00" string(2) "00" +string(5) "start" +string(1) "?" diff --git a/ext/mbstring/tests/mb_substr.phpt b/ext/mbstring/tests/mb_substr.phpt index ef37ca6dc192a..ad94c3b6069b1 100644 --- a/ext/mbstring/tests/mb_substr.phpt +++ b/ext/mbstring/tests/mb_substr.phpt @@ -118,6 +118,15 @@ print "3: " . mb_convert_encoding(mb_substr($utf7, -5, 3, 'UTF-7'), 'UTF-8', 'UT print "4: " . mb_convert_encoding(mb_substr($utf7, 1, null, 'UTF-7'), 'UTF-8', 'UTF-7') . "\n"; print "5:" . mb_convert_encoding(mb_substr($utf7, 10, 0, 'UTF-7'), 'UTF-8', 'UTF-7') . "\n"; +echo "Testing agreement with mb_strpos on invalid UTF-8 string:\n"; +/* Stefan Schiller pointed out that on invalid UTF-8 strings, character indices returned + * by mb_strpos would not extract the desired part of the string when passed to mb_substr. + * This is the test case which he provided: */ +$data = "\xF0AAA"; +$pos = mb_strpos($data, "<", 0, "UTF-8"); +$out = mb_substr($data, 0, $pos, "UTF-8"); +print $out . "\n"; + echo "Regression:\n"; /* During development, one >= comparison in mb_get_substr was wrongly written as > * This was caught by libFuzzer */ @@ -138,30 +147,30 @@ SJIS: 4: 967b8cea8365834c8358836782c582b781423031323334825482558256825782588142 5: -- Testing illegal SJIS byte 0x80 -- -6380 -806162 +633f +3f6162 SJIS-2004: -6380 -806162 +633f +3f6162 MacJapanese: 6380 806162 SJIS-Mobile#DOCOMO: -6380 -806162 +633f +3f6162 SJIS-Mobile#KDDI: -6380 -806162 +633f +3f6162 SJIS-Mobile#SoftBank: -6380 -806162 +633f +3f6162 -- Testing MacJapanese characters which map to 3-5 codepoints each -- 616263 -85ab85ac -85ac +3f3f +58 616263 -85bf85c0 -85c0 +3f3f +78 ISO-2022-JP: 1: 1b2442212121721b284241 2: 43 @@ -200,5 +209,7 @@ UTF-7: 3: йте 4: reek: Σὲ γνωρίζω ἀπὸ τὴν κόψη Russian: Зарегистрируйтесь 5: +Testing agreement with mb_strpos on invalid UTF-8 string: +?AAA Regression: 1b28493d3d3d3d3d3d3d3e3d3d3d1b28423f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f000000003f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f1b28493d3d3d3d3d3d3d3e1b2842013a4f1b28492a1b2842