Skip to content

Commit

Permalink
Merge branch 'PHP-8.3'
Browse files Browse the repository at this point in the history
* PHP-8.3:
  Character indices used by mb_strpos and mb_substr have same meaning, even on invalid strings
  • Loading branch information
alexdowad committed Dec 10, 2023
2 parents 3c3aba1 + ec348a1 commit fea895b
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 56 deletions.
7 changes: 7 additions & 0 deletions UPGRADING
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,13 @@ PHP 8.4 UPGRADE NOTES
. mb_encode_numericentity() and mb_decode_numericentity() now check that
the $map is only composed of integers, if not a ValueError is thrown.
. mb_http_input() now always throws a ValueError if the $type is invalid.
. On invalid strings (those with encoding errors), mb_substr() now interprets
character indices in the same manner as most other mbstring functions. This
means that character indices returned by mb_strpos() can be passed to mb_substr().
. For SJIS-Mac (MacJapanese) strings, character indices passed to mb_substr() now
refer to the indices of the Unicode codepoints which are produced when the string
is converted to Unicode. This is significant because around 40 SJIS-Mac characters
convert to a sequence of multiple Unicode codepoints.

- PDO_DBLIB:
. setAttribute, DBLIB_ATTR_STRINGIFY_UNIQUEIDENTIFIER and DBLIB_ATTR_DATETIME_CONVERT
Expand Down
46 changes: 5 additions & 41 deletions ext/mbstring/mbstring.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
#include "libmbfl/mbfl/mbfilter_wchar.h"
#include "libmbfl/mbfl/eaw_table.h"
#include "libmbfl/filters/mbfilter_base64.h"
#include "libmbfl/filters/mbfilter_cjk.h"
#include "libmbfl/filters/mbfilter_qprint.h"
#include "libmbfl/filters/mbfilter_htmlent.h"
#include "libmbfl/filters/mbfilter_uuencode.h"
Expand Down Expand Up @@ -2112,8 +2113,9 @@ static zend_string* mb_get_substr(zend_string *input, size_t from, size_t len, c
unsigned char *in = (unsigned char*)ZSTR_VAL(input);
size_t in_len = ZSTR_LEN(input);

if (from >= in_len || len == 0) {
/* No supported text encoding decodes to more than one codepoint per byte
if (len == 0 || (from >= in_len && enc != &mbfl_encoding_sjis_mac)) {
/* Other than MacJapanese, no supported text encoding decodes to
* more than one codepoint per byte
* So if the number of codepoints to skip >= number of input bytes,
* then definitely the output should be empty */
return zend_empty_string;
Expand All @@ -2134,30 +2136,6 @@ static zend_string* mb_get_substr(zend_string *input, size_t from, size_t len, c
len = in_len;
}
return zend_string_init_fast((const char*)in, len);
} else if (enc->mblen_table) {
/* The use of the `mblen_table` means that for encodings like MacJapanese,
* we treat each character in its native charset as "1 character", even if it
* maps to a sequence of several codepoints */
const unsigned char *mbtab = enc->mblen_table;
unsigned char *limit = in + in_len;
while (from && in < limit) {
in += mbtab[*in];
from--;
}
if (in >= limit) {
return zend_empty_string;
} else if (len == MBFL_SUBSTR_UNTIL_END) {
return zend_string_init_fast((const char*)in, limit - in);
}
unsigned char *end = in;
while (len && end < limit) {
end += mbtab[*end];
len--;
}
if (end > limit) {
end = limit;
}
return zend_string_init_fast((const char*)in, end - in);
}

return mb_get_substr_slow(in, in_len, from, len, enc);
Expand Down Expand Up @@ -2350,21 +2328,7 @@ PHP_FUNCTION(mb_substr)

size_t mblen = 0;
if (from < 0 || (!len_is_null && len < 0)) {
if (enc->mblen_table) {
/* Because we use the `mblen_table` when iterating over the string and
* extracting the requested part, we also need to use it here for counting
* the "length" of the string
* Otherwise, we can get wrong results for text encodings like MacJapanese,
* where one native 'character' can map to a sequence of several codepoints */
const unsigned char *mbtab = enc->mblen_table;
unsigned char *p = (unsigned char*)ZSTR_VAL(str), *e = p + ZSTR_LEN(str);
while (p < e) {
p += mbtab[*p];
mblen++;
}
} else {
mblen = mb_get_strlen(str, enc);
}
mblen = mb_get_strlen(str, enc);
}

/* if "from" position is negative, count start position from the end
Expand Down
9 changes: 8 additions & 1 deletion ext/mbstring/tests/mb_strstr.phpt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ var_dump(FROM_EUC_JP(mb_strstr(EUC_JP("あいうえおかきくけこ"), EUC_JP(
var_dump(bin2hex(mb_strstr("\xdd\x00", "", false, 'UTF-8')));
var_dump(bin2hex(mb_strstr("M\xff\xff\xff\x00", "\x00", false, "SJIS")));

// Test handling of invalid UTF-8 string
// Thanks to Stefan Schiller
var_dump(mb_strstr("\xf0start", "start", false, "UTF-8"));
var_dump(mb_strstr("\xf0start", "start", true, "UTF-8"));

?>
--EXPECT--
string(18) "おかきくけこ"
Expand All @@ -36,5 +41,7 @@ string(12) "あいうえ"
string(18) "おかきくけこ"
string(18) "おかきくけこ"
string(12) "あいうえ"
string(4) "dd00"
string(4) "3f00"
string(2) "00"
string(5) "start"
string(1) "?"
39 changes: 25 additions & 14 deletions ext/mbstring/tests/mb_substr.phpt
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,15 @@ print "3: " . mb_convert_encoding(mb_substr($utf7, -5, 3, 'UTF-7'), 'UTF-8', 'UT
print "4: " . mb_convert_encoding(mb_substr($utf7, 1, null, 'UTF-7'), 'UTF-8', 'UTF-7') . "\n";
print "5:" . mb_convert_encoding(mb_substr($utf7, 10, 0, 'UTF-7'), 'UTF-8', 'UTF-7') . "\n";

echo "Testing agreement with mb_strpos on invalid UTF-8 string:\n";
/* Stefan Schiller pointed out that on invalid UTF-8 strings, character indices returned
* by mb_strpos would not extract the desired part of the string when passed to mb_substr.
* This is the test case which he provided: */
$data = "\xF0AAA<b>";
$pos = mb_strpos($data, "<", 0, "UTF-8");
$out = mb_substr($data, 0, $pos, "UTF-8");
print $out . "\n";

echo "Regression:\n";
/* During development, one >= comparison in mb_get_substr was wrongly written as >
* This was caught by libFuzzer */
Expand All @@ -138,30 +147,30 @@ SJIS:
4: 967b8cea8365834c8358836782c582b781423031323334825482558256825782588142
5:
-- Testing illegal SJIS byte 0x80 --
6380
806162
633f
3f6162
SJIS-2004:
6380
806162
633f
3f6162
MacJapanese:
6380
806162
SJIS-Mobile#DOCOMO:
6380
806162
633f
3f6162
SJIS-Mobile#KDDI:
6380
806162
633f
3f6162
SJIS-Mobile#SoftBank:
6380
806162
633f
3f6162
-- Testing MacJapanese characters which map to 3-5 codepoints each --
616263
85ab85ac
85ac
3f3f
58
616263
85bf85c0
85c0
3f3f
78
ISO-2022-JP:
1: 1b2442212121721b284241
2: 43
Expand Down Expand Up @@ -200,5 +209,7 @@ UTF-7:
3: йте
4: reek: Σὲ γνωρίζω ἀπὸ τὴν κόψη Russian: Зарегистрируйтесь
5:
Testing agreement with mb_strpos on invalid UTF-8 string:
?AAA
Regression:
1b28493d3d3d3d3d3d3d3e3d3d3d1b28423f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f000000003f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f1b28493d3d3d3d3d3d3d3e1b2842013a4f1b28492a1b2842

0 comments on commit fea895b

Please sign in to comment.