Skip to content

Commit

Permalink
Add fast mb_strcut implementation for UTF-16
Browse files Browse the repository at this point in the history
Similar to the fast, specialized mb_strcut implementation for UTF-8
in 1f0cf13, this new implementation of mb_strcut for UTF-16 strings
just examines a few bytes before each cut point.

Even for short strings, the new implementation is around 2x faster.
For strings around 10,000 bytes in length, it comes out about 100-500x
faster in my microbenchmarks.

The new implementation behaves identically to the old one on valid
UTF-16 strings; a fuzzer was used to help verify this.
  • Loading branch information
alexdowad committed Oct 28, 2023
1 parent 00c567a commit d04854b
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 4 deletions.
6 changes: 6 additions & 0 deletions UPGRADING
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,10 @@ PHP 8.4 UPGRADE NOTES
5. Changed Functions
========================================

- MBString:
. The behavior of mb_strcut is more consistent now on invalid UTF-8 and UTF-16
strings. (For valid UTF-8 and UTF-16 strings, there is no change.)

- PGSQL:
. pg_select, the conditions arguments accepts an empty array and is optional.

Expand Down Expand Up @@ -177,3 +181,5 @@ PHP 8.4 UPGRADE NOTES

* The performance of strspn() is greatly improved. It now runs in linear time
instead of being bounded by quadratic time.

* mb_strcut() is much faster now for UTF-8 and UTF-16 strings.
95 changes: 92 additions & 3 deletions ext/mbstring/libmbfl/filters/mbfilter_utf16.c
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,9 @@ static void mb_wchar_to_utf16le_default(uint32_t *in, size_t len, mb_convert_buf

static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter);
static size_t mb_utf16_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
static zend_string* mb_cut_utf16(unsigned char *str, size_t from, size_t len, unsigned char *end);
static zend_string* mb_cut_utf16be(unsigned char *str, size_t from, size_t len, unsigned char *end);
static zend_string* mb_cut_utf16le(unsigned char *str, size_t from, size_t len, unsigned char *end);

static const char *mbfl_encoding_utf16_aliases[] = {"utf16", NULL};

Expand All @@ -190,7 +193,7 @@ const mbfl_encoding mbfl_encoding_utf16 = {
mb_utf16_to_wchar,
mb_wchar_to_utf16be,
NULL,
NULL,
mb_cut_utf16
};

const mbfl_encoding mbfl_encoding_utf16be = {
Expand All @@ -205,7 +208,7 @@ const mbfl_encoding mbfl_encoding_utf16be = {
mb_utf16be_to_wchar,
mb_wchar_to_utf16be,
NULL,
NULL,
mb_cut_utf16be
};

const mbfl_encoding mbfl_encoding_utf16le = {
Expand All @@ -220,7 +223,7 @@ const mbfl_encoding mbfl_encoding_utf16le = {
mb_utf16le_to_wchar,
mb_wchar_to_utf16le,
NULL,
NULL,
mb_cut_utf16le
};

const struct mbfl_convert_vtbl vtbl_utf16_wchar = {
Expand Down Expand Up @@ -1043,3 +1046,89 @@ static void mb_wchar_to_utf16le_avx2(uint32_t *in, size_t len, mb_convert_buf *b
}

#endif /* defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER) */

static zend_string* mb_cut_utf16be(unsigned char *str, size_t from, size_t len, unsigned char *end)
{
if (len > end - (str + from)) {
len = end - (str + from);
}
from &= ~1;
len &= ~1;
unsigned char *start = str + from;
if (len < 2 || (end - start) < 2) {
return zend_empty_string;
}
/* Check if 1st codepoint is 2nd part of surrogate pair */
if (from > 0) {
uint32_t start_cp = (*start << 8) + *(start + 1);
if (start_cp >= 0xDC00 && start_cp <= 0xDFFF) {
uint32_t preceding_cp = (*(start - 2) << 8) + *(start - 1);
if (preceding_cp >= 0xD800 && preceding_cp <= 0xDBFF) {
from -= 2;
}
}
}
/* Same for ending cut point */
unsigned char *_end = start + len;
if (_end > end) {
_end = end;
}
uint32_t ending_cp = (*(_end - 2) << 8) + *(_end - 1);
if (ending_cp >= 0xD800 && ending_cp <= 0xDBFF) {
_end -= 2;
}
return zend_string_init_fast((char*)start, _end - start);
}

static zend_string* mb_cut_utf16le(unsigned char *str, size_t from, size_t len, unsigned char *end)
{
if (len > end - (str + from)) {
len = end - (str + from);
}
from &= ~1;
len &= ~1;
unsigned char *start = str + from;
if (len < 2 || (end - start) < 2) {
return zend_empty_string;
}
/* Check if 1st codepoint is 2nd part of surrogate pair */
if (from > 0) {
uint32_t start_cp = (*(start + 1) << 8) + *start;
if (start_cp >= 0xDC00 && start_cp <= 0xDFFF) {
uint32_t preceding_cp = (*(start - 1) << 8) + *(start - 2);
if (preceding_cp >= 0xD800 && preceding_cp <= 0xDBFF) {
from -= 2;
}
}
}
/* Same for ending cut point */
unsigned char *_end = start + len;
if (_end > end) {
_end = end;
}
uint32_t ending_cp = (*(_end - 1) << 8) + *(_end - 2);
if (ending_cp >= 0xD800 && ending_cp <= 0xDBFF) {
_end -= 2;
}
return zend_string_init_fast((char*)start, _end - start);
}

static zend_string* mb_cut_utf16(unsigned char *str, size_t from, size_t len, unsigned char *end)
{
if (len < 2 || (end - str) < 2) {
return zend_empty_string;
}
uint32_t cp = (*str << 8) + *(str + 1);
if (cp == 0xFFFE) {
/* Little-endian BOM */
if (from < 2) {
from = 2;
}
return mb_cut_utf16le(str, from, len, end);
} else {
if (cp == 0xFEFF && from < 2) {
from = 2;
}
return mb_cut_utf16be(str, from, len, end);
}
}
2 changes: 1 addition & 1 deletion ext/mbstring/tests/mb_strcut.phpt
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ OK
Single byte: []
With from=1: []
Bad surrogate: []
Bad surrogate followed by other bytes: [003f1243]
Bad surrogate followed by other bytes: [d9001243]
BE byte order mark: []
LE byte order mark: []
Length=0: []
Expand Down

0 comments on commit d04854b

Please sign in to comment.