From 642e11140c3a8461ab9a78d6ce3d3a79b0151987 Mon Sep 17 00:00:00 2001 From: Niels Dossche <7771979+nielsdos@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:43:26 +0000 Subject: [PATCH] Minor pcre optimizations (#12923) * Update signature of pcre API This changes the variables that are bools to actually be bools instead of ints, which allows some additional optimization by the compiler (e.g. removing some ternaries and move extensions). It also gets rid of the use_flags argument because that's just the same as flags == 0. This reduces the call frame. * Use zend_string_release_ex where possible * Remove duplicate symbols from strchr * Avoid useless value conversions * Use a raw HashTable* instead of a zval * Move condition * Make for loop cheaper by reusing a recently used value as start iteration index * Remove useless condition This can't be true if the second condition is true because it would require the string to occupy the entire address space. * Upgrading + remark --- UPGRADING.INTERNALS | 6 ++++ ext/fileinfo/libmagic/softmagic.c | 2 +- ext/imap/php_imap.c | 2 +- ext/pcre/php_pcre.c | 60 ++++++++++++++++--------------- ext/pcre/php_pcre.h | 4 +-- ext/spl/spl_iterators.c | 11 ++---- 6 files changed, 43 insertions(+), 42 deletions(-) diff --git a/UPGRADING.INTERNALS b/UPGRADING.INTERNALS index a7b6e7cf825c5..e63e440f5a178 100644 --- a/UPGRADING.INTERNALS +++ b/UPGRADING.INTERNALS @@ -59,6 +59,12 @@ PHP 8.4 INTERNALS UPGRADE NOTES e. ext/date - Added the php_format_date_ex() API to format instances of php_date_obj. + d. ext/pcre + - php_pcre_match_impl() now no longer has a use_flags argument. + When flags should be ignored, pass 0 to the flags argument. + - php_pcre_match_impl() and pcre_get_compiled_regex_cache_ex() now use + proper boolean argument types instead of integer types. + ======================== 4. OpCode changes ======================== diff --git a/ext/fileinfo/libmagic/softmagic.c b/ext/fileinfo/libmagic/softmagic.c index 32d24a37d3df1..9cfdfd85f1758 100644 --- a/ext/fileinfo/libmagic/softmagic.c +++ b/ext/fileinfo/libmagic/softmagic.c @@ -2266,7 +2266,7 @@ magiccheck(struct magic_set *ms, struct magic *m) haystack = zend_string_init(ms->search.s, ms->search.s_len, 0); /* match v = 0, no match v = 1 */ - php_pcre_match_impl(pce, haystack, &retval, &subpats, 0, 1, PREG_OFFSET_CAPTURE, 0); + php_pcre_match_impl(pce, haystack, &retval, &subpats, 0, PREG_OFFSET_CAPTURE, 0); /* Free haystack */ zend_string_release(haystack); diff --git a/ext/imap/php_imap.c b/ext/imap/php_imap.c index 22ad1b0898c0c..e94270c81bae7 100644 --- a/ext/imap/php_imap.c +++ b/ext/imap/php_imap.c @@ -806,7 +806,7 @@ PHP_FUNCTION(imap_append) zend_string_release(regex); php_pcre_match_impl(pce, internal_date, return_value, subpats, global, - 0, Z_L(0), Z_L(0)); + Z_L(0), Z_L(0)); if (!Z_LVAL_P(return_value)) { // TODO Promoto to error? diff --git a/ext/pcre/php_pcre.c b/ext/pcre/php_pcre.c index 1169e0693e6a6..075d9fc5263b5 100644 --- a/ext/pcre/php_pcre.c +++ b/ext/pcre/php_pcre.c @@ -527,7 +527,7 @@ static void free_subpats_table(zend_string **subpat_names, uint32_t num_subpats) uint32_t i; for (i = 0; i < num_subpats; i++) { if (subpat_names[i]) { - zend_string_release(subpat_names[i]); + zend_string_release_ex(subpat_names[i], false); } } efree(subpat_names); @@ -584,7 +584,7 @@ static zend_always_inline size_t calculate_unit_length(pcre_cache_entry *pce, co /* }}} */ /* {{{ pcre_get_compiled_regex_cache */ -PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache_ex(zend_string *regex, int locale_aware) +PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache_ex(zend_string *regex, bool locale_aware) { pcre2_code *re = NULL; #if 10 == PCRE2_MAJOR && 37 == PCRE2_MINOR && !HAVE_BUNDLED_PCRE @@ -655,7 +655,7 @@ PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache_ex(zend_string *regex, in } start_delimiter = delimiter; - if ((pp = strchr("([{< )]}> )]}>", delimiter))) + if ((pp = strchr("([{< )]}>", delimiter))) delimiter = pp[5]; end_delimiter = delimiter; @@ -879,7 +879,7 @@ PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache_ex(zend_string *regex, in /* {{{ pcre_get_compiled_regex_cache */ PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(zend_string *regex) { - return pcre_get_compiled_regex_cache_ex(regex, 1); + return pcre_get_compiled_regex_cache_ex(regex, true); } /* }}} */ @@ -979,7 +979,7 @@ static inline void add_named( /* {{{ add_offset_pair */ static inline void add_offset_pair( HashTable *const result, const char *subject, PCRE2_SIZE start_offset, PCRE2_SIZE end_offset, - zend_string *name, uint32_t unmatched_as_null) + zend_string *name, zend_long unmatched_as_null) { zval match_pair; @@ -1013,8 +1013,8 @@ static inline void add_offset_pair( static void populate_subpat_array( zval *subpats, const char *subject, PCRE2_SIZE *offsets, zend_string **subpat_names, uint32_t num_subpats, int count, const PCRE2_SPTR mark, zend_long flags) { - bool offset_capture = (flags & PREG_OFFSET_CAPTURE) != 0; - bool unmatched_as_null = (flags & PREG_UNMATCHED_AS_NULL) != 0; + zend_long offset_capture = flags & PREG_OFFSET_CAPTURE; + zend_long unmatched_as_null = flags & PREG_UNMATCHED_AS_NULL; zval val; int i; HashTable *subpats_ht = Z_ARRVAL_P(subpats); @@ -1079,7 +1079,7 @@ static void populate_subpat_array( } } -static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global) /* {{{ */ +static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, bool global) /* {{{ */ { /* parameters */ zend_string *regex; /* Regular expression */ @@ -1105,7 +1105,7 @@ static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global) /* {{{ * pce->refcount++; php_pcre_match_impl(pce, subject, return_value, subpats, - global, ZEND_NUM_ARGS() >= 4, flags, start_offset); + global, flags, start_offset); pce->refcount--; } /* }}} */ @@ -1128,7 +1128,7 @@ static zend_always_inline bool is_known_valid_utf8( /* {{{ php_pcre_match_impl() */ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str, zval *return_value, - zval *subpats, int global, int use_flags, zend_long flags, zend_off_t start_offset) + zval *subpats, bool global, zend_long flags, zend_off_t start_offset) { zval result_set; /* Holds a set of subpatterns after a global match */ @@ -1142,17 +1142,15 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str, size_t i; uint32_t subpats_order; /* Order of subpattern matches */ uint32_t offset_capture; /* Capture match offsets: yes/no */ - uint32_t unmatched_as_null; /* Null non-matches: yes/no */ + zend_long unmatched_as_null; /* Null non-matches: yes/no */ PCRE2_SPTR mark = NULL; /* Target for MARK name */ - zval marks; /* Array of marks for PREG_PATTERN_ORDER */ + HashTable *marks = NULL; /* Array of marks for PREG_PATTERN_ORDER */ pcre2_match_data *match_data; PCRE2_SIZE start_offset2, orig_start_offset; char *subject = ZSTR_VAL(subject_str); size_t subject_len = ZSTR_LEN(subject_str); - ZVAL_UNDEF(&marks); - /* Overwrite the passed-in value for subpatterns with an empty array. */ if (subpats != NULL) { subpats = zend_try_array_init(subpats); @@ -1163,7 +1161,7 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str, subpats_order = global ? PREG_PATTERN_ORDER : 0; - if (use_flags) { + if (flags) { offset_capture = flags & PREG_OFFSET_CAPTURE; unmatched_as_null = flags & PREG_UNMATCHED_AS_NULL; @@ -1173,11 +1171,11 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str, */ if (flags & 0xff) { subpats_order = flags & 0xff; - } - if ((global && (subpats_order < PREG_PATTERN_ORDER || subpats_order > PREG_SET_ORDER)) || - (!global && subpats_order != 0)) { - zend_argument_value_error(4, "must be a PREG_* constant"); - RETURN_THROWS(); + if ((global && (subpats_order < PREG_PATTERN_ORDER || subpats_order > PREG_SET_ORDER)) || + (!global && subpats_order != 0)) { + zend_argument_value_error(4, "must be a PREG_* constant"); + RETURN_THROWS(); + } } } else { offset_capture = 0; @@ -1301,10 +1299,12 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str, mark = pcre2_get_mark(match_data); /* Add MARK, if available */ if (mark) { - if (Z_TYPE(marks) == IS_UNDEF) { - array_init(&marks); + if (!marks) { + marks = zend_new_array(0); } - add_index_string(&marks, matched - 1, (char *) mark); + zval tmp; + ZVAL_STRING(&tmp, (char *) mark); + zend_hash_index_add_new(marks, matched - 1, &tmp); } /* * If the number of captured subpatterns on this run is @@ -1312,7 +1312,7 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str, * arrays with NULLs or empty strings. */ if (count < num_subpats) { - for (; i < num_subpats; i++) { + for (int i = count; i < num_subpats; i++) { if (offset_capture) { add_offset_pair( match_sets[i], NULL, PCRE2_UNSET, PCRE2_UNSET, @@ -1394,7 +1394,7 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str, /* Execute the regular expression. */ #ifdef HAVE_PCRE_JIT_SUPPORT if ((pce->preg_options & PREG_JIT)) { - if (PCRE2_UNSET == start_offset2 || start_offset2 > subject_len) { + if (start_offset2 > subject_len) { pcre_handle_exec_error(PCRE2_ERROR_BADOFFSET); break; } @@ -1430,8 +1430,10 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str, } efree(match_sets); - if (Z_TYPE(marks) != IS_UNDEF) { - add_assoc_zval(subpats, "MARK", &marks); + if (marks) { + zval tmp; + ZVAL_ARR(&tmp, marks); + zend_hash_str_update(Z_ARRVAL_P(subpats), "MARK", sizeof("MARK") - 1, &tmp); } } @@ -1456,14 +1458,14 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str, /* {{{ Perform a Perl-style regular expression match */ PHP_FUNCTION(preg_match) { - php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0); + php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, false); } /* }}} */ /* {{{ Perform a Perl-style global regular expression match */ PHP_FUNCTION(preg_match_all) { - php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1); + php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, true); } /* }}} */ diff --git a/ext/pcre/php_pcre.h b/ext/pcre/php_pcre.h index a7c4b9cb1c359..3e526ade551d5 100644 --- a/ext/pcre/php_pcre.h +++ b/ext/pcre/php_pcre.h @@ -47,10 +47,10 @@ typedef enum { } php_pcre_error_code; PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(zend_string *regex); -PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache_ex(zend_string *regex, int locale_aware); +PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache_ex(zend_string *regex, bool locale_aware); PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str, zval *return_value, - zval *subpats, int global, int use_flags, zend_long flags, zend_off_t start_offset); + zval *subpats, bool global, zend_long flags, zend_off_t start_offset); PHPAPI zend_string *php_pcre_replace_impl(pcre_cache_entry *pce, zend_string *subject_str, const char *subject, size_t subject_len, zend_string *replace_str, size_t limit, size_t *replace_count); diff --git a/ext/spl/spl_iterators.c b/ext/spl/spl_iterators.c index 69ce2651075a7..482fb80acab8b 100644 --- a/ext/spl/spl_iterators.c +++ b/ext/spl/spl_iterators.c @@ -135,7 +135,6 @@ typedef struct _spl_dual_it_object { pcre_cache_entry *pce; zend_string *regex; regex_mode mode; - int use_flags; } regex; zend_fcall_info_cache callback_filter; } u; @@ -1407,7 +1406,6 @@ static spl_dual_it_object* spl_dual_it_construct(INTERNAL_FUNCTION_PARAMETERS, z zend_string *regex; zend_long mode = REGIT_MODE_MATCH; - intern->u.regex.use_flags = ZEND_NUM_ARGS() >= 5; intern->u.regex.flags = 0; intern->u.regex.preg_flags = 0; if (zend_parse_parameters(ZEND_NUM_ARGS(), "OS|lll", &zobject, ce_inner, ®ex, &mode, &intern->u.regex.flags, &intern->u.regex.preg_flags) == FAILURE) { @@ -1876,7 +1874,7 @@ PHP_METHOD(RegexIterator, accept) zval_ptr_dtor(&intern->current.data); ZVAL_UNDEF(&intern->current.data); php_pcre_match_impl(intern->u.regex.pce, subject, &zcount, - &intern->current.data, intern->u.regex.mode == REGIT_MODE_ALL_MATCHES, intern->u.regex.use_flags, intern->u.regex.preg_flags, 0); + &intern->current.data, intern->u.regex.mode == REGIT_MODE_ALL_MATCHES, intern->u.regex.preg_flags, 0); RETVAL_BOOL(Z_LVAL(zcount) > 0); break; @@ -2006,11 +2004,7 @@ PHP_METHOD(RegexIterator, getPregFlags) SPL_FETCH_AND_CHECK_DUAL_IT(intern, ZEND_THIS); - if (intern->u.regex.use_flags) { - RETURN_LONG(intern->u.regex.preg_flags); - } else { - RETURN_LONG(0); - } + RETURN_LONG(intern->u.regex.preg_flags); } /* }}} */ /* {{{ Set PREG flags */ @@ -2026,7 +2020,6 @@ PHP_METHOD(RegexIterator, setPregFlags) SPL_FETCH_AND_CHECK_DUAL_IT(intern, ZEND_THIS); intern->u.regex.preg_flags = preg_flags; - intern->u.regex.use_flags = 1; } /* }}} */ /* {{{ Create an RecursiveRegexIterator from another recursive iterator and a regular expression */