Skip to content

Commit

Permalink
Minor pcre optimizations (php#12923)
Browse files Browse the repository at this point in the history
* Update signature of pcre API

This changes the variables that are bools to actually be bools instead
of ints, which allows some additional optimization by the compiler (e.g.
removing some ternaries and move extensions).

It also gets rid of the use_flags argument because that's just the same
as flags == 0. This reduces the call frame.

* Use zend_string_release_ex where possible

* Remove duplicate symbols from strchr

* Avoid useless value conversions

* Use a raw HashTable* instead of a zval

* Move condition

* Make for loop cheaper by reusing a recently used value as start iteration index

* Remove useless condition

This can't be true if the second condition is true because it would
require the string to occupy the entire address space.

* Upgrading + remark
  • Loading branch information
nielsdos authored Dec 11, 2023
1 parent 185627f commit 642e111
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 42 deletions.
6 changes: 6 additions & 0 deletions UPGRADING.INTERNALS
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,12 @@ PHP 8.4 INTERNALS UPGRADE NOTES
e. ext/date
- Added the php_format_date_ex() API to format instances of php_date_obj.

d. ext/pcre
- php_pcre_match_impl() now no longer has a use_flags argument.
When flags should be ignored, pass 0 to the flags argument.
- php_pcre_match_impl() and pcre_get_compiled_regex_cache_ex() now use
proper boolean argument types instead of integer types.

========================
4. OpCode changes
========================
Expand Down
2 changes: 1 addition & 1 deletion ext/fileinfo/libmagic/softmagic.c
Original file line number Diff line number Diff line change
Expand Up @@ -2266,7 +2266,7 @@ magiccheck(struct magic_set *ms, struct magic *m)
haystack = zend_string_init(ms->search.s, ms->search.s_len, 0);

/* match v = 0, no match v = 1 */
php_pcre_match_impl(pce, haystack, &retval, &subpats, 0, 1, PREG_OFFSET_CAPTURE, 0);
php_pcre_match_impl(pce, haystack, &retval, &subpats, 0, PREG_OFFSET_CAPTURE, 0);
/* Free haystack */
zend_string_release(haystack);

Expand Down
2 changes: 1 addition & 1 deletion ext/imap/php_imap.c
Original file line number Diff line number Diff line change
Expand Up @@ -806,7 +806,7 @@ PHP_FUNCTION(imap_append)

zend_string_release(regex);
php_pcre_match_impl(pce, internal_date, return_value, subpats, global,
0, Z_L(0), Z_L(0));
Z_L(0), Z_L(0));

if (!Z_LVAL_P(return_value)) {
// TODO Promoto to error?
Expand Down
60 changes: 31 additions & 29 deletions ext/pcre/php_pcre.c
Original file line number Diff line number Diff line change
Expand Up @@ -527,7 +527,7 @@ static void free_subpats_table(zend_string **subpat_names, uint32_t num_subpats)
uint32_t i;
for (i = 0; i < num_subpats; i++) {
if (subpat_names[i]) {
zend_string_release(subpat_names[i]);
zend_string_release_ex(subpat_names[i], false);
}
}
efree(subpat_names);
Expand Down Expand Up @@ -584,7 +584,7 @@ static zend_always_inline size_t calculate_unit_length(pcre_cache_entry *pce, co
/* }}} */

/* {{{ pcre_get_compiled_regex_cache */
PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache_ex(zend_string *regex, int locale_aware)
PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache_ex(zend_string *regex, bool locale_aware)
{
pcre2_code *re = NULL;
#if 10 == PCRE2_MAJOR && 37 == PCRE2_MINOR && !HAVE_BUNDLED_PCRE
Expand Down Expand Up @@ -655,7 +655,7 @@ PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache_ex(zend_string *regex, in
}

start_delimiter = delimiter;
if ((pp = strchr("([{< )]}> )]}>", delimiter)))
if ((pp = strchr("([{< )]}>", delimiter)))
delimiter = pp[5];
end_delimiter = delimiter;

Expand Down Expand Up @@ -879,7 +879,7 @@ PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache_ex(zend_string *regex, in
/* {{{ pcre_get_compiled_regex_cache */
PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(zend_string *regex)
{
return pcre_get_compiled_regex_cache_ex(regex, 1);
return pcre_get_compiled_regex_cache_ex(regex, true);
}
/* }}} */

Expand Down Expand Up @@ -979,7 +979,7 @@ static inline void add_named(
/* {{{ add_offset_pair */
static inline void add_offset_pair(
HashTable *const result, const char *subject, PCRE2_SIZE start_offset, PCRE2_SIZE end_offset,
zend_string *name, uint32_t unmatched_as_null)
zend_string *name, zend_long unmatched_as_null)
{
zval match_pair;

Expand Down Expand Up @@ -1013,8 +1013,8 @@ static inline void add_offset_pair(
static void populate_subpat_array(
zval *subpats, const char *subject, PCRE2_SIZE *offsets, zend_string **subpat_names,
uint32_t num_subpats, int count, const PCRE2_SPTR mark, zend_long flags) {
bool offset_capture = (flags & PREG_OFFSET_CAPTURE) != 0;
bool unmatched_as_null = (flags & PREG_UNMATCHED_AS_NULL) != 0;
zend_long offset_capture = flags & PREG_OFFSET_CAPTURE;
zend_long unmatched_as_null = flags & PREG_UNMATCHED_AS_NULL;
zval val;
int i;
HashTable *subpats_ht = Z_ARRVAL_P(subpats);
Expand Down Expand Up @@ -1079,7 +1079,7 @@ static void populate_subpat_array(
}
}

static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global) /* {{{ */
static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, bool global) /* {{{ */
{
/* parameters */
zend_string *regex; /* Regular expression */
Expand All @@ -1105,7 +1105,7 @@ static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global) /* {{{ *

pce->refcount++;
php_pcre_match_impl(pce, subject, return_value, subpats,
global, ZEND_NUM_ARGS() >= 4, flags, start_offset);
global, flags, start_offset);
pce->refcount--;
}
/* }}} */
Expand All @@ -1128,7 +1128,7 @@ static zend_always_inline bool is_known_valid_utf8(

/* {{{ php_pcre_match_impl() */
PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str, zval *return_value,
zval *subpats, int global, int use_flags, zend_long flags, zend_off_t start_offset)
zval *subpats, bool global, zend_long flags, zend_off_t start_offset)
{
zval result_set; /* Holds a set of subpatterns after
a global match */
Expand All @@ -1142,17 +1142,15 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str,
size_t i;
uint32_t subpats_order; /* Order of subpattern matches */
uint32_t offset_capture; /* Capture match offsets: yes/no */
uint32_t unmatched_as_null; /* Null non-matches: yes/no */
zend_long unmatched_as_null; /* Null non-matches: yes/no */
PCRE2_SPTR mark = NULL; /* Target for MARK name */
zval marks; /* Array of marks for PREG_PATTERN_ORDER */
HashTable *marks = NULL; /* Array of marks for PREG_PATTERN_ORDER */
pcre2_match_data *match_data;
PCRE2_SIZE start_offset2, orig_start_offset;

char *subject = ZSTR_VAL(subject_str);
size_t subject_len = ZSTR_LEN(subject_str);

ZVAL_UNDEF(&marks);

/* Overwrite the passed-in value for subpatterns with an empty array. */
if (subpats != NULL) {
subpats = zend_try_array_init(subpats);
Expand All @@ -1163,7 +1161,7 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str,

subpats_order = global ? PREG_PATTERN_ORDER : 0;

if (use_flags) {
if (flags) {
offset_capture = flags & PREG_OFFSET_CAPTURE;
unmatched_as_null = flags & PREG_UNMATCHED_AS_NULL;

Expand All @@ -1173,11 +1171,11 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str,
*/
if (flags & 0xff) {
subpats_order = flags & 0xff;
}
if ((global && (subpats_order < PREG_PATTERN_ORDER || subpats_order > PREG_SET_ORDER)) ||
(!global && subpats_order != 0)) {
zend_argument_value_error(4, "must be a PREG_* constant");
RETURN_THROWS();
if ((global && (subpats_order < PREG_PATTERN_ORDER || subpats_order > PREG_SET_ORDER)) ||
(!global && subpats_order != 0)) {
zend_argument_value_error(4, "must be a PREG_* constant");
RETURN_THROWS();
}
}
} else {
offset_capture = 0;
Expand Down Expand Up @@ -1301,18 +1299,20 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str,
mark = pcre2_get_mark(match_data);
/* Add MARK, if available */
if (mark) {
if (Z_TYPE(marks) == IS_UNDEF) {
array_init(&marks);
if (!marks) {
marks = zend_new_array(0);
}
add_index_string(&marks, matched - 1, (char *) mark);
zval tmp;
ZVAL_STRING(&tmp, (char *) mark);
zend_hash_index_add_new(marks, matched - 1, &tmp);
}
/*
* If the number of captured subpatterns on this run is
* less than the total possible number, pad the result
* arrays with NULLs or empty strings.
*/
if (count < num_subpats) {
for (; i < num_subpats; i++) {
for (int i = count; i < num_subpats; i++) {
if (offset_capture) {
add_offset_pair(
match_sets[i], NULL, PCRE2_UNSET, PCRE2_UNSET,
Expand Down Expand Up @@ -1394,7 +1394,7 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str,
/* Execute the regular expression. */
#ifdef HAVE_PCRE_JIT_SUPPORT
if ((pce->preg_options & PREG_JIT)) {
if (PCRE2_UNSET == start_offset2 || start_offset2 > subject_len) {
if (start_offset2 > subject_len) {
pcre_handle_exec_error(PCRE2_ERROR_BADOFFSET);
break;
}
Expand Down Expand Up @@ -1430,8 +1430,10 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str,
}
efree(match_sets);

if (Z_TYPE(marks) != IS_UNDEF) {
add_assoc_zval(subpats, "MARK", &marks);
if (marks) {
zval tmp;
ZVAL_ARR(&tmp, marks);
zend_hash_str_update(Z_ARRVAL_P(subpats), "MARK", sizeof("MARK") - 1, &tmp);
}
}

Expand All @@ -1456,14 +1458,14 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str,
/* {{{ Perform a Perl-style regular expression match */
PHP_FUNCTION(preg_match)
{
php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, false);
}
/* }}} */

/* {{{ Perform a Perl-style global regular expression match */
PHP_FUNCTION(preg_match_all)
{
php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, true);
}
/* }}} */

Expand Down
4 changes: 2 additions & 2 deletions ext/pcre/php_pcre.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,10 @@ typedef enum {
} php_pcre_error_code;

PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(zend_string *regex);
PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache_ex(zend_string *regex, int locale_aware);
PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache_ex(zend_string *regex, bool locale_aware);

PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str, zval *return_value,
zval *subpats, int global, int use_flags, zend_long flags, zend_off_t start_offset);
zval *subpats, bool global, zend_long flags, zend_off_t start_offset);

PHPAPI zend_string *php_pcre_replace_impl(pcre_cache_entry *pce, zend_string *subject_str, const char *subject, size_t subject_len, zend_string *replace_str,
size_t limit, size_t *replace_count);
Expand Down
11 changes: 2 additions & 9 deletions ext/spl/spl_iterators.c
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,6 @@ typedef struct _spl_dual_it_object {
pcre_cache_entry *pce;
zend_string *regex;
regex_mode mode;
int use_flags;
} regex;
zend_fcall_info_cache callback_filter;
} u;
Expand Down Expand Up @@ -1407,7 +1406,6 @@ static spl_dual_it_object* spl_dual_it_construct(INTERNAL_FUNCTION_PARAMETERS, z
zend_string *regex;
zend_long mode = REGIT_MODE_MATCH;

intern->u.regex.use_flags = ZEND_NUM_ARGS() >= 5;
intern->u.regex.flags = 0;
intern->u.regex.preg_flags = 0;
if (zend_parse_parameters(ZEND_NUM_ARGS(), "OS|lll", &zobject, ce_inner, &regex, &mode, &intern->u.regex.flags, &intern->u.regex.preg_flags) == FAILURE) {
Expand Down Expand Up @@ -1876,7 +1874,7 @@ PHP_METHOD(RegexIterator, accept)
zval_ptr_dtor(&intern->current.data);
ZVAL_UNDEF(&intern->current.data);
php_pcre_match_impl(intern->u.regex.pce, subject, &zcount,
&intern->current.data, intern->u.regex.mode == REGIT_MODE_ALL_MATCHES, intern->u.regex.use_flags, intern->u.regex.preg_flags, 0);
&intern->current.data, intern->u.regex.mode == REGIT_MODE_ALL_MATCHES, intern->u.regex.preg_flags, 0);
RETVAL_BOOL(Z_LVAL(zcount) > 0);
break;

Expand Down Expand Up @@ -2006,11 +2004,7 @@ PHP_METHOD(RegexIterator, getPregFlags)

SPL_FETCH_AND_CHECK_DUAL_IT(intern, ZEND_THIS);

if (intern->u.regex.use_flags) {
RETURN_LONG(intern->u.regex.preg_flags);
} else {
RETURN_LONG(0);
}
RETURN_LONG(intern->u.regex.preg_flags);
} /* }}} */

/* {{{ Set PREG flags */
Expand All @@ -2026,7 +2020,6 @@ PHP_METHOD(RegexIterator, setPregFlags)
SPL_FETCH_AND_CHECK_DUAL_IT(intern, ZEND_THIS);

intern->u.regex.preg_flags = preg_flags;
intern->u.regex.use_flags = 1;
} /* }}} */

/* {{{ Create an RecursiveRegexIterator from another recursive iterator and a regular expression */
Expand Down

0 comments on commit 642e111

Please sign in to comment.