diff --git a/RunTest b/RunTest index d426a59a7..e4ab7ab06 100755 --- a/RunTest +++ b/RunTest @@ -89,7 +89,8 @@ title23="Test 23: \C disabled test" title24="Test 24: Non-UTF pattern conversion tests" title25="Test 25: UTF pattern conversion tests" title26="Test 26: Auto-generated unicode property tests" -maxtest=26 +title27="Test 27: Pattern rewriter tests" +maxtest=27 titleheap="Test 'heap': Environment-specific heap tests" if [ $# -eq 1 -a "$1" = "list" ]; then @@ -120,6 +121,7 @@ if [ $# -eq 1 -a "$1" = "list" ]; then echo $title24 echo $title25 echo $title26 + echo $title27 echo "" echo $titleheap echo "" @@ -255,6 +257,7 @@ do23=no do24=no do25=no do26=no +do27=no doheap=no while [ $# -gt 0 ] ; do @@ -286,6 +289,7 @@ while [ $# -gt 0 ] ; do 24) do24=yes;; 25) do25=yes;; 26) do26=yes;; + 27) do27=yes;; heap) doheap=yes;; -8) arg8=yes;; -16) arg16=yes;; @@ -437,7 +441,7 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \ $do12 = no -a $do13 = no -a $do14 = no -a $do15 = no -a \ $do16 = no -a $do17 = no -a $do18 = no -a $do19 = no -a \ $do20 = no -a $do21 = no -a $do22 = no -a $do23 = no -a \ - $do24 = no -a $do25 = no -a $do26 = no -a $doheap = no \ + $do24 = no -a $do25 = no -a $do26 = no -a $do27 = no -a $doheap = no \ ]; then do0=yes do1=yes @@ -466,6 +470,7 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \ do24=yes do25=yes do26=yes + do27=yes fi # Handle any explicit skips at this stage, so that an argument list may consist @@ -898,6 +903,18 @@ for bmode in "$test8" "$test16" "$test32"; do fi fi + # Pattern rewriter tests + + if [ $do27 = yes ] ; then + echo $title27 + if [ $utf -eq 0 ] ; then + echo " Skipped because UTF-$bits support is not available" + else + $sim $valgrind ./pcre2test -q $setstack $bmode $testdata/testinput27 testtry + checkresult $? 27 "" + fi + fi + # Manually selected heap tests - output may vary in different environments, # which is why that are not automatically run. diff --git a/doc/pcre2_compile.3 b/doc/pcre2_compile.3 index 151a7038f..15986b630 100644 --- a/doc/pcre2_compile.3 +++ b/doc/pcre2_compile.3 @@ -65,6 +65,7 @@ The primary option bits are: theses (named ones available) PCRE2_NO_AUTO_POSSESS Disable auto-possessification PCRE2_NO_DOTSTAR_ANCHOR Disable automatic anchoring for .* + PCRE2_NO_PATTERN_REWRITE Disable pattern rewriting optimizations PCRE2_NO_START_OPTIMIZE Disable match-time start optimizations PCRE2_NO_UTF_CHECK Do not check the pattern for UTF validity (only relevant if PCRE2_UTF is set) diff --git a/doc/pcre2api.3 b/doc/pcre2api.3 index a362982d8..5e7e53b09 100644 --- a/doc/pcre2api.3 +++ b/doc/pcre2api.3 @@ -1768,6 +1768,18 @@ automatically anchored if PCRE2_DOTALL is set for all the .* items and PCRE2_MULTILINE is not set for any ^ items. Otherwise, the fact that any match must start either at the start of the subject or following a newline is remembered. Like other optimizations, this can cause callouts to be skipped. +.sp + PCRE2_NO_PATTERN_REWRITE +.sp +This option disables all optimizations which occur during the pattern rewriting +phase (after parsing but before compilation). Pattern rewriting may remove +redundant items, coalesce items, adjust group structure, or replace some +constructs with an equivalent construct. Pattern rewriting will never affect +which strings are and are not matched, or what substrings are captured by +capture groups. However, since it may change the structure of a pattern, +if you are tracing the matching process, you might prefer PCRE2 to use the +original pattern without rewriting. This option is also useful for testing. +Pattern rewriting is also disabled if PCRE2_AUTO_CALLOUT is set. .sp PCRE2_NO_START_OPTIMIZE .sp diff --git a/doc/pcre2callout.3 b/doc/pcre2callout.3 index 86a1c54f6..8955b9afe 100644 --- a/doc/pcre2callout.3 +++ b/doc/pcre2callout.3 @@ -83,7 +83,9 @@ Callouts can be useful for tracking the progress of pattern matching. The program has a pattern qualifier (/auto_callout) that sets automatic callouts. When any callouts are present, the output from \fBpcre2test\fP indicates how the pattern is being matched. This is useful information when you are trying to -optimize the performance of a particular pattern. +optimize the performance of a particular pattern. However, note that some +optimizations which adjust the structure of the pattern are disabled when +automatic callouts are enabled. . . .SH "MISSING CALLOUTS" diff --git a/doc/pcre2syntax.3 b/doc/pcre2syntax.3 index b87179157..2f5ee3376 100644 --- a/doc/pcre2syntax.3 +++ b/doc/pcre2syntax.3 @@ -414,6 +414,7 @@ appear. For the first three, d is a decimal number. (*NO_AUTO_POSSESS) no auto-possessification (PCRE2_NO_AUTO_POSSESS) (*NO_DOTSTAR_ANCHOR) no .* anchoring (PCRE2_NO_DOTSTAR_ANCHOR) (*NO_JIT) disable JIT optimization + (*NO_PATTERN_REWRITE) disable pattern rewriting optimizations (PCRE2_NO_PATTERN_REWRITE) (*NO_START_OPT) no start-match optimization (PCRE2_NO_START_OPTIMIZE) (*UTF) set appropriate UTF mode for the library in use (*UCP) set PCRE2_UCP (use Unicode properties for \ed etc) diff --git a/doc/pcre2test.1 b/doc/pcre2test.1 index 9b7d37598..437c41d9b 100644 --- a/doc/pcre2test.1 +++ b/doc/pcre2test.1 @@ -623,6 +623,7 @@ for a description of the effects of these options. /n no_auto_capture set PCRE2_NO_AUTO_CAPTURE no_auto_possess set PCRE2_NO_AUTO_POSSESS no_dotstar_anchor set PCRE2_NO_DOTSTAR_ANCHOR + no_pattern_rewrite set PCRE2_NO_PATTERN_REWRITE no_start_optimize set PCRE2_NO_START_OPTIMIZE no_utf_check set PCRE2_NO_UTF_CHECK ucp set PCRE2_UCP diff --git a/doc/pcre2test.txt b/doc/pcre2test.txt index 30e16c8b5..2595e44eb 100644 --- a/doc/pcre2test.txt +++ b/doc/pcre2test.txt @@ -604,6 +604,7 @@ PATTERN MODIFIERS /n no_auto_capture set PCRE2_NO_AUTO_CAPTURE no_auto_possess set PCRE2_NO_AUTO_POSSESS no_dotstar_anchor set PCRE2_NO_DOTSTAR_ANCHOR + no_pattern_rewrite set PCRE2_NO_PATTERN_REWRITE no_start_optimize set PCRE2_NO_START_OPTIMIZE no_utf_check set PCRE2_NO_UTF_CHECK ucp set PCRE2_UCP diff --git a/src/pcre2.h.generic b/src/pcre2.h.generic index 80995fc0b..a8d37d762 100644 --- a/src/pcre2.h.generic +++ b/src/pcre2.h.generic @@ -143,6 +143,7 @@ D is inspected during pcre2_dfa_match() execution #define PCRE2_EXTENDED_MORE 0x01000000u /* C */ #define PCRE2_LITERAL 0x02000000u /* C */ #define PCRE2_MATCH_INVALID_UTF 0x04000000u /* J M D */ +#define PCRE2_NO_PATTERN_REWRITE 0x08000000u /* C */ /* An additional compile options word is available in the compile context. */ diff --git a/src/pcre2.h.in b/src/pcre2.h.in index 92f68842f..0a00635a3 100644 --- a/src/pcre2.h.in +++ b/src/pcre2.h.in @@ -143,6 +143,7 @@ D is inspected during pcre2_dfa_match() execution #define PCRE2_EXTENDED_MORE 0x01000000u /* C */ #define PCRE2_LITERAL 0x02000000u /* C */ #define PCRE2_MATCH_INVALID_UTF 0x04000000u /* J M D */ +#define PCRE2_NO_PATTERN_REWRITE 0x08000000u /* C */ /* An additional compile options word is available in the compile context. */ diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 410f220b3..aacdd2cd6 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -94,6 +94,7 @@ them will be able to (i.e. assume a 64-bit world). */ #define GETPLUSOFFSET(s,p) s = *(++p) #define READPLUSOFFSET(s,p) s = p[1] #define SKIPOFFSET(p) p++ +#define READOFFSET(p) *(p) #define SIZEOFFSET 1 #else #define PUTOFFSET(s,p) \ @@ -105,6 +106,7 @@ them will be able to (i.e. assume a 64-bit world). */ #define READPLUSOFFSET(s,p) \ { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; } #define SKIPOFFSET(p) p += 2 +#define READOFFSET(p) (((PCRE2_SIZE)(p)[0] << 32) | (PCRE2_SIZE)(p)[1]) #define SIZEOFFSET 2 #endif @@ -790,7 +792,7 @@ are allowed. */ PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \ PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \ PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \ - PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY) + PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_NO_PATTERN_REWRITE|PCRE2_UCP|PCRE2_UNGREEDY) #define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \ (PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_CASELESS_RESTRICT) @@ -847,27 +849,28 @@ typedef struct pso { /* NB: STRING_UTFn_RIGHTPAR contains the length as well */ static const pso pso_list[] = { - { STRING_UTFn_RIGHTPAR, PSO_OPT, PCRE2_UTF }, - { STRING_UTF_RIGHTPAR, 4, PSO_OPT, PCRE2_UTF }, - { STRING_UCP_RIGHTPAR, 4, PSO_OPT, PCRE2_UCP }, - { STRING_NOTEMPTY_RIGHTPAR, 9, PSO_FLG, PCRE2_NOTEMPTY_SET }, - { STRING_NOTEMPTY_ATSTART_RIGHTPAR, 17, PSO_FLG, PCRE2_NE_ATST_SET }, - { STRING_NO_AUTO_POSSESS_RIGHTPAR, 16, PSO_OPT, PCRE2_NO_AUTO_POSSESS }, - { STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR }, - { STRING_NO_JIT_RIGHTPAR, 7, PSO_FLG, PCRE2_NOJIT }, - { STRING_NO_START_OPT_RIGHTPAR, 13, PSO_OPT, PCRE2_NO_START_OPTIMIZE }, - { STRING_LIMIT_HEAP_EQ, 11, PSO_LIMH, 0 }, - { STRING_LIMIT_MATCH_EQ, 12, PSO_LIMM, 0 }, - { STRING_LIMIT_DEPTH_EQ, 12, PSO_LIMD, 0 }, - { STRING_LIMIT_RECURSION_EQ, 16, PSO_LIMD, 0 }, - { STRING_CR_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_CR }, - { STRING_LF_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_LF }, - { STRING_CRLF_RIGHTPAR, 5, PSO_NL, PCRE2_NEWLINE_CRLF }, - { STRING_ANY_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_ANY }, - { STRING_NUL_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_NUL }, - { STRING_ANYCRLF_RIGHTPAR, 8, PSO_NL, PCRE2_NEWLINE_ANYCRLF }, - { STRING_BSR_ANYCRLF_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_ANYCRLF }, - { STRING_BSR_UNICODE_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_UNICODE } + { STRING_UTFn_RIGHTPAR, PSO_OPT, PCRE2_UTF }, + { STRING_UTF_RIGHTPAR, 4, PSO_OPT, PCRE2_UTF }, + { STRING_UCP_RIGHTPAR, 4, PSO_OPT, PCRE2_UCP }, + { STRING_NOTEMPTY_RIGHTPAR, 9, PSO_FLG, PCRE2_NOTEMPTY_SET }, + { STRING_NOTEMPTY_ATSTART_RIGHTPAR, 17, PSO_FLG, PCRE2_NE_ATST_SET }, + { STRING_NO_AUTO_POSSESS_RIGHTPAR, 16, PSO_OPT, PCRE2_NO_AUTO_POSSESS }, + { STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR }, + { STRING_NO_JIT_RIGHTPAR, 7, PSO_FLG, PCRE2_NOJIT }, + { STRING_NO_START_OPT_RIGHTPAR, 13, PSO_OPT, PCRE2_NO_START_OPTIMIZE }, + { STRING_NO_PATTERN_REWRITE_RIGHTPAR, 19, PSO_OPT, PCRE2_NO_PATTERN_REWRITE }, + { STRING_LIMIT_HEAP_EQ, 11, PSO_LIMH, 0 }, + { STRING_LIMIT_MATCH_EQ, 12, PSO_LIMM, 0 }, + { STRING_LIMIT_DEPTH_EQ, 12, PSO_LIMD, 0 }, + { STRING_LIMIT_RECURSION_EQ, 16, PSO_LIMD, 0 }, + { STRING_CR_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_CR }, + { STRING_LF_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_LF }, + { STRING_CRLF_RIGHTPAR, 5, PSO_NL, PCRE2_NEWLINE_CRLF }, + { STRING_ANY_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_ANY }, + { STRING_NUL_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_NUL }, + { STRING_ANYCRLF_RIGHTPAR, 8, PSO_NL, PCRE2_NEWLINE_ANYCRLF }, + { STRING_BSR_ANYCRLF_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_ANYCRLF }, + { STRING_BSR_UNICODE_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_UNICODE } }; /* This table is used when converting repeating opcodes into possessified @@ -2853,7 +2856,7 @@ uint32_t *verblengthptr = NULL; /* Value avoids compiler warning */ uint32_t *verbstartptr = NULL; uint32_t *previous_callout = NULL; uint32_t *parsed_pattern = cb->parsed_pattern; -uint32_t *parsed_pattern_end = cb->parsed_pattern_end; +uint32_t *parsed_pattern_limit = cb->parsed_pattern_limit; uint32_t *this_parsed_item = NULL; uint32_t *prev_parsed_item = NULL; uint32_t meta_quantifier = 0; @@ -2879,6 +2882,10 @@ PCRE2_SPTR verbnamestart = NULL; /* Value avoids compiler warning */ named_group *ng; nest_save *top_nest, *end_nests; +/* All patterns are wrapped in non-capturing parentheses; this avoids the need for + * special-casing the top level when recursing over groups in the parsed pattern */ +*parsed_pattern++ = META_NOCAPTURE; + /* Insert leading items for word and line matching (features provided for the benefit of pcre2grep). */ @@ -2900,7 +2907,7 @@ if ((options & PCRE2_LITERAL) != 0) { while (ptr < ptrend) { - if (parsed_pattern >= parsed_pattern_end) + if (parsed_pattern >= parsed_pattern_limit) { errorcode = ERR63; /* Internal error (parsed pattern overflow) */ goto FAILED; @@ -2945,7 +2952,7 @@ while (ptr < ptrend) PCRE2_SPTR tempptr; PCRE2_SIZE offset; - if (parsed_pattern >= parsed_pattern_end) + if (parsed_pattern >= parsed_pattern_limit) { errorcode = ERR63; /* Internal error (parsed pattern overflow) */ goto FAILED; @@ -5147,13 +5154,15 @@ else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0) /* Terminate the parsed pattern, then return success if all groups are closed. Otherwise we have unclosed parentheses. */ -if (parsed_pattern >= parsed_pattern_end) +if (parsed_pattern >= parsed_pattern_limit) { errorcode = ERR63; /* Internal error (parsed pattern overflow) */ goto FAILED; } -*parsed_pattern = META_END; +*parsed_pattern++ = META_KET; +*parsed_pattern++ = META_END; +cb->parsed_pattern_end = parsed_pattern; if (nest_depth == 0) return 0; UNCLOSED_PARENTHESIS: @@ -5178,7 +5187,715 @@ errorcode = ERR79; goto FAILED; } +/************************************************* +* Rewrite parsed pattern (to optimize) * +*************************************************/ + +/* First type of rewrite: Common prefixes in alternation branches + * are pulled out from the alternation. + * + * For example: (ab|ac|ad) ⇒ (a(?:b|c|d)) + * + * Care is needed with this transformation, or we might change behavior. + * We cannot pull out any item which is quantified with * or +; for + * example, this transformation would be incorrect: + * + * (a*b|a*c) ⇒ (a*(?:b|c)) (✗ BAD!) + * + * Also, while it is usually safe to pull out non-quantified items from + * a non-capturing group, we cannot do this: + * + * (?:ab|ac)? ⇒ a(?:b|c)? (✗ BAD!) + * + * Further, certain constructs are never safe to pull out from alternation, + * notably: callouts and certain backtracking control verbs. Also, we can + * never pull out a common prefix from a 'conditional' group. + * + * Second type of rewrite: Alternations which only contain single, literal + * characters are rewritten to character classes. + * + * For example: (?:a|b|c) ⇒ [a-c] + */ + +static inline BOOL is_lookahead(uint32_t code) +{ + return code == META_LOOKAHEAD || code == META_LOOKAHEADNOT || code == META_LOOKAHEAD_NA; +} + +static inline BOOL is_lookbehind(uint32_t code) +{ + return code == META_LOOKBEHIND || code == META_LOOKBEHIND_NA || code == META_LOOKBEHINDNOT; +} + +static inline BOOL is_condition(uint32_t code) +{ + return code >= META_COND_ASSERT && code <= META_COND_VERSION; +} + +static inline BOOL is_substring_scan(uint32_t code) +{ + return code == META_SCS_NUMBER || code == META_SCS_NAME; +} + +static inline BOOL is_script_run(uint32_t code) +{ + return code == META_SCRIPT_RUN; +} + +/* Does this item (from within a parsed_pattern) start a grouping construct? */ +static inline BOOL is_group_starter(uint32_t code) +{ + return code == META_ATOMIC || is_lookahead(code) || is_lookbehind(code) || code == META_NOCAPTURE || code == META_CAPTURE || is_condition(code) || is_substring_scan(code) || is_script_run(code); +} + +static inline BOOL is_group_ender(uint32_t code) +{ + return code == META_KET; +} + +static inline BOOL is_class_starter(uint32_t item) +{ + return item == META_CLASS || item == META_CLASS_NOT; +} + +static inline BOOL is_quantifier(uint32_t item) +{ + return item >= META_ASTERISK && item <= META_QUERY_QUERY; +} + +static inline BOOL is_possessive(uint32_t item) +{ + return item == META_ASTERISK_PLUS || item == META_PLUS_PLUS || item == META_QUERY_PLUS || item == META_MINMAX_PLUS; +} + +static inline BOOL is_minmax(uint32_t item) +{ + return item == META_MINMAX || item == META_MINMAX_PLUS || item == META_MINMAX_QUERY; +} + +static inline BOOL is_callout(uint32_t item) +{ + return item == META_CALLOUT_NUMBER || item == META_CALLOUT_STRING; +} + +static inline BOOL is_backtrack_control(uint32_t item) +{ + return item >= META_COMMIT && item <= META_THEN_ARG; +} + +static inline BOOL specific_repeat_count(uint32_t *item) +{ + /* `item` points to a quantifier; is it one with a specific, fixed count like {2}? + * Or is it something like *, +, or {1,2}? */ + if (!is_minmax(*item)) + return FALSE; + uint32_t count1 = item[1], count2 = item[2]; + return count1 == count2; +} + +static inline unsigned int number_of_dataitems(uint32_t item, uint32_t *p) +{ + PCRE2_ASSERT(item >= META_END); + + switch (META_CODE(item)) { + case META_ESCAPE: ; + uint32_t data = META_DATA(item); + if (data == ESC_P || data == ESC_p) + return 1; + else if (data == ESC_g || data == ESC_k) + return 2; + else + return 0; + + case META_BACKREF: + return (META_DATA(item) >= 10) ? SIZEOFFSET : 0; + + case META_MARK: + case META_COMMIT_ARG: + case META_PRUNE_ARG: + case META_SKIP_ARG: + case META_THEN_ARG: + /* Data for this item is variable-length; the next value is the number + * of data items */ + return *(p+1) + 1; + + default: + return meta_extra_lengths[(META_CODE(item) >> 16) & 0x7fff]; + } +} + +static uint32_t* find_group_end(uint32_t *start, uint32_t *limit) +{ + PCRE2_ASSERT(start < limit); + PCRE2_ASSERT(is_group_starter(META_CODE(*start))); + start++; + + unsigned int nest_level = 1; + while (start < limit) { + uint32_t item = *start; + if (item >= META_END) { + uint32_t code = META_CODE(item); + if (is_group_starter(code)) { + nest_level++; + } else if (is_group_ender(code)) { + if (--nest_level == 0) { + return start+1; + } + } + start += number_of_dataitems(item, start); + } + start++; + } + PCRE2_UNREACHABLE(); /* regexp improperly formed; should have been caught during parsing */ +} + +static uint32_t* find_class_end(uint32_t *start, uint32_t *limit) +{ + PCRE2_ASSERT(start < limit); + PCRE2_ASSERT(is_class_starter(*start)); + start++; + while (start < limit) { + if (*start++ == META_CLASS_END) { + return start; + } + } + PCRE2_UNREACHABLE(); /* regexp improperly formed; should have been caught during parsing */ +} + +static uint32_t* scan_item(uint32_t *start, uint32_t *limit, uint32_t **quantifier) +{ + PCRE2_ASSERT(start < limit); + uint32_t item = *start; + uint32_t code = META_CODE(item); + uint32_t *result = NULL; + if (is_group_starter(code)) { + result = find_group_end(start, limit); + } else if (is_class_starter(code)) { + result = find_class_end(start, limit); + } else if (code < META_END) { + result = start+1; + } else { + unsigned int n = number_of_dataitems(item, start); + result = start+(n+1); + } + /* Check for quantifier suffix */ + item = *result; + if (is_quantifier(item)) { + if (quantifier != NULL) { + *quantifier = result; + } + result++; + } else if (is_minmax(item)) { + if (quantifier != NULL) { + *quantifier = result; + } + result += 3; + } + return result; +} + +static BOOL group_has_no_backtrack_points(uint32_t *start, uint32_t *end) +{ + uint32_t meta = META_CODE(*start); + if (meta == META_ATOMIC || meta == META_LOOKAHEAD || meta == META_LOOKAHEADNOT || meta == META_LOOKBEHIND || meta == META_LOOKBEHINDNOT) + return TRUE; + + uint32_t *p = ++start; + if (is_lookbehind(meta)) + p += 2; + + while (p < end) { + uint32_t *item_quant = NULL; + uint32_t *item_end = scan_item(p, end, &item_quant); + uint32_t code = META_CODE(*p); + if (code == META_ALT) + return FALSE; + if (item_quant != NULL && !specific_repeat_count(item_quant) && !is_possessive(*item_quant)) + return FALSE; + if (is_group_starter(code) && !group_has_no_backtrack_points(p, item_end)) + return FALSE; + p = item_end; + } + + return TRUE; +} + +static BOOL group_has_no_callouts_or_backtrack_control(uint32_t *start, uint32_t *end) +{ + uint32_t meta = META_CODE(*start); + uint32_t *p = ++start; + if (is_lookbehind(meta)) + p += 2; + + while (p < end) { + uint32_t *item_end = scan_item(p, end, NULL); + uint32_t code = META_CODE(*p); + if (is_callout(code) || is_backtrack_control(code)) + return FALSE; + if (is_group_starter(code) && !group_has_no_callouts_or_backtrack_control(p, item_end)) + return FALSE; + p = item_end; + } + + return TRUE; +} + +/* This function assumes there is exactly ONE item in each alternation branch */ +static BOOL all_branches_are_literals(uint32_t *first, unsigned int alt_index, uint32_t **alt_positions) +{ + if (*first >= META_END) + return FALSE; + PCRE2_ASSERT(first+1 == alt_positions[0]); + for (unsigned int i = 0; i < alt_index; i++) { + if (*(alt_positions[i]+1) >= META_END) + return FALSE; + } + return TRUE; +} + +static BOOL all_branches_end_with_literal(unsigned int alt_index, uint32_t **alt_positions, uint32_t last_item) +{ + if (last_item >= META_END) + return FALSE; + for (unsigned int i = 0; i < alt_index; i++) { + if (*(alt_positions[i]-1) >= META_END) { + return FALSE; + } + } + return TRUE; +} + +typedef struct { + uint32_t *p; + uint32_t *start; + uint32_t *limit; +} rewrite_buf; + +#define REWRITE_BUF_BASE_SIZE 16 +#define rewrite_buf_size(buf) ((buf)->limit - (buf)->start) +#define rewrite_buf_offset(buf) ((buf)->p - (buf)->start) +#define rewrite_buf_space(buf) (size_t)((buf)->limit - (buf)->p) + +static inline void rewrite_buf_init(rewrite_buf *buf) +{ + buf->start = buf->p = buf->limit = NULL; +} + +/* To increase performance when required size of buffer is known ahead of time */ +static inline void rewrite_buf_prealloc(rewrite_buf *buf, size_t size, pcre2_memctl *memctl) +{ + PCRE2_ASSERT(!buf->start); + buf->start = buf->p = memctl->malloc(size * sizeof(uint32_t), memctl->memory_data); + buf->limit = buf->start + size; +} + +static inline void rewrite_buf_realloc(rewrite_buf *buf, size_t new_size, pcre2_memctl *memctl) +{ + PCRE2_ASSERT(buf->start); + uint32_t *expanded = memctl->malloc(new_size * sizeof(uint32_t), memctl->memory_data); + memcpy(expanded, buf->start, (buf->p - buf->start) * sizeof(uint32_t)); + size_t offset = buf->p - buf->start; + memctl->free(buf->start, memctl->memory_data); + buf->start = expanded; + buf->p = expanded + offset; + buf->limit = expanded + new_size; +} + +static inline void rewrite_buf_ensure(rewrite_buf *buf, size_t needed, pcre2_memctl *memctl) +{ + if (buf->start == NULL) { + if (REWRITE_BUF_BASE_SIZE > needed) needed = REWRITE_BUF_BASE_SIZE; + rewrite_buf_prealloc(buf, needed, memctl); + } else if (rewrite_buf_space(buf) < needed) { + needed += rewrite_buf_offset(buf); + size_t grow_size = rewrite_buf_size(buf) * 2; + if (grow_size > needed) needed = grow_size; + rewrite_buf_realloc(buf, needed, memctl); + } +} + +static inline void rewrite_buf_append(rewrite_buf *buf, uint32_t item, pcre2_memctl *memctl) +{ + rewrite_buf_ensure(buf, 1, memctl); + *(buf->p)++ = item; +} + +static inline void rewrite_buf_copy(rewrite_buf *buf, uint32_t *src, size_t count, pcre2_memctl *memctl) +{ + rewrite_buf_ensure(buf, count, memctl); + memcpy(buf->p, src, count * sizeof(uint32_t)); + buf->p += count; +} + +static inline void rewrite_finish(rewrite_buf *buf, compile_block *cb, pcre2_memctl *memctl) +{ + /* Was the regex actually modified? If so, update `cb` accordingly */ + if (buf->start != NULL) { + rewrite_buf_append(buf, META_END, memctl); + cb->parsed_pattern = buf->start; + cb->parsed_pattern_end = buf->p; + cb->parsed_pattern_limit = buf->limit; + rewrite_buf_init(buf); + } +} + +static void rewrite_alternation(uint32_t *start, uint32_t *end, uint32_t *quantifier, rewrite_buf *buf, uint32_t *pattern, PCRE2_SPTR patstring, pcre2_memctl *memctl) +{ + /* Skip over the opening paren */ + uint32_t *first = start + 1; + + /* We can't rewrite alternation in a conditional group, since it has a special meaning + * (the first branch is taken if the condition is true, second branch if false) + * + * We don't attempt to rewrite alternation in a lookbehind assertion, for a different reason: + * In many cases, doing so would convert fixed-length lookbehind to variable lookbehind, + * and PCRE2 handles fixed-length lookbehind far more efficiently + * Further, in some cases, rewriting alternation in lookbehind assertions could even cause + * compilation to fail with a "maximum variable lookbehind length exceeded" error + * + * However, in either case, the group might still contain subgroups which can be rewritten */ + if (is_condition(*start) || is_lookbehind(META_CODE(*start))) { + goto DONT_REWRITE; + } + + /* Sometimes this function can just pull out some common prefix from a group, like (?:abc|abd) ⇒ ab(?:c|d) + * In other cases, we need to copy over the opening paren when rewriting, like (abc|abd) ⇒ (ab(?:c|d)) */ + BOOL copy_opening_paren = (*start != META_NOCAPTURE) || quantifier != NULL; + if (*first == META_OPTIONS) { + copy_opening_paren = TRUE; + first += 3; + } + + /* First, pass over the group and find alternation branches */ + uint32_t *alt_position_buf[16]; + uint32_t **alt_positions = alt_position_buf; + uint32_t alt_limit = (sizeof(alt_position_buf) / sizeof(uint32_t*)); + unsigned int alt_index = 0; + + uint32_t *p = first; + while (p < end) { + uint32_t *item_end = scan_item(p, end, NULL); + uint32_t code = META_CODE(*p); + if (code == META_ALT) { + if (alt_index == alt_limit) { + unsigned int new_limit = alt_limit * 3; + uint32_t **new_alt_positions = memctl->malloc(new_limit * sizeof(uint32_t*), memctl->memory_data); + memcpy(new_alt_positions, alt_positions, alt_limit * sizeof(uint32_t*)); + if (alt_positions != alt_position_buf) { + memctl->free(alt_positions, memctl->memory_data); + } + alt_positions = new_alt_positions; + alt_limit = new_limit; + } + alt_positions[alt_index++] = p; + } else if (code == META_KET) { + break; + } + p = item_end; + } + + /* See if we can pull out any common prefix */ + if (alt_index > 0) { + size_t smallest_branch = alt_positions[0] - first; + size_t largest_branch = smallest_branch; + for (unsigned int i = 1; i < alt_index; i++) { + size_t branch_size = (alt_positions[i] - alt_positions[i-1]) - 1; + if (branch_size < smallest_branch) smallest_branch = branch_size; + if (branch_size > largest_branch) largest_branch = branch_size; + } + size_t branch_size = ((quantifier != NULL ? quantifier : end) - alt_positions[alt_index-1]) - 2; + if (branch_size < smallest_branch) smallest_branch = branch_size; + if (branch_size > largest_branch) largest_branch = branch_size; + + /* We can't pull a common prefix out if there is an empty alternation branch */ + if (smallest_branch > 0) { + /* First check if we have an alternation like (a|b|c) which can be rewritten + * as a character class (i.e. [abc]) */ + if (largest_branch == 1 && all_branches_are_literals(first, alt_index, alt_positions)) { + /* First copy some prefix of overall pattern if needed */ + if (buf->start == NULL) { + rewrite_buf_prealloc(buf, end - pattern, memctl); + rewrite_buf_copy(buf, pattern, start - pattern, memctl); + } + if (copy_opening_paren) { + rewrite_buf_copy(buf, start, first - start, memctl); + } + /* Rewrite alternation into character class */ + rewrite_buf_append(buf, META_CLASS, memctl); + rewrite_buf_append(buf, *first, memctl); + for (unsigned int i = 0; i < alt_index; i++) { + rewrite_buf_append(buf, *(alt_positions[i]+1), memctl); + } + rewrite_buf_append(buf, META_CLASS_END, memctl); + goto FINISH_REWRITE; + } + + /* Find longest common prefix, if any, in all the alternation branches */ + uint32_t *first_branch_end = alt_positions[0]; + uint32_t *extract_up_to = first; + while (extract_up_to < first_branch_end) { + uint32_t *item = extract_up_to; + uint32_t *item_quant = NULL; + uint32_t *item_end = scan_item(item, first_branch_end, &item_quant); + if (item_quant != NULL && !specific_repeat_count(item_quant) && !is_possessive(*item_quant)) { + /* We can't pull out an item with a quantifier like * or + */ + goto FOUND_LONGEST_PREFIX; + } + if (is_backtrack_control(*item)) { + /* Pulling out (*PRUNE), (*COMMIT), (*SKIP), or (*THEN) would change behavior. + * These verbs take effect if there is a matching failure which causes + * backtracking to reach them, which might not happen if they are pulled + * out from an alternation */ + goto FOUND_LONGEST_PREFIX; + } + if (is_callout(*item)) { + /* Pulling out a callout from each alternation branch would change + * observable behavior; instead of being called at the beginning of + * each branch, the callout function would be called just once */ + goto FOUND_LONGEST_PREFIX; + } + PCRE2_ASSERT(item_end > item); /* Don't get stuck in an infinite loop! */ + if ((size_t)(item_end - first) > smallest_branch) { + goto FOUND_LONGEST_PREFIX; + } + uint32_t meta = META_CODE(*item); + if (is_group_starter(meta)) { + if (!group_has_no_backtrack_points(item, item_end)) { + /* We can't pull out a group which the regex engine might backtrack into; + * doing so could change what the regex matches + * (It would never change a match failure into success, but if there is + * more than one substring in the target string which could possibly + * match the regex, it might change which one is actually returned) */ + goto FOUND_LONGEST_PREFIX; + } + if (!group_has_no_callouts_or_backtrack_control(item, item_end)) { + /* We can't pull out groups which contain callouts or certain + * backtracking control verbs, for the same reasons explained above */ + goto FOUND_LONGEST_PREFIX; + } + } + /* Check if the corresponding item in all subsequent alternation branches + * match the item in the first branch */ + size_t offset = item - first; + size_t compare_len = item_end - item; + if (!is_lookbehind(meta) && meta != META_RECURSE && meta != META_RECURSE_BYNAME) { + for (unsigned int i = 0; i < alt_index; i++) { + uint32_t *compare_with = alt_positions[i]+offset+1; + if (memcmp(item, compare_with, compare_len * sizeof(uint32_t)) != 0) { + goto FOUND_LONGEST_PREFIX; + } + uint32_t following = *(compare_with + compare_len); + if (is_quantifier(following) || is_minmax(following)) { + /* There is an 'identical' item in the first alternation branch and + * the one we are just checking... but the latter one is quantified + * while the first one was not, so they don't really match */ + goto FOUND_LONGEST_PREFIX; + } + } + } else { + /* For each lookbehind assertion and by-number subroutine call, we + * have an offset which points to the location where the construct + * occurred in the original pattern string. + * Those are used only for error messages, and will obviously be different + * even if the constructs are otherwise the same, so don't compare them */ + size_t skip = SIZEOFFSET+1; + if (meta == META_RECURSE_BYNAME) + skip++; + compare_len -= skip; + for (unsigned int i = 0; i < alt_index; i++) { + uint32_t *compare_with = alt_positions[i]+offset+1; + if (*compare_with != *item) { + goto FOUND_LONGEST_PREFIX; + } + if (memcmp(item+skip, compare_with+skip, compare_len*sizeof(uint32_t)) != 0) { + goto FOUND_LONGEST_PREFIX; + } + uint32_t following = *(compare_with + compare_len + skip); + if (is_quantifier(following) || is_minmax(following)) { + goto FOUND_LONGEST_PREFIX; + } + } + if (meta == META_RECURSE_BYNAME) { + /* All alternation branches have a by-name subroutine call in the same place + * Confirm if they are calling the same named group */ + size_t name_len = item[1]; + size_t name_offset = READOFFSET(&item[2]); + PCRE2_SPTR name_ptr = &patstring[name_offset]; + for (unsigned int i = 0; i < alt_index; i++) { + uint32_t *compare_with = alt_positions[i]+offset+1; + size_t compare_len = compare_with[1]; + size_t compare_offset = READOFFSET(&compare_with[2]); + if (compare_offset == name_offset) + continue; + PCRE2_SPTR compare_ptr = &patstring[compare_offset]; + if (name_len != compare_len || memcmp((const char*)name_ptr, (const char*)compare_ptr, name_len * (PCRE2_CODE_UNIT_WIDTH >> 3)) != 0) { + goto FOUND_LONGEST_PREFIX; + } + } + } + } + extract_up_to = item_end; + } + + FOUND_LONGEST_PREFIX: + if (extract_up_to != first) { + /* Rewrite alternation + * Do we need to copy over some prefix of the pattern up until here? */ + if (buf->start == NULL) { + rewrite_buf_prealloc(buf, end - pattern, memctl); + rewrite_buf_copy(buf, pattern, start - pattern, memctl); + } + + /* Do we need an opening paren? */ + if (copy_opening_paren) { + rewrite_buf_copy(buf, start, first - start, memctl); + } + + /* Copy the common prefix + * If it is necessary to grow the rewrite_buf, try to do it just once for + * performance. The estimate of buffer space needed for the entire rewritten + * group is a bit obscure; the '3' is for an added BRA if needed, plus 2× KET + * The '+ 1' is to make space for all instances of | (META_ALT) */ + rewrite_buf_ensure(buf, 3 + largest_branch + (alt_index * (largest_branch + 1 - (extract_up_to - first))), memctl); + uint32_t *item = first; + while (item < extract_up_to) { + uint32_t *item_quant = NULL; + uint32_t *item_end = scan_item(item, extract_up_to, &item_quant); + uint32_t meta = META_CODE(*item); + if (is_group_starter(meta)) { + /* We may need to rewrite groups within the common prefix */ + rewrite_alternation(item, item_end, item_quant, buf, pattern, patstring, memctl); + } else { + rewrite_buf_copy(buf, item, item_end - item, memctl); + } + item = item_end; + } + + if (extract_up_to != first_branch_end || smallest_branch != largest_branch) { + if (smallest_branch == largest_branch && (first_branch_end - extract_up_to) == 1) { + uint32_t last_item = (quantifier != NULL) ? *(quantifier-2) : *(end-2); + if (all_branches_end_with_literal(alt_index, alt_positions, last_item)) { + /* Create character class for the last literal in each branch */ + rewrite_buf_append(buf, META_CLASS, memctl); + for (unsigned int i = 0; i < alt_index; i++) { + rewrite_buf_append(buf, *(alt_positions[i]-1), memctl); + } + rewrite_buf_append(buf, last_item, memctl); + rewrite_buf_append(buf, META_CLASS_END, memctl); + goto FINISH_REWRITE; + } + } + + /* Add non-capturing paren */ + rewrite_buf_append(buf, META_NOCAPTURE, memctl); + + /* Copy the part AFTER the common prefix for each branch, separated by META_ALT + * When copying each part, allow subgroups to be rewritten */ + size_t prefix_size = extract_up_to - first; + rewrite_buf_copy(buf, extract_up_to, first_branch_end - extract_up_to + 1, memctl); + for (unsigned int i = 0; i < alt_index; i++) { + uint32_t *copy_from = alt_positions[i] + prefix_size + 1; + uint32_t *copy_to = (i+1 < alt_index) ? alt_positions[i+1]+1 : end; + while (copy_from < copy_to) { + uint32_t *item_quant = NULL; + uint32_t *item_end = scan_item(copy_from, copy_to, &item_quant); + uint32_t meta = META_CODE(*copy_from); + if (is_group_starter(meta)) { + rewrite_alternation(copy_from, item_end, item_quant, buf, pattern, patstring, memctl); + } else if (meta == META_KET) { + /* Finish last part of group */ + rewrite_buf_append(buf, META_KET, memctl); + goto FINISH_REWRITE; + } else { + rewrite_buf_copy(buf, copy_from, item_end - copy_from, memctl); + } + copy_from = item_end; + } + } + + PCRE2_UNREACHABLE(); + } + + FINISH_REWRITE: + if (quantifier != NULL) { + rewrite_buf_copy(buf, quantifier-1, end-quantifier+1, memctl); + } else if (copy_opening_paren) { + rewrite_buf_append(buf, META_KET, memctl); + } + + if (alt_positions != alt_position_buf) + memctl->free(alt_positions, memctl->memory_data); + return; + } + } + + if (alt_positions != alt_position_buf) + memctl->free(alt_positions, memctl->memory_data); + } + + /* We didn't rewrite this group + * Even so, a rewrite of some subgroups might be needed */ + DONT_REWRITE: ; + uint32_t *item = first; + if (buf->start != NULL) { + /* It has already been decided that some part of the overall pattern does need to be rewritten */ + rewrite_buf_copy(buf, start, first - start, memctl); + while (item < end) { + uint32_t *item_quant = NULL; + uint32_t *item_end = scan_item(item, end, &item_quant); + uint32_t meta = META_CODE(*item); + if (is_group_starter(meta)) { + rewrite_alternation(item, item_end, item_quant, buf, pattern, patstring, memctl); + } else { + rewrite_buf_copy(buf, item, item_end - item, memctl); + } + item = item_end; + REWRITE_UNDER_WAY: ; + } + } else { + /* It hasn't yet been decided that we need to rewrite some part of the overall pattern... + * But if we find a subgroup which needs rewrite, then we can still initiate that process */ + while (item < end) { + uint32_t *item_quant = NULL; + uint32_t *item_end = scan_item(item, end, &item_quant); + uint32_t meta = META_CODE(*item); + if (is_group_starter(meta)) { + rewrite_alternation(item, item_end, item_quant, buf, pattern, patstring, memctl); + if (buf->start != NULL) { + /* A rewrite has now started, so go to the other loop */ + item = item_end; + goto REWRITE_UNDER_WAY; + } + } + item = item_end; + } + } +} + +static void rewrite_regex(compile_block *cb, pcre2_compile_context *ccontext) +{ + rewrite_buf buf; + rewrite_buf_init(&buf); + + rewrite_alternation(cb->parsed_pattern, cb->parsed_pattern_end-1, NULL, &buf, cb->parsed_pattern, cb->start_pattern, &ccontext->memctl); + rewrite_finish(&buf, cb, &ccontext->memctl); +} +/* Strip non-capturing parentheses which are wrapping the entire regex. + * (During the parsing phase, an extra BRA/KET pair is inserted around the + * entire regex to make it easier to traverse recursively during the pattern + * rewrite phase.) */ +static void strip_enclosing_nocaptures(compile_block *cb) +{ + while (cb->parsed_pattern_end > cb->parsed_pattern && META_CODE(cb->parsed_pattern[0]) == META_NOCAPTURE) { + uint32_t *group_end = find_group_end(cb->parsed_pattern, cb->parsed_pattern_end); + if (group_end != cb->parsed_pattern_end-1) { + break; + } + cb->parsed_pattern++; + cb->parsed_pattern_end--; + } +} /************************************************* * Find first significant opcode * @@ -10465,6 +11182,8 @@ cb.named_group_list_size = NAMED_GROUP_LIST_SIZE; cb.names_found = 0; cb.parens_depth = 0; cb.parsed_pattern = stack_parsed_pattern; +cb.parsed_pattern_end = NULL; +cb.parsed_pattern_limit = NULL; cb.req_varyopt = 0; cb.start_code = cworkspace; cb.start_pattern = pattern; @@ -10695,9 +11414,12 @@ if (!utf) is set we have to assume a numerical callout (4 elements) for each character plus one at the end. This is overkill, but memory is plentiful these days. For many smaller patterns the vector on the stack (which was set up above) can be -used. */ +used. Further, an extra pair of non-capturing parentheses is added around every +pattern to make it easier to traverse recursively during the pattern rewrite +phase. Therefore, add 2 to the required buffer size to make space for wrapping +the pattern in non-capturing parentheses. */ -parsed_size_needed = patlen - skipatstart + big32count; +parsed_size_needed = patlen - skipatstart + big32count + 2; if ((ccontext->extra_options & (PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0) @@ -10706,10 +11428,10 @@ if ((ccontext->extra_options & if ((options & PCRE2_AUTO_CALLOUT) != 0) parsed_size_needed = (parsed_size_needed + 1) * 5; +uint32_t *heap_parsed_pattern = NULL; if (parsed_size_needed >= PARSED_PATTERN_DEFAULT_SIZE) { - uint32_t *heap_parsed_pattern = ccontext->memctl.malloc( - (parsed_size_needed + 1) * sizeof(uint32_t), ccontext->memctl.memory_data); + heap_parsed_pattern = ccontext->memctl.malloc((parsed_size_needed + 1) * sizeof(uint32_t), ccontext->memctl.memory_data); if (heap_parsed_pattern == NULL) { *errorptr = ERR21; @@ -10717,13 +11439,18 @@ if (parsed_size_needed >= PARSED_PATTERN_DEFAULT_SIZE) } cb.parsed_pattern = heap_parsed_pattern; } -cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed + 1; +cb.parsed_pattern_limit = cb.parsed_pattern + parsed_size_needed + 1; /* Do the parsing scan. */ errorcode = parse_regex(ptr, cb.external_options, &has_lookbehind, &cb); if (errorcode != 0) goto HAD_CB_ERROR; +if ((cb.external_options & (PCRE2_AUTO_CALLOUT|PCRE2_NO_PATTERN_REWRITE)) == 0) { + rewrite_regex(&cb, ccontext); +} +strip_enclosing_nocaptures(&cb); + /* If there are any lookbehinds, scan the parsed pattern to figure out their lengths. Workspace is needed to remember whether numbered groups are or are not of limited length, and if limited, what the minimum and maximum lengths are. @@ -11185,8 +11912,8 @@ vector. */ #ifdef SUPPORT_VALGRIND if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1)); #endif -if (cb.parsed_pattern != stack_parsed_pattern) - ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data); +if (heap_parsed_pattern != NULL) + ccontext->memctl.free(heap_parsed_pattern, ccontext->memctl.memory_data); if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE) ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data); if (cb.groupinfo != stack_groupinfo) diff --git a/src/pcre2_fuzzsupport.c b/src/pcre2_fuzzsupport.c index 89135620a..c660ed73b 100644 --- a/src/pcre2_fuzzsupport.c +++ b/src/pcre2_fuzzsupport.c @@ -48,6 +48,7 @@ below that output them. */ PCRE2_MATCH_UNSET_BACKREF|PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C| \ PCRE2_NO_AUTO_CAPTURE| \ PCRE2_NO_AUTO_POSSESS|PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_NO_START_OPTIMIZE| \ + PCRE2_NO_PATTERN_REWRITE| \ PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_USE_OFFSET_LIMIT| \ PCRE2_UTF) @@ -67,7 +68,7 @@ fprintf(stream, "Compile options %s%.8x =", (compile_options == PCRE2_NEVER_BACKSLASH_C)? "(base) " : "", compile_options); -fprintf(stream, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", +fprintf(stream, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", ((compile_options & PCRE2_ALT_BSUX) != 0)? " alt_bsux" : "", ((compile_options & PCRE2_ALT_CIRCUMFLEX) != 0)? " alt_circumflex" : "", ((compile_options & PCRE2_ALT_VERBNAMES) != 0)? " alt_verbnames" : "", @@ -90,6 +91,7 @@ fprintf(stream, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", ((compile_options & PCRE2_NO_AUTO_CAPTURE) != 0)? " no_auto_capture" : "", ((compile_options & PCRE2_NO_AUTO_POSSESS) != 0)? " no_auto_possess" : "", ((compile_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)? " no_dotstar_anchor" : "", + ((compile_options & PCRE2_NO_PATTERN_REWRITE) != 0)? " no_pattern_rewrite" : "", ((compile_options & PCRE2_NO_UTF_CHECK) != 0)? " no_utf_check" : "", ((compile_options & PCRE2_NO_START_OPTIMIZE) != 0)? " no_start_optimize" : "", ((compile_options & PCRE2_UCP) != 0)? " ucp" : "", diff --git a/src/pcre2_internal.h b/src/pcre2_internal.h index 88cce82d7..c742c729d 100644 --- a/src/pcre2_internal.h +++ b/src/pcre2_internal.h @@ -948,37 +948,38 @@ a positive value. */ #define STRING_WEIRD_STARTWORD "[:<:]]" #define STRING_WEIRD_ENDWORD "[:>:]]" -#define STRING_CR_RIGHTPAR "CR)" -#define STRING_LF_RIGHTPAR "LF)" -#define STRING_CRLF_RIGHTPAR "CRLF)" -#define STRING_ANY_RIGHTPAR "ANY)" -#define STRING_ANYCRLF_RIGHTPAR "ANYCRLF)" -#define STRING_NUL_RIGHTPAR "NUL)" -#define STRING_BSR_ANYCRLF_RIGHTPAR "BSR_ANYCRLF)" -#define STRING_BSR_UNICODE_RIGHTPAR "BSR_UNICODE)" -#define STRING_UTF8_RIGHTPAR "UTF8)" -#define STRING_UTF16_RIGHTPAR "UTF16)" -#define STRING_UTF32_RIGHTPAR "UTF32)" -#define STRING_UTF_RIGHTPAR "UTF)" -#define STRING_UCP_RIGHTPAR "UCP)" -#define STRING_NO_AUTO_POSSESS_RIGHTPAR "NO_AUTO_POSSESS)" -#define STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR "NO_DOTSTAR_ANCHOR)" -#define STRING_NO_JIT_RIGHTPAR "NO_JIT)" -#define STRING_NO_START_OPT_RIGHTPAR "NO_START_OPT)" -#define STRING_NOTEMPTY_RIGHTPAR "NOTEMPTY)" -#define STRING_NOTEMPTY_ATSTART_RIGHTPAR "NOTEMPTY_ATSTART)" -#define STRING_LIMIT_HEAP_EQ "LIMIT_HEAP=" -#define STRING_LIMIT_MATCH_EQ "LIMIT_MATCH=" -#define STRING_LIMIT_DEPTH_EQ "LIMIT_DEPTH=" -#define STRING_LIMIT_RECURSION_EQ "LIMIT_RECURSION=" -#define STRING_MARK "MARK" - -#define STRING_bc "bc" -#define STRING_bidiclass "bidiclass" -#define STRING_sc "sc" -#define STRING_script "script" -#define STRING_scriptextensions "scriptextensions" -#define STRING_scx "scx" +#define STRING_CR_RIGHTPAR "CR)" +#define STRING_LF_RIGHTPAR "LF)" +#define STRING_CRLF_RIGHTPAR "CRLF)" +#define STRING_ANY_RIGHTPAR "ANY)" +#define STRING_ANYCRLF_RIGHTPAR "ANYCRLF)" +#define STRING_NUL_RIGHTPAR "NUL)" +#define STRING_BSR_ANYCRLF_RIGHTPAR "BSR_ANYCRLF)" +#define STRING_BSR_UNICODE_RIGHTPAR "BSR_UNICODE)" +#define STRING_UTF8_RIGHTPAR "UTF8)" +#define STRING_UTF16_RIGHTPAR "UTF16)" +#define STRING_UTF32_RIGHTPAR "UTF32)" +#define STRING_UTF_RIGHTPAR "UTF)" +#define STRING_UCP_RIGHTPAR "UCP)" +#define STRING_NO_AUTO_POSSESS_RIGHTPAR "NO_AUTO_POSSESS)" +#define STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR "NO_DOTSTAR_ANCHOR)" +#define STRING_NO_JIT_RIGHTPAR "NO_JIT)" +#define STRING_NO_PATTERN_REWRITE_RIGHTPAR "NO_PATTERN_REWRITE)" +#define STRING_NO_START_OPT_RIGHTPAR "NO_START_OPT)" +#define STRING_NOTEMPTY_RIGHTPAR "NOTEMPTY)" +#define STRING_NOTEMPTY_ATSTART_RIGHTPAR "NOTEMPTY_ATSTART)" +#define STRING_LIMIT_HEAP_EQ "LIMIT_HEAP=" +#define STRING_LIMIT_MATCH_EQ "LIMIT_MATCH=" +#define STRING_LIMIT_DEPTH_EQ "LIMIT_DEPTH=" +#define STRING_LIMIT_RECURSION_EQ "LIMIT_RECURSION=" +#define STRING_MARK "MARK" + +#define STRING_bc "bc" +#define STRING_bidiclass "bidiclass" +#define STRING_sc "sc" +#define STRING_script "script" +#define STRING_scriptextensions "scriptextensions" +#define STRING_scx "scx" #else /* SUPPORT_UNICODE */ @@ -1267,6 +1268,7 @@ only. */ #define STRING_NO_AUTO_POSSESS_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_A STR_U STR_T STR_O STR_UNDERSCORE STR_P STR_O STR_S STR_S STR_E STR_S STR_S STR_RIGHT_PARENTHESIS #define STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_D STR_O STR_T STR_S STR_T STR_A STR_R STR_UNDERSCORE STR_A STR_N STR_C STR_H STR_O STR_R STR_RIGHT_PARENTHESIS #define STRING_NO_JIT_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_J STR_I STR_T STR_RIGHT_PARENTHESIS +#define STRING_NO_PATTERN_REWRITE_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_P STR_A STR_T STR_T STR_E STR_R STR_N STR_UNDERSCORE STR_R STR_E STR_W STR_R STR_I STR_T STR_E STR_RIGHT_PARENTHESIS #define STRING_NO_START_OPT_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_S STR_T STR_A STR_R STR_T STR_UNDERSCORE STR_O STR_P STR_T STR_RIGHT_PARENTHESIS #define STRING_NOTEMPTY_RIGHTPAR STR_N STR_O STR_T STR_E STR_M STR_P STR_T STR_Y STR_RIGHT_PARENTHESIS #define STRING_NOTEMPTY_ATSTART_RIGHTPAR STR_N STR_O STR_T STR_E STR_M STR_P STR_T STR_Y STR_UNDERSCORE STR_A STR_T STR_S STR_T STR_A STR_R STR_T STR_RIGHT_PARENTHESIS diff --git a/src/pcre2_intmodedep.h b/src/pcre2_intmodedep.h index a798cdd4f..1f62e3488 100644 --- a/src/pcre2_intmodedep.h +++ b/src/pcre2_intmodedep.h @@ -749,7 +749,8 @@ typedef struct compile_block { uint32_t bracount; /* Count of capturing parentheses */ uint32_t lastcapture; /* Last capture encountered */ uint32_t *parsed_pattern; /* Parsed pattern buffer */ - uint32_t *parsed_pattern_end; /* Parsed pattern should not get here */ + uint32_t *parsed_pattern_end; /* Last unused position in parsed pattern buffer */ + uint32_t *parsed_pattern_limit; /* Parsed pattern should not get here */ uint32_t *groupinfo; /* Group info vector */ uint32_t top_backref; /* Maximum back reference */ uint32_t backref_map; /* Bitmap of low back refs */ diff --git a/src/pcre2test.c b/src/pcre2test.c index eaacf09e1..732bac9ce 100644 --- a/src/pcre2test.c +++ b/src/pcre2test.c @@ -732,6 +732,7 @@ static modstruct modlist[] = { { "no_auto_possess", MOD_PATP, MOD_OPT, PCRE2_NO_AUTO_POSSESS, PO(options) }, { "no_dotstar_anchor", MOD_PAT, MOD_OPT, PCRE2_NO_DOTSTAR_ANCHOR, PO(options) }, { "no_jit", MOD_DATP, MOD_OPT, PCRE2_NO_JIT, DO(options) }, + { "no_pattern_rewrite", MOD_PAT, MOD_OPT, PCRE2_NO_PATTERN_REWRITE, PO(options) }, { "no_start_optimize", MOD_PATP, MOD_OPT, PCRE2_NO_START_OPTIMIZE, PO(options) }, { "no_utf_check", MOD_PD, MOD_OPT, PCRE2_NO_UTF_CHECK, PD(options) }, { "notbol", MOD_DAT, MOD_OPT, PCRE2_NOTBOL, DO(options) }, @@ -4288,7 +4289,7 @@ static void show_compile_options(uint32_t options, const char *before, const char *after) { if (options == 0) fprintf(outfile, "%s %s", before, after); -else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", +else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", before, ((options & PCRE2_ALT_BSUX) != 0)? " alt_bsux" : "", ((options & PCRE2_ALT_CIRCUMFLEX) != 0)? " alt_circumflex" : "", @@ -4314,6 +4315,7 @@ else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s% ((options & PCRE2_NO_AUTO_CAPTURE) != 0)? " no_auto_capture" : "", ((options & PCRE2_NO_AUTO_POSSESS) != 0)? " no_auto_possess" : "", ((options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)? " no_dotstar_anchor" : "", + ((options & PCRE2_NO_PATTERN_REWRITE) != 0)? " no_pattern_rewrite" : "", ((options & PCRE2_NO_UTF_CHECK) != 0)? " no_utf_check" : "", ((options & PCRE2_NO_START_OPTIMIZE) != 0)? " no_start_optimize" : "", ((options & PCRE2_UCP) != 0)? " ucp" : "", diff --git a/testdata/testinput17 b/testdata/testinput17 index 7dd2d8ea9..d9e744067 100644 --- a/testdata/testinput17 +++ b/testdata/testinput17 @@ -282,7 +282,11 @@ /[axm]{7}/ -/(.|.)*?bx/ +/(.|.)*?bx/no_pattern_rewrite +\= Expect limit exceeded + aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaabax\=match_limit=10000000 + +/(a|.)*?bx/ \= Expect limit exceeded aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaabax\=match_limit=10000000 diff --git a/testdata/testinput27 b/testdata/testinput27 new file mode 100644 index 000000000..0af1f73ee --- /dev/null +++ b/testdata/testinput27 @@ -0,0 +1,485 @@ +# Tests for the pattern rewriter which now precedes regex compilation + +# Pulling out common prefixes from alternation: + +# Simplest case: +# ⇒ a[b-d] +/ab|ac|ad/B + +# ⇒ a(?:b|c|(d)) +/ab|ac|a(d)/B + +# ⇒ ab[bc] +/abb|abc/B + +# Inside a group is also OK: +# ⇒ (a[b-d]) +/(ab|ac|ad)/B + +# Can recurse into deeply nested groups: +# ⇒ (((a[b-d]))) +/(((ab|ac|ad)))/B + +# Another variant +# ⇒ (a((a[b-d]))b) +/(a((ab|ac|ad))b)/B + +# Common prefix covers all of one alternation branch +# ⇒ a(?:|b) +/a|ab/B + +# Multiple identical alternation branches +# (We don't detect this and prune the redundant ones) +/a|ab|a/B + +# Can't be rewritten +/.+a|.+b/B + aaab + +# Can't be rewritten +/.*b|.*c/B + bbbc + +# Can't be rewritten +/.?b|.?c/B + bc + +# However, a quantifier which matches a specific number of times is OK: +# ⇒ .{2}[bc] +/.{2}b|.{2}c/B + +# Can't be rewritten +# NOTE: During regex compilation, a{1,2} is converted to aa{0,1} +# If this conversion was done earlier, we could pull out a common prefix +# 🤷 +/a{1,2}b|a{1,2}c/B + +# Possessive quantifiers are OK: +# ⇒ .++[ab] +/.++a|.++b/B + +# ⇒ .*+[ab] +/.*+a|.*+b/B + +# ⇒ .?+[ab] +/.?+a|.?+b/B + +# ⇒ .{2,4}+[ab] +/.{2,4}+a|.{2,4}+b/B + +# Non-capturing groups can be pulled out ONLY if they don't contain anything +# which the regex engine can backtrack into +# This includes: alternation, non-possessive quantifiers +# ⇒ (?:aa)[bc] +/(?:aa)b|(?:aa)c/B + +# Can't be rewritten +/(?:\s|\d)b|(?:\s|\d)c/B + +# Will be rewritten into character class +# ⇒ [ab]b|[ab]c +/(?:a|b)b|(?:a|b)c/B + +# Can't be rewritten +/(?:.*)b|(?:.*)c/B + bbbc + +# Capturing groups can't, because it would change the capture numbers +/(\d)b|(\d)c/B + 1b + 1c + +# Will be rewritten into character class +# ⇒ ([ab])b|([ab])c +/(a|b)b|(a|b)c/B + ab + ac + +# Non-capturing group quantified with *, +, ? +# Can't be pulled out from alternation... but (?:a|b) will be converted to (?:[ab]) +/(?:a|b)*b|(?:a|b)*c/B + +# Can't be pulled out, but can be converted to use [ab] +/(?:a|b)+b|(?:a|b)+c/B + +# Can't be pulled out, but can be converted to use [ab] +/(?:a|b)?b|(?:a|b)?c/B + +# Lookahead can be pulled out +# ⇒ (?=a)a[AB] +/(?=a)aA|(?=a)aB/B + +# Lookbehind can be pulled out +# ⇒ (?<=a)[bc] +/(?<=a)b|(?<=a)c/B + +# Non-matching lookbehind can't be pulled out (of course) +/(?<=a)b|(?<=b)c/B + +# Negative lookahead can be pulled out +# ⇒ (?!a)[bc] +/(?!a)b|(?!a)c/B + +# Negative lookbehind can be pulled out +# ⇒ (?aa)a[bc] +/(?>aa)ab|(?>aa)ac/B + +# Sometimes items are pulled out FROM an atomic group... +# ⇒ (?>a[bc]) +/(?>ab|ac)/B + +# Character classes can be pulled out +# ⇒ [a-z][bc] +/[a-z]b|[a-z]c/B + +# Semantically identical char classes, written in a different way +# NOTE: If char classes were canonicalized before pattern rewriting, +# we could pull out a common prefix here +# 🤷 Too bad +/[abc]b|[a-c]c/B + +# Character types +# ⇒ \d\s\v\w[ab] +/\d\s\v\wa|\d\s\v\wb/B + +# Start-of-string anchor +/^a|^b/B + +# Rewriting still works with options at beginning of regex +/(*NOTEMPTY)(*NO_AUTO_POSSESS)ab|ac/B + +/(*NO_START_OPT)ab|ac/B + +/(*NO_DOTSTAR_ANCHOR)ab|ac/B + +/(*LIMIT_HEAP=10000)ab|ac/B + +/(*LIMIT_MATCH=10)ab|ac/B + +/(*LIMIT_DEPTH=10)ab|ac/B + +/(*CR)ab|ac/B + +# Dot +/.a|.b/B + +# Extended regex (with embedded whitespace) +# The whitespace doesn't interfere with rewriting +/(?x) a b | a a b/B + +# \Q..\E literal sequences +/\Q$\Ea|\Q$\Eb/B + +# Hex escapes +# 'A' and '\x41' are recognized as equivalent and rewritten +/Aa|\x41b/B + +# Backreferences can be pulled out +/(\w)(?:\1a|\1b)/B + +# Relative backreferences can be pulled out +/(\w)(?:\g{-1}a|\g{-1}b)/B + +# Unicode property property escape sequences can be pulled out +/\p{Greek}a|\p{Greek}b/B + +# Match reset escape can be pulled out +/a\Kb|a\Kc/B + +# Group with duplicate capture numbers +# ⇒ (a)[bc] +/(?|(a)b|(a)c)/B + ab + ac + +# Although capture numbers are the same, capturing groups can't be pulled out +# if they contain anything which the regex engine can backtrack into +/(?|(a*)b|(a*)c)/B + +/(?|(\d|\s)b|(\d|\s)c)/B + +# Non-capturing group with option letters +/(?i:ab|ac)/B + +# Named capture groups +# This can't be rewritten, because although the capture group +# names are the same, their numbers are different +/(?J)(?:(?a)b|(?a)c)\k/B + +# Named capture groups with identical capture group numbers +# These can be rewritten +/(?|(?a)b|(?a)c)\k/B + +# Script runs can be pulled out +# ⇒ (*sr:..)(?:a|b) +/(*sr:..)a|(*sr:..)b/B + +# Capture group condition +# Can't be rewritten +/(a)(?(1)bc|bd)/B + +# Capture group condition (by group name) +# Can't be rewritten +/(?a)(?()bc|bd)/B + +# Version number condition +# Can't be rewritten +/(?(VERSION>=10.4)ab|ac)/B + +# Lookahead assertion condition +# Can't be rewritten +/(?(?=a)ab|ac)/B + +# However, subgroups of a conditional group can be rewritten +/(?(?=a)a(?:bb|bc)|ac)/B + +# Lookbehind assertion condition +# Can't be rewritten +/(?(?<=a)ab|ac)/B + +# Recursion condition +# Can't be rewritten +/(?(R)ab|ac)/B + +# Recursion condition with explicit number +# Can't be rewritten +/(a)(?(R1)ab|ac)/B + +# Recursion condition by name +# Can't be rewritten +/(?a)(?(R&n)ab|ac)/B + +# Define +/(?(DEFINE) a)b|(?(DEFINE) a)\w/B + +# Subroutine call by number +# ⇒ (?|(a)|(b))(?1)[bc] +/(?|(a)|(b))(?:(?1)b|(?1)c)/B + +# Subroutine call by number, but with non-matching number +/(?:(a)|(b))(?:(?1)b|(?2)c)/B + +# Subroutine call by name +# ⇒ (?a)(?&n)[ab] +/(?a)(?:(?&n)a|(?&n)b)/B + +# Subroutine call by name, but with non-matching name +/(?a)(?b)(?:(?&n)a|(?&m)b)/B + +/(?a)(?b)(?:(?&abce)a|(?&abcd)b)/B + +# Callouts are never pulled out +/(?C0)a|(?C0)b/B + +/(?C{ab})a|(?C{ab})b/B + +# Callouts are still not pulled out if they are inside a sub-group +/(?:(?C0))a|(?:(?C0))b/B + +/(?:(?C{ab}))a|(?:(?C{ab}))b/B + +/(?>(?C0))a|(?>(?C0))b/B + +/(?=(?C0))a|(?=(?C0))b/B + +/(?<=(?C0))a|(?<=(?C0))b/B + +# (*ACCEPT) can be pulled out +/(*ACCEPT)ab|(*ACCEPT)ac/B + +# (*ACCEPT:name) can be pulled out +/(*ACCEPT:hello)ab|(*ACCEPT:hello)ac/B + +# (*ACCEPT:name) but with non-matching names +/(*ACCEPT:hello)ab|(*ACCEPT:goodbye)ac/B + +/(*ACCEPT:a)ab|(*ACCEPT:b)ac/B + +# (*FAIL) can be pulled out +/(*FAIL)ab|(*FAIL)ac/B + +# (*FAIL:name) can be pulled out +/(*FAIL:hello)ab|(*FAIL:hello)ac/B + +# (*FAIL:name) but with non-matching names +/(*FAIL:hello)ab|(*FAIL:goodbye)ac/B + +# (*MARK:name) can be pulled out +/(*MARK:hello)ab|(*MARK:hello)ac/B + +# (*MARK:name) but with non-matching names +/(*MARK:hello)ab|(*MARK:goodbye)ac/B + +# (*PRUNE) is never pulled out +/(*PRUNE)a|(*PRUNE)b/B + +/(?:(*PRUNE))a|(?:(*PRUNE))b/B + +# (*PRUNE:name) is never pulled out +/(*PRUNE:abc)a|(*PRUNE:abc)b/B + +/(?:(*PRUNE:abc))a|(?:(*PRUNE:abc))b/B + +# (*COMMIT) is never pulled out +/(*COMMIT)a|(*COMMIT)b/B + +/(?:(*COMMIT))a|(?:(*COMMIT))b/B + +# (*COMMIT:name) is never pulled out +/(*COMMIT:abc)a|(*COMMIT:abc)b/B + +/(?:(*COMMIT:abc))a|(?:(*COMMIT:abc))b/B + +# (*SKIP) is never pulled out +/(*SKIP)a|(*SKIP)b/B + +/(?:(*SKIP))a|(?:(*SKIP))b/B + +# (*SKIP:name) is never pulled out +/(*SKIP:abc)a|(*SKIP:abc)b/B + +/(?:(*SKIP:abc))a|(?:(*SKIP:abc))b/B + +# (*THEN) is never pulled out +/(*THEN)a|(*THEN)b/B + +/(?:(*THEN))a|(?:(*THEN))b/B + +# (*THEN:name) is never pulled out +/(*THEN:abc)a|(*THEN:abc)b/B + +/(?:(*THEN:abc))a|(?:(*THEN:abc))b/B + +# \C escape (single code unit) can be pulled out +# ⇒ \C[ab] +/\Ca|\Cb/B + +# Common prefix which itself has a common prefix +# ⇒ (?>a[bc])[de] +/(?:(?>ab|ac)d|(?>ab|ac)e)/B + +# Rewriting common prefix causes parent group to have a common prefix +# (We don't detect this case) +# ⇒ (?:a[bc]d|a[bc]e) +/(?:a(?:b|c)d|(?:ab|ac)e)/B + +# Another case: +# ⇒ (?>a[bc]d|a[bc]e) +/(?>(?:ab|ac)d|(?:ab|ac)e)/B + +# When rewriting groups which are themselves quantified, +# the extracted items have to stay inside the quantified group +# ⇒ (?:a[bc])? +/(?:ab|ac)?/B + +/(?:ab|ac)*/B + +/(?:ab|ac)+/B + +/(?:ab|ac)?+/B + +/(?:ab|ac)*+/B + +/(?:ab|ac)++/B + +/(?:ab|ac){2}/B + +/(?:ab|ac){2,4}/B + +/(?:ab|ac){2,4}+/B + +# Handling of script runs: +# Common prefix can be pulled out from alternation in a script run: +# ⇒ (*sr:ab[cd]) +/(*sr:abc|abd)/B + +# Script runs themselves can also be part of a common prefix +# ⇒ (*sr:abc)[cd] +/(*sr:abc)c|(*sr:abc)d/B + +# But if there are backtracking control verbs like (*COMMIT) in a script run +# it won't be pulled out +/(*sr:ab(*COMMIT))c|(*sr:ab(*COMMIT))d/B + +# Same deal if there is a callout inside a script run +/(*sr:ab(?C1))c|(*sr:ab(?C1))d/B + +# All the above applies to ATOMIC script runs +/(*asr:abc|abd)/B + +/(*asr:abc)c|(*asr:abc)d/B + +/(*asr:ab(*COMMIT))c|(*asr:ab(*COMMIT))d/B + +/(*asr:ab(?C1))c|(*asr:ab(?C1))d/B + +# Regression test: +# Pattern rewriter must properly handle assert conditions which contain alternation +# This should not be rewritten: +/b(?(?!)|b)/B + +# Regression test: +# For a conditional group which uses an assertion condition, that assertion condition +# cannot be pulled out +# This should not be rewritten (and the subject string should not match): +/(?(?abc< @@ -15068,7 +15061,6 @@ Subject length lower bound = 65535 /(?|()+|(a)+)/BI ------------------------------------------------------------------ - Bra Bra SCBra 1 KetRmax @@ -15077,7 +15069,6 @@ Subject length lower bound = 65535 a KetRmax Ket - Ket End ------------------------------------------------------------------ Capture group count = 1 @@ -15086,7 +15077,6 @@ Subject length lower bound = 0 /(?|(a)+|()+)/BI ------------------------------------------------------------------ - Bra Bra CBra 1 a @@ -15095,7 +15085,6 @@ Subject length lower bound = 0 SCBra 1 KetRmax Ket - Ket End ------------------------------------------------------------------ Capture group count = 1 @@ -15104,7 +15093,6 @@ Subject length lower bound = 0 /(?|()|(a))/BI ------------------------------------------------------------------ - Bra Bra CBra 1 Ket @@ -15113,7 +15101,6 @@ Subject length lower bound = 0 a Ket Ket - Ket End ------------------------------------------------------------------ Capture group count = 1 @@ -15122,7 +15109,6 @@ Subject length lower bound = 0 /(?|(a)|())/BI ------------------------------------------------------------------ - Bra Bra CBra 1 a @@ -15131,7 +15117,6 @@ Subject length lower bound = 0 CBra 1 Ket Ket - Ket End ------------------------------------------------------------------ Capture group count = 1 @@ -17282,7 +17267,8 @@ Subject length lower bound = 1 Capture group count = 0 May match empty string First code unit = 'a' -Subject length lower bound = 1 +Last code unit = 'b' +Subject length lower bound = 2 /(*napla:a|(.)(*ACCEPT)zz)\1../ abcd @@ -17526,7 +17512,7 @@ Subject length lower bound = 5 /(?:c|C)abcd/I Capture group count = 0 -First code unit = 'C' (caseless) +First code unit = 'c' (caseless) Last code unit = 'd' Subject length lower bound = 5 diff --git a/testdata/testoutput27 b/testdata/testoutput27 new file mode 100644 index 000000000..a57950799 --- /dev/null +++ b/testdata/testoutput27 @@ -0,0 +1,2132 @@ +# Tests for the pattern rewriter which now precedes regex compilation + +# Pulling out common prefixes from alternation: + +# Simplest case: +# ⇒ a[b-d] +/ab|ac|ad/B +------------------------------------------------------------------ + Bra + a + [b-d] + Ket + End +------------------------------------------------------------------ + +# ⇒ a(?:b|c|(d)) +/ab|ac|a(d)/B +------------------------------------------------------------------ + Bra + a + Bra + b + Alt + c + Alt + CBra 1 + d + Ket + Ket + Ket + End +------------------------------------------------------------------ + +# ⇒ ab[bc] +/abb|abc/B +------------------------------------------------------------------ + Bra + ab + [bc] + Ket + End +------------------------------------------------------------------ + +# Inside a group is also OK: +# ⇒ (a[b-d]) +/(ab|ac|ad)/B +------------------------------------------------------------------ + Bra + CBra 1 + a + [b-d] + Ket + Ket + End +------------------------------------------------------------------ + +# Can recurse into deeply nested groups: +# ⇒ (((a[b-d]))) +/(((ab|ac|ad)))/B +------------------------------------------------------------------ + Bra + CBra 1 + CBra 2 + CBra 3 + a + [b-d] + Ket + Ket + Ket + Ket + End +------------------------------------------------------------------ + +# Another variant +# ⇒ (a((a[b-d]))b) +/(a((ab|ac|ad))b)/B +------------------------------------------------------------------ + Bra + CBra 1 + a + CBra 2 + CBra 3 + a + [b-d] + Ket + Ket + b + Ket + Ket + End +------------------------------------------------------------------ + +# Common prefix covers all of one alternation branch +# ⇒ a(?:|b) +/a|ab/B +------------------------------------------------------------------ + Bra + a + Bra + Alt + b + Ket + Ket + End +------------------------------------------------------------------ + +# Multiple identical alternation branches +# (We don't detect this and prune the redundant ones) +/a|ab|a/B +------------------------------------------------------------------ + Bra + a + Bra + Alt + b + Alt + Ket + Ket + End +------------------------------------------------------------------ + +# Can't be rewritten +/.+a|.+b/B +------------------------------------------------------------------ + Bra + Any+ + a + Alt + Any+ + b + Ket + End +------------------------------------------------------------------ + aaab + 0: aaa + +# Can't be rewritten +/.*b|.*c/B +------------------------------------------------------------------ + Bra + Any* + b + Alt + Any* + c + Ket + End +------------------------------------------------------------------ + bbbc + 0: bbb + +# Can't be rewritten +/.?b|.?c/B +------------------------------------------------------------------ + Bra + Any? + b + Alt + Any? + c + Ket + End +------------------------------------------------------------------ + bc + 0: b + +# However, a quantifier which matches a specific number of times is OK: +# ⇒ .{2}[bc] +/.{2}b|.{2}c/B +------------------------------------------------------------------ + Bra + Any{2} + [bc] + Ket + End +------------------------------------------------------------------ + +# Can't be rewritten +# NOTE: During regex compilation, a{1,2} is converted to aa{0,1} +# If this conversion was done earlier, we could pull out a common prefix +# 🤷 +/a{1,2}b|a{1,2}c/B +------------------------------------------------------------------ + Bra + a + a{0,1}+ + b + Alt + a + a{0,1}+ + c + Ket + End +------------------------------------------------------------------ + +# Possessive quantifiers are OK: +# ⇒ .++[ab] +/.++a|.++b/B +------------------------------------------------------------------ + Bra + Any++ + [ab] + Ket + End +------------------------------------------------------------------ + +# ⇒ .*+[ab] +/.*+a|.*+b/B +------------------------------------------------------------------ + Bra + Any*+ + [ab] + Ket + End +------------------------------------------------------------------ + +# ⇒ .?+[ab] +/.?+a|.?+b/B +------------------------------------------------------------------ + Bra + Any?+ + [ab] + Ket + End +------------------------------------------------------------------ + +# ⇒ .{2,4}+[ab] +/.{2,4}+a|.{2,4}+b/B +------------------------------------------------------------------ + Bra + Any{2} + Any{0,2}+ + [ab] + Ket + End +------------------------------------------------------------------ + +# Non-capturing groups can be pulled out ONLY if they don't contain anything +# which the regex engine can backtrack into +# This includes: alternation, non-possessive quantifiers +# ⇒ (?:aa)[bc] +/(?:aa)b|(?:aa)c/B +------------------------------------------------------------------ + Bra + Bra + aa + Ket + [bc] + Ket + End +------------------------------------------------------------------ + +# Can't be rewritten +/(?:\s|\d)b|(?:\s|\d)c/B +------------------------------------------------------------------ + Bra + Bra + \s + Alt + \d + Ket + b + Alt + Bra + \s + Alt + \d + Ket + c + Ket + End +------------------------------------------------------------------ + +# Will be rewritten into character class +# ⇒ [ab]b|[ab]c +/(?:a|b)b|(?:a|b)c/B +------------------------------------------------------------------ + Bra + [ab] + b + Alt + [ab] + c + Ket + End +------------------------------------------------------------------ + +# Can't be rewritten +/(?:.*)b|(?:.*)c/B +------------------------------------------------------------------ + Bra + Bra + Any* + Ket + b + Alt + Bra + Any* + Ket + c + Ket + End +------------------------------------------------------------------ + bbbc + 0: bbb + +# Capturing groups can't, because it would change the capture numbers +/(\d)b|(\d)c/B +------------------------------------------------------------------ + Bra + CBra 1 + \d + Ket + b + Alt + CBra 2 + \d + Ket + c + Ket + End +------------------------------------------------------------------ + 1b + 0: 1b + 1: 1 + 1c + 0: 1c + 1: + 2: 1 + +# Will be rewritten into character class +# ⇒ ([ab])b|([ab])c +/(a|b)b|(a|b)c/B +------------------------------------------------------------------ + Bra + CBra 1 + [ab] + Ket + b + Alt + CBra 2 + [ab] + Ket + c + Ket + End +------------------------------------------------------------------ + ab + 0: ab + 1: a + ac + 0: ac + 1: + 2: a + +# Non-capturing group quantified with *, +, ? +# Can't be pulled out from alternation... but (?:a|b) will be converted to (?:[ab]) +/(?:a|b)*b|(?:a|b)*c/B +------------------------------------------------------------------ + Bra + Brazero + Bra + [ab] + KetRmax + b + Alt + Brazero + Bra + [ab] + KetRmax + c + Ket + End +------------------------------------------------------------------ + +# Can't be pulled out, but can be converted to use [ab] +/(?:a|b)+b|(?:a|b)+c/B +------------------------------------------------------------------ + Bra + Bra + [ab] + KetRmax + b + Alt + Bra + [ab] + KetRmax + c + Ket + End +------------------------------------------------------------------ + +# Can't be pulled out, but can be converted to use [ab] +/(?:a|b)?b|(?:a|b)?c/B +------------------------------------------------------------------ + Bra + Brazero + Bra + [ab] + Ket + b + Alt + Brazero + Bra + [ab] + Ket + c + Ket + End +------------------------------------------------------------------ + +# Lookahead can be pulled out +# ⇒ (?=a)a[AB] +/(?=a)aA|(?=a)aB/B +------------------------------------------------------------------ + Bra + Assert + a + Ket + a + [AB] + Ket + End +------------------------------------------------------------------ + +# Lookbehind can be pulled out +# ⇒ (?<=a)[bc] +/(?<=a)b|(?<=a)c/B +------------------------------------------------------------------ + Bra + Assert back + Reverse + a + Ket + [bc] + Ket + End +------------------------------------------------------------------ + +# Non-matching lookbehind can't be pulled out (of course) +/(?<=a)b|(?<=b)c/B +------------------------------------------------------------------ + Bra + Assert back + Reverse + a + Ket + b + Alt + Assert back + Reverse + b + Ket + c + Ket + End +------------------------------------------------------------------ + +# Negative lookahead can be pulled out +# ⇒ (?!a)[bc] +/(?!a)b|(?!a)c/B +------------------------------------------------------------------ + Bra + Assert not + a + Ket + [bc] + Ket + End +------------------------------------------------------------------ + +# Negative lookbehind can be pulled out +# ⇒ (?aa)a[bc] +/(?>aa)ab|(?>aa)ac/B +------------------------------------------------------------------ + Bra + Once + aa + Ket + a + [bc] + Ket + End +------------------------------------------------------------------ + +# Sometimes items are pulled out FROM an atomic group... +# ⇒ (?>a[bc]) +/(?>ab|ac)/B +------------------------------------------------------------------ + Bra + Once + a + [bc] + Ket + Ket + End +------------------------------------------------------------------ + +# Character classes can be pulled out +# ⇒ [a-z][bc] +/[a-z]b|[a-z]c/B +------------------------------------------------------------------ + Bra + [a-z] + [bc] + Ket + End +------------------------------------------------------------------ + +# Semantically identical char classes, written in a different way +# NOTE: If char classes were canonicalized before pattern rewriting, +# we could pull out a common prefix here +# 🤷 Too bad +/[abc]b|[a-c]c/B +------------------------------------------------------------------ + Bra + [a-c] + b + Alt + [a-c] + c + Ket + End +------------------------------------------------------------------ + +# Character types +# ⇒ \d\s\v\w[ab] +/\d\s\v\wa|\d\s\v\wb/B +------------------------------------------------------------------ + Bra + \d + \s + \v + \w + [ab] + Ket + End +------------------------------------------------------------------ + +# Start-of-string anchor +/^a|^b/B +------------------------------------------------------------------ + Bra + ^ + [ab] + Ket + End +------------------------------------------------------------------ + +# Rewriting still works with options at beginning of regex +/(*NOTEMPTY)(*NO_AUTO_POSSESS)ab|ac/B +------------------------------------------------------------------ + Bra + a + [bc] + Ket + End +------------------------------------------------------------------ + +/(*NO_START_OPT)ab|ac/B +------------------------------------------------------------------ + Bra + a + [bc] + Ket + End +------------------------------------------------------------------ + +/(*NO_DOTSTAR_ANCHOR)ab|ac/B +------------------------------------------------------------------ + Bra + a + [bc] + Ket + End +------------------------------------------------------------------ + +/(*LIMIT_HEAP=10000)ab|ac/B +------------------------------------------------------------------ + Bra + a + [bc] + Ket + End +------------------------------------------------------------------ + +/(*LIMIT_MATCH=10)ab|ac/B +------------------------------------------------------------------ + Bra + a + [bc] + Ket + End +------------------------------------------------------------------ + +/(*LIMIT_DEPTH=10)ab|ac/B +------------------------------------------------------------------ + Bra + a + [bc] + Ket + End +------------------------------------------------------------------ + +/(*CR)ab|ac/B +------------------------------------------------------------------ + Bra + a + [bc] + Ket + End +------------------------------------------------------------------ + +# Dot +/.a|.b/B +------------------------------------------------------------------ + Bra + Any + [ab] + Ket + End +------------------------------------------------------------------ + +# Extended regex (with embedded whitespace) +# The whitespace doesn't interfere with rewriting +/(?x) a b | a a b/B +------------------------------------------------------------------ + Bra + a + Bra + b + Alt + ab + Ket + Ket + End +------------------------------------------------------------------ + +# \Q..\E literal sequences +/\Q$\Ea|\Q$\Eb/B +------------------------------------------------------------------ + Bra + $ + [ab] + Ket + End +------------------------------------------------------------------ + +# Hex escapes +# 'A' and '\x41' are recognized as equivalent and rewritten +/Aa|\x41b/B +------------------------------------------------------------------ + Bra + A + [ab] + Ket + End +------------------------------------------------------------------ + +# Backreferences can be pulled out +/(\w)(?:\1a|\1b)/B +------------------------------------------------------------------ + Bra + CBra 1 + \w + Ket + \1 + [ab] + Ket + End +------------------------------------------------------------------ + +# Relative backreferences can be pulled out +/(\w)(?:\g{-1}a|\g{-1}b)/B +------------------------------------------------------------------ + Bra + CBra 1 + \w + Ket + \1 + [ab] + Ket + End +------------------------------------------------------------------ + +# Unicode property property escape sequences can be pulled out +/\p{Greek}a|\p{Greek}b/B +------------------------------------------------------------------ + Bra + prop Greek + [ab] + Ket + End +------------------------------------------------------------------ + +# Match reset escape can be pulled out +/a\Kb|a\Kc/B +------------------------------------------------------------------ + Bra + a + \K + [bc] + Ket + End +------------------------------------------------------------------ + +# Group with duplicate capture numbers +# ⇒ (a)[bc] +/(?|(a)b|(a)c)/B +------------------------------------------------------------------ + Bra + CBra 1 + a + Ket + [bc] + Ket + End +------------------------------------------------------------------ + ab + 0: ab + 1: a + ac + 0: ac + 1: a + +# Although capture numbers are the same, capturing groups can't be pulled out +# if they contain anything which the regex engine can backtrack into +/(?|(a*)b|(a*)c)/B +------------------------------------------------------------------ + Bra + CBra 1 + a*+ + Ket + b + Alt + CBra 1 + a*+ + Ket + c + Ket + End +------------------------------------------------------------------ + +/(?|(\d|\s)b|(\d|\s)c)/B +------------------------------------------------------------------ + Bra + CBra 1 + \d + Alt + \s + Ket + b + Alt + CBra 1 + \d + Alt + \s + Ket + c + Ket + End +------------------------------------------------------------------ + +# Non-capturing group with option letters +/(?i:ab|ac)/B +------------------------------------------------------------------ + Bra + /i a + [BCbc] + Ket + End +------------------------------------------------------------------ + +# Named capture groups +# This can't be rewritten, because although the capture group +# names are the same, their numbers are different +/(?J)(?:(?a)b|(?a)c)\k/B +------------------------------------------------------------------ + Bra + Bra + CBra 1 + a + Ket + b + Alt + CBra 2 + a + Ket + c + Ket + \k2 + Ket + End +------------------------------------------------------------------ + +# Named capture groups with identical capture group numbers +# These can be rewritten +/(?|(?a)b|(?a)c)\k/B +------------------------------------------------------------------ + Bra + CBra 1 + a + Ket + [bc] + \1 + Ket + End +------------------------------------------------------------------ + +# Script runs can be pulled out +# ⇒ (*sr:..)(?:a|b) +/(*sr:..)a|(*sr:..)b/B +------------------------------------------------------------------ + Bra + Script run + Any + Any + Ket + [ab] + Ket + End +------------------------------------------------------------------ + +# Capture group condition +# Can't be rewritten +/(a)(?(1)bc|bd)/B +------------------------------------------------------------------ + Bra + CBra 1 + a + Ket + Cond + 1 Capture ref + bc + Alt + bd + Ket + Ket + End +------------------------------------------------------------------ + +# Capture group condition (by group name) +# Can't be rewritten +/(?a)(?()bc|bd)/B +------------------------------------------------------------------ + Bra + CBra 1 + a + Ket + Cond + 1 Capture ref + bc + Alt + bd + Ket + Ket + End +------------------------------------------------------------------ + +# Version number condition +# Can't be rewritten +/(?(VERSION>=10.4)ab|ac)/B +------------------------------------------------------------------ + Bra + Cond + Cond true + ab + Alt + ac + Ket + Ket + End +------------------------------------------------------------------ + +# Lookahead assertion condition +# Can't be rewritten +/(?(?=a)ab|ac)/B +------------------------------------------------------------------ + Bra + Cond + Assert + a + Ket + ab + Alt + ac + Ket + Ket + End +------------------------------------------------------------------ + +# However, subgroups of a conditional group can be rewritten +/(?(?=a)a(?:bb|bc)|ac)/B +------------------------------------------------------------------ + Bra + Cond + Assert + a + Ket + ab + [bc] + Alt + ac + Ket + Ket + End +------------------------------------------------------------------ + +# Lookbehind assertion condition +# Can't be rewritten +/(?(?<=a)ab|ac)/B +------------------------------------------------------------------ + Bra + Cond + Assert back + Reverse + a + Ket + ab + Alt + ac + Ket + Ket + End +------------------------------------------------------------------ + +# Recursion condition +# Can't be rewritten +/(?(R)ab|ac)/B +------------------------------------------------------------------ + Bra + Cond + Cond recurse any + ab + Alt + ac + Ket + Ket + End +------------------------------------------------------------------ + +# Recursion condition with explicit number +# Can't be rewritten +/(a)(?(R1)ab|ac)/B +------------------------------------------------------------------ + Bra + CBra 1 + a + Ket + Cond + Cond recurse 1 + ab + Alt + ac + Ket + Ket + End +------------------------------------------------------------------ + +# Recursion condition by name +# Can't be rewritten +/(?a)(?(R&n)ab|ac)/B +------------------------------------------------------------------ + Bra + CBra 1 + a + Ket + Cond + Cond recurse 1 + ab + Alt + ac + Ket + Ket + End +------------------------------------------------------------------ + +# Define +/(?(DEFINE) a)b|(?(DEFINE) a)\w/B +------------------------------------------------------------------ + Bra + Cond + Cond false + a + Ket + b + Alt + Cond + Cond false + a + Ket + \w + Ket + End +------------------------------------------------------------------ + +# Subroutine call by number +# ⇒ (?|(a)|(b))(?1)[bc] +/(?|(a)|(b))(?:(?1)b|(?1)c)/B +------------------------------------------------------------------ + Bra + Bra + CBra 1 + a + Ket + Alt + CBra 1 + b + Ket + Ket + Recurse + [bc] + Ket + End +------------------------------------------------------------------ + +# Subroutine call by number, but with non-matching number +/(?:(a)|(b))(?:(?1)b|(?2)c)/B +------------------------------------------------------------------ + Bra + Bra + CBra 1 + a + Ket + Alt + CBra 2 + b + Ket + Ket + Bra + Recurse + b + Alt + Recurse + c + Ket + Ket + End +------------------------------------------------------------------ + +# Subroutine call by name +# ⇒ (?a)(?&n)[ab] +/(?a)(?:(?&n)a|(?&n)b)/B +------------------------------------------------------------------ + Bra + CBra 1 + a + Ket + Recurse + [ab] + Ket + End +------------------------------------------------------------------ + +# Subroutine call by name, but with non-matching name +/(?a)(?b)(?:(?&n)a|(?&m)b)/B +------------------------------------------------------------------ + Bra + CBra 1 + a + Ket + CBra 2 + b + Ket + Bra + Recurse + a + Alt + Recurse + b + Ket + Ket + End +------------------------------------------------------------------ + +/(?a)(?b)(?:(?&abce)a|(?&abcd)b)/B +------------------------------------------------------------------ + Bra + CBra 1 + a + Ket + CBra 2 + b + Ket + Bra + Recurse + a + Alt + Recurse + b + Ket + Ket + End +------------------------------------------------------------------ + +# Callouts are never pulled out +/(?C0)a|(?C0)b/B +------------------------------------------------------------------ + Bra + Callout 0 5 1 + a + Alt + Callout 0 12 1 + b + Ket + End +------------------------------------------------------------------ + +/(?C{ab})a|(?C{ab})b/B +------------------------------------------------------------------ + Bra + CalloutStr {ab} 4 8 1 + a + Alt + CalloutStr {ab} 14 18 1 + b + Ket + End +------------------------------------------------------------------ + +# Callouts are still not pulled out if they are inside a sub-group +/(?:(?C0))a|(?:(?C0))b/B +------------------------------------------------------------------ + Bra + Bra + Callout 0 8 1 + Ket + a + Alt + Bra + Callout 0 19 1 + Ket + b + Ket + End +------------------------------------------------------------------ + +/(?:(?C{ab}))a|(?:(?C{ab}))b/B +------------------------------------------------------------------ + Bra + Bra + CalloutStr {ab} 7 11 1 + Ket + a + Alt + Bra + CalloutStr {ab} 21 25 1 + Ket + b + Ket + End +------------------------------------------------------------------ + +/(?>(?C0))a|(?>(?C0))b/B +------------------------------------------------------------------ + Bra + Once + Callout 0 8 1 + Ket + a + Alt + Once + Callout 0 19 1 + Ket + b + Ket + End +------------------------------------------------------------------ + +/(?=(?C0))a|(?=(?C0))b/B +------------------------------------------------------------------ + Bra + Assert + Callout 0 8 1 + Ket + a + Alt + Assert + Callout 0 19 1 + Ket + b + Ket + End +------------------------------------------------------------------ + +/(?<=(?C0))a|(?<=(?C0))b/B +------------------------------------------------------------------ + Bra + Assert back + Callout 0 9 1 + Ket + a + Alt + Assert back + Callout 0 21 1 + Ket + b + Ket + End +------------------------------------------------------------------ + +# (*ACCEPT) can be pulled out +/(*ACCEPT)ab|(*ACCEPT)ac/B +------------------------------------------------------------------ + Bra + *ACCEPT + a + [bc] + Ket + End +------------------------------------------------------------------ + +# (*ACCEPT:name) can be pulled out +/(*ACCEPT:hello)ab|(*ACCEPT:hello)ac/B +------------------------------------------------------------------ + Bra + *MARK hello + *ACCEPT + a + [bc] + Ket + End +------------------------------------------------------------------ + +# (*ACCEPT:name) but with non-matching names +/(*ACCEPT:hello)ab|(*ACCEPT:goodbye)ac/B +------------------------------------------------------------------ + Bra + *MARK hello + *ACCEPT + ab + Alt + *MARK goodbye + *ACCEPT + ac + Ket + End +------------------------------------------------------------------ + +/(*ACCEPT:a)ab|(*ACCEPT:b)ac/B +------------------------------------------------------------------ + Bra + *MARK a + *ACCEPT + ab + Alt + *MARK b + *ACCEPT + ac + Ket + End +------------------------------------------------------------------ + +# (*FAIL) can be pulled out +/(*FAIL)ab|(*FAIL)ac/B +------------------------------------------------------------------ + Bra + *FAIL + a + [bc] + Ket + End +------------------------------------------------------------------ + +# (*FAIL:name) can be pulled out +/(*FAIL:hello)ab|(*FAIL:hello)ac/B +------------------------------------------------------------------ + Bra + *MARK hello + *FAIL + a + [bc] + Ket + End +------------------------------------------------------------------ + +# (*FAIL:name) but with non-matching names +/(*FAIL:hello)ab|(*FAIL:goodbye)ac/B +------------------------------------------------------------------ + Bra + *MARK hello + *FAIL + ab + Alt + *MARK goodbye + *FAIL + ac + Ket + End +------------------------------------------------------------------ + +# (*MARK:name) can be pulled out +/(*MARK:hello)ab|(*MARK:hello)ac/B +------------------------------------------------------------------ + Bra + *MARK hello + a + [bc] + Ket + End +------------------------------------------------------------------ + +# (*MARK:name) but with non-matching names +/(*MARK:hello)ab|(*MARK:goodbye)ac/B +------------------------------------------------------------------ + Bra + *MARK hello + ab + Alt + *MARK goodbye + ac + Ket + End +------------------------------------------------------------------ + +# (*PRUNE) is never pulled out +/(*PRUNE)a|(*PRUNE)b/B +------------------------------------------------------------------ + Bra + *PRUNE + a + Alt + *PRUNE + b + Ket + End +------------------------------------------------------------------ + +/(?:(*PRUNE))a|(?:(*PRUNE))b/B +------------------------------------------------------------------ + Bra + Bra + *PRUNE + Ket + a + Alt + Bra + *PRUNE + Ket + b + Ket + End +------------------------------------------------------------------ + +# (*PRUNE:name) is never pulled out +/(*PRUNE:abc)a|(*PRUNE:abc)b/B +------------------------------------------------------------------ + Bra + *PRUNE abc + a + Alt + *PRUNE abc + b + Ket + End +------------------------------------------------------------------ + +/(?:(*PRUNE:abc))a|(?:(*PRUNE:abc))b/B +------------------------------------------------------------------ + Bra + Bra + *PRUNE abc + Ket + a + Alt + Bra + *PRUNE abc + Ket + b + Ket + End +------------------------------------------------------------------ + +# (*COMMIT) is never pulled out +/(*COMMIT)a|(*COMMIT)b/B +------------------------------------------------------------------ + Bra + *COMMIT + a + Alt + *COMMIT + b + Ket + End +------------------------------------------------------------------ + +/(?:(*COMMIT))a|(?:(*COMMIT))b/B +------------------------------------------------------------------ + Bra + Bra + *COMMIT + Ket + a + Alt + Bra + *COMMIT + Ket + b + Ket + End +------------------------------------------------------------------ + +# (*COMMIT:name) is never pulled out +/(*COMMIT:abc)a|(*COMMIT:abc)b/B +------------------------------------------------------------------ + Bra + *COMMIT abc + a + Alt + *COMMIT abc + b + Ket + End +------------------------------------------------------------------ + +/(?:(*COMMIT:abc))a|(?:(*COMMIT:abc))b/B +------------------------------------------------------------------ + Bra + Bra + *COMMIT abc + Ket + a + Alt + Bra + *COMMIT abc + Ket + b + Ket + End +------------------------------------------------------------------ + +# (*SKIP) is never pulled out +/(*SKIP)a|(*SKIP)b/B +------------------------------------------------------------------ + Bra + *SKIP + a + Alt + *SKIP + b + Ket + End +------------------------------------------------------------------ + +/(?:(*SKIP))a|(?:(*SKIP))b/B +------------------------------------------------------------------ + Bra + Bra + *SKIP + Ket + a + Alt + Bra + *SKIP + Ket + b + Ket + End +------------------------------------------------------------------ + +# (*SKIP:name) is never pulled out +/(*SKIP:abc)a|(*SKIP:abc)b/B +------------------------------------------------------------------ + Bra + *SKIP abc + a + Alt + *SKIP abc + b + Ket + End +------------------------------------------------------------------ + +/(?:(*SKIP:abc))a|(?:(*SKIP:abc))b/B +------------------------------------------------------------------ + Bra + Bra + *SKIP abc + Ket + a + Alt + Bra + *SKIP abc + Ket + b + Ket + End +------------------------------------------------------------------ + +# (*THEN) is never pulled out +/(*THEN)a|(*THEN)b/B +------------------------------------------------------------------ + Bra + *THEN + a + Alt + *THEN + b + Ket + End +------------------------------------------------------------------ + +/(?:(*THEN))a|(?:(*THEN))b/B +------------------------------------------------------------------ + Bra + Bra + *THEN + Ket + a + Alt + Bra + *THEN + Ket + b + Ket + End +------------------------------------------------------------------ + +# (*THEN:name) is never pulled out +/(*THEN:abc)a|(*THEN:abc)b/B +------------------------------------------------------------------ + Bra + *THEN abc + a + Alt + *THEN abc + b + Ket + End +------------------------------------------------------------------ + +/(?:(*THEN:abc))a|(?:(*THEN:abc))b/B +------------------------------------------------------------------ + Bra + Bra + *THEN abc + Ket + a + Alt + Bra + *THEN abc + Ket + b + Ket + End +------------------------------------------------------------------ + +# \C escape (single code unit) can be pulled out +# ⇒ \C[ab] +/\Ca|\Cb/B +------------------------------------------------------------------ + Bra + AllAny + [ab] + Ket + End +------------------------------------------------------------------ + +# Common prefix which itself has a common prefix +# ⇒ (?>a[bc])[de] +/(?:(?>ab|ac)d|(?>ab|ac)e)/B +------------------------------------------------------------------ + Bra + Once + a + [bc] + Ket + [de] + Ket + End +------------------------------------------------------------------ + +# Rewriting common prefix causes parent group to have a common prefix +# (We don't detect this case) +# ⇒ (?:a[bc]d|a[bc]e) +/(?:a(?:b|c)d|(?:ab|ac)e)/B +------------------------------------------------------------------ + Bra + a + [bc] + d + Alt + a + [bc] + e + Ket + End +------------------------------------------------------------------ + +# Another case: +# ⇒ (?>a[bc]d|a[bc]e) +/(?>(?:ab|ac)d|(?:ab|ac)e)/B +------------------------------------------------------------------ + Bra + Once + a + [bc] + d + Alt + a + [bc] + e + Ket + Ket + End +------------------------------------------------------------------ + +# When rewriting groups which are themselves quantified, +# the extracted items have to stay inside the quantified group +# ⇒ (?:a[bc])? +/(?:ab|ac)?/B +------------------------------------------------------------------ + Bra + Brazero + Bra + a + [bc] + Ket + Ket + End +------------------------------------------------------------------ + +/(?:ab|ac)*/B +------------------------------------------------------------------ + Bra + Brazero + Bra + a + [bc] + KetRmax + Ket + End +------------------------------------------------------------------ + +/(?:ab|ac)+/B +------------------------------------------------------------------ + Bra + Bra + a + [bc] + KetRmax + Ket + End +------------------------------------------------------------------ + +/(?:ab|ac)?+/B +------------------------------------------------------------------ + Bra + Once + Brazero + Bra + a + [bc] + Ket + Ket + Ket + End +------------------------------------------------------------------ + +/(?:ab|ac)*+/B +------------------------------------------------------------------ + Bra + Braposzero + BraPos + a + [bc] + KetRpos + Ket + End +------------------------------------------------------------------ + +/(?:ab|ac)++/B +------------------------------------------------------------------ + Bra + BraPos + a + [bc] + KetRpos + Ket + End +------------------------------------------------------------------ + +/(?:ab|ac){2}/B +------------------------------------------------------------------ + Bra + Bra + a + [bc] + Ket + Bra + a + [bc] + Ket + Ket + End +------------------------------------------------------------------ + +/(?:ab|ac){2,4}/B +------------------------------------------------------------------ + Bra + Bra + a + [bc] + Ket + Bra + a + [bc] + Ket + Brazero + Bra + Bra + a + [bc] + Ket + Brazero + Bra + a + [bc] + Ket + Ket + Ket + End +------------------------------------------------------------------ + +/(?:ab|ac){2,4}+/B +------------------------------------------------------------------ + Bra + Once + Bra + a + [bc] + Ket + Bra + a + [bc] + Ket + Brazero + Bra + Bra + a + [bc] + Ket + Brazero + Bra + a + [bc] + Ket + Ket + Ket + Ket + End +------------------------------------------------------------------ + +# Handling of script runs: +# Common prefix can be pulled out from alternation in a script run: +# ⇒ (*sr:ab[cd]) +/(*sr:abc|abd)/B +------------------------------------------------------------------ + Bra + Script run + ab + [cd] + Ket + Ket + End +------------------------------------------------------------------ + +# Script runs themselves can also be part of a common prefix +# ⇒ (*sr:abc)[cd] +/(*sr:abc)c|(*sr:abc)d/B +------------------------------------------------------------------ + Bra + Script run + abc + Ket + [cd] + Ket + End +------------------------------------------------------------------ + +# But if there are backtracking control verbs like (*COMMIT) in a script run +# it won't be pulled out +/(*sr:ab(*COMMIT))c|(*sr:ab(*COMMIT))d/B +------------------------------------------------------------------ + Bra + Script run + ab + *COMMIT + Ket + c + Alt + Script run + ab + *COMMIT + Ket + d + Ket + End +------------------------------------------------------------------ + +# Same deal if there is a callout inside a script run +/(*sr:ab(?C1))c|(*sr:ab(?C1))d/B +------------------------------------------------------------------ + Bra + Script run + ab + Callout 1 12 1 + Ket + c + Alt + Script run + ab + Callout 1 27 1 + Ket + d + Ket + End +------------------------------------------------------------------ + +# All the above applies to ATOMIC script runs +/(*asr:abc|abd)/B +------------------------------------------------------------------ + Bra + Script run + Once + ab + [cd] + Ket + Ket + Ket + End +------------------------------------------------------------------ + +/(*asr:abc)c|(*asr:abc)d/B +------------------------------------------------------------------ + Bra + Script run + Once + abc + Ket + Ket + [cd] + Ket + End +------------------------------------------------------------------ + +/(*asr:ab(*COMMIT))c|(*asr:ab(*COMMIT))d/B +------------------------------------------------------------------ + Bra + Script run + Once + ab + *COMMIT + Ket + Ket + c + Alt + Script run + Once + ab + *COMMIT + Ket + Ket + d + Ket + End +------------------------------------------------------------------ + +/(*asr:ab(?C1))c|(*asr:ab(?C1))d/B +------------------------------------------------------------------ + Bra + Script run + Once + ab + Callout 1 13 1 + Ket + Ket + c + Alt + Script run + Once + ab + Callout 1 29 1 + Ket + Ket + d + Ket + End +------------------------------------------------------------------ + +# Regression test: +# Pattern rewriter must properly handle assert conditions which contain alternation +# This should not be rewritten: +/b(?(?!)|b)/B +------------------------------------------------------------------ + Bra + b + Cond + *FAIL + Alt + b + Ket + Ket + End +------------------------------------------------------------------ + +# Regression test: +# For a conditional group which uses an assertion condition, that assertion condition +# cannot be pulled out +# This should not be rewritten (and the subject string should not match): +/(?(?b|c)d(?Pe)/ -Memory allocation - code size : 54 +Memory allocation - code size : 76 Memory allocation - data size : 52 ------------------------------------------------------------------ - 0 24 Bra + 0 35 Bra 2 a - 4 5 CBra 1 - 7 b - 9 4 Alt - 11 c - 13 9 Ket - 15 d - 17 5 CBra 2 - 20 e - 22 5 Ket - 24 24 Ket - 26 End + 4 20 CBra 1 + 7 [bc] + 24 20 Ket + 26 d + 28 5 CBra 2 + 31 e + 33 5 Ket + 35 35 Ket + 37 End ------------------------------------------------------------------ /(?:a(?Pc(?Pd)))(?Pa)/ @@ -694,31 +690,27 @@ Memory allocation - code size : 14 /abc(d|e)(*THEN)x(123(*THEN)4|567(b|q)(*THEN)xx)/ ------------------------------------------------------------------ - 0 60 Bra + 0 82 Bra 2 abc - 8 5 CBra 1 - 11 d - 13 4 Alt - 15 e - 17 9 Ket - 19 *THEN - 20 x - 22 12 CBra 2 - 25 123 - 31 *THEN - 32 4 - 34 24 Alt - 36 567 - 42 5 CBra 3 - 45 b - 47 4 Alt - 49 q - 51 9 Ket - 53 *THEN - 54 xx - 58 36 Ket - 60 60 Ket - 62 End + 8 20 CBra 1 + 11 [de] + 28 20 Ket + 30 *THEN + 31 x + 33 12 CBra 2 + 36 123 + 42 *THEN + 43 4 + 45 35 Alt + 47 567 + 53 20 CBra 3 + 56 [bq] + 73 20 Ket + 75 *THEN + 76 xx + 80 47 Ket + 82 82 Ket + 84 End ------------------------------------------------------------------ /(((a\2)|(a*)\g<-1>))*a?/ diff --git a/testdata/testoutput8-16-3 b/testdata/testoutput8-16-3 index bc7021f8f..d6537be51 100644 --- a/testdata/testoutput8-16-3 +++ b/testdata/testoutput8-16-3 @@ -36,18 +36,16 @@ Memory allocation - code size : 48 ------------------------------------------------------------------ /(?s:.*X|^B)/ -Memory allocation - code size : 46 +Memory allocation - code size : 34 ------------------------------------------------------------------ - 0 19 Bra - 3 7 Bra - 6 AllAny* - 8 X - 10 6 Alt - 13 ^ - 14 B - 16 13 Ket - 19 19 Ket - 22 End + 0 7 Bra + 3 AllAny* + 5 X + 7 6 Alt + 10 ^ + 11 B + 13 13 Ket + 16 End ------------------------------------------------------------------ /^[[:alnum:]]/ @@ -197,22 +195,20 @@ Memory allocation - code size : 54 ------------------------------------------------------------------ /a(?Pb|c)d(?Pe)/ -Memory allocation - code size : 68 +Memory allocation - code size : 88 Memory allocation - data size : 52 ------------------------------------------------------------------ - 0 30 Bra + 0 40 Bra 3 a - 5 6 CBra 1 - 9 b - 11 5 Alt - 14 c - 16 11 Ket - 19 d - 21 6 CBra 2 - 25 e - 27 6 Ket - 30 30 Ket - 33 End + 5 21 CBra 1 + 9 [bc] + 26 21 Ket + 29 d + 31 6 CBra 2 + 35 e + 37 6 Ket + 40 40 Ket + 43 End ------------------------------------------------------------------ /(?:a(?Pc(?Pd)))(?Pa)/ @@ -694,31 +690,27 @@ Memory allocation - code size : 18 /abc(d|e)(*THEN)x(123(*THEN)4|567(b|q)(*THEN)xx)/ ------------------------------------------------------------------ - 0 70 Bra + 0 90 Bra 3 abc - 9 6 CBra 1 - 13 d - 15 5 Alt - 18 e - 20 11 Ket - 23 *THEN - 24 x - 26 13 CBra 2 - 30 123 - 36 *THEN - 37 4 - 39 28 Alt - 42 567 - 48 6 CBra 3 - 52 b - 54 5 Alt - 57 q - 59 11 Ket - 62 *THEN - 63 xx - 67 41 Ket - 70 70 Ket - 73 End + 9 21 CBra 1 + 13 [de] + 30 21 Ket + 33 *THEN + 34 x + 36 13 CBra 2 + 40 123 + 46 *THEN + 47 4 + 49 38 Alt + 52 567 + 58 21 CBra 3 + 62 [bq] + 79 21 Ket + 82 *THEN + 83 xx + 87 51 Ket + 90 90 Ket + 93 End ------------------------------------------------------------------ /(((a\2)|(a*)\g<-1>))*a?/ diff --git a/testdata/testoutput8-16-4 b/testdata/testoutput8-16-4 index bc7021f8f..d6537be51 100644 --- a/testdata/testoutput8-16-4 +++ b/testdata/testoutput8-16-4 @@ -36,18 +36,16 @@ Memory allocation - code size : 48 ------------------------------------------------------------------ /(?s:.*X|^B)/ -Memory allocation - code size : 46 +Memory allocation - code size : 34 ------------------------------------------------------------------ - 0 19 Bra - 3 7 Bra - 6 AllAny* - 8 X - 10 6 Alt - 13 ^ - 14 B - 16 13 Ket - 19 19 Ket - 22 End + 0 7 Bra + 3 AllAny* + 5 X + 7 6 Alt + 10 ^ + 11 B + 13 13 Ket + 16 End ------------------------------------------------------------------ /^[[:alnum:]]/ @@ -197,22 +195,20 @@ Memory allocation - code size : 54 ------------------------------------------------------------------ /a(?Pb|c)d(?Pe)/ -Memory allocation - code size : 68 +Memory allocation - code size : 88 Memory allocation - data size : 52 ------------------------------------------------------------------ - 0 30 Bra + 0 40 Bra 3 a - 5 6 CBra 1 - 9 b - 11 5 Alt - 14 c - 16 11 Ket - 19 d - 21 6 CBra 2 - 25 e - 27 6 Ket - 30 30 Ket - 33 End + 5 21 CBra 1 + 9 [bc] + 26 21 Ket + 29 d + 31 6 CBra 2 + 35 e + 37 6 Ket + 40 40 Ket + 43 End ------------------------------------------------------------------ /(?:a(?Pc(?Pd)))(?Pa)/ @@ -694,31 +690,27 @@ Memory allocation - code size : 18 /abc(d|e)(*THEN)x(123(*THEN)4|567(b|q)(*THEN)xx)/ ------------------------------------------------------------------ - 0 70 Bra + 0 90 Bra 3 abc - 9 6 CBra 1 - 13 d - 15 5 Alt - 18 e - 20 11 Ket - 23 *THEN - 24 x - 26 13 CBra 2 - 30 123 - 36 *THEN - 37 4 - 39 28 Alt - 42 567 - 48 6 CBra 3 - 52 b - 54 5 Alt - 57 q - 59 11 Ket - 62 *THEN - 63 xx - 67 41 Ket - 70 70 Ket - 73 End + 9 21 CBra 1 + 13 [de] + 30 21 Ket + 33 *THEN + 34 x + 36 13 CBra 2 + 40 123 + 46 *THEN + 47 4 + 49 38 Alt + 52 567 + 58 21 CBra 3 + 62 [bq] + 79 21 Ket + 82 *THEN + 83 xx + 87 51 Ket + 90 90 Ket + 93 End ------------------------------------------------------------------ /(((a\2)|(a*)\g<-1>))*a?/ diff --git a/testdata/testoutput8-32-2 b/testdata/testoutput8-32-2 index 69f59316e..08c9bb352 100644 --- a/testdata/testoutput8-32-2 +++ b/testdata/testoutput8-32-2 @@ -36,18 +36,16 @@ Memory allocation - code size : 76 ------------------------------------------------------------------ /(?s:.*X|^B)/ -Memory allocation - code size : 72 +Memory allocation - code size : 56 ------------------------------------------------------------------ - 0 15 Bra - 2 6 Bra - 4 AllAny* - 6 X - 8 5 Alt - 10 ^ - 11 B - 13 11 Ket - 15 15 Ket - 17 End + 0 6 Bra + 2 AllAny* + 4 X + 6 5 Alt + 8 ^ + 9 B + 11 11 Ket + 13 End ------------------------------------------------------------------ /^[[:alnum:]]/ @@ -197,22 +195,20 @@ Memory allocation - code size : 80 ------------------------------------------------------------------ /a(?Pb|c)d(?Pe)/ -Memory allocation - code size : 108 +Memory allocation - code size : 120 Memory allocation - data size : 104 ------------------------------------------------------------------ - 0 24 Bra + 0 27 Bra 2 a - 4 5 CBra 1 - 7 b - 9 4 Alt - 11 c - 13 9 Ket - 15 d - 17 5 CBra 2 - 20 e - 22 5 Ket - 24 24 Ket - 26 End + 4 12 CBra 1 + 7 [bc] + 16 12 Ket + 18 d + 20 5 CBra 2 + 23 e + 25 5 Ket + 27 27 Ket + 29 End ------------------------------------------------------------------ /(?:a(?Pc(?Pd)))(?Pa)/ @@ -694,31 +690,27 @@ Memory allocation - code size : 28 /abc(d|e)(*THEN)x(123(*THEN)4|567(b|q)(*THEN)xx)/ ------------------------------------------------------------------ - 0 60 Bra + 0 66 Bra 2 abc - 8 5 CBra 1 - 11 d - 13 4 Alt - 15 e - 17 9 Ket - 19 *THEN - 20 x - 22 12 CBra 2 - 25 123 - 31 *THEN - 32 4 - 34 24 Alt - 36 567 - 42 5 CBra 3 - 45 b - 47 4 Alt - 49 q - 51 9 Ket - 53 *THEN - 54 xx - 58 36 Ket - 60 60 Ket - 62 End + 8 12 CBra 1 + 11 [de] + 20 12 Ket + 22 *THEN + 23 x + 25 12 CBra 2 + 28 123 + 34 *THEN + 35 4 + 37 27 Alt + 39 567 + 45 12 CBra 3 + 48 [bq] + 57 12 Ket + 59 *THEN + 60 xx + 64 39 Ket + 66 66 Ket + 68 End ------------------------------------------------------------------ /(((a\2)|(a*)\g<-1>))*a?/ diff --git a/testdata/testoutput8-32-3 b/testdata/testoutput8-32-3 index 69f59316e..08c9bb352 100644 --- a/testdata/testoutput8-32-3 +++ b/testdata/testoutput8-32-3 @@ -36,18 +36,16 @@ Memory allocation - code size : 76 ------------------------------------------------------------------ /(?s:.*X|^B)/ -Memory allocation - code size : 72 +Memory allocation - code size : 56 ------------------------------------------------------------------ - 0 15 Bra - 2 6 Bra - 4 AllAny* - 6 X - 8 5 Alt - 10 ^ - 11 B - 13 11 Ket - 15 15 Ket - 17 End + 0 6 Bra + 2 AllAny* + 4 X + 6 5 Alt + 8 ^ + 9 B + 11 11 Ket + 13 End ------------------------------------------------------------------ /^[[:alnum:]]/ @@ -197,22 +195,20 @@ Memory allocation - code size : 80 ------------------------------------------------------------------ /a(?Pb|c)d(?Pe)/ -Memory allocation - code size : 108 +Memory allocation - code size : 120 Memory allocation - data size : 104 ------------------------------------------------------------------ - 0 24 Bra + 0 27 Bra 2 a - 4 5 CBra 1 - 7 b - 9 4 Alt - 11 c - 13 9 Ket - 15 d - 17 5 CBra 2 - 20 e - 22 5 Ket - 24 24 Ket - 26 End + 4 12 CBra 1 + 7 [bc] + 16 12 Ket + 18 d + 20 5 CBra 2 + 23 e + 25 5 Ket + 27 27 Ket + 29 End ------------------------------------------------------------------ /(?:a(?Pc(?Pd)))(?Pa)/ @@ -694,31 +690,27 @@ Memory allocation - code size : 28 /abc(d|e)(*THEN)x(123(*THEN)4|567(b|q)(*THEN)xx)/ ------------------------------------------------------------------ - 0 60 Bra + 0 66 Bra 2 abc - 8 5 CBra 1 - 11 d - 13 4 Alt - 15 e - 17 9 Ket - 19 *THEN - 20 x - 22 12 CBra 2 - 25 123 - 31 *THEN - 32 4 - 34 24 Alt - 36 567 - 42 5 CBra 3 - 45 b - 47 4 Alt - 49 q - 51 9 Ket - 53 *THEN - 54 xx - 58 36 Ket - 60 60 Ket - 62 End + 8 12 CBra 1 + 11 [de] + 20 12 Ket + 22 *THEN + 23 x + 25 12 CBra 2 + 28 123 + 34 *THEN + 35 4 + 37 27 Alt + 39 567 + 45 12 CBra 3 + 48 [bq] + 57 12 Ket + 59 *THEN + 60 xx + 64 39 Ket + 66 66 Ket + 68 End ------------------------------------------------------------------ /(((a\2)|(a*)\g<-1>))*a?/ diff --git a/testdata/testoutput8-32-4 b/testdata/testoutput8-32-4 index 69f59316e..08c9bb352 100644 --- a/testdata/testoutput8-32-4 +++ b/testdata/testoutput8-32-4 @@ -36,18 +36,16 @@ Memory allocation - code size : 76 ------------------------------------------------------------------ /(?s:.*X|^B)/ -Memory allocation - code size : 72 +Memory allocation - code size : 56 ------------------------------------------------------------------ - 0 15 Bra - 2 6 Bra - 4 AllAny* - 6 X - 8 5 Alt - 10 ^ - 11 B - 13 11 Ket - 15 15 Ket - 17 End + 0 6 Bra + 2 AllAny* + 4 X + 6 5 Alt + 8 ^ + 9 B + 11 11 Ket + 13 End ------------------------------------------------------------------ /^[[:alnum:]]/ @@ -197,22 +195,20 @@ Memory allocation - code size : 80 ------------------------------------------------------------------ /a(?Pb|c)d(?Pe)/ -Memory allocation - code size : 108 +Memory allocation - code size : 120 Memory allocation - data size : 104 ------------------------------------------------------------------ - 0 24 Bra + 0 27 Bra 2 a - 4 5 CBra 1 - 7 b - 9 4 Alt - 11 c - 13 9 Ket - 15 d - 17 5 CBra 2 - 20 e - 22 5 Ket - 24 24 Ket - 26 End + 4 12 CBra 1 + 7 [bc] + 16 12 Ket + 18 d + 20 5 CBra 2 + 23 e + 25 5 Ket + 27 27 Ket + 29 End ------------------------------------------------------------------ /(?:a(?Pc(?Pd)))(?Pa)/ @@ -694,31 +690,27 @@ Memory allocation - code size : 28 /abc(d|e)(*THEN)x(123(*THEN)4|567(b|q)(*THEN)xx)/ ------------------------------------------------------------------ - 0 60 Bra + 0 66 Bra 2 abc - 8 5 CBra 1 - 11 d - 13 4 Alt - 15 e - 17 9 Ket - 19 *THEN - 20 x - 22 12 CBra 2 - 25 123 - 31 *THEN - 32 4 - 34 24 Alt - 36 567 - 42 5 CBra 3 - 45 b - 47 4 Alt - 49 q - 51 9 Ket - 53 *THEN - 54 xx - 58 36 Ket - 60 60 Ket - 62 End + 8 12 CBra 1 + 11 [de] + 20 12 Ket + 22 *THEN + 23 x + 25 12 CBra 2 + 28 123 + 34 *THEN + 35 4 + 37 27 Alt + 39 567 + 45 12 CBra 3 + 48 [bq] + 57 12 Ket + 59 *THEN + 60 xx + 64 39 Ket + 66 66 Ket + 68 End ------------------------------------------------------------------ /(((a\2)|(a*)\g<-1>))*a?/ diff --git a/testdata/testoutput8-8-2 b/testdata/testoutput8-8-2 index 42a65809e..38ee8c15f 100644 --- a/testdata/testoutput8-8-2 +++ b/testdata/testoutput8-8-2 @@ -36,18 +36,16 @@ Memory allocation - code size : 25 ------------------------------------------------------------------ /(?s:.*X|^B)/ -Memory allocation - code size : 23 +Memory allocation - code size : 17 ------------------------------------------------------------------ - 0 19 Bra - 3 7 Bra - 6 AllAny* - 8 X - 10 6 Alt - 13 ^ - 14 B - 16 13 Ket - 19 19 Ket - 22 End + 0 7 Bra + 3 AllAny* + 5 X + 7 6 Alt + 10 ^ + 11 B + 13 13 Ket + 16 End ------------------------------------------------------------------ /^[[:alnum:]]/ @@ -197,22 +195,20 @@ Memory allocation - code size : 28 ------------------------------------------------------------------ /a(?Pb|c)d(?Pe)/ -Memory allocation - code size : 36 +Memory allocation - code size : 62 Memory allocation - data size : 28 ------------------------------------------------------------------ - 0 32 Bra + 0 58 Bra 3 a - 5 7 CBra 1 - 10 b - 12 5 Alt - 15 c - 17 12 Ket - 20 d - 22 7 CBra 2 - 27 e - 29 7 Ket - 32 32 Ket - 35 End + 5 38 CBra 1 + 10 [bc] + 43 38 Ket + 46 d + 48 7 CBra 2 + 53 e + 55 7 Ket + 58 58 Ket + 61 End ------------------------------------------------------------------ /(?:a(?Pc(?Pd)))(?Pa)/ @@ -694,31 +690,27 @@ Memory allocation - code size : 10 /abc(d|e)(*THEN)x(123(*THEN)4|567(b|q)(*THEN)xx)/ ------------------------------------------------------------------ - 0 73 Bra + 0 125 Bra 3 abc - 9 7 CBra 1 - 14 d - 16 5 Alt - 19 e - 21 12 Ket - 24 *THEN - 25 x - 27 14 CBra 2 - 32 123 - 38 *THEN - 39 4 - 41 29 Alt - 44 567 - 50 7 CBra 3 - 55 b - 57 5 Alt - 60 q - 62 12 Ket - 65 *THEN - 66 xx - 70 43 Ket - 73 73 Ket - 76 End + 9 38 CBra 1 + 14 [de] + 47 38 Ket + 50 *THEN + 51 x + 53 14 CBra 2 + 58 123 + 64 *THEN + 65 4 + 67 55 Alt + 70 567 + 76 38 CBra 3 + 81 [bq] +114 38 Ket +117 *THEN +118 xx +122 69 Ket +125 125 Ket +128 End ------------------------------------------------------------------ /(((a\2)|(a*)\g<-1>))*a?/ diff --git a/testdata/testoutput8-8-3 b/testdata/testoutput8-8-3 index 8837b7c24..f1b3fefcf 100644 --- a/testdata/testoutput8-8-3 +++ b/testdata/testoutput8-8-3 @@ -36,18 +36,16 @@ Memory allocation - code size : 30 ------------------------------------------------------------------ /(?s:.*X|^B)/ -Memory allocation - code size : 28 +Memory allocation - code size : 20 ------------------------------------------------------------------ - 0 23 Bra - 4 8 Bra - 8 AllAny* - 10 X - 12 7 Alt - 16 ^ - 17 B - 19 15 Ket - 23 23 Ket - 27 End + 0 8 Bra + 4 AllAny* + 6 X + 8 7 Alt + 12 ^ + 13 B + 15 15 Ket + 19 End ------------------------------------------------------------------ /^[[:alnum:]]/ @@ -197,22 +195,20 @@ Memory allocation - code size : 35 ------------------------------------------------------------------ /a(?Pb|c)d(?Pe)/ -Memory allocation - code size : 43 +Memory allocation - code size : 68 Memory allocation - data size : 28 ------------------------------------------------------------------ - 0 38 Bra + 0 63 Bra 4 a - 6 8 CBra 1 - 12 b - 14 6 Alt - 18 c - 20 14 Ket - 24 d - 26 8 CBra 2 - 32 e - 34 8 Ket - 38 38 Ket - 42 End + 6 39 CBra 1 + 12 [bc] + 45 39 Ket + 49 d + 51 8 CBra 2 + 57 e + 59 8 Ket + 63 63 Ket + 67 End ------------------------------------------------------------------ /(?:a(?Pc(?Pd)))(?Pa)/ @@ -694,31 +690,27 @@ Memory allocation - code size : 12 /abc(d|e)(*THEN)x(123(*THEN)4|567(b|q)(*THEN)xx)/ ------------------------------------------------------------------ - 0 83 Bra + 0 133 Bra 4 abc - 10 8 CBra 1 - 16 d - 18 6 Alt - 22 e - 24 14 Ket - 28 *THEN - 29 x - 31 15 CBra 2 - 37 123 - 43 *THEN - 44 4 - 46 33 Alt - 50 567 - 56 8 CBra 3 - 62 b - 64 6 Alt - 68 q - 70 14 Ket - 74 *THEN - 75 xx - 79 48 Ket - 83 83 Ket - 87 End + 10 39 CBra 1 + 16 [de] + 49 39 Ket + 53 *THEN + 54 x + 56 15 CBra 2 + 62 123 + 68 *THEN + 69 4 + 71 58 Alt + 75 567 + 81 39 CBra 3 + 87 [bq] +120 39 Ket +124 *THEN +125 xx +129 73 Ket +133 133 Ket +137 End ------------------------------------------------------------------ /(((a\2)|(a*)\g<-1>))*a?/ diff --git a/testdata/testoutput8-8-4 b/testdata/testoutput8-8-4 index 86ede51b6..c58831c38 100644 --- a/testdata/testoutput8-8-4 +++ b/testdata/testoutput8-8-4 @@ -36,18 +36,16 @@ Memory allocation - code size : 35 ------------------------------------------------------------------ /(?s:.*X|^B)/ -Memory allocation - code size : 33 +Memory allocation - code size : 23 ------------------------------------------------------------------ - 0 27 Bra - 5 9 Bra - 10 AllAny* - 12 X - 14 8 Alt - 19 ^ - 20 B - 22 17 Ket - 27 27 Ket - 32 End + 0 9 Bra + 5 AllAny* + 7 X + 9 8 Alt + 14 ^ + 15 B + 17 17 Ket + 22 End ------------------------------------------------------------------ /^[[:alnum:]]/ @@ -197,22 +195,20 @@ Memory allocation - code size : 42 ------------------------------------------------------------------ /a(?Pb|c)d(?Pe)/ -Memory allocation - code size : 50 +Memory allocation - code size : 74 Memory allocation - data size : 28 ------------------------------------------------------------------ - 0 44 Bra + 0 68 Bra 5 a - 7 9 CBra 1 - 14 b - 16 7 Alt - 21 c - 23 16 Ket - 28 d - 30 9 CBra 2 - 37 e - 39 9 Ket - 44 44 Ket - 49 End + 7 40 CBra 1 + 14 [bc] + 47 40 Ket + 52 d + 54 9 CBra 2 + 61 e + 63 9 Ket + 68 68 Ket + 73 End ------------------------------------------------------------------ /(?:a(?Pc(?Pd)))(?Pa)/ @@ -694,31 +690,27 @@ Memory allocation - code size : 14 /abc(d|e)(*THEN)x(123(*THEN)4|567(b|q)(*THEN)xx)/ ------------------------------------------------------------------ - 0 93 Bra + 0 141 Bra 5 abc - 11 9 CBra 1 - 18 d - 20 7 Alt - 25 e - 27 16 Ket - 32 *THEN - 33 x - 35 16 CBra 2 - 42 123 - 48 *THEN - 49 4 - 51 37 Alt - 56 567 - 62 9 CBra 3 - 69 b - 71 7 Alt - 76 q - 78 16 Ket - 83 *THEN - 84 xx - 88 53 Ket - 93 93 Ket - 98 End + 11 40 CBra 1 + 18 [de] + 51 40 Ket + 56 *THEN + 57 x + 59 16 CBra 2 + 66 123 + 72 *THEN + 73 4 + 75 61 Alt + 80 567 + 86 40 CBra 3 + 93 [bq] +126 40 Ket +131 *THEN +132 xx +136 77 Ket +141 141 Ket +146 End ------------------------------------------------------------------ /(((a\2)|(a*)\g<-1>))*a?/