Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rewrite regexes where common prefix can be pulled out from alternation branches #464

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -766,6 +766,8 @@ EXTRA_DIST += \
testdata/testinput25 \
testdata/testinput26 \
testdata/testinput27 \
testdata/testinput28 \
testdata/testinput29 \
testdata/testinputEBC \
testdata/testinputheap \
testdata/testoutput1 \
Expand Down Expand Up @@ -811,6 +813,8 @@ EXTRA_DIST += \
testdata/testoutput25 \
testdata/testoutput26 \
testdata/testoutput27 \
testdata/testoutput28 \
testdata/testoutput29 \
testdata/testoutputEBC \
testdata/testoutputheap-16 \
testdata/testoutputheap-32 \
Expand Down
34 changes: 32 additions & 2 deletions RunTest
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,9 @@ title24="Test 24: Non-UTF pattern conversion tests"
title25="Test 25: UTF pattern conversion tests"
title26="Test 26: Unicode property tests (compatible with Perl >= 5.38)"
title27="Test 27: Auto-generated unicode property tests"
maxtest=27
title28="Test 28: Pattern rewriter tests without UTF"
title29="Test 29: Pattern rewriter tests with UTF"
maxtest=29
titleheap="Test 'heap': Environment-specific heap tests"

if [ $# -eq 1 -a "$1" = "list" ]; then
Expand Down Expand Up @@ -122,6 +124,8 @@ if [ $# -eq 1 -a "$1" = "list" ]; then
echo $title25
echo $title26
echo $title27
echo $title28
echo $title29
echo ""
echo $titleheap
echo ""
Expand Down Expand Up @@ -247,6 +251,8 @@ do24=no
do25=no
do26=no
do27=no
do28=no
do29=no
doheap=no
doebcdic=no

Expand Down Expand Up @@ -280,6 +286,8 @@ while [ $# -gt 0 ] ; do
25) do25=yes;;
26) do26=yes;;
27) do27=yes;;
28) do28=yes;;
29) do29=yes;;
heap) doheap=yes;;
ebcdic) doebcdic=yes;;
-8) arg8=yes;;
Expand Down Expand Up @@ -433,7 +441,7 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
$do16 = no -a $do17 = no -a $do18 = no -a $do19 = no -a \
$do20 = no -a $do21 = no -a $do22 = no -a $do23 = no -a \
$do24 = no -a $do25 = no -a $do26 = no -a $do27 = no -a \
$doheap = no -a $doebcdic = no \
$do28 = no -a $do29 = no -a $doheap = no -a $doebcdic = no \
]; then
do0=yes
do1=yes
Expand Down Expand Up @@ -463,6 +471,8 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
do25=yes
do26=yes
do27=yes
do28=yes
do29=yes
fi

# Handle any explicit skips at this stage, so that an argument list may consist
Expand Down Expand Up @@ -912,6 +922,26 @@ for bmode in "$test8" "$test16" "$test32"; do
fi
fi

# Pattern rewriter tests (without UTF)

if [ $do28 = yes ] ; then
echo $title28
$sim $valgrind ./pcre2test -q $setstack $bmode $testdata/testinput28 testtry
checkresult $? 28 ""
fi

# Pattern rewriter tests (with UTF)

if [ $do29 = yes ] ; then
echo $title29
if [ $utf -eq 0 ] ; then
alexdowad marked this conversation as resolved.
Show resolved Hide resolved
echo " Skipped because UTF-$bits support is not available"
else
$sim $valgrind ./pcre2test -q $setstack $bmode $testdata/testinput29 testtry
checkresult $? 29 ""
fi
fi

# Manually selected heap tests - output may vary in different environments,
# which is why that are not automatically run.

Expand Down
25 changes: 22 additions & 3 deletions RunTest.bat
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
@rem line. Added argument validation and added error reporting.
@rem
@rem Sheri Pierce added logic to skip feature dependent tests
@rem tests 4 5 7 10 12 14 19 22 25 and 26 require Unicode support
@rem tests 4 5 7 10 12 14 19 22 25 26 27 and 29 require Unicode support
@rem 8 requires Unicode and link size 2
@rem 16 requires absence of jit support
@rem 17 requires presence of jit support
Expand Down Expand Up @@ -114,18 +114,20 @@ set do24=no
set do25=no
set do26=no
set do27=no
set do28=no
set do29=no
set all=yes

for %%a in (%*) do (
set valid=no
for %%v in (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27) do if %%v == %%a set valid=yes
for %%v in (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29) do if %%v == %%a set valid=yes
if "!valid!" == "yes" (
set do%%a=yes
set all=no
) else (
echo Invalid test number - %%a!
echo Usage %0 [ test_number ] ...
echo Where test_number is one or more optional test numbers 1 through 27, default is all tests.
echo Where test_number is one or more optional test numbers 1 through 29, default is all tests.
exit /b 1
)
)
Expand Down Expand Up @@ -159,6 +161,8 @@ if "%all%" == "yes" (
set do25=yes
set do26=yes
set do27=yes
set do28=yes
set do29=yes
)

@echo RunTest.bat's pcre2test output is written to newly created subfolders
Expand Down Expand Up @@ -214,6 +218,8 @@ if "%do24%" == "yes" call :do24
if "%do25%" == "yes" call :do25
if "%do26%" == "yes" call :do26
if "%do27%" == "yes" call :do27
if "%do28%" == "yes" call :do28
if "%do29%" == "yes" call :do29
:modeSkip
if "%mode%" == "" (
set mode=-16
Expand Down Expand Up @@ -540,6 +546,19 @@ if %unicode% EQU 0 (
if %jit% EQU 1 call :runsub 27 testoutjit "Test with JIT Override" -q -jit
goto :eof

:do28
call :runsub 28 testout "Pattern rewriter tests without UTF" -q
goto :eof

:do29
if %unicode% EQU 0 (
echo Test 29 Skipped due to absence of Unicode support.
goto :eof
)
call :runsub 29 testout "Pattern rewriter tests with UTF" -q
if %jit% EQU 1 call :runsub 29 testoutjit "Test with JIT Override" -q -jit
goto :eof

:conferror
@echo.
@echo Either your build is incomplete or you have a configuration error.
Expand Down
14 changes: 14 additions & 0 deletions doc/html/pcre2api.html
Original file line number Diff line number Diff line change
Expand Up @@ -1014,6 +1014,20 @@ <h1>pcre2api man page</h1>
Dotstar anchor optimization is automatically disabled for .* if it is inside an
atomic group or a capture group that is the subject of a backreference, or if
the pattern contains (*PRUNE) or (*SKIP).
<pre>
PCRE2_PATTERN_REWRITE
PCRE2_PATTERN_REWRITE_OFF
</pre>
Enable/disable optimizations which occur during the pattern rewriting
phase (after parsing but before compilation). Pattern rewriting may remove
redundant items, coalesce items, adjust group structure, or replace some
constructs with an equivalent construct. Pattern rewriting will never affect
which strings are and are not matched, or what substrings are captured by
capture groups. However, since it may change the structure of a pattern,
if you are tracing the matching process, you might prefer PCRE2 to use the
original pattern without rewriting. Disabling rewriting may also be useful
for testing. Pattern rewriting is disabled if the compile option
PCRE2_AUTO_CALLOUT is set.
<pre>
PCRE2_START_OPTIMIZE
PCRE2_START_OPTIMIZE_OFF
Expand Down
4 changes: 3 additions & 1 deletion doc/html/pcre2callout.html
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,9 @@ <h1>pcre2callout man page</h1>
program has a pattern qualifier (/auto_callout) that sets automatic callouts.
When any callouts are present, the output from <b>pcre2test</b> indicates how
the pattern is being matched. This is useful information when you are trying to
optimize the performance of a particular pattern.
optimize the performance of a particular pattern. However, note that some
optimizations which adjust the structure of the pattern are disabled when
automatic callouts are enabled.
</P>
<br><a name="SEC3" href="#TOC1">MISSING CALLOUTS</a><br>
<P>
Expand Down
1 change: 1 addition & 0 deletions doc/html/pcre2syntax.html
Original file line number Diff line number Diff line change
Expand Up @@ -487,6 +487,7 @@ <h1>pcre2syntax man page</h1>
(*NO_AUTO_POSSESS) no auto-possessification (PCRE2_NO_AUTO_POSSESS)
(*NO_DOTSTAR_ANCHOR) no .* anchoring (PCRE2_NO_DOTSTAR_ANCHOR)
(*NO_JIT) disable JIT optimization
(*NO_REWRITE) disable pattern rewriting phase of compilation
(*NO_START_OPT) no start-match optimization (PCRE2_NO_START_OPTIMIZE)
(*TURKISH_CASING) set PCRE2_EXTRA_TURKISH_CASING when matching
(*UTF) set appropriate UTF mode for the library in use
Expand Down
2 changes: 2 additions & 0 deletions doc/html/pcre2test.html
Original file line number Diff line number Diff line change
Expand Up @@ -705,6 +705,8 @@ <h1>pcre2test man page</h1>
auto_possess_off don't auto-possessify variable quantifiers
dotstar_anchor anchor patterns starting with .*
dotstar_anchor_off don't anchor patterns starting with .*
pattern_rewrite rewrite some slow constructs
pattern_rewrite_off don't rewrite slow constructs
start_optimize enable pre-scan of subject string
start_optimize_off disable pre-scan of subject string
</pre>
Expand Down
19 changes: 18 additions & 1 deletion doc/pcre2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1040,6 +1040,20 @@ PCRE2 CONTEXTS
inside an atomic group or a capture group that is the subject of a
backreference, or if the pattern contains (*PRUNE) or (*SKIP).

PCRE2_PATTERN_REWRITE
PCRE2_PATTERN_REWRITE_OFF

Enable/disable optimizations which occur during the pattern rewriting
phase (after parsing but before compilation). Pattern rewriting may re-
move redundant items, coalesce items, adjust group structure, or re-
place some constructs with an equivalent construct. Pattern rewriting
will never affect which strings are and are not matched, or what sub-
strings are captured by capture groups. However, since it may change
the structure of a pattern, if you are tracing the matching process,
you might prefer PCRE2 to use the original pattern without rewriting.
Disabling rewriting may also be useful for testing. Pattern rewriting
is disabled if the compile option PCRE2_AUTO_CALLOUT is set.

PCRE2_START_OPTIMIZE
PCRE2_START_OPTIMIZE_OFF

Expand Down Expand Up @@ -5005,7 +5019,9 @@ DESCRIPTION
automatic callouts. When any callouts are present, the output from
pcre2test indicates how the pattern is being matched. This is useful
information when you are trying to optimize the performance of a par-
ticular pattern.
ticular pattern. However, note that some optimizations which adjust the
structure of the pattern are disabled when automatic callouts are en-
abled.


MISSING CALLOUTS
Expand Down Expand Up @@ -11931,6 +11947,7 @@ OPTION SETTING
(*NO_AUTO_POSSESS) no auto-possessification (PCRE2_NO_AUTO_POSSESS)
(*NO_DOTSTAR_ANCHOR) no .* anchoring (PCRE2_NO_DOTSTAR_ANCHOR)
(*NO_JIT) disable JIT optimization
(*NO_REWRITE) disable pattern rewriting phase of compilation
(*NO_START_OPT) no start-match optimization (PCRE2_NO_START_OP-
TIMIZE)
(*TURKISH_CASING) set PCRE2_EXTRA_TURKISH_CASING when matching
Expand Down
14 changes: 14 additions & 0 deletions doc/pcre2api.3
Original file line number Diff line number Diff line change
Expand Up @@ -939,6 +939,20 @@ this can cause callouts to be skipped.
Dotstar anchor optimization is automatically disabled for .* if it is inside an
atomic group or a capture group that is the subject of a backreference, or if
the pattern contains (*PRUNE) or (*SKIP).
.sp
PCRE2_PATTERN_REWRITE
PCRE2_PATTERN_REWRITE_OFF
.sp
Enable/disable optimizations which occur during the pattern rewriting
phase (after parsing but before compilation). Pattern rewriting may remove
redundant items, coalesce items, adjust group structure, or replace some
constructs with an equivalent construct. Pattern rewriting will never affect
which strings are and are not matched, or what substrings are captured by
capture groups. However, since it may change the structure of a pattern,
if you are tracing the matching process, you might prefer PCRE2 to use the
original pattern without rewriting. Disabling rewriting may also be useful
for testing. Pattern rewriting is disabled if the compile option
PCRE2_AUTO_CALLOUT is set.
.sp
PCRE2_START_OPTIMIZE
PCRE2_START_OPTIMIZE_OFF
Expand Down
4 changes: 3 additions & 1 deletion doc/pcre2callout.3
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,9 @@ Callouts can be useful for tracking the progress of pattern matching. The
program has a pattern qualifier (/auto_callout) that sets automatic callouts.
When any callouts are present, the output from \fBpcre2test\fP indicates how
the pattern is being matched. This is useful information when you are trying to
optimize the performance of a particular pattern.
optimize the performance of a particular pattern. However, note that some
optimizations which adjust the structure of the pattern are disabled when
automatic callouts are enabled.
.
.
.SH "MISSING CALLOUTS"
Expand Down
1 change: 1 addition & 0 deletions doc/pcre2syntax.3
Original file line number Diff line number Diff line change
Expand Up @@ -459,6 +459,7 @@ of them may appear. For the first three, d is a decimal number.
(*NO_AUTO_POSSESS) no auto-possessification (PCRE2_NO_AUTO_POSSESS)
(*NO_DOTSTAR_ANCHOR) no .* anchoring (PCRE2_NO_DOTSTAR_ANCHOR)
(*NO_JIT) disable JIT optimization
(*NO_REWRITE) disable pattern rewriting phase of compilation
(*NO_START_OPT) no start-match optimization (PCRE2_NO_START_OPTIMIZE)
(*TURKISH_CASING) set PCRE2_EXTRA_TURKISH_CASING when matching
(*UTF) set appropriate UTF mode for the library in use
Expand Down
2 changes: 2 additions & 0 deletions doc/pcre2test.1
Original file line number Diff line number Diff line change
Expand Up @@ -659,6 +659,8 @@ calling \fBpcre2_set_optimize()\fP before invoking the regex compiler.
auto_possess_off don't auto-possessify variable quantifiers
dotstar_anchor anchor patterns starting with .*
dotstar_anchor_off don't anchor patterns starting with .*
pattern_rewrite rewrite some slow constructs
pattern_rewrite_off don't rewrite slow constructs
start_optimize enable pre-scan of subject string
start_optimize_off disable pre-scan of subject string
.sp
Expand Down
2 changes: 2 additions & 0 deletions doc/pcre2test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -639,6 +639,8 @@ PATTERN MODIFIERS
auto_possess_off don't auto-possessify variable quantifiers
dotstar_anchor anchor patterns starting with .*
dotstar_anchor_off don't anchor patterns starting with .*
pattern_rewrite rewrite some slow constructs
pattern_rewrite_off don't rewrite slow constructs
start_optimize enable pre-scan of subject string
start_optimize_off disable pre-scan of subject string

Expand Down
4 changes: 4 additions & 0 deletions maint/manifest-tarball
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,8 @@ drwxr-xr-x tarball-dir/pcre2-SNAPSHOT/testdata
-rw-r--r-- tarball-dir/pcre2-SNAPSHOT/testdata/testinput25
-rw-r--r-- tarball-dir/pcre2-SNAPSHOT/testdata/testinput26
-rw-r--r-- tarball-dir/pcre2-SNAPSHOT/testdata/testinput27
-rw-r--r-- tarball-dir/pcre2-SNAPSHOT/testdata/testinput28
-rw-r--r-- tarball-dir/pcre2-SNAPSHOT/testdata/testinput29
-rw-r--r-- tarball-dir/pcre2-SNAPSHOT/testdata/testinput3
-rw-r--r-- tarball-dir/pcre2-SNAPSHOT/testdata/testinput4
-rw-r--r-- tarball-dir/pcre2-SNAPSHOT/testdata/testinput5
Expand Down Expand Up @@ -436,6 +438,8 @@ drwxr-xr-x tarball-dir/pcre2-SNAPSHOT/testdata
-rw-r--r-- tarball-dir/pcre2-SNAPSHOT/testdata/testoutput25
-rw-r--r-- tarball-dir/pcre2-SNAPSHOT/testdata/testoutput26
-rw-r--r-- tarball-dir/pcre2-SNAPSHOT/testdata/testoutput27
-rw-r--r-- tarball-dir/pcre2-SNAPSHOT/testdata/testoutput28
-rw-r--r-- tarball-dir/pcre2-SNAPSHOT/testdata/testoutput29
-rw-r--r-- tarball-dir/pcre2-SNAPSHOT/testdata/testoutput3
-rw-r--r-- tarball-dir/pcre2-SNAPSHOT/testdata/testoutput3A
-rw-r--r-- tarball-dir/pcre2-SNAPSHOT/testdata/testoutput3B
Expand Down
2 changes: 2 additions & 0 deletions src/pcre2.h.generic
Original file line number Diff line number Diff line change
Expand Up @@ -497,6 +497,8 @@ For binary compatibility, only add to this list; do not renumber. */
#define PCRE2_DOTSTAR_ANCHOR_OFF 67
#define PCRE2_START_OPTIMIZE 68
#define PCRE2_START_OPTIMIZE_OFF 69
#define PCRE2_PATTERN_REWRITE 70
#define PCRE2_PATTERN_REWRITE_OFF 71

/* Types used in pcre2_set_substitute_case_callout().

Expand Down
2 changes: 2 additions & 0 deletions src/pcre2.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -497,6 +497,8 @@ For binary compatibility, only add to this list; do not renumber. */
#define PCRE2_DOTSTAR_ANCHOR_OFF 67
#define PCRE2_START_OPTIMIZE 68
#define PCRE2_START_OPTIMIZE_OFF 69
#define PCRE2_PATTERN_REWRITE 70
#define PCRE2_PATTERN_REWRITE_OFF 71

/* Types used in pcre2_set_substitute_case_callout().

Expand Down
Loading
Loading