Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add re2 multi-mode matching #171

Merged
merged 16 commits into from
Dec 25, 2024
8 changes: 8 additions & 0 deletions experimental/experimental.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,11 @@ func MustCompileLatin1(str string) *re2.Regexp {
}
return regexp
}

// Set is a compiled collection of regular expressions that can be searched for simultaneously.
type Set = internal.Set

// CompileSet compiles the set of regular expression in preparation for matching.
func CompileSet(exprs []string) (*Set, error) {
return internal.CompileSet(exprs, internal.CompileOptions{})
}
206 changes: 206 additions & 0 deletions experimental/experimental_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,12 @@ package experimental

import (
"fmt"
"reflect"
"sort"
"strings"
"testing"

"github.com/wasilibs/go-re2"
)

func TestCompileLatin1(t *testing.T) {
Expand Down Expand Up @@ -55,3 +60,204 @@ func TestCompileLatin1(t *testing.T) {
})
}
}

var goodRe = []string{
``,
`.`,
`^.$`,
`a`,
`a*`,
`a+`,
`a?`,
`a|b`,
`a*|b*`,
`(a*|b)(c*|d)`,
`[a-z]`,
`[a-abc-c\-\]\[]`,
`[a-z]+`,
`[abc]`,
`[^1234]`,
`[^\n]`,
`\!\\`,
}

type stringError struct {
re string
err string
}

var badSet = []stringError{
{`*`, "error parsing regexp: no argument for repetition operator: *"},
{`+`, "error parsing regexp: no argument for repetition operator: +"},
{`?`, "error parsing regexp: no argument for repetition operator: ?"},
{`(abc`, "error parsing regexp: missing ): (abc"},
{`abc)`, "error parsing regexp: unexpected ): abc)"},
{`x[a-z`, "error parsing regexp: missing ]: [a-z"},
{`[z-a]`, "error parsing regexp: invalid character class range: z-a"},
{`abc\`, "error parsing regexp: trailing \\"},
{`a**`, "error parsing regexp: bad repetition operator: **"},
{`a*+`, "error parsing regexp: bad repetition operator: *+"},
{`\x`, "error parsing regexp: invalid escape sequence: \\x"},
{strings.Repeat(`)\pL`, 27000), "error parsing regexp: unexpected ): " + strings.Repeat(`)\pL`, 27000)},
}

func compileSetTest(t *testing.T, exprs []string, error string) *Set {
set, err := CompileSet(exprs)
if error == "" && err != nil {
t.Error("compiling `", exprs, "`; unexpected error: ", err.Error())
}
if error != "" && err == nil {
t.Error("compiling `", exprs, "`; missing error")
} else if error != "" && !strings.Contains(err.Error(), error) {
t.Error("compiling `", exprs, "`; wrong error: ", err.Error(), "; want ", error)
}
return set
}

func TestGoodSetCompile(t *testing.T) {
compileSetTest(t, goodRe, "")
}

func TestBadCompileSet(t *testing.T) {
for i := 0; i < len(badSet); i++ {
compileSetTest(t, []string{badSet[i].re}, badSet[i].err)
}
}

type SetTest struct {
exprs []string
matches string
matched [4][]int
}

var setTests = []SetTest{
{
exprs: []string{`(d)(e){0}(f)`, `[a-c]+`, `abc`, `\d+`},
matches: "x",
matched: [4][]int{
nil, nil, nil, nil,
},
},
{
exprs: []string{`(d)(e){0}(f)`, `[a-c]+`, `abc`, `\d+`},
matches: "123",
matched: [4][]int{
nil, {3}, {3}, {3},
},
},
{
exprs: []string{`(d)(e){0}(f)`, `[a-c]+`, `abc`, `\d+`},
matches: "df123abc",
matched: [4][]int{
nil, {0}, {0, 3}, {0, 1, 2, 3},
},
},
{
exprs: []string{`(d)(e){0}(f)`, `[a-c]+`, `abc`, `\d+`, `d{4}-\d{2}-\d{2}$`, `[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}`, `1[3-9]\d{9}`, `\.[a-zA-Z0-9]+$`, `<!--[\s\S]*?-->`},
matches: "abcdef123</html><!-- test -->[email protected]",
matched: [4][]int{
nil, {1}, {1, 2}, {1, 2, 3, 5, 6, 7, 8},
},
},
{
exprs: []string{`(d)(e){0}(f)`, `[a-c]+`, `abc`, `\d+`, `d{4}-\d{2}-\d{2}$`, `[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}`, `1[3-9]\d{9}`, `\.[a-zA-Z0-9]+$`, `<!--[\s\S]*?-->`},
matches: "[email protected]",
matched: [4][]int{
nil, {0}, {0, 3}, {0, 1, 3, 5, 6, 7},
},
},
}

func setFindAllTest(t *testing.T, set *Set, matchStr string, matchNum int, matchedIds []int) {
m := set.FindAll([]byte(matchStr), matchNum)
sort.Ints(m)
if !reflect.DeepEqual(m, matchedIds) {
t.Errorf("Match failure on %s: %v should be %v", matchStr, m, matchedIds)
}
}

func setFindAllStringTest(t *testing.T, set *Set, matchStr string, matchNum int, matchedIds []int) {
m := set.FindAllString(matchStr, matchNum)
sort.Ints(m)
if !reflect.DeepEqual(m, matchedIds) {
t.Errorf("Match failure on %s: %v should be %v", matchStr, m, matchedIds)
}
}

func TestSetFindAll(t *testing.T) {
for _, test := range setTests {
set := compileSetTest(t, test.exprs, "")
if set == nil {
return
}
setFindAllTest(t, set, test.matches, 0, test.matched[0])
setFindAllTest(t, set, test.matches, 1, test.matched[1])
setFindAllTest(t, set, test.matches, 2, test.matched[2])
setFindAllTest(t, set, test.matches, 7, test.matched[3])
setFindAllTest(t, set, test.matches, 20, test.matched[3])
}
}

func TestSetFindAllString(t *testing.T) {
for _, test := range setTests {
set := compileSetTest(t, test.exprs, "")
if set == nil {
return
}
setFindAllStringTest(t, set, test.matches, 0, test.matched[0])
setFindAllStringTest(t, set, test.matches, 1, test.matched[1])
setFindAllStringTest(t, set, test.matches, 2, test.matched[2])
setFindAllStringTest(t, set, test.matches, 7, test.matched[3])
setFindAllStringTest(t, set, test.matches, 20, test.matched[3])
}
}

func BenchmarkSet(b *testing.B) {
b.Run("findAll", func(b *testing.B) {
set, err := CompileSet(goodRe)
if err != nil {
panic(err)
}
for i := 0; i < b.N; i++ {
set.FindAll([]byte("abcdef123</html><!-- test -->[email protected]"), 20)
}
})
}

func BenchmarkSetMatchWithFindSubmatch(b *testing.B) {
b.Run("set match", func(b *testing.B) {
set, err := CompileSet(goodRe)
if err != nil {
panic(err)
}
for i := 0; i < b.N; i++ {
set.FindAll([]byte("abcd123"), 20)
}
})
b.Run("findSubmatch", func(b *testing.B) {
re, err := re2.Compile("(" + strings.Join(goodRe, ")|(") + ")")
if err != nil {
panic(err)
}
for i := 0; i < b.N; i++ {
re.FindAllStringSubmatchIndex("abcd123", 20)
}
})
}

func ExampleCompileSet() {
exprs := []string{"abc", "\\d+"}
set, err := CompileSet(exprs)
if err != nil {
panic(err)
}
fmt.Println(set.FindAll([]byte("abcd"), len(exprs)))
fmt.Println(set.FindAll([]byte("123"), len(exprs)))
fmt.Println(set.FindAll([]byte("abc123"), len(exprs)))
fmt.Println(set.FindAll([]byte("def"), len(exprs)))
// Output:
// [0]
// [1]
// [0 1]
// []
}
30 changes: 29 additions & 1 deletion internal/cre2/cre2.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,20 @@ void cre2_opt_set_posix_syntax(void* opt, int flag);
void cre2_opt_set_case_sensitive(void* opt, int flag);
void cre2_opt_set_latin1_encoding(void* opt);
void cre2_opt_set_max_mem(void* opt, int64_t size);
void* cre2_set_new(void* opt, int anchor);
void* cre2_set_add(void* set, void* pattern, int pattern_len);
int cre2_set_compile(void* set);
int cre2_set_match(void* set, void* text, int text_len, void* match, int nmatch);
void cre2_set_delete(void* set);

void* malloc(size_t size);
void free(void* ptr);
*/
import "C"
import "unsafe"

import (
"unsafe"
)

func New(patternPtr unsafe.Pointer, patternLen int, opts unsafe.Pointer) unsafe.Pointer {
return C.cre2_new(patternPtr, C.int(patternLen), opts)
Expand Down Expand Up @@ -112,6 +120,26 @@ func OptSetMaxMem(opt unsafe.Pointer, size int) {
C.cre2_opt_set_max_mem(opt, C.int64_t(size))
}

func NewSet(opt unsafe.Pointer, anchor int) unsafe.Pointer {
return C.cre2_set_new(opt, C.int(anchor))
}

func SetAdd(set unsafe.Pointer, patternPtr unsafe.Pointer, patternLen int) unsafe.Pointer {
return C.cre2_set_add(set, patternPtr, C.int(patternLen))
}

func SetCompile(set unsafe.Pointer) int {
return int(C.cre2_set_compile(set))
}

func SetMatch(set unsafe.Pointer, textPtr unsafe.Pointer, textLen int, match unsafe.Pointer, nMatch int) int {
return int(C.cre2_set_match(set, textPtr, C.int(textLen), match, C.int(nMatch)))
}

func SetDelete(ptr unsafe.Pointer) {
C.cre2_set_delete(ptr)
}

func Malloc(size int) unsafe.Pointer {
return C.malloc(C.size_t(size))
}
Expand Down
50 changes: 50 additions & 0 deletions internal/re2_re2_cgo.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
package internal

import (
"fmt"
"unsafe"

"github.com/wasilibs/go-re2/internal/cre2"
Expand Down Expand Up @@ -112,6 +113,10 @@ func (a *allocation) newCStringArray(n int) cStringArray {
return cStringArray{ptr: wasmPtr(ptr)}
}

func (a *allocation) read(ptr wasmPtr, size int) []byte {
return (*[1 << 30]byte)(unsafe.Pointer(ptr))[:size:size]
}

type cString struct {
ptr unsafe.Pointer
length int
Expand Down Expand Up @@ -164,3 +169,48 @@ func readMatches(alloc *allocation, cs cString, matchesPtr wasmPtr, n int, deliv
}
}
}

func newSet(_ *libre2ABI, opts CompileOptions) wasmPtr {
opt := cre2.NewOpt()
defer cre2.DeleteOpt(opt)
cre2.OptSetMaxMem(opt, maxSize)
cre2.OptSetLogErrors(opt, false)
if opts.Longest {
cre2.OptSetLongestMatch(opt, true)
}
if opts.Posix {
cre2.OptSetPosixSyntax(opt, true)
}
if opts.CaseInsensitive {
cre2.OptSetCaseSensitive(opt, false)
}
if opts.Latin1 {
cre2.OptSetLatin1Encoding(opt)
}
return wasmPtr(cre2.NewSet(opt, 0))
}

func setAdd(set *Set, s cString) string {
msgPtr := cre2.SetAdd(unsafe.Pointer(set.ptr), s.ptr, s.length)
if msgPtr == nil {
return unknownCompileError
}
msg := cre2.CopyCString(msgPtr)
if msg != "ok" {
cre2.Free(msgPtr)
return fmt.Sprintf("error parsing regexp: %s", msg)
}
return ""
}

func setCompile(set *Set) int32 {
return int32(cre2.SetCompile(unsafe.Pointer(set.ptr)))
}

func setMatch(set *Set, cs cString, matchedPtr wasmPtr, nMatch int) int {
return cre2.SetMatch(unsafe.Pointer(set.ptr), cs.ptr, cs.length, unsafe.Pointer(matchedPtr), nMatch)
}

func deleteSet(_ *libre2ABI, setPtr wasmPtr) {
cre2.SetDelete(unsafe.Pointer(setPtr))
}
Loading
Loading