Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore(storage/bloom): support simplifiable regexp matchers #14622

Merged
merged 9 commits into from
Nov 4, 2024
5 changes: 5 additions & 0 deletions docs/sources/query/query_accceleration.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ If [bloom filters][] are enabled, you can write LogQL queries using [structured
Queries will be accelerated for any [label filter expression][] that satisfies _all_ of the following criteria:

* The label filter expression using **string equality**, such as `| key="value"`.
* `or` and `and` operators can be used to match multiple values, such as `| detected_level="error" or detected_level="warn"`.
* _Basic_ regular expressions are automatically simplified into a supported expression:
* `| key=~"value"` is converted to `| key="value"`.
* `| key=~"value1|value2"` is converted to `| key="value1" or key="value2"`.
* `| key=~".+"` checks for existence of `key`. `.*` is not supported.
rfratto marked this conversation as resolved.
Show resolved Hide resolved
* The label filter expression is querying for structured metadata and not a stream label.
* The label filter expression is placed before any [parser expression][], [labels format expression][], [drop labels expression][], or [keep labels expression][].

Expand Down
6 changes: 3 additions & 3 deletions pkg/bloomgateway/processor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ func TestProcessor(t *testing.T) {
}

matchers := []v1.LabelMatcher{
v1.PlainLabelMatcher{
v1.KeyValueMatcher{
Key: "trace_id",
Value: "nomatch",
},
Expand Down Expand Up @@ -191,7 +191,7 @@ func TestProcessor(t *testing.T) {
day: config.NewDayTime(truncateDay(now)),
}
matchers := []v1.LabelMatcher{
v1.PlainLabelMatcher{
v1.KeyValueMatcher{
Key: "trace_id",
Value: "nomatch",
},
Expand Down Expand Up @@ -238,7 +238,7 @@ func TestProcessor(t *testing.T) {
day: config.NewDayTime(truncateDay(now)),
}
matchers := []v1.LabelMatcher{
v1.PlainLabelMatcher{
v1.KeyValueMatcher{
Key: "trace_id",
Value: "nomatch",
},
Expand Down
200 changes: 189 additions & 11 deletions pkg/storage/bloom/v1/ast_extractor.go
Original file line number Diff line number Diff line change
@@ -1,12 +1,22 @@
package v1

import (
regexsyn "github.com/grafana/regexp/syntax"

"github.com/prometheus/prometheus/model/labels"

"github.com/grafana/loki/v3/pkg/logql/log"
"github.com/grafana/loki/v3/pkg/logql/syntax"
"github.com/grafana/loki/v3/pkg/util"
)

// Simplifiable regexp expressions can quickly expand into very high
// cardinality; we limit the number of matchers to prevent this.
//
// For example, the regex `[0-9]` expands to 10 matchers (0, 1, .. 9), while
// `[0-9][0-9]` expands to 100 matchers (00, 01, .., 99).
const maxRegexMatchers = 25
rfratto marked this conversation as resolved.
Show resolved Hide resolved

// LabelMatcher represents bloom tests for key-value pairs, mapped from
// LabelFilterExprs from the AST.
type LabelMatcher interface{ isLabelMatcher() }
Expand All @@ -15,9 +25,13 @@ type LabelMatcher interface{ isLabelMatcher() }
// mapped. Bloom tests for UnsupportedLabelMatchers must always pass.
type UnsupportedLabelMatcher struct{}

// PlainLabelMatcher represents a direct key-value matcher. Bloom tests
// must only pass if the key-value pair exists in the bloom.
type PlainLabelMatcher struct{ Key, Value string }
// KeyValueMatcher represents a direct key-value matcher. Bloom tests must only
// pass if the key-value pair exists in the bloom.
type KeyValueMatcher struct{ Key, Value string }

// KeyMatcher represents a key matcher. Bloom tests must only pass if the key
// exists in the bloom.
type KeyMatcher struct{ Key string }

// OrLabelMatcher represents a logical OR test. Bloom tests must only pass if
// one of the Left or Right label matcher bloom tests pass.
Expand Down Expand Up @@ -54,21 +68,27 @@ func buildLabelMatcher(filter log.LabelFilterer) LabelMatcher {
switch filter := filter.(type) {

case *log.LineFilterLabelFilter:
if filter.Type != labels.MatchEqual {
return UnsupportedLabelMatcher{}
if filter.Type == labels.MatchEqual {
return KeyValueMatcher{
Key: filter.Name,
Value: filter.Value,
}
} else if filter.Type == labels.MatchRegexp {
reg, err := regexsyn.Parse(filter.Value, regexsyn.Perl)
if err != nil {
return UnsupportedLabelMatcher{}
}
return buildSimplifiedRegexMatcher(filter.Name, reg.Simplify())
}

return PlainLabelMatcher{
Key: filter.Name,
Value: filter.Value,
}
return UnsupportedLabelMatcher{}

case *log.StringLabelFilter:
if filter.Type != labels.MatchEqual {
return UnsupportedLabelMatcher{}
}

return PlainLabelMatcher{
return KeyValueMatcher{
Key: filter.Name,
Value: filter.Value,
}
Expand All @@ -89,11 +109,169 @@ func buildLabelMatcher(filter log.LabelFilterer) LabelMatcher {
}
}

// buildSimplifiedRegexMatcher builds a simplified label matcher from a regex.
// reg may be mutated.
func buildSimplifiedRegexMatcher(key string, reg *regexsyn.Regexp) LabelMatcher {
switch reg.Op {
case regexsyn.OpAlternate:
util.ClearCapture(reg)

left := buildSimplifiedRegexMatcher(key, reg.Sub[0])
if len(reg.Sub) == 1 {
// This shouldn't be possible (even `warn|` has two subexpressions, where
// the latter matches an empty string), but we have a length check here
// anyway just to avoid a potential panic.
return left
}
for _, sub := range reg.Sub[1:] {
rfratto marked this conversation as resolved.
Show resolved Hide resolved
right := buildSimplifiedRegexMatcher(key, sub)
left = OrLabelMatcher{Left: left, Right: right}
}
return left

case regexsyn.OpConcat:
// OpConcat checks for the concatenation of two or more subexpressions. For
// example, value1|value2 simplifies to value[12], with the two
// subexpressions value and [12].
//
// We expand subexpressions back out into full matchers where possible, so
// value[12] becomes value1 OR value2, and value[1-9] becomes value1 OR
// value2 .. OR value9.
util.ClearCapture(reg)

matchers, ok := expandSubexpr(reg)
if !ok || len(matchers) == 0 {
return UnsupportedLabelMatcher{}
}

var left LabelMatcher = KeyValueMatcher{Key: key, Value: matchers[0]}
for _, matcher := range matchers[1:] {
right := KeyValueMatcher{Key: key, Value: matcher}
left = OrLabelMatcher{Left: left, Right: right}
}
return left

case regexsyn.OpCapture:
util.ClearCapture(reg)
return buildSimplifiedRegexMatcher(key, reg)

case regexsyn.OpLiteral:
return KeyValueMatcher{
Key: key,
Value: string(reg.Rune),
}

case regexsyn.OpPlus:
if reg.Sub[0].Op == regexsyn.OpAnyChar || reg.Sub[0].Op == regexsyn.OpAnyCharNotNL { // .+
return KeyMatcher{Key: key}
}

return UnsupportedLabelMatcher{}

default:
return UnsupportedLabelMatcher{}
}
}

func expandSubexpr(reg *regexsyn.Regexp) (prefixes []string, ok bool) {
switch reg.Op {
case regexsyn.OpAlternate:
util.ClearCapture(reg)

for _, sub := range reg.Sub {
subPrefixes, ok := expandSubexpr(sub)
if !ok {
return nil, false
} else if len(prefixes)+len(subPrefixes) > maxRegexMatchers {
return nil, false
}
prefixes = append(prefixes, subPrefixes...)
}
return prefixes, true

case regexsyn.OpCharClass:
// OpCharClass stores ranges of characters, so [12] is the range of bytes
// []rune('1', '2'), while [15] is represented as []rune('1', '1', '5',
// '5').
//
// To expand OpCharClass, we iterate over each pair of runes.
if len(reg.Rune)%2 != 0 {
// Invalid regexp; sequences should be even.
return nil, false
}

for i := 0; i < len(reg.Rune); i += 2 {
start, end := reg.Rune[i+0], reg.Rune[i+1]
for r := start; r <= end; r++ {
prefixes = append(prefixes, string(r))
if len(prefixes) > maxRegexMatchers {
return nil, false
}
}
}

return prefixes, true

case regexsyn.OpConcat:
if len(reg.Sub) == 0 {
return nil, false
}

// We get the prefixes for each subexpression and then iteratively combine
// them together.
//
// For the regexp [12][34]value (which concatenates [12], [34], and value):
//
// 1. We get the prefixes for [12], which are 1 and 2.
// 2. We get the prefixes for [34], which are 3 and 4.
// 3. We add the prefixes together to get 13, 14, 23, and 24.
// 4. We get the prerfixes for value, which is value.
// 5. Finally, we add the prefixes together to get 13value, 14value, 23value, and 24value.
curPrefixes, ok := expandSubexpr(reg.Sub[0])
if !ok {
return nil, false
}

for _, sub := range reg.Sub[1:] {
subPrefixes, ok := expandSubexpr(sub)
if !ok {
return nil, false
} else if len(curPrefixes)*len(subPrefixes) > maxRegexMatchers {
return nil, false
}

newPrefixes := make([]string, 0, len(curPrefixes)*len(subPrefixes))

for _, curPrefix := range curPrefixes {
for _, subPrefix := range subPrefixes {
newPrefixes = append(newPrefixes, curPrefix+subPrefix)
}
}

curPrefixes = newPrefixes
}

return curPrefixes, true

case regexsyn.OpCapture:
util.ClearCapture(reg)
return expandSubexpr(reg)

case regexsyn.OpLiteral:
prefixes = append(prefixes, string(reg.Rune))
return prefixes, true

default:
return nil, false
}
}

//
// Implement marker types:
//

func (UnsupportedLabelMatcher) isLabelMatcher() {}
func (PlainLabelMatcher) isLabelMatcher() {}
func (KeyValueMatcher) isLabelMatcher() {}
func (KeyMatcher) isLabelMatcher() {}
func (OrLabelMatcher) isLabelMatcher() {}
func (AndLabelMatcher) isLabelMatcher() {}
Loading
Loading