Skip to content

Commit

Permalink
statistics: BinarySearchRemoveVal should use decoded val instead of e…
Browse files Browse the repository at this point in the history
…ncoded (#59131)

close #59130
  • Loading branch information
winoros authored Jan 23, 2025
1 parent 6812b17 commit 41c3b01
Show file tree
Hide file tree
Showing 6 changed files with 41 additions and 31 deletions.
2 changes: 1 addition & 1 deletion pkg/statistics/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ go_test(
data = glob(["testdata/**"]),
embed = [":statistics"],
flaky = True,
shard_count = 38,
shard_count = 37,
deps = [
"//pkg/config",
"//pkg/meta/model",
Expand Down
33 changes: 31 additions & 2 deletions pkg/statistics/handle/globalstats/global_stats_internal_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -363,10 +363,39 @@ func testIssues24349(t *testing.T, testKit *testkit.TestKit, store kv.Storage) {
"test t global b 0 2 4",
))
testKit.MustExec("explain select * from t where a > 0 and b > 0")
testKit.MustQuery("show stats_buckets where partition_name='global'").Check(testkit.Rows(
testKit.MustQuery("show stats_topn where table_name = 't'").Sort().Check(testkit.Rows(
"test t global a 0 1 6",
"test t global b 0 2 4",
"test t p0 a 0 0 4",
"test t p0 b 0 3 3",
"test t p1 a 0 1 6",
"test t p1 b 0 2 3",
"test t p2 a 0 2 2",
"test t p2 b 0 1 2",
))
// column a is trival.
// column b:
// TopN:
// p0: b=3, occurs 3 times
// p1: b=2, occurs 3 times
// p2: b=1, occurs 2 times
// Histogram:
// p0: hist of b: [2, 2] count=repeat=2
// p1: hist of b: [1, 3] count=2, repeat=3. [4, 4] count==repeat=1
// After merging global TopN, it should be 2 with 4 as the repeat.(constructed by p1's TopN and p0's histogram)
// Kicking it out, the remained buckets for b are:(consider TopN as a bucket whose lower bound is the same as upper bound and count is the same as repeat)
// [3, 3] count=repeat=4
// [1, 1] count=repeat=2
// [1, 3] count=1, repeat=0(merged into TopN)
// [4, 4] count=repeat=1
// Finally, get one global bucket [1, 4] count=8, repeat=1
testKit.MustQuery("show stats_buckets where table_name='t'").Sort().Check(testkit.Rows(
"test t global a 0 0 4 4 0 0 0",
"test t global a 0 1 6 2 2 2 0",
"test t global b 0 0 10 1 1 4 0",
"test t global b 0 0 8 1 1 4 0",
"test t p0 b 0 0 1 1 2 2 0",
"test t p1 b 0 0 2 1 1 3 0",
"test t p1 b 0 1 3 1 4 4 0",
))
}

Expand Down
2 changes: 1 addition & 1 deletion pkg/statistics/handle/globalstats/merge_worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ func (worker *topnStatsMergeWorker) Run(timeZone *time.Location, isIndex bool, v
count, _ := allHists[j].EqualRowCount(nil, datum, isIndex)
if count != 0 {
// Remove the value corresponding to encodedVal from the histogram.
worker.statsWrapper.AllHg[j].BinarySearchRemoveVal(statistics.TopNMeta{Encoded: datum.GetBytes(), Count: uint64(count)})
worker.statsWrapper.AllHg[j].BinarySearchRemoveVal(&datum, int64(count))
}
worker.shardMutex[j].Unlock()
if count != 0 {
Expand Down
2 changes: 1 addition & 1 deletion pkg/statistics/handle/globalstats/topn.go
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ func MergePartTopN2GlobalTopN(
if count != 0 {
counter[encodedVal] += count
// Remove the value corresponding to encodedVal from the histogram.
hists[j].BinarySearchRemoveVal(statistics.TopNMeta{Encoded: datum.GetBytes(), Count: uint64(count)})
hists[j].BinarySearchRemoveVal(&datum, int64(count))
}
}
}
Expand Down
15 changes: 7 additions & 8 deletions pkg/statistics/histogram.go
Original file line number Diff line number Diff line change
Expand Up @@ -296,28 +296,27 @@ func (hg *Histogram) BucketToString(bktID, idxCols int) string {
}

// BinarySearchRemoveVal removes the value from the TopN using binary search.
func (hg *Histogram) BinarySearchRemoveVal(valCntPairs TopNMeta) {
func (hg *Histogram) BinarySearchRemoveVal(val *types.Datum, count int64) {
lowIdx, highIdx := 0, hg.Len()-1
column := hg.Bounds.Column(0)
// if hg is too small, we don't need to check the branch. because the cost is more than binary search.
if hg.Len() > 4 {
if cmpResult := bytes.Compare(column.GetRaw(highIdx*2+1), valCntPairs.Encoded); cmpResult < 0 {
if cmpResult := chunk.Compare(hg.Bounds.GetRow(highIdx*2+1), 0, val); cmpResult < 0 {
return
}
if cmpResult := bytes.Compare(column.GetRaw(lowIdx), valCntPairs.Encoded); cmpResult > 0 {
if cmpResult := chunk.Compare(hg.Bounds.GetRow(lowIdx), 0, val); cmpResult > 0 {
return
}
}
var midIdx = 0
var found bool
for lowIdx <= highIdx {
midIdx = (lowIdx + highIdx) / 2
cmpResult := bytes.Compare(column.GetRaw(midIdx*2), valCntPairs.Encoded)
cmpResult := chunk.Compare(hg.Bounds.GetRow(midIdx*2), 0, val)
if cmpResult > 0 {
highIdx = midIdx - 1
continue
}
cmpResult = bytes.Compare(column.GetRaw(midIdx*2+1), valCntPairs.Encoded)
cmpResult = chunk.Compare(hg.Bounds.GetRow(midIdx*2+1), 0, val)
if cmpResult < 0 {
lowIdx = midIdx + 1
continue
Expand All @@ -330,7 +329,7 @@ func (hg *Histogram) BinarySearchRemoveVal(valCntPairs TopNMeta) {
if cmpResult == 0 {
midbucket.Repeat = 0
}
midbucket.Count -= int64(valCntPairs.Count)
midbucket.Count -= count
if midbucket.Count < 0 {
midbucket.Count = 0
}
Expand All @@ -339,7 +338,7 @@ func (hg *Histogram) BinarySearchRemoveVal(valCntPairs TopNMeta) {
}
if found {
for midIdx++; midIdx <= hg.Len()-1; midIdx++ {
hg.Buckets[midIdx].Count -= int64(valCntPairs.Count)
hg.Buckets[midIdx].Count -= count
if hg.Buckets[midIdx].Count < 0 {
hg.Buckets[midIdx].Count = 0
}
Expand Down
18 changes: 0 additions & 18 deletions pkg/statistics/histogram_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -712,21 +712,3 @@ func generateData(t *testing.T) *Histogram {
}
return genHist4Test(t, data, 0)
}

func TestVerifyHistsBinarySearchRemoveValAndRemoveVals(t *testing.T) {
data1 := generateData(t)
data2 := generateData(t)

require.Equal(t, data1, data2)
ctx := mock.NewContext()
sc := ctx.GetSessionVars().StmtCtx
b, err := codec.EncodeKey(sc.TimeZone(), nil, types.NewIntDatum(150))
require.NoError(t, err)
tmp := TopNMeta{
Encoded: b,
Count: 2,
}
data1.RemoveVals([]TopNMeta{tmp})
data2.BinarySearchRemoveVal(tmp)
require.Equal(t, data1, data2)
}

0 comments on commit 41c3b01

Please sign in to comment.