diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ab56b48..5feb159 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -4,20 +4,20 @@ jobs: test: strategy: matrix: - go-version: [1.17.x] + go-version: [1.22.x] os: [ubuntu-latest, macos-latest, windows-latest] runs-on: ${{ matrix.os }} steps: - - name: Install Go - uses: actions/setup-go@v4 - with: - go-version: ${{ matrix.go-version }} - - name: Checkout code - uses: actions/checkout@v2 - - name: Vet - run: go vet ./... - - name: Test - run: go test ./... + - name: Install Go + uses: actions/setup-go@v4 + with: + go-version: ${{ matrix.go-version }} + - name: Checkout code + uses: actions/checkout@v2 + - name: Vet + run: go vet ./... + - name: Test + run: go test ./... single-ver: runs-on: ubuntu-latest @@ -25,7 +25,7 @@ jobs: - name: Set up Go uses: actions/setup-go@v3 with: - go-version: 1.17.x + go-version: 1.22.x - name: Checkout code uses: actions/checkout@v2 diff --git a/binaryfusefilter.go b/binaryfusefilter.go index 8b83273..5cb97c1 100644 --- a/binaryfusefilter.go +++ b/binaryfusefilter.go @@ -6,89 +6,26 @@ import ( "math/bits" ) -type BinaryFuse8 struct { +type Unsigned interface { + ~uint8 | ~uint16 | ~uint32 +} + +type BinaryFuse[T Unsigned] struct { Seed uint64 SegmentLength uint32 SegmentLengthMask uint32 SegmentCount uint32 SegmentCountLength uint32 - Fingerprints []uint8 -} - -func calculateSegmentLength(arity uint32, size uint32) uint32 { - // These parameters are very sensitive. Replacing 'floor' by 'round' can - // substantially affect the construction time. - if size == 0 { - return 4 - } - if arity == 3 { - return uint32(1) << int(math.Floor(math.Log(float64(size))/math.Log(3.33)+2.25)) - } else if arity == 4 { - return uint32(1) << int(math.Floor(math.Log(float64(size))/math.Log(2.91)-0.5)) - } else { - return 65536 - } -} - -func calculateSizeFactor(arity uint32, size uint32) float64 { - if arity == 3 { - return math.Max(1.125, 0.875+0.25*math.Log(1000000)/math.Log(float64(size))) - } else if arity == 4 { - return math.Max(1.075, 0.77+0.305*math.Log(600000)/math.Log(float64(size))) - } else { - return 2.0 - } -} - -func (filter *BinaryFuse8) initializeParameters(size uint32) { - arity := uint32(3) - filter.SegmentLength = calculateSegmentLength(arity, size) - if filter.SegmentLength > 262144 { - filter.SegmentLength = 262144 - } - filter.SegmentLengthMask = filter.SegmentLength - 1 - sizeFactor := calculateSizeFactor(arity, size) - capacity := uint32(0) - if size > 1 { - capacity = uint32(math.Round(float64(size) * sizeFactor)) - } - initSegmentCount := (capacity+filter.SegmentLength-1)/filter.SegmentLength - (arity - 1) - arrayLength := (initSegmentCount + arity - 1) * filter.SegmentLength - filter.SegmentCount = (arrayLength + filter.SegmentLength - 1) / filter.SegmentLength - if filter.SegmentCount <= arity-1 { - filter.SegmentCount = 1 - } else { - filter.SegmentCount = filter.SegmentCount - (arity - 1) - } - arrayLength = (filter.SegmentCount + arity - 1) * filter.SegmentLength - filter.SegmentCountLength = filter.SegmentCount * filter.SegmentLength - filter.Fingerprints = make([]uint8, arrayLength) -} - -func (filter *BinaryFuse8) getHashFromHash(hash uint64) (uint32, uint32, uint32) { - hi, _ := bits.Mul64(hash, uint64(filter.SegmentCountLength)) - h0 := uint32(hi) - h1 := h0 + filter.SegmentLength - h2 := h1 + filter.SegmentLength - h1 ^= uint32(hash>>18) & filter.SegmentLengthMask - h2 ^= uint32(hash) & filter.SegmentLengthMask - return h0, h1, h2 -} - -func mod3(x uint8) uint8 { - if x > 2 { - x -= 3 - } - return x + Fingerprints []T } -// PopulateBinaryFuse8 fills the filter with provided keys. For best results, +// NewBinaryFuse fills the filter with provided keys. For best results, // the caller should avoid having too many duplicated keys. // The function may return an error if the set is empty. -func PopulateBinaryFuse8(keys []uint64) (*BinaryFuse8, error) { +func NewBinaryFuse[T Unsigned](keys []uint64) (*BinaryFuse[T], error) { size := uint32(len(keys)) - filter := &BinaryFuse8{} + filter := &BinaryFuse[T]{} filter.initializeParameters(size) rngcounter := uint64(1) filter.Seed = splitmix64(&rngcounter) @@ -98,8 +35,8 @@ func PopulateBinaryFuse8(keys []uint64) (*BinaryFuse8, error) { // the lowest 2 bits are the h index (0, 1, or 2) // so we only have 6 bits for counting; // but that's sufficient - t2count := make([]uint8, capacity) - reverseH := make([]uint8, size) + t2count := make([]T, capacity) + reverseH := make([]T, size) t2hash := make([]uint64, capacity) reverseOrder := make([]uint64, size+1) @@ -224,7 +161,7 @@ func PopulateBinaryFuse8(keys []uint64) (*BinaryFuse8, error) { Qsize++ } t2count[other_index1] -= 4 - t2count[other_index1] ^= mod3(found + 1) // could use this instead: tabmod3[found+1] + t2count[other_index1] ^= filter.mod3(found + 1) // could use this instead: tabmod3[found+1] t2hash[other_index1] ^= hash other_index2 := h012[found+2] @@ -233,7 +170,7 @@ func PopulateBinaryFuse8(keys []uint64) (*BinaryFuse8, error) { Qsize++ } t2count[other_index2] -= 4 - t2count[other_index2] ^= mod3(found + 2) // could use this instead: tabmod3[found+2] + t2count[other_index2] ^= filter.mod3(found + 2) // could use this instead: tabmod3[found+2] t2hash[other_index2] ^= hash } } @@ -265,7 +202,7 @@ func PopulateBinaryFuse8(keys []uint64) (*BinaryFuse8, error) { for i := int(size - 1); i >= 0; i-- { // the hash of the key we insert next hash := reverseOrder[i] - xor2 := uint8(fingerprint(hash)) + xor2 := T(fingerprint(hash)) index1, index2, index3 := filter.getHashFromHash(hash) found := reverseH[i] h012[0] = index1 @@ -279,11 +216,79 @@ func PopulateBinaryFuse8(keys []uint64) (*BinaryFuse8, error) { return filter, nil } -// Contains returns `true` if key is part of the set with a false positive probability of <0.4%. -func (filter *BinaryFuse8) Contains(key uint64) bool { +func (filter *BinaryFuse[T]) initializeParameters(size uint32) { + arity := uint32(3) + filter.SegmentLength = calculateSegmentLength(arity, size) + if filter.SegmentLength > 262144 { + filter.SegmentLength = 262144 + } + filter.SegmentLengthMask = filter.SegmentLength - 1 + sizeFactor := calculateSizeFactor(arity, size) + capacity := uint32(0) + if size > 1 { + capacity = uint32(math.Round(float64(size) * sizeFactor)) + } + initSegmentCount := (capacity+filter.SegmentLength-1)/filter.SegmentLength - (arity - 1) + arrayLength := (initSegmentCount + arity - 1) * filter.SegmentLength + filter.SegmentCount = (arrayLength + filter.SegmentLength - 1) / filter.SegmentLength + if filter.SegmentCount <= arity-1 { + filter.SegmentCount = 1 + } else { + filter.SegmentCount = filter.SegmentCount - (arity - 1) + } + arrayLength = (filter.SegmentCount + arity - 1) * filter.SegmentLength + filter.SegmentCountLength = filter.SegmentCount * filter.SegmentLength + filter.Fingerprints = make([]T, arrayLength) +} + +func (filter *BinaryFuse[T]) mod3(x T) T { + if x > 2 { + x -= 3 + } + + return x +} + +func (filter *BinaryFuse[T]) getHashFromHash(hash uint64) (uint32, uint32, uint32) { + hi, _ := bits.Mul64(hash, uint64(filter.SegmentCountLength)) + h0 := uint32(hi) + h1 := h0 + filter.SegmentLength + h2 := h1 + filter.SegmentLength + h1 ^= uint32(hash>>18) & filter.SegmentLengthMask + h2 ^= uint32(hash) & filter.SegmentLengthMask + return h0, h1, h2 +} + +// Contains returns `true` if key is part of the set with a false positive probability. +func (filter *BinaryFuse[T]) Contains(key uint64) bool { hash := mixsplit(key, filter.Seed) - f := uint8(fingerprint(hash)) + f := T(fingerprint(hash)) h0, h1, h2 := filter.getHashFromHash(hash) f ^= filter.Fingerprints[h0] ^ filter.Fingerprints[h1] ^ filter.Fingerprints[h2] return f == 0 } + +func calculateSegmentLength(arity uint32, size uint32) uint32 { + // These parameters are very sensitive. Replacing 'floor' by 'round' can + // substantially affect the construction time. + if size == 0 { + return 4 + } + if arity == 3 { + return uint32(1) << int(math.Floor(math.Log(float64(size))/math.Log(3.33)+2.25)) + } else if arity == 4 { + return uint32(1) << int(math.Floor(math.Log(float64(size))/math.Log(2.91)-0.5)) + } else { + return 65536 + } +} + +func calculateSizeFactor(arity uint32, size uint32) float64 { + if arity == 3 { + return math.Max(1.125, 0.875+0.25*math.Log(1000000)/math.Log(float64(size))) + } else if arity == 4 { + return math.Max(1.075, 0.77+0.305*math.Log(600000)/math.Log(float64(size))) + } else { + return 2.0 + } +} diff --git a/binaryfusefilter8.go b/binaryfusefilter8.go new file mode 100644 index 0000000..ee8584a --- /dev/null +++ b/binaryfusefilter8.go @@ -0,0 +1,20 @@ +package xorfilter + +type BinaryFuse8 BinaryFuse[uint8] + +// PopulateBinaryFuse8 fills the filter with provided keys. For best results, +// the caller should avoid having too many duplicated keys. +// The function may return an error if the set is empty. +func PopulateBinaryFuse8(keys []uint64) (*BinaryFuse8, error) { + filter, err := NewBinaryFuse[uint8](keys) + if err != nil { + return nil, err + } + + return (*BinaryFuse8)(filter), nil +} + +// Contains returns `true` if key is part of the set with a false positive probability of <0.4%. +func (filter *BinaryFuse8) Contains(key uint64) bool { + return (*BinaryFuse[uint8])(filter).Contains(key) +} diff --git a/binaryfusefilter_test.go b/binaryfusefilter_test.go index abddbc2..1f94b8d 100644 --- a/binaryfusefilter_test.go +++ b/binaryfusefilter_test.go @@ -13,19 +13,21 @@ const NUM_KEYS = 1e6 const MID_NUM_KEYS = 11500 const SMALL_NUM_KEYS = 100 -func TestBinaryFuse8Basic(t *testing.T) { +type testType uint8 + +func TestBinaryFuseNBasic(t *testing.T) { keys := make([]uint64, NUM_KEYS) for i := range keys { keys[i] = rand.Uint64() } - filter, _ := PopulateBinaryFuse8(keys) + filter, _ := NewBinaryFuse[testType](keys) for _, v := range keys { assert.Equal(t, true, filter.Contains(v)) } falsesize := 10000000 matches := 0 bpv := float64(len(filter.Fingerprints)) * 8.0 / float64(NUM_KEYS) - fmt.Println("Binary Fuse8 filter:") + fmt.Println("Binary Fuse filter:") fmt.Println("bits per entry ", bpv) for i := 0; i < falsesize; i++ { v := rand.Uint64() @@ -45,7 +47,7 @@ func TestBinaryFuse8Basic(t *testing.T) { for i := range keys { keys[i] = rand.Uint64() } - filter, _ = PopulateBinaryFuse8(keys) + filter, _ = NewBinaryFuse[testType](keys) for _, v := range keys { assert.Equal(t, true, filter.Contains(v)) } @@ -53,13 +55,13 @@ func TestBinaryFuse8Basic(t *testing.T) { } } -func TestBinaryFuse8Issue23(t *testing.T) { +func TestBinaryFuseNIssue23(t *testing.T) { for trials := 0; trials < 20; trials++ { keys := make([]uint64, MID_NUM_KEYS) for i := range keys { keys[i] = rand.Uint64() } - filter, error := PopulateBinaryFuse8(keys) + filter, error := NewBinaryFuse[testType](keys) assert.Equal(t, nil, error) for _, v := range keys { assert.Equal(t, true, filter.Contains(v)) @@ -67,12 +69,12 @@ func TestBinaryFuse8Issue23(t *testing.T) { } } -func TestBinaryFuse8Small(t *testing.T) { +func TestBinaryFuseNSmall(t *testing.T) { keys := make([]uint64, SMALL_NUM_KEYS) for i := range keys { keys[i] = rand.Uint64() } - filter, _ := PopulateBinaryFuse8(keys) + filter, _ := NewBinaryFuse[testType](keys) for _, v := range keys { assert.Equal(t, true, filter.Contains(v)) } @@ -95,7 +97,7 @@ func TestBinaryFuse8Small(t *testing.T) { for i := range keys { keys[i] = rand.Uint64() } - filter, _ = PopulateBinaryFuse8(keys) + filter, _ = NewBinaryFuse[testType](keys) for _, v := range keys { assert.Equal(t, true, filter.Contains(v)) } @@ -103,35 +105,23 @@ func TestBinaryFuse8Small(t *testing.T) { } } -func BenchmarkBinaryFuse8Populate1000000(b *testing.B) { - keys := make([]uint64, NUM_KEYS) - for i := range keys { - keys[i] = rand.Uint64() - } - - b.ResetTimer() - for n := 0; n < b.N; n++ { - bogusbinary, _ = PopulateBinaryFuse8(keys) - } -} - -func Test_ZeroSet(t *testing.T) { +func TestBinaryFuseN_ZeroSet(t *testing.T) { keys := []uint64{} - _, err := PopulateBinaryFuse8(keys) + _, err := NewBinaryFuse[testType](keys) if err != nil { t.Fatalf("Unexpected error: %v", err) } } -func Test_DuplicateKeysBinaryFuseDup(t *testing.T) { +func TestBinaryFuseN_DuplicateKeysBinaryFuseDup(t *testing.T) { keys := []uint64{303, 1, 77, 31, 241, 303} - _, err := PopulateBinaryFuse8(keys) + _, err := NewBinaryFuse[testType](keys) if err != nil { t.Fatalf("Unexpected error: %v", err) } } -func Test_DuplicateKeysBinaryFuseDup_Issue30(t *testing.T) { +func TestBinaryFuseN_DuplicateKeysBinaryFuseDup_Issue30(t *testing.T) { keys := []uint64{ 14032282262966018013, 14032282273189634013, @@ -255,30 +245,47 @@ func Test_DuplicateKeysBinaryFuseDup_Issue30(t *testing.T) { 4825061103098367168, 4825061103098367168, } - _, err := PopulateBinaryFuse8(keys) + _, err := NewBinaryFuse[testType](keys) if err != nil { t.Fatalf("Unexpected error: %v", err) } } -var bogusbinary *BinaryFuse8 -var bogusbool bool +var ( + bogusbool bool + binaryfusedbig *BinaryFuse8 + bogusbinary *BinaryFuse[testType] +) -func BenchmarkConstructBinaryFuse8(b *testing.B) { +func BenchmarkBinaryFuseNPopulate1000000(b *testing.B) { + keys := make([]uint64, NUM_KEYS) + for i := range keys { + keys[i] = rand.Uint64() + } + + b.ResetTimer() + + for n := 0; n < b.N; n++ { + bogusbinary, _ = NewBinaryFuse[testType](keys) + } +} + +func BenchmarkConstructBinaryFuseN(b *testing.B) { bigrandomarrayInit() b.ResetTimer() b.ReportAllocs() + for n := 0; n < b.N; n++ { - bogusbinary, _ = PopulateBinaryFuse8(bigrandomarray) + bogusbinary, _ = NewBinaryFuse[testType](bigrandomarray) } } -func BenchmarkBinaryFuse8Contains1000000(b *testing.B) { +func BenchmarkBinaryFuseNContains1000000(b *testing.B) { keys := make([]uint64, NUM_KEYS) for i := range keys { keys[i] = rand.Uint64() } - filter, _ := PopulateBinaryFuse8(keys) + filter, _ := NewBinaryFuse[testType](keys) b.ResetTimer() for n := 0; n < b.N; n++ { @@ -286,8 +293,6 @@ func BenchmarkBinaryFuse8Contains1000000(b *testing.B) { } } -var binaryfusedbig *BinaryFuse8 - func binaryfusedbigInit() { fmt.Println("Binary Fuse setup") keys := make([]uint64, 50000000) @@ -298,7 +303,7 @@ func binaryfusedbigInit() { fmt.Println("Binary Fuse setup ok") } -func BenchmarkBinaryFuse8Contains50000000(b *testing.B) { +func BenchmarkBinaryFuseNContains50000000(b *testing.B) { if binaryfusedbig == nil { binaryfusedbigInit() } @@ -308,14 +313,14 @@ func BenchmarkBinaryFuse8Contains50000000(b *testing.B) { } } -func Test_Issue35(t *testing.T) { +func TestBinaryFuseN_Issue35(t *testing.T) { for test := 0; test < 100; test++ { hashes := make([]uint64, 0) for i := 0; i < 40000; i++ { v := encode(int32(rand.Intn(10)), int32(rand.Intn(100000))) hashes = append(hashes, xxhash.Sum64(v)) } - inner, err := PopulateBinaryFuse8(hashes) + inner, err := NewBinaryFuse[testType](hashes) if err != nil { panic(err) } diff --git a/go.mod b/go.mod index f24bf2b..5533ace 100644 --- a/go.mod +++ b/go.mod @@ -1,14 +1,14 @@ module github.com/FastFilter/xorfilter -go 1.17 +go 1.22 require ( github.com/cespare/xxhash v1.1.0 - github.com/stretchr/testify v1.7.0 + github.com/stretchr/testify v1.9.0 ) require ( - github.com/davecgh/go-spew v1.1.0 // indirect + github.com/davecgh/go-spew v1.1.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect - gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index 9e81cab..8ac3ff3 100644 --- a/go.sum +++ b/go.sum @@ -2,16 +2,15 @@ github.com/OneOfOne/xxhash v1.2.2 h1:KMrpdQIwFcEqXDklaen+P1axHaj9BSKzvpUUfnHldSE github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU= github.com/cespare/xxhash v1.1.0 h1:a6HrQnmkObjyL+Gs60czilIUGqrzKutQD6XZog3p+ko= github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc= -github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= -github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72 h1:qLC7fQah7D6K1B0ujays3HV9gkFtllcxhzImRR7ArPQ= github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA= -github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= -github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=