Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature: add generics into binary fuse #39

Merged
merged 3 commits into from
Sep 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 12 additions & 12 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,28 +4,28 @@ jobs:
test:
strategy:
matrix:
go-version: [1.17.x]
go-version: [1.22.x]
os: [ubuntu-latest, macos-latest, windows-latest]
runs-on: ${{ matrix.os }}
steps:
- name: Install Go
uses: actions/setup-go@v4
with:
go-version: ${{ matrix.go-version }}
- name: Checkout code
uses: actions/checkout@v2
- name: Vet
run: go vet ./...
- name: Test
run: go test ./...
- name: Install Go
uses: actions/setup-go@v4
with:
go-version: ${{ matrix.go-version }}
- name: Checkout code
uses: actions/checkout@v2
- name: Vet
run: go vet ./...
- name: Test
run: go test ./...

single-ver:
runs-on: ubuntu-latest
steps:
- name: Set up Go
uses: actions/setup-go@v3
with:
go-version: 1.17.x
go-version: 1.22.x

- name: Checkout code
uses: actions/checkout@v2
Expand Down
165 changes: 85 additions & 80 deletions binaryfusefilter.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,89 +6,26 @@ import (
"math/bits"
)

type BinaryFuse8 struct {
type Unsigned interface {
~uint8 | ~uint16 | ~uint32
}

type BinaryFuse[T Unsigned] struct {
Seed uint64
SegmentLength uint32
SegmentLengthMask uint32
SegmentCount uint32
SegmentCountLength uint32

Fingerprints []uint8
}

func calculateSegmentLength(arity uint32, size uint32) uint32 {
// These parameters are very sensitive. Replacing 'floor' by 'round' can
// substantially affect the construction time.
if size == 0 {
return 4
}
if arity == 3 {
return uint32(1) << int(math.Floor(math.Log(float64(size))/math.Log(3.33)+2.25))
} else if arity == 4 {
return uint32(1) << int(math.Floor(math.Log(float64(size))/math.Log(2.91)-0.5))
} else {
return 65536
}
}

func calculateSizeFactor(arity uint32, size uint32) float64 {
if arity == 3 {
return math.Max(1.125, 0.875+0.25*math.Log(1000000)/math.Log(float64(size)))
} else if arity == 4 {
return math.Max(1.075, 0.77+0.305*math.Log(600000)/math.Log(float64(size)))
} else {
return 2.0
}
}

func (filter *BinaryFuse8) initializeParameters(size uint32) {
arity := uint32(3)
filter.SegmentLength = calculateSegmentLength(arity, size)
if filter.SegmentLength > 262144 {
filter.SegmentLength = 262144
}
filter.SegmentLengthMask = filter.SegmentLength - 1
sizeFactor := calculateSizeFactor(arity, size)
capacity := uint32(0)
if size > 1 {
capacity = uint32(math.Round(float64(size) * sizeFactor))
}
initSegmentCount := (capacity+filter.SegmentLength-1)/filter.SegmentLength - (arity - 1)
arrayLength := (initSegmentCount + arity - 1) * filter.SegmentLength
filter.SegmentCount = (arrayLength + filter.SegmentLength - 1) / filter.SegmentLength
if filter.SegmentCount <= arity-1 {
filter.SegmentCount = 1
} else {
filter.SegmentCount = filter.SegmentCount - (arity - 1)
}
arrayLength = (filter.SegmentCount + arity - 1) * filter.SegmentLength
filter.SegmentCountLength = filter.SegmentCount * filter.SegmentLength
filter.Fingerprints = make([]uint8, arrayLength)
}

func (filter *BinaryFuse8) getHashFromHash(hash uint64) (uint32, uint32, uint32) {
hi, _ := bits.Mul64(hash, uint64(filter.SegmentCountLength))
h0 := uint32(hi)
h1 := h0 + filter.SegmentLength
h2 := h1 + filter.SegmentLength
h1 ^= uint32(hash>>18) & filter.SegmentLengthMask
h2 ^= uint32(hash) & filter.SegmentLengthMask
return h0, h1, h2
}

func mod3(x uint8) uint8 {
if x > 2 {
x -= 3
}
return x
Fingerprints []T
}

// PopulateBinaryFuse8 fills the filter with provided keys. For best results,
// NewBinaryFuse fills the filter with provided keys. For best results,
// the caller should avoid having too many duplicated keys.
// The function may return an error if the set is empty.
func PopulateBinaryFuse8(keys []uint64) (*BinaryFuse8, error) {
func NewBinaryFuse[T Unsigned](keys []uint64) (*BinaryFuse[T], error) {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

May be hide generic interface and expose only PopulateBinaryFuse8(), PopulateBinaryFuse16(), PopulateBinaryFuse32() publicly to keep API consistent with what we already have.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That was a trick with configurable types

type MyFuse uint32
...
func main() {
  filter := xorfilter.NewBinaryFuse[MyFuse](keys)
  second(filter)
}

func second(filter *xorfilter.BinaryFuse[MyFuse]) {
  ...
}

to change only one line when trying to find best binary fuse for use case. Deprecating PopulateBinaryFuse8.

To be honest I'm tired of switch value := value.(type) usage in established projects. Generics allow to avoid "default" cases as they mostly error, simplify to compiler error.

size := uint32(len(keys))
filter := &BinaryFuse8{}
filter := &BinaryFuse[T]{}
filter.initializeParameters(size)
rngcounter := uint64(1)
filter.Seed = splitmix64(&rngcounter)
Expand All @@ -98,8 +35,8 @@ func PopulateBinaryFuse8(keys []uint64) (*BinaryFuse8, error) {
// the lowest 2 bits are the h index (0, 1, or 2)
// so we only have 6 bits for counting;
// but that's sufficient
t2count := make([]uint8, capacity)
reverseH := make([]uint8, size)
t2count := make([]T, capacity)
reverseH := make([]T, size)

t2hash := make([]uint64, capacity)
reverseOrder := make([]uint64, size+1)
Expand Down Expand Up @@ -224,7 +161,7 @@ func PopulateBinaryFuse8(keys []uint64) (*BinaryFuse8, error) {
Qsize++
}
t2count[other_index1] -= 4
t2count[other_index1] ^= mod3(found + 1) // could use this instead: tabmod3[found+1]
t2count[other_index1] ^= filter.mod3(found + 1) // could use this instead: tabmod3[found+1]
t2hash[other_index1] ^= hash

other_index2 := h012[found+2]
Expand All @@ -233,7 +170,7 @@ func PopulateBinaryFuse8(keys []uint64) (*BinaryFuse8, error) {
Qsize++
}
t2count[other_index2] -= 4
t2count[other_index2] ^= mod3(found + 2) // could use this instead: tabmod3[found+2]
t2count[other_index2] ^= filter.mod3(found + 2) // could use this instead: tabmod3[found+2]
t2hash[other_index2] ^= hash
}
}
Expand Down Expand Up @@ -265,7 +202,7 @@ func PopulateBinaryFuse8(keys []uint64) (*BinaryFuse8, error) {
for i := int(size - 1); i >= 0; i-- {
// the hash of the key we insert next
hash := reverseOrder[i]
xor2 := uint8(fingerprint(hash))
xor2 := T(fingerprint(hash))
index1, index2, index3 := filter.getHashFromHash(hash)
found := reverseH[i]
h012[0] = index1
Expand All @@ -279,11 +216,79 @@ func PopulateBinaryFuse8(keys []uint64) (*BinaryFuse8, error) {
return filter, nil
}

// Contains returns `true` if key is part of the set with a false positive probability of <0.4%.
func (filter *BinaryFuse8) Contains(key uint64) bool {
func (filter *BinaryFuse[T]) initializeParameters(size uint32) {
arity := uint32(3)
filter.SegmentLength = calculateSegmentLength(arity, size)
if filter.SegmentLength > 262144 {
filter.SegmentLength = 262144
}
filter.SegmentLengthMask = filter.SegmentLength - 1
sizeFactor := calculateSizeFactor(arity, size)
capacity := uint32(0)
if size > 1 {
capacity = uint32(math.Round(float64(size) * sizeFactor))
}
initSegmentCount := (capacity+filter.SegmentLength-1)/filter.SegmentLength - (arity - 1)
arrayLength := (initSegmentCount + arity - 1) * filter.SegmentLength
filter.SegmentCount = (arrayLength + filter.SegmentLength - 1) / filter.SegmentLength
if filter.SegmentCount <= arity-1 {
filter.SegmentCount = 1
} else {
filter.SegmentCount = filter.SegmentCount - (arity - 1)
}
arrayLength = (filter.SegmentCount + arity - 1) * filter.SegmentLength
filter.SegmentCountLength = filter.SegmentCount * filter.SegmentLength
filter.Fingerprints = make([]T, arrayLength)
}

func (filter *BinaryFuse[T]) mod3(x T) T {
if x > 2 {
x -= 3
}

return x
}

func (filter *BinaryFuse[T]) getHashFromHash(hash uint64) (uint32, uint32, uint32) {
hi, _ := bits.Mul64(hash, uint64(filter.SegmentCountLength))
h0 := uint32(hi)
h1 := h0 + filter.SegmentLength
h2 := h1 + filter.SegmentLength
h1 ^= uint32(hash>>18) & filter.SegmentLengthMask
h2 ^= uint32(hash) & filter.SegmentLengthMask
return h0, h1, h2
}

// Contains returns `true` if key is part of the set with a false positive probability.
func (filter *BinaryFuse[T]) Contains(key uint64) bool {
hash := mixsplit(key, filter.Seed)
f := uint8(fingerprint(hash))
f := T(fingerprint(hash))
h0, h1, h2 := filter.getHashFromHash(hash)
f ^= filter.Fingerprints[h0] ^ filter.Fingerprints[h1] ^ filter.Fingerprints[h2]
return f == 0
}

func calculateSegmentLength(arity uint32, size uint32) uint32 {
// These parameters are very sensitive. Replacing 'floor' by 'round' can
// substantially affect the construction time.
if size == 0 {
return 4
}
if arity == 3 {
return uint32(1) << int(math.Floor(math.Log(float64(size))/math.Log(3.33)+2.25))
} else if arity == 4 {
return uint32(1) << int(math.Floor(math.Log(float64(size))/math.Log(2.91)-0.5))
} else {
return 65536
}
}

func calculateSizeFactor(arity uint32, size uint32) float64 {
if arity == 3 {
return math.Max(1.125, 0.875+0.25*math.Log(1000000)/math.Log(float64(size)))
} else if arity == 4 {
return math.Max(1.075, 0.77+0.305*math.Log(600000)/math.Log(float64(size)))
} else {
return 2.0
}
}
20 changes: 20 additions & 0 deletions binaryfusefilter8.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
package xorfilter

type BinaryFuse8 BinaryFuse[uint8]

// PopulateBinaryFuse8 fills the filter with provided keys. For best results,
// the caller should avoid having too many duplicated keys.
// The function may return an error if the set is empty.
func PopulateBinaryFuse8(keys []uint64) (*BinaryFuse8, error) {
filter, err := NewBinaryFuse[uint8](keys)
if err != nil {
return nil, err
}

return (*BinaryFuse8)(filter), nil
}

// Contains returns `true` if key is part of the set with a false positive probability of <0.4%.
func (filter *BinaryFuse8) Contains(key uint64) bool {
return (*BinaryFuse[uint8])(filter).Contains(key)
}
Loading
Loading