From 8620698d668ae0fdf1d2f85f8df2a7500f77cd2a Mon Sep 17 00:00:00 2001 From: Thomas Mueller Date: Fri, 27 Dec 2019 21:31:30 +0100 Subject: [PATCH] CountingBloom: counters can overflow #20 --- .../org/fastfilter/bloom/count/CountingBloom.java | 11 ++++++++++- .../java/org/fastfilter/gcs/GolombCompressedSet.java | 3 +++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/fastfilter/src/main/java/org/fastfilter/bloom/count/CountingBloom.java b/fastfilter/src/main/java/org/fastfilter/bloom/count/CountingBloom.java index 1cc0625..d9a2183 100644 --- a/fastfilter/src/main/java/org/fastfilter/bloom/count/CountingBloom.java +++ b/fastfilter/src/main/java/org/fastfilter/bloom/count/CountingBloom.java @@ -36,7 +36,11 @@ public long getBitCount() { entryCount = Math.max(1, entryCount); this.k = k; this.seed = Hash.randomSeed(); - this.bits = (long) (4 * entryCount * bitsPerKey); + // if the entryCount is very small, then there is a relatively high + // probability that one of the counter overflows, so we add + // a fixed number of bits (64 in this case) to reduce the probability of this + // (this is a workaround only) + this.bits = (long) (4 * entryCount * bitsPerKey) + 64; arraySize = (int) ((bits + 63) / 64); counts = new long[arraySize]; } @@ -53,6 +57,11 @@ public void add(long key) { int b = (int) hash; for (int i = 0; i < k; i++) { int index = Hash.reduce(a, arraySize << 4); + int oldCount = (int) (counts[index >>> 4] >>> (index << 2)) & 0xf; + if (oldCount >= 15) { + // TODO we should also undo what was added so far + throw new UnsupportedOperationException("Counter overflow"); + } counts[index >>> 4] += getBit(index); a += b; } diff --git a/fastfilter/src/main/java/org/fastfilter/gcs/GolombCompressedSet.java b/fastfilter/src/main/java/org/fastfilter/gcs/GolombCompressedSet.java index 2f3bad1..3742f9f 100644 --- a/fastfilter/src/main/java/org/fastfilter/gcs/GolombCompressedSet.java +++ b/fastfilter/src/main/java/org/fastfilter/gcs/GolombCompressedSet.java @@ -26,6 +26,9 @@ public static GolombCompressedSet construct(long[] keys, int setting) { return new GolombCompressedSet(keys, keys.length, setting); } + // TODO rearrange Rice codes so that buckets have all variable parts first, then fixed part + // this is to speed up lookup with large bucket sizes + GolombCompressedSet(long[] keys, int len, int fingerprintBits) { if (fingerprintBits < 4 || fingerprintBits > 50) { throw new IllegalArgumentException();