Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP experiment: allow extended lz4 lookback window #26

Open
wants to merge 3 commits into
base: fs/branch_9_7
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 52 additions & 8 deletions lucene/core/src/java/org/apache/lucene/util/compress/LZ4.java
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,27 @@ private LZ4() {}
*/
public static final int MAX_DISTANCE = 1 << 16; // maximum distance of a reference

/**
* Longer lookback window size. Cf. {@link #MAX_DISTANCE}. This allows the context window to be
* 256k instead of the default 64k, and can provide substantial compression ratio and performance
* boost for data where the repetition period is longer.
*/
public static final int EXTENDED_MAX_DISTANCE = (1 << 18) - 1;

/**
* There are some use cases (e.g., 256k block-level compression applied over index files) where
* the period of pattern repetition is longer. Such cases benefit from a combination of {@link
* HighCompressionHashTable} and a longer lookback window ({@link #EXTENDED_MAX_DISTANCE} instead
* of {@link #MAX_DISTANCE}). The benefits are both in compression (real-world cases with ~3x
* improved compression!), but also in latency/CPU-efficiency, in some cases with &gt;2x faster
* execution.
*
* <p>We want to support lz4 with {@link #EXTENDED_MAX_DISTANCE} for these special cases, but also
* provide {@link #DEFAULT_EXTENDED_MAX_DISTANCE} to allow tests to be run exercising lz4 with
* {@link #EXTENDED_MAX_DISTANCE}.
*/
public static final boolean DEFAULT_EXTENDED_MAX_DISTANCE = false;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add sys prop here?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could do ... I opted not to because the main use for this would be to run full test suite with EXTENDED_MAX_DISTANCE config, and that would be a bit more involved than simply adding a sysprop. If we're going make this configurable via sysprop there would be some benefit, but IMO only if we go as far as supporting configuration via project property for running tests.

Practically this only affects the lookback window for existing "inner" lz4 -- it has no impact on the configuration of lz4 as employed by TeeDirectory/CompressingDirectory. For these, EXTENDED_MAX_DISTANCE defaults to true, and is configurable via solrconfig.xml.


static final int MEMORY_USAGE = 14;
static final int MIN_MATCH = 4; // minimum length of a match
static final int LAST_LITERALS = 5; // the last 5 bytes must be encoded as literals
Expand Down Expand Up @@ -88,6 +109,12 @@ private static int commonBytes(byte[] b, int o1, int o2, int limit) {
*/
public static int decompress(DataInput compressed, int decompressedLen, byte[] dest, int dOff)
throws IOException {
return decompress(compressed, decompressedLen, dest, dOff, DEFAULT_EXTENDED_MAX_DISTANCE);
}

public static int decompress(
DataInput compressed, int decompressedLen, byte[] dest, int dOff, boolean ext)
throws IOException {
final int destEnd = dOff + decompressedLen;

do {
Expand All @@ -112,7 +139,7 @@ public static int decompress(DataInput compressed, int decompressedLen, byte[] d
}

// matchs
final int matchDec = compressed.readShort() & 0xFFFF;
final int matchDec = ext ? compressed.readVInt() : (compressed.readShort() & 0xFFFF);
assert matchDec > 0;

int matchLen = token & 0x0F;
Expand Down Expand Up @@ -170,7 +197,13 @@ private static void encodeLastLiterals(byte[] bytes, int anchor, int literalLen,
}

private static void encodeSequence(
byte[] bytes, int anchor, int matchRef, int matchOff, int matchLen, DataOutput out)
byte[] bytes,
int anchor,
int matchRef,
int matchOff,
int matchLen,
DataOutput out,
boolean ext)
throws IOException {
final int literalLen = matchOff - anchor;
assert matchLen >= 4;
Expand All @@ -180,8 +213,12 @@ private static void encodeSequence(

// encode match dec
final int matchDec = matchOff - matchRef;
assert matchDec > 0 && matchDec < 1 << 16;
out.writeShort((short) matchDec);
assert matchDec > 0 && matchDec < (ext ? EXTENDED_MAX_DISTANCE : MAX_DISTANCE);
if (ext) {
out.writeVInt(matchDec);
} else {
out.writeShort((short) matchDec);
}

// encode match len
if (matchLen >= MIN_MATCH + 0x0F) {
Expand Down Expand Up @@ -524,11 +561,18 @@ public static void compress(byte[] bytes, int off, int len, DataOutput out, Hash
public static void compressWithDictionary(
byte[] bytes, int dictOff, int dictLen, int len, DataOutput out, HashTable ht)
throws IOException {
compressWithDictionary(bytes, dictOff, dictLen, len, out, ht, DEFAULT_EXTENDED_MAX_DISTANCE);
}

public static void compressWithDictionary(
byte[] bytes, int dictOff, int dictLen, int len, DataOutput out, HashTable ht, boolean ext)
throws IOException {
Objects.checkFromIndexSize(dictOff, dictLen, bytes.length);
Objects.checkFromIndexSize(dictOff + dictLen, len, bytes.length);
if (dictLen > MAX_DISTANCE) {
final int maxDistance = ext ? EXTENDED_MAX_DISTANCE : MAX_DISTANCE;
if (dictLen > maxDistance) {
throw new IllegalArgumentException(
"dictLen must not be greater than 64kB, but got " + dictLen);
"dictLen must not be greater than " + (ext ? "256k" : "64k") + ", but got " + dictLen);
}

final int end = dictOff + dictLen + len;
Expand Down Expand Up @@ -564,7 +608,7 @@ public static void compressWithDictionary(
int matchLen = MIN_MATCH + commonBytes(bytes, ref + MIN_MATCH, off + MIN_MATCH, limit);

// try to find a better match
for (int r = ht.previous(ref), min = Math.max(off - MAX_DISTANCE + 1, dictOff);
for (int r = ht.previous(ref), min = Math.max(off - maxDistance + 1, dictOff);
r >= min;
r = ht.previous(r)) {
assert readInt(bytes, r) == readInt(bytes, off);
Expand All @@ -575,7 +619,7 @@ public static void compressWithDictionary(
}
}

encodeSequence(bytes, anchor, ref, off, matchLen, out);
encodeSequence(bytes, anchor, ref, off, matchLen, out, ext);
off += matchLen;
anchor = off;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,41 @@ private void doTest(byte[] data, LZ4.HashTable hashTable) throws IOException {
doTest(copy, offset, data.length, hashTable);
}

private static int readVInt(byte[] compressed, int off, int[] size) throws IOException {
byte b = compressed[off++];
if (b >= 0) {
size[0] = 1;
return b;
}
int i = b & 0x7F;
b = compressed[off++];
i |= (b & 0x7F) << 7;
if (b >= 0) {
size[0] = 2;
return i;
}
b = compressed[off++];
i |= (b & 0x7F) << 14;
if (b >= 0) {
size[0] = 3;
return i;
}
b = compressed[off++];
i |= (b & 0x7F) << 21;
if (b >= 0) {
size[0] = 4;
return i;
}
b = compressed[off];
// Warning: the next ands use 0x0F / 0xF0 - beware copy/paste errors:
i |= (b & 0x0F) << 28;
if ((b & 0xF0) == 0) {
size[0] = 5;
return i;
}
throw new IOException("Invalid vInt detected (too many bits)");
}

private void doTest(byte[] data, int offset, int length, LZ4.HashTable hashTable)
throws IOException {
ByteBuffersDataOutput out = new ByteBuffersDataOutput();
Expand All @@ -84,6 +119,7 @@ private void doTest(byte[] data, int offset, int length, LZ4.HashTable hashTable

int off = 0;
int decompressedOff = 0;
final int[] vintSize = LZ4.DEFAULT_EXTENDED_MAX_DISTANCE ? new int[1] : null;
for (; ; ) {
final int token = compressed[off++] & 0xFF;
int literalLen = token >>> 4;
Expand All @@ -108,7 +144,13 @@ private void doTest(byte[] data, int offset, int length, LZ4.HashTable hashTable
break;
}

final int matchDec = (compressed[off++] & 0xFF) | ((compressed[off++] & 0xFF) << 8);
final int matchDec;
if (LZ4.DEFAULT_EXTENDED_MAX_DISTANCE) {
matchDec = readVInt(compressed, off, vintSize);
off += vintSize[0];
} else {
matchDec = (compressed[off++] & 0xFF) | ((compressed[off++] & 0xFF) << 8);
}
// check that match dec is not 0
assertTrue(matchDec + " " + decompressedOff, matchDec > 0 && matchDec <= decompressedOff);

Expand Down