Skip to content

Commit

Permalink
convert dwa encoder to use algorithm quantize
Browse files Browse the repository at this point in the history
Convert the quantization from a massive table to a bit-fiddling
algorithm which is more code but significantly faster (in some cases,
almost 2x). This also eliminates the giant lookup tables from the
binary, which reduces the compiled size of the core library. Beside
the identity table (value in, same value out), those quantization
tables are preserved in the unit tests to ensure correct behavior.

The tests are defaulted to a light-er weight test, which should test
the variety of error tolerance ranges, but there is a deeper test
that can be enabled by switching an ifdef in the compression tables
test.

As a motivation for improvement, with this change, the size for the
core library is reduced by more than half (2.2MB to 0.99MB) while
performance is improved from 0.23s for an encode to 0.155s on
x86_64 with avx2 / f16c enabled anyway. Other platforms will
hopefully experience similar improvements.

Signed-off-by: Kimball Thurston <[email protected]>
  • Loading branch information
kdt3rd authored Jan 1, 2025
1 parent a8dc4f9 commit 22cbb79
Show file tree
Hide file tree
Showing 9 changed files with 56,853 additions and 62,446 deletions.
62,236 changes: 0 additions & 62,236 deletions src/lib/OpenEXRCore/dwaLookups.h

Large diffs are not rendered by default.

251 changes: 121 additions & 130 deletions src/lib/OpenEXRCore/internal_dwa_decoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,12 +103,68 @@ static exr_result_t LossyDctDecoder_execute (
// value in the buffer - with the index into zig zag
// order data. If we return 0, we have DC only data.
//
static int LossyDctDecoder_unRleAc (
//
// This is assuminging that halfZigBlock is zero'ed
// prior to calling
//
static inline exr_result_t
LossyDctDecoder_unRleAc (
LossyDctDecoder* d,
int* lastNonZero,
uint16_t** currAcComp,
uint16_t* acBufferEnd,
uint16_t* halfZigBlock);
uint16_t* packedAcEnd,
uint16_t* halfZigBlock)
{
//
// Un-RLE the RLE'd blocks. If we find an item whose
// high byte is 0xff, then insert the number of 0's
// as indicated by the low byte.
//
// Otherwise, just copy the number verbatim.
//
int dctComp = 1;
uint16_t* acComp = *currAcComp;
uint16_t val;
int lnz = 0;
uint64_t ac_count = 0;

//
// Start with a zero'ed block, so we don't have to
// write when we hit a run symbol
//

while (dctComp < 64)
{
if (acComp >= packedAcEnd) { return EXR_ERR_CORRUPT_CHUNK; }
val = *acComp;
if ((val & 0xff00) == 0xff00)
{
uint8_t count = val & 0xff;

// run, insert 0s - since block pre-zeroed, nothing to do
// just increment dctComp but test for end of block...
dctComp += (count == 0) ? 64 : count;
}
else
{
//
// Not a run, just copy over the value
//
lnz = dctComp;
halfZigBlock[dctComp] = val;

++dctComp;
}

++ac_count;
++acComp;
}

d->_packedAcCount += ac_count;
*lastNonZero = lnz;
*currAcComp = acComp;
return EXR_ERR_SUCCESS;
}

//
// Used to decode a single channel of LOSSY_DCT data.
Expand Down Expand Up @@ -215,7 +271,6 @@ LossyDctDecoder_base_construct (
d->_toLinear = toLinear;
d->_width = width;
d->_height = height;
if (d->_toLinear == NULL) d->_toLinear = dwaCompressorNoOp;

//d->_isNativeXdr = GLOBAL_SYSTEM_LITTLE_ENDIAN;

Expand Down Expand Up @@ -523,27 +578,17 @@ LossyDctDecoder_execute (
//

#ifdef IMF_HAVE_SSE2

uint8_t fastPath = DWA_CLASSIFIER_TRUE;

for (int y = 8 * blocky; y < 8 * blocky + maxY; ++y)
{
if ((uintptr_t) (chanData[comp]->_rows[y]) &
_SSE_ALIGNMENT_MASK)
fastPath = DWA_CLASSIFIER_FALSE;
}

if (fastPath)
//
// Handle all the full X blocks, in a fast path with sse2 and
//
// test for no-op conversion
//
if (d->_toLinear != NULL)
{
//
// Handle all the full X blocks, in a fast path with sse2 and
// aligned row pointers
//

for (int y = 8 * blocky; y < 8 * blocky + maxY; ++y)
{
__m128i* dst = (__m128i*) chanData[comp]->_rows[y];
__m128i* src = (__m128i*) &rowBlock[comp][(y & 0x7) * 8];
__m128i* restrict dst = (__m128i *) chanData[comp]->_rows[y];
__m128i const * restrict src = (__m128i const *)&rowBlock[comp][(y & 0x7) * 8];

for (int blockx = 0; blockx < numFullBlocksX; ++blockx)
{
Expand All @@ -553,17 +598,18 @@ LossyDctDecoder_execute (
// Run with multiples of 8
//

_mm_prefetch ((char*) (src + 16), _MM_HINT_NTA);

i0 = (uint16_t) _mm_extract_epi16 (*src, 0);
i1 = (uint16_t) _mm_extract_epi16 (*src, 1);
i2 = (uint16_t) _mm_extract_epi16 (*src, 2);
i3 = (uint16_t) _mm_extract_epi16 (*src, 3);
_mm_prefetch ((const char*) (src + 16*8), _MM_HINT_NTA);
__m128i srcv = _mm_loadu_si128 (src);

i4 = (uint16_t) _mm_extract_epi16 (*src, 4);
i5 = (uint16_t) _mm_extract_epi16 (*src, 5);
i6 = (uint16_t) _mm_extract_epi16 (*src, 6);
i7 = (uint16_t) _mm_extract_epi16 (*src, 7);
// TODO: avx2 scatter gather
i0 = (uint16_t) _mm_extract_epi16 (srcv, 0);
i1 = (uint16_t) _mm_extract_epi16 (srcv, 1);
i2 = (uint16_t) _mm_extract_epi16 (srcv, 2);
i3 = (uint16_t) _mm_extract_epi16 (srcv, 3);
i4 = (uint16_t) _mm_extract_epi16 (srcv, 4);
i5 = (uint16_t) _mm_extract_epi16 (srcv, 5);
i6 = (uint16_t) _mm_extract_epi16 (srcv, 6);
i7 = (uint16_t) _mm_extract_epi16 (srcv, 7);

i0 = d->_toLinear[i0];
i1 = d->_toLinear[i1];
Expand All @@ -575,37 +621,53 @@ LossyDctDecoder_execute (
i6 = d->_toLinear[i6];
i7 = d->_toLinear[i7];

*dst = _mm_insert_epi16 (_mm_setzero_si128 (), i0, 0);
*dst = _mm_insert_epi16 (*dst, i1, 1);
*dst = _mm_insert_epi16 (*dst, i2, 2);
*dst = _mm_insert_epi16 (*dst, i3, 3);
__m128i dstv = _mm_insert_epi16 (_mm_setzero_si128 (), i0, 0);
dstv = _mm_insert_epi16 (dstv, i1, 1);
dstv = _mm_insert_epi16 (dstv, i2, 2);
dstv = _mm_insert_epi16 (dstv, i3, 3);
dstv = _mm_insert_epi16 (dstv, i4, 4);
dstv = _mm_insert_epi16 (dstv, i5, 5);
dstv = _mm_insert_epi16 (dstv, i6, 6);
dstv = _mm_insert_epi16 (dstv, i7, 7);

*dst = _mm_insert_epi16 (*dst, i4, 4);
*dst = _mm_insert_epi16 (*dst, i5, 5);
*dst = _mm_insert_epi16 (*dst, i6, 6);
*dst = _mm_insert_epi16 (*dst, i7, 7);
_mm_storeu_si128 (dst, dstv);

++dst;
src += 8;
dst++;
}
}
}
else
{
// no-op conversion to linear
for (int y = 8 * blocky; y < 8 * blocky + maxY; ++y)
{
__m128i* restrict dst = (__m128i *) chanData[comp]->_rows[y];
__m128i const * restrict src = (__m128i const *)&rowBlock[comp][(y & 0x7) * 8];

#endif /* IMF_HAVE_SSE2 */
for (int blockx = 0; blockx < numFullBlocksX; ++blockx)
{
_mm_storeu_si128 (dst, _mm_loadu_si128 (src));

src += 8 * 8;
dst += 8;
}
}
}
#else
if (d->_toLinear)
{
//
// Basic scalar kinda slow path for handling the full X blocks
//

for (int y = 8 * blocky; y < 8 * blocky + maxY; ++y)
{
uint16_t* dst = (uint16_t*) chanData[comp]->_rows[y];
uint16_t* restrict dst = (uint16_t*) chanData[comp]->_rows[y];

for (int blockx = 0; blockx < numFullBlocksX; ++blockx)
{
uint16_t* src =
uint16_t* restrict src =
&rowBlock[comp][blockx * 64 + ((y & 0x7) * 8)];

dst[0] = d->_toLinear[src[0]];
Expand All @@ -621,10 +683,23 @@ LossyDctDecoder_execute (
dst += 8;
}
}

#ifdef IMF_HAVE_SSE2
}
else
{
// no-op conversion to linear
for (int y = 8 * blocky; y < 8 * blocky + maxY; ++y)
{
uint16_t* dst = (uint16_t*) chanData[comp]->_rows[y];

for (int blockx = 0; blockx < numFullBlocksX; ++blockx)
{
uint16_t* src =
&rowBlock[comp][blockx * 64 + ((y & 0x7) * 8)];
memcpy (dst, src, 8*sizeof(uint16_t));
dst += 8;
}
}
}
#endif /* IMF_HAVE_SSE2 */

//
Expand Down Expand Up @@ -683,87 +758,3 @@ LossyDctDecoder_execute (
}

/**************************************/

//
// Un-RLE the packed AC components into
// a half buffer. The half block should
// be the full 8x8 block (in zig-zag order
// still), not the first AC component.
//
// currAcComp is advanced as bytes are decoded.
//
// This returns the index of the last non-zero
// value in the buffer - with the index into zig zag
// order data. If we return 0, we have DC only data.
//
// This is assuminging that halfZigBlock is zero'ed
// prior to calling
//
exr_result_t
LossyDctDecoder_unRleAc (
LossyDctDecoder* d,
int* lastNonZero,
uint16_t** currAcComp,
uint16_t* packedAcEnd,
uint16_t* halfZigBlock)
{
//
// Un-RLE the RLE'd blocks. If we find an item whose
// high byte is 0xff, then insert the number of 0's
// as indicated by the low byte.
//
// Otherwise, just copy the number verbatim.
//
int dctComp = 1;
uint16_t* acComp = *currAcComp;
uint16_t val;
int lnz = 0;
uint64_t ac_count = 0;

//
// Start with a zero'ed block, so we don't have to
// write when we hit a run symbol
//

while (dctComp < 64)
{
if (acComp >= packedAcEnd) { return EXR_ERR_CORRUPT_CHUNK; }
val = *acComp;
if (val == 0xff00)
{
//
// End of block
//

dctComp = 64;
}
else if ((val >> 8) == 0xff)
{
//
// Run detected! Insert 0's.
//
// Since the block has been zeroed, just advance the ptr
//

dctComp += val & 0xff;
}
else
{
//
// Not a run, just copy over the value
//
lnz = dctComp;
halfZigBlock[dctComp] = val;

dctComp++;
}

ac_count++;
acComp++;
}

d->_packedAcCount += ac_count;
*lastNonZero = lnz;
*currAcComp = acComp;
return EXR_ERR_SUCCESS;
}
Loading

0 comments on commit 22cbb79

Please sign in to comment.