convert dwa encoder to use algorithm quantize

Convert the quantization from a massive table to a bit-fiddling algorithm which is more code but significantly faster (in some cases, almost 2x). This also eliminates the giant lookup tables from the binary, which reduces the compiled size of the core library. Beside the identity table (value in, same value out), those quantization tables are preserved in the unit tests to ensure correct behavior. The tests are defaulted to a light-er weight test, which should test the variety of error tolerance ranges, but there is a deeper test that can be enabled by switching an ifdef in the compression tables test. As a motivation for improvement, with this change, the size for the core library is reduced by more than half (2.2MB to 0.99MB) while performance is improved from 0.23s for an encode to 0.155s on x86_64 with avx2 / f16c enabled anyway. Other platforms will hopefully experience similar improvements. Signed-off-by: Kimball Thurston <[email protected]>
AcademySoftwareFoundation · Jan 1, 2025 · 22cbb79 · 22cbb79
1 parent a8dc4f9
commit 22cbb79
Show file tree

Hide file tree

Showing 9 changed files with 56,853 additions and 62,446 deletions.
diff --git a/src/lib/OpenEXRCore/dwaLookups.h b/src/lib/OpenEXRCore/dwaLookups.h
diff --git a/src/lib/OpenEXRCore/internal_dwa_decoder.h b/src/lib/OpenEXRCore/internal_dwa_decoder.h
@@ -103,12 +103,68 @@ static exr_result_t LossyDctDecoder_execute (
 // value in the buffer - with the index into zig zag
 // order data. If we return 0, we have DC only data.
 //
-static int LossyDctDecoder_unRleAc (
+//
+// This is assuminging that halfZigBlock is zero'ed
+// prior to calling
+//
+static inline exr_result_t
+LossyDctDecoder_unRleAc (
     LossyDctDecoder* d,
     int*             lastNonZero,
     uint16_t**       currAcComp,
-    uint16_t*        acBufferEnd,
-    uint16_t*        halfZigBlock);
+    uint16_t*        packedAcEnd,
+    uint16_t*        halfZigBlock)
+{
+    //
+    // Un-RLE the RLE'd blocks. If we find an item whose
+    // high byte is 0xff, then insert the number of 0's
+    // as indicated by the low byte.
+    //
+    // Otherwise, just copy the number verbatim.
+    //
+    int       dctComp = 1;
+    uint16_t* acComp  = *currAcComp;
+    uint16_t  val;
+    int       lnz      = 0;
+    uint64_t  ac_count = 0;
+
+    //
+    // Start with a zero'ed block, so we don't have to
+    // write when we hit a run symbol
+    //
+
+    while (dctComp < 64)
+    {
+        if (acComp >= packedAcEnd) { return EXR_ERR_CORRUPT_CHUNK; }
+        val = *acComp;
+        if ((val & 0xff00) == 0xff00)
+        {
+            uint8_t count = val & 0xff;
+
+            // run, insert 0s - since block pre-zeroed, nothing to do
+            // just increment dctComp but test for end of block...
+            dctComp += (count == 0) ? 64 : count;
+        }
+        else
+        {
+            //
+            // Not a run, just copy over the value
+            //
+            lnz                   = dctComp;
+            halfZigBlock[dctComp] = val;
+
+            ++dctComp;
+        }
+
+        ++ac_count;
+        ++acComp;
+    }
+
+    d->_packedAcCount += ac_count;
+    *lastNonZero = lnz;
+    *currAcComp  = acComp;
+    return EXR_ERR_SUCCESS;
+}
 
 //
 // Used to decode a single channel of LOSSY_DCT data.
@@ -215,7 +271,6 @@ LossyDctDecoder_base_construct (
     d->_toLinear      = toLinear;
     d->_width         = width;
     d->_height        = height;
-    if (d->_toLinear == NULL) d->_toLinear = dwaCompressorNoOp;
 
     //d->_isNativeXdr = GLOBAL_SYSTEM_LITTLE_ENDIAN;
 
@@ -523,27 +578,17 @@ LossyDctDecoder_execute (
             //
 
 #ifdef IMF_HAVE_SSE2
-
-            uint8_t fastPath = DWA_CLASSIFIER_TRUE;
-
-            for (int y = 8 * blocky; y < 8 * blocky + maxY; ++y)
-            {
-                if ((uintptr_t) (chanData[comp]->_rows[y]) &
-                    _SSE_ALIGNMENT_MASK)
-                    fastPath = DWA_CLASSIFIER_FALSE;
-            }
-
-            if (fastPath)
+            //
+            // Handle all the full X blocks, in a fast path with sse2 and
+            //
+            // test for no-op conversion
+            //
+            if (d->_toLinear != NULL)
             {
-                //
-                // Handle all the full X blocks, in a fast path with sse2 and
-                // aligned row pointers
-                //
-
                 for (int y = 8 * blocky; y < 8 * blocky + maxY; ++y)
                 {
-                    __m128i* dst = (__m128i*) chanData[comp]->_rows[y];
-                    __m128i* src = (__m128i*) &rowBlock[comp][(y & 0x7) * 8];
+                    __m128i* restrict dst = (__m128i *) chanData[comp]->_rows[y];
+                    __m128i const * restrict src = (__m128i const *)&rowBlock[comp][(y & 0x7) * 8];
 
                     for (int blockx = 0; blockx < numFullBlocksX; ++blockx)
                     {
@@ -553,17 +598,18 @@ LossyDctDecoder_execute (
                         // Run with multiples of 8
                         //
 
-                        _mm_prefetch ((char*) (src + 16), _MM_HINT_NTA);
-
-                        i0 = (uint16_t) _mm_extract_epi16 (*src, 0);
-                        i1 = (uint16_t) _mm_extract_epi16 (*src, 1);
-                        i2 = (uint16_t) _mm_extract_epi16 (*src, 2);
-                        i3 = (uint16_t) _mm_extract_epi16 (*src, 3);
+                        _mm_prefetch ((const char*) (src + 16*8), _MM_HINT_NTA);
+                        __m128i srcv = _mm_loadu_si128 (src);
 
-                        i4 = (uint16_t) _mm_extract_epi16 (*src, 4);
-                        i5 = (uint16_t) _mm_extract_epi16 (*src, 5);
-                        i6 = (uint16_t) _mm_extract_epi16 (*src, 6);
-                        i7 = (uint16_t) _mm_extract_epi16 (*src, 7);
+                        // TODO: avx2 scatter gather
+                        i0 = (uint16_t) _mm_extract_epi16 (srcv, 0);
+                        i1 = (uint16_t) _mm_extract_epi16 (srcv, 1);
+                        i2 = (uint16_t) _mm_extract_epi16 (srcv, 2);
+                        i3 = (uint16_t) _mm_extract_epi16 (srcv, 3);
+                        i4 = (uint16_t) _mm_extract_epi16 (srcv, 4);
+                        i5 = (uint16_t) _mm_extract_epi16 (srcv, 5);
+                        i6 = (uint16_t) _mm_extract_epi16 (srcv, 6);
+                        i7 = (uint16_t) _mm_extract_epi16 (srcv, 7);
 
                         i0 = d->_toLinear[i0];
                         i1 = d->_toLinear[i1];
@@ -575,37 +621,53 @@ LossyDctDecoder_execute (
                         i6 = d->_toLinear[i6];
                         i7 = d->_toLinear[i7];
 
-                        *dst = _mm_insert_epi16 (_mm_setzero_si128 (), i0, 0);
-                        *dst = _mm_insert_epi16 (*dst, i1, 1);
-                        *dst = _mm_insert_epi16 (*dst, i2, 2);
-                        *dst = _mm_insert_epi16 (*dst, i3, 3);
+                        __m128i dstv = _mm_insert_epi16 (_mm_setzero_si128 (), i0, 0);
+                        dstv = _mm_insert_epi16 (dstv, i1, 1);
+                        dstv = _mm_insert_epi16 (dstv, i2, 2);
+                        dstv = _mm_insert_epi16 (dstv, i3, 3);
+                        dstv = _mm_insert_epi16 (dstv, i4, 4);
+                        dstv = _mm_insert_epi16 (dstv, i5, 5);
+                        dstv = _mm_insert_epi16 (dstv, i6, 6);
+                        dstv = _mm_insert_epi16 (dstv, i7, 7);
 
-                        *dst = _mm_insert_epi16 (*dst, i4, 4);
-                        *dst = _mm_insert_epi16 (*dst, i5, 5);
-                        *dst = _mm_insert_epi16 (*dst, i6, 6);
-                        *dst = _mm_insert_epi16 (*dst, i7, 7);
+                        _mm_storeu_si128 (dst, dstv);
 
+                        ++dst;
                         src += 8;
-                        dst++;
                     }
                 }
             }
             else
             {
+                // no-op conversion to linear
+                for (int y = 8 * blocky; y < 8 * blocky + maxY; ++y)
+                {
+                    __m128i* restrict dst = (__m128i *) chanData[comp]->_rows[y];
+                    __m128i const * restrict src = (__m128i const *)&rowBlock[comp][(y & 0x7) * 8];
 
-#endif /* IMF_HAVE_SSE2 */
+                    for (int blockx = 0; blockx < numFullBlocksX; ++blockx)
+                    {
+                        _mm_storeu_si128 (dst, _mm_loadu_si128 (src));
 
+                        src += 8 * 8;
+                        dst += 8;
+                    }
+                }
+            }
+#else
+            if (d->_toLinear)
+            {
                 //
                 // Basic scalar kinda slow path for handling the full X blocks
                 //
 
                 for (int y = 8 * blocky; y < 8 * blocky + maxY; ++y)
                 {
-                    uint16_t* dst = (uint16_t*) chanData[comp]->_rows[y];
+                    uint16_t* restrict dst = (uint16_t*) chanData[comp]->_rows[y];
 
                     for (int blockx = 0; blockx < numFullBlocksX; ++blockx)
                     {
-                        uint16_t* src =
+                        uint16_t* restrict src =
                             &rowBlock[comp][blockx * 64 + ((y & 0x7) * 8)];
 
                         dst[0] = d->_toLinear[src[0]];
@@ -621,10 +683,23 @@ LossyDctDecoder_execute (
                         dst += 8;
                     }
                 }
-
-#ifdef IMF_HAVE_SSE2
             }
+            else
+            {
+                // no-op conversion to linear
+                for (int y = 8 * blocky; y < 8 * blocky + maxY; ++y)
+                {
+                    uint16_t* dst = (uint16_t*) chanData[comp]->_rows[y];
 
+                    for (int blockx = 0; blockx < numFullBlocksX; ++blockx)
+                    {
+                        uint16_t* src =
+                            &rowBlock[comp][blockx * 64 + ((y & 0x7) * 8)];
+                        memcpy (dst, src, 8*sizeof(uint16_t));
+                        dst += 8;
+                    }
+                }
+            }
 #endif /* IMF_HAVE_SSE2 */
 
             //
@@ -683,87 +758,3 @@ LossyDctDecoder_execute (
 }
 
 /**************************************/
-
-//
-// Un-RLE the packed AC components into
-// a half buffer. The half block should
-// be the full 8x8 block (in zig-zag order
-// still), not the first AC component.
-//
-// currAcComp is advanced as bytes are decoded.
-//
-// This returns the index of the last non-zero
-// value in the buffer - with the index into zig zag
-// order data. If we return 0, we have DC only data.
-//
-// This is assuminging that halfZigBlock is zero'ed
-// prior to calling
-//
-exr_result_t
-LossyDctDecoder_unRleAc (
-    LossyDctDecoder* d,
-    int*             lastNonZero,
-    uint16_t**       currAcComp,
-    uint16_t*        packedAcEnd,
-    uint16_t*        halfZigBlock)
-{
-    //
-    // Un-RLE the RLE'd blocks. If we find an item whose
-    // high byte is 0xff, then insert the number of 0's
-    // as indicated by the low byte.
-    //
-    // Otherwise, just copy the number verbatim.
-    //
-    int       dctComp = 1;
-    uint16_t* acComp  = *currAcComp;
-    uint16_t  val;
-    int       lnz      = 0;
-    uint64_t  ac_count = 0;
-
-    //
-    // Start with a zero'ed block, so we don't have to
-    // write when we hit a run symbol
-    //
-
-    while (dctComp < 64)
-    {
-        if (acComp >= packedAcEnd) { return EXR_ERR_CORRUPT_CHUNK; }
-        val = *acComp;
-        if (val == 0xff00)
-        {
-            //
-            // End of block
-            //
-
-            dctComp = 64;
-        }
-        else if ((val >> 8) == 0xff)
-        {
-            //
-            // Run detected! Insert 0's.
-            //
-            // Since the block has been zeroed, just advance the ptr
-            //
-
-            dctComp += val & 0xff;
-        }
-        else
-        {
-            //
-            // Not a run, just copy over the value
-            //
-            lnz                   = dctComp;
-            halfZigBlock[dctComp] = val;
-
-            dctComp++;
-        }
-
-        ac_count++;
-        acComp++;
-    }
-
-    d->_packedAcCount += ac_count;
-    *lastNonZero = lnz;
-    *currAcComp  = acComp;
-    return EXR_ERR_SUCCESS;
-}