diff --git a/src/layer/x86/dequantize_x86.cpp b/src/layer/x86/dequantize_x86.cpp index 6152c9dbc91..3cc4b1805aa 100644 --- a/src/layer/x86/dequantize_x86.cpp +++ b/src/layer/x86/dequantize_x86.cpp @@ -32,552 +32,263 @@ Dequantize_x86::Dequantize_x86() #endif // __SSE2__ } -int Dequantize_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +static void dequantize(const int* intptr, float* ptr, const Mat& scale_data, const Mat& bias_data, int elemcount, int elempack) { - int dims = bottom_blob.dims; - int elempack = bottom_blob.elempack; + const int scale_data_size = scale_data.w; + const int bias_data_size = bias_data.w; + const int size = elemcount * elempack; + + // NCNN_LOGE("dequantize %d %d %d %d", scale_data_size, bias_data_size, elemcount, elempack); + const float* scale_ptr = scale_data; + + float scale = 0.f; #if __SSE2__ + __m128 _scale = _mm_setzero_ps(); #if __AVX__ + __m256 _scale_avx = _mm256_setzero_ps(); #if __AVX512F__ - if (elempack == 16) - { - Mat tmp; - convert_packing(bottom_blob, tmp, 8, opt); - - Mat tmpout; - forward(tmp, tmpout, opt); - - convert_packing(tmpout, top_blob, 16, opt); + __m512 _scale_avx512 = _mm512_setzero_ps(); +#endif // __AVX512F__ +#endif // __AVX__ +#endif // __SSE2__ - return 0; + if (scale_data_size == 1 || elempack == 1) + { + scale = scale_ptr[0]; +#if __SSE2__ + _scale = _mm_set1_ps(scale); +#if __AVX__ + _scale_avx = _mm256_set1_ps(scale); +#if __AVX512F__ + _scale_avx512 = _mm512_set1_ps(scale); +#endif // __AVX512F__ +#endif // __AVX__ +#endif // __SSE2__ } + else + { +#if __SSE2__ +#if __AVX__ +#if __AVX512F__ + if (elempack == 16) + { + _scale_avx512 = _mm512_loadu_ps(scale_ptr); + } +#endif // __AVX512F__ + if (elempack == 8) + { + _scale_avx = _mm256_loadu_ps(scale_ptr); +#if __AVX512F__ + _scale_avx512 = combine8x2_ps(_scale_avx, _scale_avx); #endif // __AVX512F__ + } +#endif // __AVX__ + if (elempack == 4) + { + _scale = _mm_loadu_ps(scale_ptr); +#if __AVX__ + _scale_avx = combine4x2_ps(_scale, _scale); +#if __AVX512F__ + _scale_avx512 = combine8x2_ps(_scale_avx, _scale_avx); +#endif // __AVX512F__ +#endif // __AVX__ + } +#endif // __SSE2__ + } - if (elempack == 8) + if (bias_data_size == 0) { - if (dims == 1) + int i = 0; +#if __SSE2__ +#if __AVX__ +#if __AVX512F__ + for (; i + 15 < size; i += 16) { - int w = bottom_blob.w; - - top_blob.create(w, (size_t)32u, 8, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (scale_data_size == 1) - { - __m256 _scale = _mm256_set1_ps(scale_data[0]); - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - float* ptr = (float*)top_blob + i * 8; - - __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr)); - _v = _mm256_mul_ps(_v, _scale); - _mm256_storeu_ps(ptr, _v); - } - } - else if (bias_data_size == 1) - { - __m256 _bias = _mm256_set1_ps(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - float* ptr = (float*)top_blob + i * 8; - - __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr)); - _v = _mm256_comp_fmadd_ps(_v, _scale, _bias); - _mm256_storeu_ps(ptr, _v); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - float* ptr = (float*)top_blob + i * 8; - - __m256 _bias = _mm256_loadu_ps((const float*)bias_data + i * 8); - __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr)); - _v = _mm256_comp_fmadd_ps(_v, _scale, _bias); - _mm256_storeu_ps(ptr, _v); - } - } - } - else - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - float* ptr = (float*)top_blob + i * 8; - - __m256 _scale = _mm256_loadu_ps((const float*)scale_data + i * 8); - __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr)); - _v = _mm256_mul_ps(_v, _scale); - _mm256_storeu_ps(ptr, _v); - } - } - else if (bias_data_size == 1) - { - __m256 _bias = _mm256_set1_ps(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - float* ptr = (float*)top_blob + i * 8; - - __m256 _scale = _mm256_loadu_ps((const float*)scale_data + i * 8); - __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr)); - _v = _mm256_comp_fmadd_ps(_v, _scale, _bias); - _mm256_storeu_ps(ptr, _v); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - float* ptr = (float*)top_blob + i * 8; - - __m256 _scale = _mm256_loadu_ps((const float*)scale_data + i * 8); - __m256 _bias = _mm256_loadu_ps((const float*)bias_data + i * 8); - __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr)); - _v = _mm256_comp_fmadd_ps(_v, _scale, _bias); - _mm256_storeu_ps(ptr, _v); - } - } - } + __m512 _v = _mm512_cvtepi32_ps(_mm512_loadu_si512((const __m512i*)intptr)); + _v = _mm512_mul_ps(_v, _scale_avx512); + _mm512_storeu_ps(ptr, _v); + intptr += 16; + ptr += 16; } - - if (dims == 2) +#endif // __AVX512F__ + for (; i + 7 < size; i += 8) { - int w = bottom_blob.w; - int h = bottom_blob.h; - - top_blob.create(w, h, (size_t)32u, 8, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - float* ptr = top_blob.row(i); - - __m256 _scale = scale_data_size == 1 ? _mm256_set1_ps(scale_data[0]) : _mm256_loadu_ps((const float*)scale_data + i * 8); - - for (int j = 0; j < w; j++) - { - __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr)); - _v = _mm256_mul_ps(_v, _scale); - _mm256_storeu_ps(ptr, _v); - - intptr += 8; - ptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - float* ptr = top_blob.row(i); - - __m256 _scale = scale_data_size == 1 ? _mm256_set1_ps(scale_data[0]) : _mm256_loadu_ps((const float*)scale_data + i * 8); - __m256 _bias = bias_data_size == 1 ? _mm256_set1_ps(bias_data[0]) : _mm256_loadu_ps((const float*)bias_data + i * 8); - - for (int j = 0; j < w; j++) - { - __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr)); - _v = _mm256_comp_fmadd_ps(_v, _scale, _bias); - _mm256_storeu_ps(ptr, _v); - - intptr += 8; - ptr += 8; - } - } - } + __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr)); + _v = _mm256_mul_ps(_v, _scale_avx); + _mm256_storeu_ps(ptr, _v); + intptr += 8; + ptr += 8; } - - if (dims == 3) +#endif // __AVX__ + for (; i + 3 < size; i += 4) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - - top_blob.create(w, h, channels, (size_t)32u, 8, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - float* ptr = top_blob.channel(q); - - __m256 _scale = scale_data_size == 1 ? _mm256_set1_ps(scale_data[0]) : _mm256_loadu_ps((const float*)scale_data + q * 8); - - for (int i = 0; i < size; i++) - { - __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr)); - _v = _mm256_mul_ps(_v, _scale); - _mm256_storeu_ps(ptr, _v); - - intptr += 8; - ptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - float* ptr = top_blob.channel(q); - - __m256 _scale = scale_data_size == 1 ? _mm256_set1_ps(scale_data[0]) : _mm256_loadu_ps((const float*)scale_data + q * 8); - __m256 _bias = bias_data_size == 1 ? _mm256_set1_ps(bias_data[0]) : _mm256_loadu_ps((const float*)bias_data + q * 8); - - for (int i = 0; i < size; i++) - { - __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr)); - _v = _mm256_comp_fmadd_ps(_v, _scale, _bias); - _mm256_storeu_ps(ptr, _v); - - intptr += 8; - ptr += 8; - } - } - } + __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); + _v = _mm_mul_ps(_v, _scale); + _mm_storeu_ps(ptr, _v); + intptr += 4; + ptr += 4; + } +#endif // __SSE2__ + for (; i < size; i++) + { + *ptr = *intptr * scale; + intptr++; + ptr++; } - - return 0; } -#else // __AVX__ - if (elempack == 8) + else { - if (dims == 1) - { - int w = bottom_blob.w; - int outw = w * 2; - - top_blob.create(outw, (size_t)16u, 4, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (scale_data_size == 1) - { - __m128 _scale = _mm_set1_ps(scale_data[0]); - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outw; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - _v = _mm_mul_ps(_v, _scale); - _mm_storeu_ps(ptr, _v); - } - } - else if (bias_data_size == 1) - { - __m128 _bias = _mm_set1_ps(bias_data[0]); + const float* bias_ptr = bias_data; - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outw; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale)); - _mm_storeu_ps(ptr, _v); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outw; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - __m128 _bias = _mm_loadu_ps((const float*)bias_data + i * 4); - __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale)); - _mm_storeu_ps(ptr, _v); - } - } - } - else - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outw; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - __m128 _scale = _mm_loadu_ps((const float*)scale_data + i * 4); - __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - _v = _mm_mul_ps(_v, _scale); - _mm_storeu_ps(ptr, _v); - } - } - else if (bias_data_size == 1) - { - __m128 _bias = _mm_set1_ps(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outw; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - __m128 _scale = _mm_loadu_ps((const float*)scale_data + i * 4); - __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale)); - _mm_storeu_ps(ptr, _v); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outw; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; + float bias = 0.f; +#if __SSE2__ + __m128 _bias = _mm_setzero_ps(); +#if __AVX__ + __m256 _bias_avx = _mm256_setzero_ps(); +#if __AVX512F__ + __m512 _bias_avx512 = _mm512_setzero_ps(); +#endif // __AVX512F__ +#endif // __AVX__ +#endif // __SSE2__ - __m128 _scale = _mm_loadu_ps((const float*)scale_data + i * 4); - __m128 _bias = _mm_loadu_ps((const float*)bias_data + i * 4); - __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale)); - _mm_storeu_ps(ptr, _v); - } - } - } + if (bias_data_size == 1 || elempack == 1) + { + bias = bias_ptr[0]; +#if __SSE2__ + _bias = _mm_set1_ps(bias); +#if __AVX__ + _bias_avx = _mm256_set1_ps(bias); +#if __AVX512F__ + _bias_avx512 = _mm512_set1_ps(bias); +#endif // __AVX512F__ +#endif // __AVX__ +#endif // __SSE2__ } - - if (dims == 2) + else { - int w = bottom_blob.w; - int h = bottom_blob.h; - int outh = h * 2; - - top_blob.create(w, outh, (size_t)16u, 4, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) +#if __SSE2__ +#if __AVX__ +#if __AVX512F__ + if (elempack == 16) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - float* ptr0 = top_blob.row(i * 2); - float* ptr1 = top_blob.row(i * 2 + 1); - - __m128 _scale0 = scale_data_size == 1 ? _mm_set1_ps(scale_data[0]) : _mm_loadu_ps((const float*)scale_data + i * 8); - __m128 _scale1 = scale_data_size == 1 ? _mm_set1_ps(scale_data[0]) : _mm_loadu_ps((const float*)scale_data + i * 8 + 4); - - for (int j = 0; j < w; j++) - { - __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4))); - _v0 = _mm_mul_ps(_v0, _scale0); - _v1 = _mm_mul_ps(_v1, _scale1); - _mm_storeu_ps(ptr0, _v0); - _mm_storeu_ps(ptr1, _v1); - - intptr += 8; - ptr0 += 4; - ptr1 += 4; - } - } + _bias_avx512 = _mm512_loadu_ps(bias_ptr); } - else +#endif // __AVX512F__ + if (elempack == 8) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - float* ptr0 = top_blob.row(i * 2); - float* ptr1 = top_blob.row(i * 2 + 1); - - __m128 _scale0 = scale_data_size == 1 ? _mm_set1_ps(scale_data[0]) : _mm_loadu_ps((const float*)scale_data + i * 8); - __m128 _scale1 = scale_data_size == 1 ? _mm_set1_ps(scale_data[0]) : _mm_loadu_ps((const float*)scale_data + i * 8 + 4); - __m128 _bias0 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8); - __m128 _bias1 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8 + 4); - - for (int j = 0; j < w; j++) - { - __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4))); - _v0 = _mm_add_ps(_bias0, _mm_mul_ps(_v0, _scale0)); - _v1 = _mm_add_ps(_bias1, _mm_mul_ps(_v1, _scale1)); - _mm_storeu_ps(ptr0, _v0); - _mm_storeu_ps(ptr1, _v1); - - intptr += 8; - ptr0 += 4; - ptr1 += 4; - } - } + _bias_avx = _mm256_loadu_ps(bias_ptr); +#if __AVX512F__ + _bias_avx512 = combine8x2_ps(_bias_avx, _bias_avx); +#endif // __AVX512F__ } +#endif // __AVX__ + if (elempack == 4) + { + _bias = _mm_loadu_ps(bias_ptr); +#if __AVX__ + _bias_avx = combine4x2_ps(_bias, _bias); +#if __AVX512F__ + _bias_avx512 = combine8x2_ps(_bias_avx, _bias_avx); +#endif // __AVX512F__ +#endif // __AVX__ + } +#endif // __SSE2__ } - if (dims == 3) + int i = 0; +#if __SSE2__ +#if __AVX__ +#if __AVX512F__ + for (; i + 15 < size; i += 16) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - int outc = channels * 2; - - top_blob.create(w, h, outc, (size_t)16u, 4, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - float* ptr0 = top_blob.channel(q * 2); - float* ptr1 = top_blob.channel(q * 2 + 1); - - __m128 _scale0 = scale_data_size == 1 ? _mm_set1_ps(scale_data[0]) : _mm_loadu_ps((const float*)scale_data + q * 8); - __m128 _scale1 = scale_data_size == 1 ? _mm_set1_ps(scale_data[0]) : _mm_loadu_ps((const float*)scale_data + q * 8 + 4); - - for (int i = 0; i < size; i++) - { - __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4))); - _v0 = _mm_mul_ps(_v0, _scale0); - _v1 = _mm_mul_ps(_v1, _scale1); - _mm_storeu_ps(ptr0, _v0); - _mm_storeu_ps(ptr1, _v1); + __m512 _v = _mm512_cvtepi32_ps(_mm512_loadu_si512((const __m512i*)intptr)); + _v = _mm512_fmadd_ps(_v, _scale_avx512, _bias_avx512); + _mm512_storeu_ps(ptr, _v); + intptr += 16; + ptr += 16; + } +#endif // __AVX512F__ + for (; i + 7 < size; i += 8) + { + __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr)); + _v = _mm256_comp_fmadd_ps(_v, _scale_avx, _bias_avx); + _mm256_storeu_ps(ptr, _v); + intptr += 8; + ptr += 8; + } +#endif // __AVX__ + for (; i + 3 < size; i += 4) + { + __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); + _v = _mm_comp_fmadd_ps(_v, _scale, _bias); + _mm_storeu_ps(ptr, _v); + intptr += 4; + ptr += 4; + } +#endif // __SSE2__ + for (; i < size; i++) + { + *ptr = *intptr * scale + bias; + intptr++; + ptr++; + } + } +} - intptr += 8; - ptr0 += 4; - ptr1 += 4; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - float* ptr0 = top_blob.channel(q * 2); - float* ptr1 = top_blob.channel(q * 2 + 1); +int Dequantize_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + const int dims = bottom_blob.dims; + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int channels = bottom_blob.c; + const int elempack = bottom_blob.elempack; - __m128 _scale0 = scale_data_size == 1 ? _mm_set1_ps(scale_data[0]) : _mm_loadu_ps((const float*)scale_data + q * 8); - __m128 _scale1 = scale_data_size == 1 ? _mm_set1_ps(scale_data[0]) : _mm_loadu_ps((const float*)scale_data + q * 8 + 4); - __m128 _bias0 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + q * 8); - __m128 _bias1 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + q * 8 + 4); + top_blob.create_like(bottom_blob, opt.blob_allocator); + if (top_blob.empty()) + return -100; - for (int i = 0; i < size; i++) - { - __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4))); - _v0 = _mm_add_ps(_bias0, _mm_mul_ps(_v0, _scale0)); - _v1 = _mm_add_ps(_bias1, _mm_mul_ps(_v1, _scale1)); - _mm_storeu_ps(ptr0, _v0); - _mm_storeu_ps(ptr1, _v1); + if (dims == 1) + { + const int wp = std::max(1, w / opt.num_threads); + const int nn_w = (w + wp - 1) / wp; - intptr += 8; - ptr0 += 4; - ptr1 += 4; - } - } - } - } + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_w; ii++) + { + const int i = ii * wp; - return 0; - } -#endif // __AVX__ + const int* intptr = (const int*)bottom_blob + i * elempack; + float* ptr = (float*)top_blob + i * elempack; - if (elempack == 4) - { - if (dims == 1) - { - int w = bottom_blob.w; + const float* scale_ptr = scale_data_size > 1 ? (const float*)scale_data + i * elempack : scale_data; + const float* bias_ptr = bias_data_size > 1 ? (const float*)bias_data + i * elempack : bias_data; - top_blob.create(w, (size_t)16u, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; + const int size = std::min(w - i, wp) * elempack; if (scale_data_size == 1) { - __m128 _scale = _mm_set1_ps(scale_data[0]); - + const float scale = scale_ptr[0]; if (bias_data_size == 0) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) + for (int j = 0; j < size; j++) { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - _v = _mm_mul_ps(_v, _scale); - _mm_storeu_ps(ptr, _v); + ptr[j] = intptr[j] * scale; } } else if (bias_data_size == 1) { - __m128 _bias = _mm_set1_ps(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) + const float bias = bias_ptr[0]; + for (int j = 0; j < size; j++) { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale)); - _mm_storeu_ps(ptr, _v); + ptr[j] = intptr[j] * scale + bias; } } else { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) + for (int j = 0; j < size; j++) { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - __m128 _bias = _mm_loadu_ps((const float*)bias_data + i * 4); - __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale)); - _mm_storeu_ps(ptr, _v); + ptr[j] = intptr[j] * scale + bias_ptr[j]; } } } @@ -585,380 +296,57 @@ int Dequantize_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& { if (bias_data_size == 0) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) + for (int j = 0; j < size; j++) { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - __m128 _scale = _mm_loadu_ps((const float*)scale_data + i * 4); - __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - _v = _mm_mul_ps(_v, _scale); - _mm_storeu_ps(ptr, _v); + ptr[j] = intptr[j] * scale_ptr[j]; } } else if (bias_data_size == 1) { - __m128 _bias = _mm_set1_ps(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) + const float bias = bias_ptr[0]; + for (int j = 0; j < size; j++) { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - __m128 _scale = _mm_loadu_ps((const float*)scale_data + i * 4); - __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale)); - _mm_storeu_ps(ptr, _v); + ptr[j] = intptr[j] * scale_ptr[j] + bias; } } else { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - __m128 _scale = _mm_loadu_ps((const float*)scale_data + i * 4); - __m128 _bias = _mm_loadu_ps((const float*)bias_data + i * 4); - __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale)); - _mm_storeu_ps(ptr, _v); - } - } - } - } - - if (dims == 2) - { - int w = bottom_blob.w; - int h = bottom_blob.h; - - top_blob.create(w, h, (size_t)16u, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - float* ptr = top_blob.row(i); - - __m128 _scale = scale_data_size == 1 ? _mm_set1_ps(scale_data[0]) : _mm_loadu_ps((const float*)scale_data + i * 4); - - for (int j = 0; j < w; j++) + for (int j = 0; j < size; j++) { - __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - _v = _mm_mul_ps(_v, _scale); - _mm_storeu_ps(ptr, _v); - - intptr += 4; - ptr += 4; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - float* ptr = top_blob.row(i); - - __m128 _scale = scale_data_size == 1 ? _mm_set1_ps(scale_data[0]) : _mm_loadu_ps((const float*)scale_data + i * 4); - __m128 _bias = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 4); - - for (int j = 0; j < w; j++) - { - __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale)); - _mm_storeu_ps(ptr, _v); - - intptr += 4; - ptr += 4; + ptr[j] = intptr[j] * scale_ptr[j] + bias_ptr[j]; } } } } - - if (dims == 3) - { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - - top_blob.create(w, h, channels, (size_t)16u, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - float* ptr = top_blob.channel(q); - - __m128 _scale = scale_data_size == 1 ? _mm_set1_ps(scale_data[0]) : _mm_loadu_ps((const float*)scale_data + q * 4); - - for (int i = 0; i < size; i++) - { - __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - _v = _mm_mul_ps(_v, _scale); - _mm_storeu_ps(ptr, _v); - - intptr += 4; - ptr += 4; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - float* ptr = top_blob.channel(q); - - __m128 _scale = scale_data_size == 1 ? _mm_set1_ps(scale_data[0]) : _mm_loadu_ps((const float*)scale_data + q * 4); - __m128 _bias = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + q * 4); - - for (int i = 0; i < size; i++) - { - __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale)); - _mm_storeu_ps(ptr, _v); - - intptr += 4; - ptr += 4; - } - } - } - } - - return 0; - } -#endif // __SSE2__ - - if (dims == 1) - { - int w = bottom_blob.w; - - top_blob.create(w, (size_t)4u, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - const int* intptr = bottom_blob; - float* ptr = top_blob; - - if (scale_data_size == 1) - { - const float scale = scale_data[0]; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = intptr[i] * scale; - } - } - else if (bias_data_size == 1) - { - const float bias = bias_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = intptr[i] * scale + bias; - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = intptr[i] * scale + bias_data[i]; - } - } - } - else - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = intptr[i] * scale_data[i]; - } - } - else if (bias_data_size == 1) - { - const float bias = bias_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = intptr[i] * scale_data[i] + bias; - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = intptr[i] * scale_data[i] + bias_data[i]; - } - } - } } if (dims == 2) { - int w = bottom_blob.w; - int h = bottom_blob.h; - - top_blob.create(w, h, (size_t)4u, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - float* ptr = top_blob.row(i); - - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; - - int j = 0; -#if __SSE2__ - __m128 _scale = _mm_set1_ps(scale); - for (; j + 3 < w; j += 4) - { - __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - _v = _mm_mul_ps(_v, _scale); - _mm_storeu_ps(ptr, _v); - - intptr += 4; - ptr += 4; - } -#endif // __SSE2__ - for (; j < w; j++) - { - *ptr++ = *intptr++ * scale; - } - } - } - else + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - float* ptr = top_blob.row(i); + const int* intptr = bottom_blob.row(i); + float* ptr = top_blob.row(i); - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; - const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i]; + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; + const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data; - int j = 0; -#if __SSE2__ - __m128 _scale = _mm_set1_ps(scale); - __m128 _bias = _mm_set1_ps(bias); - for (; j + 3 < w; j += 4) - { - __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale)); - _mm_storeu_ps(ptr, _v); - - intptr += 4; - ptr += 4; - } -#endif // __SSE2__ - for (; j < w; j++) - { - *ptr++ = *intptr++ * scale + bias; - } - } + dequantize(intptr, ptr, scale_data_i, bias_data_i, w, elempack); } } if (dims == 3) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - - top_blob.create(w, h, channels, (size_t)4u, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - float* ptr = top_blob.channel(q); - - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; - - int i = 0; -#if __SSE2__ - __m128 _scale = _mm_set1_ps(scale); - for (; i + 3 < size; i += 4) - { - __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - _v = _mm_mul_ps(_v, _scale); - _mm_storeu_ps(ptr, _v); - - intptr += 4; - ptr += 4; - } -#endif // __SSE2__ - for (; i < size; i++) - { - *ptr++ = *intptr++ * scale; - } - } - } - else + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - float* ptr = top_blob.channel(q); + const int* intptr = bottom_blob.channel(q); + float* ptr = top_blob.channel(q); - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; - const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q]; + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; + const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data; - int i = 0; -#if __SSE2__ - __m128 _scale = _mm_set1_ps(scale); - __m128 _bias = _mm_set1_ps(bias); - for (; i + 3 < size; i += 4) - { - __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale)); - _mm_storeu_ps(ptr, _v); - - intptr += 4; - ptr += 4; - } -#endif // __SSE2__ - for (; i < size; i++) - { - *ptr++ = *intptr++ * scale + bias; - } - } + dequantize(intptr, ptr, scale_data_q, bias_data_q, w * h, elempack); } } diff --git a/tests/test_dequantize.cpp b/tests/test_dequantize.cpp index ca05059fa45..8ac6382762a 100644 --- a/tests/test_dequantize.cpp +++ b/tests/test_dequantize.cpp @@ -35,30 +35,15 @@ static int test_dequantize(const ncnn::Mat& a, int scale_data_size, int bias_dat return ret; } -static int test_dequantize_pack8(const ncnn::Mat& a, int scale_data_size, int bias_data_size) -{ - ncnn::ParamDict pd; - pd.set(0, scale_data_size); - pd.set(1, bias_data_size); - - std::vector weights(bias_data_size ? 2 : 1); - weights[0] = RandomMat(scale_data_size); - if (bias_data_size) - weights[1] = RandomMat(bias_data_size); - - int flag = TEST_LAYER_DISABLE_AUTO_INPUT_CASTING | TEST_LAYER_ENABLE_FORCE_INPUT_PACK8; - int ret = test_layer("Dequantize", pd, weights, a, 0.001, 0, flag); - if (ret != 0) - { - fprintf(stderr, "test_dequantize_pack8 failed a.dims=%d a=(%d %d %d) scale_data_size=%d bias_data_size=%d\n", a.dims, a.w, a.h, a.c, scale_data_size, bias_data_size); - } - - return ret; -} - static int test_dequantize_0() { return 0 + || test_dequantize(RandomIntMat(11, 13, 48), 1, 48) + || test_dequantize(RandomIntMat(11, 13, 48), 1, 1) + || test_dequantize(RandomIntMat(11, 13, 48), 1, 0) + || test_dequantize(RandomIntMat(11, 13, 48), 48, 48) + || test_dequantize(RandomIntMat(11, 13, 48), 48, 1) + || test_dequantize(RandomIntMat(11, 13, 48), 48, 0) || test_dequantize(RandomIntMat(5, 7, 24), 1, 24) || test_dequantize(RandomIntMat(5, 7, 24), 1, 1) || test_dequantize(RandomIntMat(5, 7, 24), 1, 0) @@ -82,6 +67,12 @@ static int test_dequantize_0() static int test_dequantize_1() { return 0 + || test_dequantize(RandomIntMat(127, 48), 1, 48) + || test_dequantize(RandomIntMat(127, 48), 1, 1) + || test_dequantize(RandomIntMat(127, 48), 1, 0) + || test_dequantize(RandomIntMat(127, 48), 48, 48) + || test_dequantize(RandomIntMat(127, 48), 48, 1) + || test_dequantize(RandomIntMat(127, 48), 48, 0) || test_dequantize(RandomIntMat(15, 24), 1, 24) || test_dequantize(RandomIntMat(15, 24), 1, 1) || test_dequantize(RandomIntMat(15, 24), 1, 0) @@ -111,6 +102,12 @@ static int test_dequantize_2() || test_dequantize(RandomIntMat(128), 128, 128) || test_dequantize(RandomIntMat(128), 128, 1) || test_dequantize(RandomIntMat(128), 128, 0) + || test_dequantize(RandomIntMat(120), 1, 120) + || test_dequantize(RandomIntMat(120), 1, 1) + || test_dequantize(RandomIntMat(120), 1, 0) + || test_dequantize(RandomIntMat(120), 120, 120) + || test_dequantize(RandomIntMat(120), 120, 1) + || test_dequantize(RandomIntMat(120), 120, 0) || test_dequantize(RandomIntMat(124), 1, 124) || test_dequantize(RandomIntMat(124), 1, 1) || test_dequantize(RandomIntMat(124), 1, 0) @@ -125,29 +122,6 @@ static int test_dequantize_2() || test_dequantize(RandomIntMat(127), 127, 0); } -static int test_dequantize_3() -{ - return 0 - || test_dequantize_pack8(RandomIntMat(5, 7, 24), 1, 24) - || test_dequantize_pack8(RandomIntMat(5, 7, 24), 1, 1) - || test_dequantize_pack8(RandomIntMat(5, 7, 24), 1, 0) - || test_dequantize_pack8(RandomIntMat(5, 7, 24), 24, 24) - || test_dequantize_pack8(RandomIntMat(5, 7, 24), 24, 1) - || test_dequantize_pack8(RandomIntMat(5, 7, 24), 24, 0) - || test_dequantize_pack8(RandomIntMat(15, 24), 1, 24) - || test_dequantize_pack8(RandomIntMat(15, 24), 1, 1) - || test_dequantize_pack8(RandomIntMat(15, 24), 1, 0) - || test_dequantize_pack8(RandomIntMat(15, 24), 24, 24) - || test_dequantize_pack8(RandomIntMat(15, 24), 24, 1) - || test_dequantize_pack8(RandomIntMat(15, 24), 24, 0) - || test_dequantize_pack8(RandomIntMat(128), 1, 128) - || test_dequantize_pack8(RandomIntMat(128), 1, 1) - || test_dequantize_pack8(RandomIntMat(128), 1, 0) - || test_dequantize_pack8(RandomIntMat(128), 128, 128) - || test_dequantize_pack8(RandomIntMat(128), 128, 1) - || test_dequantize_pack8(RandomIntMat(128), 128, 0); -} - int main() { SRAND(7767517); @@ -155,6 +129,5 @@ int main() return 0 || test_dequantize_0() || test_dequantize_1() - || test_dequantize_2() - || test_dequantize_3(); + || test_dequantize_2(); }