Skip to content

Commit

Permalink
Prelu layer uses sse instruction _mm_load_ps but data can be misalign…
Browse files Browse the repository at this point in the history
…ed so it must use _mm_loadu_ps (#5149)
  • Loading branch information
AlOa authored Nov 15, 2023
1 parent 465debe commit 9f26eeb
Showing 1 changed file with 3 additions and 3 deletions.
6 changes: 3 additions & 3 deletions src/layer/x86/prelu_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ int PReLU_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
int i = remain_size_start + ii * 4;
__m128 _p128 = _mm_load_ps(ptr + i);
__m128 _slope128 = _mm_load_ps(slope + i);
__m128 _slope128 = _mm_loadu_ps(slope + i);
_mm_store_ps(ptr + i, prelu_sse(_p128, _slope128));
}
remain_size_start += nn_size * 4;
Expand Down Expand Up @@ -157,7 +157,7 @@ int PReLU_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const

float slope = num_slope > 1 ? slope_data[i] : slope_data[0];
#if __SSE2__
__m128 _slope128 = num_slope > 1 && (elempack == 4) ? _mm_load_ps((const float*)slope_data + i * 4) : _mm_set1_ps(slope);
__m128 _slope128 = num_slope > 1 && (elempack == 4) ? _mm_loadu_ps((const float*)slope_data + i * 4) : _mm_set1_ps(slope);
#if __AVX__
__m256 _slope256 = num_slope > 1 && (elempack == 8) ? _mm256_loadu_ps((const float*)slope_data + i * 8) : _mm256_insertf128_ps(_mm256_castps128_ps256(_slope128), _slope128, 1);
#if __AVX512F__
Expand Down Expand Up @@ -205,7 +205,7 @@ int PReLU_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const

float slope = num_slope > 1 ? slope_data[q] : slope_data[0];
#if __SSE2__
__m128 _slope128 = num_slope > 1 && (elempack == 4) ? _mm_load_ps((const float*)slope_data + q * 4) : _mm_set1_ps(slope);
__m128 _slope128 = num_slope > 1 && (elempack == 4) ? _mm_loadu_ps((const float*)slope_data + q * 4) : _mm_set1_ps(slope);
#if __AVX__
__m256 _slope256 = num_slope > 1 && (elempack == 8) ? _mm256_loadu_ps((const float*)slope_data + q * 8) : _mm256_insertf128_ps(_mm256_castps128_ps256(_slope128), _slope128, 1);
#if __AVX512F__
Expand Down

0 comments on commit 9f26eeb

Please sign in to comment.