diff --git a/src/layer/arm/dequantize_arm.cpp b/src/layer/arm/dequantize_arm.cpp
index 77992eb52e0..bf2bd17adfc 100644
--- a/src/layer/arm/dequantize_arm.cpp
+++ b/src/layer/arm/dequantize_arm.cpp
@@ -283,13 +283,14 @@ int Dequantize_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const O
     const int h = bottom_blob.h;
     const int channels = bottom_blob.c;
     const int elempack = bottom_blob.elempack;
-
-    top_blob.create_like(bottom_blob, opt.blob_allocator);
-    if (top_blob.empty())
-        return -100;
+    const size_t out_elemsize = elempack * 2u;
 
     if (dims == 1)
     {
+        top_blob.create(w, out_elemsize, elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
         const int wp = std::max(1, w / opt.num_threads);
         const int nn_w = (w + wp - 1) / wp;
 
@@ -312,6 +313,10 @@ int Dequantize_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const O
 
     if (dims == 2)
     {
+        top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
         #pragma omp parallel for num_threads(opt.num_threads)
         for (int i = 0; i < h; i++)
         {
@@ -327,6 +332,10 @@ int Dequantize_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const O
 
     if (dims == 3)
     {
+        top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
         #pragma omp parallel for num_threads(opt.num_threads)
         for (int q = 0; q < channels; q++)
         {
diff --git a/src/layer/arm/dequantize_arm_asimdhp.cpp b/src/layer/arm/dequantize_arm_asimdhp.cpp
index cfd6ce36b25..18404104c42 100644
--- a/src/layer/arm/dequantize_arm_asimdhp.cpp
+++ b/src/layer/arm/dequantize_arm_asimdhp.cpp
@@ -129,13 +129,14 @@ int Dequantize_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const O
     const int h = bottom_blob.h;
     const int channels = bottom_blob.c;
     const int elempack = bottom_blob.elempack;
-
-    top_blob.create_like(bottom_blob, opt.blob_allocator);
-    if (top_blob.empty())
-        return -100;
+    const size_t out_elemsize = elempack * 2u;
 
     if (dims == 1)
     {
+        top_blob.create(w, out_elemsize, elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
         const int wp = std::max(1, w / opt.num_threads);
         const int nn_w = (w + wp - 1) / wp;
 
@@ -158,6 +159,10 @@ int Dequantize_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const O
 
     if (dims == 2)
     {
+        top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
         #pragma omp parallel for num_threads(opt.num_threads)
         for (int i = 0; i < h; i++)
         {
@@ -173,6 +178,10 @@ int Dequantize_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const O
 
     if (dims == 3)
     {
+        top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
         #pragma omp parallel for num_threads(opt.num_threads)
         for (int q = 0; q < channels; q++)
         {