More optimize dlk runtime (#390)

* Optimize matmul for arm neon * Optimize quantizer * Optimize pack_input * Optimize ApplyThresholdsAndPack for arm neon * Use float instead of double * Optimize extract_image_patches * Optimize tensor convert * Optimize memcpy * Optimize lookup * Optimize matmul * Copy thresholds in Network::init() * Fuse conv2d, apply_thresholds, pack_16bit, convert_tensor * Remove unused sources * Fix includes * Remove unused includes * Optimize and refactor quantized_conv2d_tiling * Fuse conv2d, apply_thresholds, pack_16bit, convert_tensor for x86 avx * Remove unused sources * Remove unused include * Fix options * Optimize matmul for x86 avx * Optimize quantized conv2d tiling for x86 avx * Remove unused dependency * Fix type * Make quantizer.cpp * Refactor assert * Fix for warning * Put B_buf on heap area * Fix for x86 avx * Fix CMakeLists and remove unused files * Remove unused include * Add test for Arm w/o FPGA and x86 with AVX * Fix index * Optimize packing * Fix pack_input_to_qwords * Optimize de-interleave * Remove unused lines * Remove comment
blue-oil · Sep 10, 2019 · 3937a64 · 3937a64
1 parent 4801a2f
commit 3937a64
Show file tree

Hide file tree

Showing 28 changed files with 1,770 additions and 1,912 deletions.
diff --git a/dlk/python/dlk/core/view.py b/dlk/python/dlk/core/view.py
@@ -130,11 +130,13 @@ def run(self):
 
                 if op.has_thresholds:
                     threshold = f'{op.name}_thresholds'
+                    thresholds_addr = f'THRESHOLD_ADDR + {op.name}_thresholds_offset'
                     conv_func = 'func_QuantizedConv2DWithThreshold'
                     nbit_aqtz = self.op.a_quantizer[0].nbit
                     max_value = self.op.a_quantizer[0].max_v
                 else:
                     threshold = 'nullptr'
+                    thresholds_addr = '0'
                     conv_func = 'func_QuantizedConv2D'
                     nbit_aqtz = 2
                     max_value = 2.0
@@ -169,6 +171,7 @@ def run(self):
                     binConv2D_struct.debug_name = "{op.name}";
 #ifdef RUN_ON_FPGA
                     binConv2D_struct.device_kernel_phys_addr = KERNEL_ADDR + {op.name}_kernel_offset;
+                    binConv2D_struct.device_thresholds_phys_addr = {thresholds_addr};
 #endif
 
                     {conv_func}({inputs_string}, {op.name}, scaling_factors::{op.name}, binConv2D_struct);

diff --git a/dlk/python/dlk/templates/CMakeLists.txt b/dlk/python/dlk/templates/CMakeLists.txt
@@ -32,17 +32,20 @@ file(GLOB SRC_LIB_ALL "src/inputs/*.cpp")
 list(APPEND SRC_LIB_ALL
     src/func/average_pool.cpp
     src/func/conv2d.cpp
+    src/func/lookup.cpp
     src/func/max_pool.cpp
     src/func/pad.cpp
     src/func/matmul.cpp
     src/func/quantize.cpp
     src/func/softmax.cpp
     src/func/unpooling.cpp
     src/matrix/shift_add.cpp
+    src/matrix/multiplication.cpp
     src/network_c_interface.cpp
     src/network.cpp
     src/pack_input_to_qwords.cpp
     src/time_measurement.cpp
+    src/quantizer.cpp
 )
 
 if(EXISTS ${CMAKE_SOURCE_DIR}/src/scaling_factors.cpp)
@@ -54,20 +57,17 @@ if(EXISTS ${CMAKE_SOURCE_DIR}/src/thresholds.cpp)
 endif()
 
 if(RUN_ON_FPGA)
-    list(APPEND SRC_LIB_ALL src/pack2b_neonv7.S)
     list(APPEND SRC_LIB_ALL src/func/arm_neon/batch_normalization.cpp)
     list(APPEND SRC_LIB_ALL src/func/impl/fpga/quantized_conv2d_kn2row.cpp)
     list(APPEND SRC_LIB_ALL src/func/impl/arm_neon/pop_count.cpp)
-    list(APPEND SRC_LIB_ALL src/matrix/arm_neon/quantized_multiplication.cpp)
 elseif(USE_NEON)
-    list(APPEND SRC_LIB_ALL src/pack2b_neonv7.S)
     list(APPEND SRC_LIB_ALL src/func/arm_neon/batch_normalization.cpp)
     list(APPEND SRC_LIB_ALL src/func/impl/arm_neon/quantized_conv2d_tiling.cpp)
-    list(APPEND SRC_LIB_ALL src/func/impl/generic/quantized_conv2d_kn2row.cpp)
     list(APPEND SRC_LIB_ALL src/func/impl/arm_neon/pop_count.cpp)
-    list(APPEND SRC_LIB_ALL src/func/impl/arm_neon/pack_16bit.cpp)
-    list(APPEND SRC_LIB_ALL src/func/impl/arm_neon/apply_thresholds.cpp)
-    list(APPEND SRC_LIB_ALL src/matrix/arm_neon/quantized_multiplication.cpp)
+elseif(USE_AVX)
+    list(APPEND SRC_LIB_ALL src/func/generic/batch_normalization.cpp)
+    list(APPEND SRC_LIB_ALL src/func/impl/x86_avx/quantized_conv2d_tiling.cpp)
+    list(APPEND SRC_LIB_ALL src/func/impl/generic/pop_count.cpp)
 else()
     list(APPEND SRC_LIB_ALL src/func/generic/batch_normalization.cpp)
     list(APPEND SRC_LIB_ALL src/func/impl/generic/quantized_conv2d_kn2row.cpp)
@@ -90,14 +90,20 @@ macro(add_dlk_target_compile_properties target)
     target_include_directories(${target} PUBLIC include)
     if(USE_NEON)
         target_compile_definitions(${target} PUBLIC -DUSE_NEON)
-        target_compile_options(${target} PUBLIC -mcpu=cortex-a9 -mfpu=neon -mthumb -fopenmp)
+        target_compile_options(${target} PUBLIC -fopenmp)
+        target_link_libraries(${target} PUBLIC -fopenmp)
+    endif()
+    if(USE_AVX)
+        target_compile_definitions(${target} PUBLIC -DUSE_AVX)
+        target_compile_options(${target} PUBLIC -mavx2 -mfma -fopenmp)
         target_link_libraries(${target} PUBLIC -fopenmp)
     endif()
     if(RUN_ON_FPGA)
         target_compile_definitions(${target} PUBLIC -DRUN_ON_FPGA)
     endif()
     if(AARCH32)
         target_compile_definitions(${target} PUBLIC -DAARCH32)
+        target_compile_options(${target} PUBLIC -mcpu=cortex-a9 -mfpu=neon -mthumb)
     endif()
 endmacro()
 

diff --git a/dlk/python/dlk/templates/Makefile.tpl b/dlk/python/dlk/templates/Makefile.tpl
@@ -25,41 +25,35 @@ LIB_SRC := $(wildcard $(INPUTS_SRC_DIR)/*.cpp) \
     $(SRC_DIR)/func/unpooling.cpp \
     $(SRC_DIR)/func/lookup.cpp \
     $(SRC_DIR)/matrix/shift_add.cpp \
+    $(SRC_DIR)/matrix/multiplication.cpp \
     $(SRC_DIR)/network_c_interface.cpp \
     $(SRC_DIR)/network.cpp \
     $(SRC_DIR)/pack_input_to_qwords.cpp \
     $(SRC_DIR)/time_measurement.cpp \
-    $(SRC_DIR)/write_to_file.cpp
+    $(SRC_DIR)/write_to_file.cpp \
+    $(SRC_DIR)/quantizer.cpp
 
 SRC := $(LIB_SRC) $(wildcard $(DLK_TEST_SRC_DIR)/*.cpp) mains/main.cpp
 SRC := $(filter-out ./src/network_c_interface.cpp, $(SRC))
 
 LIB_ARM_SRC := $(wildcard $(SRC_DIR)/*.S) \
     $(SRC_DIR)/func/arm_neon/batch_normalization.cpp \
     $(SRC_DIR)/func/impl/arm_neon/quantized_conv2d_tiling.cpp \
-    $(SRC_DIR)/func/impl/generic/quantized_conv2d_kn2row.cpp \
-    $(SRC_DIR)/func/impl/arm_neon/pop_count.cpp \
-    $(SRC_DIR)/func/impl/arm_neon/pack_16bit.cpp \
-    $(SRC_DIR)/func/impl/arm_neon/apply_thresholds.cpp \
-    $(SRC_DIR)/matrix/arm_neon/quantized_multiplication.cpp
+    $(SRC_DIR)/func/impl/arm_neon/pop_count.cpp
 LIB_ARM_OBJ := $(patsubst %.S, %.o, $(LIB_ARM_SRC))
 LIB_ARM_OBJ := $(patsubst %.cpp, %.o, $(LIB_ARM_OBJ))
 
 LIB_FPGA_SRC := $(wildcard $(SRC_DIR)/*.S) \
     $(SRC_DIR)/func/arm_neon/batch_normalization.cpp \
     $(SRC_DIR)/func/impl/fpga/quantized_conv2d_kn2row.cpp \
-    $(SRC_DIR)/func/impl/arm_neon/pop_count.cpp \
-    $(SRC_DIR)/matrix/arm_neon/quantized_multiplication.cpp
+    $(SRC_DIR)/func/impl/arm_neon/pop_count.cpp
 LIB_FPGA_OBJ := $(patsubst %.S, %.o, $(LIB_FPGA_SRC))
 LIB_FPGA_OBJ := $(patsubst %.cpp, %.o, $(LIB_FPGA_OBJ))
 
 LIB_AARCH64_SRC := \
     $(SRC_DIR)/func/arm_neon/batch_normalization.cpp \
     $(SRC_DIR)/func/impl/arm_neon/quantized_conv2d_tiling.cpp \
-    $(SRC_DIR)/matrix/arm_neon/quantized_multiplication.cpp \
-    $(SRC_DIR)/func/impl/arm_neon/pop_count.cpp \
-    $(SRC_DIR)/func/impl/arm_neon/apply_thresholds.cpp \
-    $(SRC_DIR)/func/impl/arm_neon/pack_16bit.cpp
+    $(SRC_DIR)/func/impl/arm_neon/pop_count.cpp
 LIB_AARCH64_OBJ := $(patsubst %.S, %.o, $(LIB_AARCH64_SRC))
 LIB_AARCH64_OBJ := $(patsubst %.cpp, %.o, $(LIB_AARCH64_OBJ))
 
@@ -75,10 +69,7 @@ LIB_X86_OBJ := $(patsubst %.cpp, %.o, $(LIB_X86_SRC))
 LIB_X86_AVX_SRC := \
     $(SRC_DIR)/func/generic/batch_normalization.cpp \
     $(SRC_DIR)/func/impl/x86_avx/quantized_conv2d_tiling.cpp \
-    $(SRC_DIR)/matrix/generic/quantized_multiplication.cpp \
-    $(SRC_DIR)/func/impl/generic/pop_count.cpp \
-    $(SRC_DIR)/func/impl/x86_avx/apply_thresholds.cpp \
-    $(SRC_DIR)/func/impl/x86_avx/pack_16bit.cpp
+    $(SRC_DIR)/func/impl/generic/pop_count.cpp
 LIB_X86_AVX_OBJ := $(patsubst %.cpp, %.o, $(LIB_X86_AVX_SRC))
 
 LIB_OBJ := $(patsubst %.cpp, %.o, $(LIB_SRC))
@@ -145,7 +136,7 @@ lm_x86:           FLAGS += $(INCLUDES) -O3 -std=c++14 -DUSE_PNG -pthread -g
 lm_x86:           CXXFLAGS +=
 
 lm_x86_avx:       CXX = g++
-lm_x86_avx:       FLAGS += $(INCLUDES) -O3 -std=c++14 -mavx2 -DUSE_AVX -DUSE_PNG -pthread -g -fopenmp
+lm_x86_avx:       FLAGS += $(INCLUDES) -O3 -std=c++14 -mavx2 -mfma -DUSE_AVX -DUSE_PNG -pthread -g -fopenmp
 lm_x86_avx:       CXXFLAGS +=
 
 lm_aarch64:       CXX = aarch64-linux-gnu-g++
@@ -165,7 +156,7 @@ lib_x86:           FLAGS += $(INCLUDES) -O3 -std=c++14 -fPIC -fvisibility=hidden
 lib_x86:           CXXFLAGS +=
 
 lib_x86_avx:       CXX = g++
-lib_x86_avx:       FLAGS += $(INCLUDES) -O3 -std=c++14 -fPIC -fvisibility=hidden -DUSE_AVX -pthread -g -fopenmp
+lib_x86_avx:       FLAGS += $(INCLUDES) -O3 -std=c++14 -fPIC -fvisibility=hidden -DUSE_AVX -mavx2 -mfma -pthread -g -fopenmp
 lib_x86_avx:       CXXFLAGS +=
 
 lib_aarch64:       CXX = aarch64-linux-gnu-g++

diff --git a/dlk/python/dlk/templates/include/de10_nano.h b/dlk/python/dlk/templates/include/de10_nano.h
@@ -505,28 +505,18 @@ Parameters calcParameters(uint32_t inputHeight, uint32_t inputWidth, uint32_t in
 }
 
 void RunTCA(unsigned long input_addr, unsigned long output_addr, unsigned long kernel_addr,
-  BIN_CONV_OUTPUT th_data[], unsigned in_w, unsigned in_h, unsigned in_c, unsigned nbits_in_data,
+  unsigned long thresholds_addr, unsigned in_w, unsigned in_h, unsigned in_c, unsigned nbits_in_data,
   unsigned out_w, unsigned out_h, unsigned out_c, unsigned k_w, unsigned k_h, unsigned pad, unsigned stride) {
 
-  const unsigned k_size = (k_h * k_w * in_c * out_c) / 32;
-  // MappedMem k_data_mem(KERNEL_ADDR, k_size, sizeof(T_UINT));
-  // k_data_mem.Write(k_data_packed, k_size);
-
-  unsigned use_threshold = (th_data != NULL) ? 1 : 0;
-
-  if (use_threshold == 1) {
-    const unsigned th_size = out_c * NUM_OF_A2W1_THRESHOLD;
-    MappedMem th_data_mem(THRESHOLD_ADDR, th_size, sizeof(BIN_CONV_OUTPUT));
-    th_data_mem.Write(th_data, th_size);
-  }
+  unsigned use_threshold = (thresholds_addr != 0) ? 1 : 0;
 
   static volatile uint32_t* csr = nullptr;
   if (csr == nullptr) {
     csr = reinterpret_cast<uint32_t*>(mapPhysicalMemory(HPS_TO_FPGA_LW_BASE, 0xFF));
   }
     auto tileWidth = 32u;
     auto tileHeight = 32u;
-    auto p = calcParameters(in_h, in_w, in_c, tileWidth, tileHeight, out_c, k_h, k_w, input_addr, kernel_addr, THRESHOLD_ADDR, output_addr, use_threshold == 1);
+    auto p = calcParameters(in_h, in_w, in_c, tileWidth, tileHeight, out_c, k_h, k_w, input_addr, kernel_addr, thresholds_addr, output_addr, use_threshold == 1);
 
     csr[Csr::admaInputAddress] = p.admaInputAddress;
     csr[Csr::admaInputHCount] = p.admaInputHCount;

diff --git a/dlk/python/dlk/templates/include/func/extract_image_patches.h b/dlk/python/dlk/templates/include/func/extract_image_patches.h
@@ -183,46 +183,64 @@ inline void func_ExtractImagePatches(
   T_UINT output_index = 0;
 
   if (out_depth < kernel_size * kernel_size) {
-    int bit_shift = out_depth * QUANTIZED_PACKED::BitCount / (kernel_size * kernel_size);
+    const T_UINT kernel_area = kernel_size * kernel_size;
+    const T_UINT bit_shift = out_depth * QUANTIZED_PACKED::BitCount / kernel_area;
     const QUANTIZED_PACKED::base_t mask((QUANTIZED_PACKED::base_t(1) << bit_shift) - 1);
+    const T_UINT lb_kernel_size = __builtin_ctz(kernel_size);
+    const T_UINT kernel_mask = (1 << lb_kernel_size) - 1;
+#ifdef USE_NEON
+    const auto shift_ref = vcombine_s32(vdup_n_s32(0), vdup_n_s32(bit_shift));
+    const auto add = vdupq_n_s32(bit_shift * 2);
+    const auto mask_v = vdupq_n_u32(mask);
+#else
     const uint64_t mask64 = mask * 0x1'0000'0001ull;
-    std::fill(output.data(), output.data() + output.size(), QUANTIZED_PACKED(0));
+#endif
+    const T_UINT blocks = kernel_area / out_depth;
+#pragma omp parallel for
     for(T_UINT wi = 0; wi < out_height; wi++)
       for(T_UINT wj = 0; wj < out_width; wj++)
-        for(T_UINT ki = 0; ki < kernel_size; ki++)
-          for(T_UINT kj = 0; kj < kernel_size; kj++)
-          {
+#ifdef USE_NEON
+        for(T_UINT k = 0; k < out_depth; ++k) {
+          auto tmp = vdupq_n_u32(0);
+          auto shift = shift_ref;
+          for(T_UINT i = 0; i < blocks; i += 2) {
+            T_UINT ki = (k * blocks + i) >> lb_kernel_size;
+            T_UINT kj = (k * blocks + i) & kernel_mask;
             T_INT row = (wi * stride) + ki;
             T_INT col = (wj * stride) + kj;
-            T_UINT ch = (ki * kernel_size + kj) * bit_shift;
-            T_UINT ch_high = ch / QUANTIZED_PACKED::BitCount;
-            T_UINT ch_low = ch % QUANTIZED_PACKED::BitCount;
-#ifdef USE_NEON
-            const auto out_idx = ch_high * out_height * out_width * bits_per_input
-              + wi * out_width * bits_per_input
-              + wj * bits_per_input;
             const auto in_idx = row * input_width * bits_per_input
               + col * bits_per_input;
-            const auto in = vld1_u32(reinterpret_cast<uint32_t*>(input.data() + in_idx));
-            const auto masked = vand_u32(vdup_n_u32(mask), in);
-#ifdef AARCH32
-            const auto shifted = vshl_u32(masked, vdup_n_s32(ch_low));
-#else
-            const auto shifted = vshl_n_u32(masked, ch_low);
-#endif
-            const auto out_old = vld1_u32(reinterpret_cast<uint32_t*>(output.data() + out_idx));
-            const auto out_new = vorr_u32(out_old, shifted);
-            vst1_u32(reinterpret_cast<uint32_t*>(output.data() + out_idx), out_new);
+            const auto in = vld1q_u32(reinterpret_cast<uint32_t*>(input.data() + in_idx));
+            const auto masked = vandq_u32(mask_v, in);
+            const auto shifted = vshlq_u32(masked, shift);
+            shift += add;
+            tmp |= shifted;
+          }
+          const auto out = vorr_u32(vget_low_u32(tmp), vget_high_u32(tmp));
+          const auto out_idx = k * out_height * out_width * bits_per_input
+            + wi * out_width * bits_per_input
+            + wj * bits_per_input;
+          vst1_u32(reinterpret_cast<uint32_t*>(output.data() + out_idx), out);
+        }
 #else
-            const auto out_idx = ch_high * out_height * out_width * bits_per_input
-              + wi * out_width * bits_per_input
-              + wj * bits_per_input;
+        for(T_UINT k = 0; k < out_depth; ++k) {
+          uint64_t out = 0;
+          for(T_UINT i = 0; i < blocks; ++i) {
+            T_UINT ki = (k * blocks + i) >> lb_kernel_size;
+            T_UINT kj = (k * blocks + i) & kernel_mask;
+            T_INT row = (wi * stride) + ki;
+            T_INT col = (wj * stride) + kj;
             const auto in_idx = row * input_width * bits_per_input
               + col * bits_per_input;
             const auto in = *reinterpret_cast<uint64_t*>(input.data() + in_idx);
-            *reinterpret_cast<uint64_t*>(output.data() + out_idx) |= (mask64 & in) << ch_low;
-#endif
+            out |= (mask64 & in) << (i * bit_shift);
           }
+          const auto out_idx = k * out_height * out_width * bits_per_input
+            + wi * out_width * bits_per_input
+            + wj * bits_per_input;
+          *reinterpret_cast<uint64_t*>(output.data() + out_idx) = out;
+        }
+#endif
   } else {
     for(T_UINT ih = 0; ih < input_depth; ++ih)
       for(T_UINT wi = 0; wi < out_height; wi++)
@@ -232,24 +250,17 @@ inline void func_ExtractImagePatches(
             {
               T_INT row = (wi * stride) + ki;
               T_INT col = (wj * stride) + kj;
-#ifdef USE_NEON
               const auto ch_high = ih + (ki * kernel_size + kj) * input_depth;
               const auto out_idx = ch_high * out_height * out_width * bits_per_input
                 + wi * out_width * bits_per_input
                 + wj * bits_per_input;
               const auto in_idx = ih * input_height * input_width * bits_per_input
                 + row * input_width * bits_per_input
                 + col * bits_per_input;
+#ifdef USE_NEON
               const auto in = vld1_u32(reinterpret_cast<uint32_t*>(input.data() + in_idx));
               vst1_u32(reinterpret_cast<uint32_t*>(output.data() + out_idx), in);
 #else
-              const auto ch_high = ih + (ki * kernel_size + kj) * input_depth;
-              const auto out_idx = ch_high * out_height * out_width * bits_per_input
-                + wi * out_width * bits_per_input
-                + wj * bits_per_input;
-              const auto in_idx = ih * input_height * input_width * bits_per_input
-                + row * input_width * bits_per_input
-                + col * bits_per_input;
               *reinterpret_cast<uint64_t*>(output.data() + out_idx) =
                   *reinterpret_cast<uint64_t*>(input.data() + in_idx);
 #endif

diff --git a/dlk/python/dlk/templates/include/func/quantized_conv2d.h b/dlk/python/dlk/templates/include/func/quantized_conv2d.h
@@ -24,8 +24,11 @@ limitations under the License.
 #include "tensor_convert.h"
 #include "operators.h"
 #include "time_measurement.h"
-#include "func/impl/apply_thresholds.h"
 #include "func/impl/quantized_conv2d_tiling.h"
+#include "func/impl/quantized_conv2d_kn2row.h"
+#ifdef _OPENMP
+#include <omp.h>
+#endif
 
 template <typename T, MemoryLayout layout>
 void QuantizedConv2D(const TensorView<T, layout>& input,
@@ -178,7 +181,19 @@ void func_QuantizedConv2DWithThreshold(
 
   Measurement::Start("Memcpy");
 
+#ifdef _OPENMP
+  const int num_blocks = bytes / sizeof(QUANTIZED_PACKED);
+  const int num_threads = omp_get_max_threads();
+  const int chunk_size = (num_blocks + num_threads - 1) / num_threads;
+#pragma omp parallel for
+  for (int i = 0; i < num_blocks; i += chunk_size) {
+    memcpy(output.data() + i,
+        (QUANTIZED_PACKED*)(p.device_output_buf) + i,
+        std::min(chunk_size, num_blocks - i) * sizeof(QUANTIZED_PACKED));
+  }
+#else
   memcpy(output.data(), (void*)p.device_output_buf, bytes);
+#endif
 
   Measurement::Stop();
 }