Skip to content
This repository has been archived by the owner on Dec 1, 2021. It is now read-only.

Commit

Permalink
More optimize dlk runtime (#390)
Browse files Browse the repository at this point in the history
* Optimize matmul for arm neon

* Optimize quantizer

* Optimize pack_input

* Optimize ApplyThresholdsAndPack for arm neon

* Use float instead of double

* Optimize extract_image_patches

* Optimize tensor convert

* Optimize memcpy

* Optimize lookup

* Optimize matmul

* Copy thresholds in Network::init()

* Fuse conv2d, apply_thresholds, pack_16bit, convert_tensor

* Remove unused sources

* Fix includes

* Remove unused includes

* Optimize and refactor quantized_conv2d_tiling

* Fuse conv2d, apply_thresholds, pack_16bit, convert_tensor for x86 avx

* Remove unused sources

* Remove unused include

* Fix options

* Optimize matmul for x86 avx

* Optimize quantized conv2d tiling for x86 avx

* Remove unused dependency

* Fix type

* Make quantizer.cpp

* Refactor assert

* Fix for warning

* Put B_buf on heap area

* Fix for x86 avx

* Fix CMakeLists and remove unused files

* Remove unused include

* Add test for Arm w/o FPGA and x86 with AVX

* Fix index

* Optimize packing

* Fix pack_input_to_qwords

* Optimize de-interleave

* Remove unused lines

* Remove comment
  • Loading branch information
primenumber authored and tkng committed Sep 10, 2019
1 parent 4801a2f commit 3937a64
Show file tree
Hide file tree
Showing 28 changed files with 1,770 additions and 1,912 deletions.
3 changes: 3 additions & 0 deletions dlk/python/dlk/core/view.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,11 +130,13 @@ def run(self):

if op.has_thresholds:
threshold = f'{op.name}_thresholds'
thresholds_addr = f'THRESHOLD_ADDR + {op.name}_thresholds_offset'
conv_func = 'func_QuantizedConv2DWithThreshold'
nbit_aqtz = self.op.a_quantizer[0].nbit
max_value = self.op.a_quantizer[0].max_v
else:
threshold = 'nullptr'
thresholds_addr = '0'
conv_func = 'func_QuantizedConv2D'
nbit_aqtz = 2
max_value = 2.0
Expand Down Expand Up @@ -169,6 +171,7 @@ def run(self):
binConv2D_struct.debug_name = "{op.name}";
#ifdef RUN_ON_FPGA
binConv2D_struct.device_kernel_phys_addr = KERNEL_ADDR + {op.name}_kernel_offset;
binConv2D_struct.device_thresholds_phys_addr = {thresholds_addr};
#endif
{conv_func}({inputs_string}, {op.name}, scaling_factors::{op.name}, binConv2D_struct);
Expand Down
22 changes: 14 additions & 8 deletions dlk/python/dlk/templates/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,17 +32,20 @@ file(GLOB SRC_LIB_ALL "src/inputs/*.cpp")
list(APPEND SRC_LIB_ALL
src/func/average_pool.cpp
src/func/conv2d.cpp
src/func/lookup.cpp
src/func/max_pool.cpp
src/func/pad.cpp
src/func/matmul.cpp
src/func/quantize.cpp
src/func/softmax.cpp
src/func/unpooling.cpp
src/matrix/shift_add.cpp
src/matrix/multiplication.cpp
src/network_c_interface.cpp
src/network.cpp
src/pack_input_to_qwords.cpp
src/time_measurement.cpp
src/quantizer.cpp
)

if(EXISTS ${CMAKE_SOURCE_DIR}/src/scaling_factors.cpp)
Expand All @@ -54,20 +57,17 @@ if(EXISTS ${CMAKE_SOURCE_DIR}/src/thresholds.cpp)
endif()

if(RUN_ON_FPGA)
list(APPEND SRC_LIB_ALL src/pack2b_neonv7.S)
list(APPEND SRC_LIB_ALL src/func/arm_neon/batch_normalization.cpp)
list(APPEND SRC_LIB_ALL src/func/impl/fpga/quantized_conv2d_kn2row.cpp)
list(APPEND SRC_LIB_ALL src/func/impl/arm_neon/pop_count.cpp)
list(APPEND SRC_LIB_ALL src/matrix/arm_neon/quantized_multiplication.cpp)
elseif(USE_NEON)
list(APPEND SRC_LIB_ALL src/pack2b_neonv7.S)
list(APPEND SRC_LIB_ALL src/func/arm_neon/batch_normalization.cpp)
list(APPEND SRC_LIB_ALL src/func/impl/arm_neon/quantized_conv2d_tiling.cpp)
list(APPEND SRC_LIB_ALL src/func/impl/generic/quantized_conv2d_kn2row.cpp)
list(APPEND SRC_LIB_ALL src/func/impl/arm_neon/pop_count.cpp)
list(APPEND SRC_LIB_ALL src/func/impl/arm_neon/pack_16bit.cpp)
list(APPEND SRC_LIB_ALL src/func/impl/arm_neon/apply_thresholds.cpp)
list(APPEND SRC_LIB_ALL src/matrix/arm_neon/quantized_multiplication.cpp)
elseif(USE_AVX)
list(APPEND SRC_LIB_ALL src/func/generic/batch_normalization.cpp)
list(APPEND SRC_LIB_ALL src/func/impl/x86_avx/quantized_conv2d_tiling.cpp)
list(APPEND SRC_LIB_ALL src/func/impl/generic/pop_count.cpp)
else()
list(APPEND SRC_LIB_ALL src/func/generic/batch_normalization.cpp)
list(APPEND SRC_LIB_ALL src/func/impl/generic/quantized_conv2d_kn2row.cpp)
Expand All @@ -90,14 +90,20 @@ macro(add_dlk_target_compile_properties target)
target_include_directories(${target} PUBLIC include)
if(USE_NEON)
target_compile_definitions(${target} PUBLIC -DUSE_NEON)
target_compile_options(${target} PUBLIC -mcpu=cortex-a9 -mfpu=neon -mthumb -fopenmp)
target_compile_options(${target} PUBLIC -fopenmp)
target_link_libraries(${target} PUBLIC -fopenmp)
endif()
if(USE_AVX)
target_compile_definitions(${target} PUBLIC -DUSE_AVX)
target_compile_options(${target} PUBLIC -mavx2 -mfma -fopenmp)
target_link_libraries(${target} PUBLIC -fopenmp)
endif()
if(RUN_ON_FPGA)
target_compile_definitions(${target} PUBLIC -DRUN_ON_FPGA)
endif()
if(AARCH32)
target_compile_definitions(${target} PUBLIC -DAARCH32)
target_compile_options(${target} PUBLIC -mcpu=cortex-a9 -mfpu=neon -mthumb)
endif()
endmacro()

Expand Down
27 changes: 9 additions & 18 deletions dlk/python/dlk/templates/Makefile.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -25,41 +25,35 @@ LIB_SRC := $(wildcard $(INPUTS_SRC_DIR)/*.cpp) \
$(SRC_DIR)/func/unpooling.cpp \
$(SRC_DIR)/func/lookup.cpp \
$(SRC_DIR)/matrix/shift_add.cpp \
$(SRC_DIR)/matrix/multiplication.cpp \
$(SRC_DIR)/network_c_interface.cpp \
$(SRC_DIR)/network.cpp \
$(SRC_DIR)/pack_input_to_qwords.cpp \
$(SRC_DIR)/time_measurement.cpp \
$(SRC_DIR)/write_to_file.cpp
$(SRC_DIR)/write_to_file.cpp \
$(SRC_DIR)/quantizer.cpp

SRC := $(LIB_SRC) $(wildcard $(DLK_TEST_SRC_DIR)/*.cpp) mains/main.cpp
SRC := $(filter-out ./src/network_c_interface.cpp, $(SRC))

LIB_ARM_SRC := $(wildcard $(SRC_DIR)/*.S) \
$(SRC_DIR)/func/arm_neon/batch_normalization.cpp \
$(SRC_DIR)/func/impl/arm_neon/quantized_conv2d_tiling.cpp \
$(SRC_DIR)/func/impl/generic/quantized_conv2d_kn2row.cpp \
$(SRC_DIR)/func/impl/arm_neon/pop_count.cpp \
$(SRC_DIR)/func/impl/arm_neon/pack_16bit.cpp \
$(SRC_DIR)/func/impl/arm_neon/apply_thresholds.cpp \
$(SRC_DIR)/matrix/arm_neon/quantized_multiplication.cpp
$(SRC_DIR)/func/impl/arm_neon/pop_count.cpp
LIB_ARM_OBJ := $(patsubst %.S, %.o, $(LIB_ARM_SRC))
LIB_ARM_OBJ := $(patsubst %.cpp, %.o, $(LIB_ARM_OBJ))

LIB_FPGA_SRC := $(wildcard $(SRC_DIR)/*.S) \
$(SRC_DIR)/func/arm_neon/batch_normalization.cpp \
$(SRC_DIR)/func/impl/fpga/quantized_conv2d_kn2row.cpp \
$(SRC_DIR)/func/impl/arm_neon/pop_count.cpp \
$(SRC_DIR)/matrix/arm_neon/quantized_multiplication.cpp
$(SRC_DIR)/func/impl/arm_neon/pop_count.cpp
LIB_FPGA_OBJ := $(patsubst %.S, %.o, $(LIB_FPGA_SRC))
LIB_FPGA_OBJ := $(patsubst %.cpp, %.o, $(LIB_FPGA_OBJ))

LIB_AARCH64_SRC := \
$(SRC_DIR)/func/arm_neon/batch_normalization.cpp \
$(SRC_DIR)/func/impl/arm_neon/quantized_conv2d_tiling.cpp \
$(SRC_DIR)/matrix/arm_neon/quantized_multiplication.cpp \
$(SRC_DIR)/func/impl/arm_neon/pop_count.cpp \
$(SRC_DIR)/func/impl/arm_neon/apply_thresholds.cpp \
$(SRC_DIR)/func/impl/arm_neon/pack_16bit.cpp
$(SRC_DIR)/func/impl/arm_neon/pop_count.cpp
LIB_AARCH64_OBJ := $(patsubst %.S, %.o, $(LIB_AARCH64_SRC))
LIB_AARCH64_OBJ := $(patsubst %.cpp, %.o, $(LIB_AARCH64_OBJ))

Expand All @@ -75,10 +69,7 @@ LIB_X86_OBJ := $(patsubst %.cpp, %.o, $(LIB_X86_SRC))
LIB_X86_AVX_SRC := \
$(SRC_DIR)/func/generic/batch_normalization.cpp \
$(SRC_DIR)/func/impl/x86_avx/quantized_conv2d_tiling.cpp \
$(SRC_DIR)/matrix/generic/quantized_multiplication.cpp \
$(SRC_DIR)/func/impl/generic/pop_count.cpp \
$(SRC_DIR)/func/impl/x86_avx/apply_thresholds.cpp \
$(SRC_DIR)/func/impl/x86_avx/pack_16bit.cpp
$(SRC_DIR)/func/impl/generic/pop_count.cpp
LIB_X86_AVX_OBJ := $(patsubst %.cpp, %.o, $(LIB_X86_AVX_SRC))

LIB_OBJ := $(patsubst %.cpp, %.o, $(LIB_SRC))
Expand Down Expand Up @@ -145,7 +136,7 @@ lm_x86: FLAGS += $(INCLUDES) -O3 -std=c++14 -DUSE_PNG -pthread -g
lm_x86: CXXFLAGS +=

lm_x86_avx: CXX = g++
lm_x86_avx: FLAGS += $(INCLUDES) -O3 -std=c++14 -mavx2 -DUSE_AVX -DUSE_PNG -pthread -g -fopenmp
lm_x86_avx: FLAGS += $(INCLUDES) -O3 -std=c++14 -mavx2 -mfma -DUSE_AVX -DUSE_PNG -pthread -g -fopenmp
lm_x86_avx: CXXFLAGS +=

lm_aarch64: CXX = aarch64-linux-gnu-g++
Expand All @@ -165,7 +156,7 @@ lib_x86: FLAGS += $(INCLUDES) -O3 -std=c++14 -fPIC -fvisibility=hidden
lib_x86: CXXFLAGS +=

lib_x86_avx: CXX = g++
lib_x86_avx: FLAGS += $(INCLUDES) -O3 -std=c++14 -fPIC -fvisibility=hidden -DUSE_AVX -pthread -g -fopenmp
lib_x86_avx: FLAGS += $(INCLUDES) -O3 -std=c++14 -fPIC -fvisibility=hidden -DUSE_AVX -mavx2 -mfma -pthread -g -fopenmp
lib_x86_avx: CXXFLAGS +=

lib_aarch64: CXX = aarch64-linux-gnu-g++
Expand Down
16 changes: 3 additions & 13 deletions dlk/python/dlk/templates/include/de10_nano.h
Original file line number Diff line number Diff line change
Expand Up @@ -505,28 +505,18 @@ Parameters calcParameters(uint32_t inputHeight, uint32_t inputWidth, uint32_t in
}

void RunTCA(unsigned long input_addr, unsigned long output_addr, unsigned long kernel_addr,
BIN_CONV_OUTPUT th_data[], unsigned in_w, unsigned in_h, unsigned in_c, unsigned nbits_in_data,
unsigned long thresholds_addr, unsigned in_w, unsigned in_h, unsigned in_c, unsigned nbits_in_data,
unsigned out_w, unsigned out_h, unsigned out_c, unsigned k_w, unsigned k_h, unsigned pad, unsigned stride) {

const unsigned k_size = (k_h * k_w * in_c * out_c) / 32;
// MappedMem k_data_mem(KERNEL_ADDR, k_size, sizeof(T_UINT));
// k_data_mem.Write(k_data_packed, k_size);

unsigned use_threshold = (th_data != NULL) ? 1 : 0;

if (use_threshold == 1) {
const unsigned th_size = out_c * NUM_OF_A2W1_THRESHOLD;
MappedMem th_data_mem(THRESHOLD_ADDR, th_size, sizeof(BIN_CONV_OUTPUT));
th_data_mem.Write(th_data, th_size);
}
unsigned use_threshold = (thresholds_addr != 0) ? 1 : 0;

static volatile uint32_t* csr = nullptr;
if (csr == nullptr) {
csr = reinterpret_cast<uint32_t*>(mapPhysicalMemory(HPS_TO_FPGA_LW_BASE, 0xFF));
}
auto tileWidth = 32u;
auto tileHeight = 32u;
auto p = calcParameters(in_h, in_w, in_c, tileWidth, tileHeight, out_c, k_h, k_w, input_addr, kernel_addr, THRESHOLD_ADDR, output_addr, use_threshold == 1);
auto p = calcParameters(in_h, in_w, in_c, tileWidth, tileHeight, out_c, k_h, k_w, input_addr, kernel_addr, thresholds_addr, output_addr, use_threshold == 1);

csr[Csr::admaInputAddress] = p.admaInputAddress;
csr[Csr::admaInputHCount] = p.admaInputHCount;
Expand Down
81 changes: 46 additions & 35 deletions dlk/python/dlk/templates/include/func/extract_image_patches.h
Original file line number Diff line number Diff line change
Expand Up @@ -183,46 +183,64 @@ inline void func_ExtractImagePatches(
T_UINT output_index = 0;

if (out_depth < kernel_size * kernel_size) {
int bit_shift = out_depth * QUANTIZED_PACKED::BitCount / (kernel_size * kernel_size);
const T_UINT kernel_area = kernel_size * kernel_size;
const T_UINT bit_shift = out_depth * QUANTIZED_PACKED::BitCount / kernel_area;
const QUANTIZED_PACKED::base_t mask((QUANTIZED_PACKED::base_t(1) << bit_shift) - 1);
const T_UINT lb_kernel_size = __builtin_ctz(kernel_size);
const T_UINT kernel_mask = (1 << lb_kernel_size) - 1;
#ifdef USE_NEON
const auto shift_ref = vcombine_s32(vdup_n_s32(0), vdup_n_s32(bit_shift));
const auto add = vdupq_n_s32(bit_shift * 2);
const auto mask_v = vdupq_n_u32(mask);
#else
const uint64_t mask64 = mask * 0x1'0000'0001ull;
std::fill(output.data(), output.data() + output.size(), QUANTIZED_PACKED(0));
#endif
const T_UINT blocks = kernel_area / out_depth;
#pragma omp parallel for
for(T_UINT wi = 0; wi < out_height; wi++)
for(T_UINT wj = 0; wj < out_width; wj++)
for(T_UINT ki = 0; ki < kernel_size; ki++)
for(T_UINT kj = 0; kj < kernel_size; kj++)
{
#ifdef USE_NEON
for(T_UINT k = 0; k < out_depth; ++k) {
auto tmp = vdupq_n_u32(0);
auto shift = shift_ref;
for(T_UINT i = 0; i < blocks; i += 2) {
T_UINT ki = (k * blocks + i) >> lb_kernel_size;
T_UINT kj = (k * blocks + i) & kernel_mask;
T_INT row = (wi * stride) + ki;
T_INT col = (wj * stride) + kj;
T_UINT ch = (ki * kernel_size + kj) * bit_shift;
T_UINT ch_high = ch / QUANTIZED_PACKED::BitCount;
T_UINT ch_low = ch % QUANTIZED_PACKED::BitCount;
#ifdef USE_NEON
const auto out_idx = ch_high * out_height * out_width * bits_per_input
+ wi * out_width * bits_per_input
+ wj * bits_per_input;
const auto in_idx = row * input_width * bits_per_input
+ col * bits_per_input;
const auto in = vld1_u32(reinterpret_cast<uint32_t*>(input.data() + in_idx));
const auto masked = vand_u32(vdup_n_u32(mask), in);
#ifdef AARCH32
const auto shifted = vshl_u32(masked, vdup_n_s32(ch_low));
#else
const auto shifted = vshl_n_u32(masked, ch_low);
#endif
const auto out_old = vld1_u32(reinterpret_cast<uint32_t*>(output.data() + out_idx));
const auto out_new = vorr_u32(out_old, shifted);
vst1_u32(reinterpret_cast<uint32_t*>(output.data() + out_idx), out_new);
const auto in = vld1q_u32(reinterpret_cast<uint32_t*>(input.data() + in_idx));
const auto masked = vandq_u32(mask_v, in);
const auto shifted = vshlq_u32(masked, shift);
shift += add;
tmp |= shifted;
}
const auto out = vorr_u32(vget_low_u32(tmp), vget_high_u32(tmp));
const auto out_idx = k * out_height * out_width * bits_per_input
+ wi * out_width * bits_per_input
+ wj * bits_per_input;
vst1_u32(reinterpret_cast<uint32_t*>(output.data() + out_idx), out);
}
#else
const auto out_idx = ch_high * out_height * out_width * bits_per_input
+ wi * out_width * bits_per_input
+ wj * bits_per_input;
for(T_UINT k = 0; k < out_depth; ++k) {
uint64_t out = 0;
for(T_UINT i = 0; i < blocks; ++i) {
T_UINT ki = (k * blocks + i) >> lb_kernel_size;
T_UINT kj = (k * blocks + i) & kernel_mask;
T_INT row = (wi * stride) + ki;
T_INT col = (wj * stride) + kj;
const auto in_idx = row * input_width * bits_per_input
+ col * bits_per_input;
const auto in = *reinterpret_cast<uint64_t*>(input.data() + in_idx);
*reinterpret_cast<uint64_t*>(output.data() + out_idx) |= (mask64 & in) << ch_low;
#endif
out |= (mask64 & in) << (i * bit_shift);
}
const auto out_idx = k * out_height * out_width * bits_per_input
+ wi * out_width * bits_per_input
+ wj * bits_per_input;
*reinterpret_cast<uint64_t*>(output.data() + out_idx) = out;
}
#endif
} else {
for(T_UINT ih = 0; ih < input_depth; ++ih)
for(T_UINT wi = 0; wi < out_height; wi++)
Expand All @@ -232,24 +250,17 @@ inline void func_ExtractImagePatches(
{
T_INT row = (wi * stride) + ki;
T_INT col = (wj * stride) + kj;
#ifdef USE_NEON
const auto ch_high = ih + (ki * kernel_size + kj) * input_depth;
const auto out_idx = ch_high * out_height * out_width * bits_per_input
+ wi * out_width * bits_per_input
+ wj * bits_per_input;
const auto in_idx = ih * input_height * input_width * bits_per_input
+ row * input_width * bits_per_input
+ col * bits_per_input;
#ifdef USE_NEON
const auto in = vld1_u32(reinterpret_cast<uint32_t*>(input.data() + in_idx));
vst1_u32(reinterpret_cast<uint32_t*>(output.data() + out_idx), in);
#else
const auto ch_high = ih + (ki * kernel_size + kj) * input_depth;
const auto out_idx = ch_high * out_height * out_width * bits_per_input
+ wi * out_width * bits_per_input
+ wj * bits_per_input;
const auto in_idx = ih * input_height * input_width * bits_per_input
+ row * input_width * bits_per_input
+ col * bits_per_input;
*reinterpret_cast<uint64_t*>(output.data() + out_idx) =
*reinterpret_cast<uint64_t*>(input.data() + in_idx);
#endif
Expand Down
17 changes: 16 additions & 1 deletion dlk/python/dlk/templates/include/func/quantized_conv2d.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,11 @@ limitations under the License.
#include "tensor_convert.h"
#include "operators.h"
#include "time_measurement.h"
#include "func/impl/apply_thresholds.h"
#include "func/impl/quantized_conv2d_tiling.h"
#include "func/impl/quantized_conv2d_kn2row.h"
#ifdef _OPENMP
#include <omp.h>
#endif

template <typename T, MemoryLayout layout>
void QuantizedConv2D(const TensorView<T, layout>& input,
Expand Down Expand Up @@ -178,7 +181,19 @@ void func_QuantizedConv2DWithThreshold(

Measurement::Start("Memcpy");

#ifdef _OPENMP
const int num_blocks = bytes / sizeof(QUANTIZED_PACKED);
const int num_threads = omp_get_max_threads();
const int chunk_size = (num_blocks + num_threads - 1) / num_threads;
#pragma omp parallel for
for (int i = 0; i < num_blocks; i += chunk_size) {
memcpy(output.data() + i,
(QUANTIZED_PACKED*)(p.device_output_buf) + i,
std::min(chunk_size, num_blocks - i) * sizeof(QUANTIZED_PACKED));
}
#else
memcpy(output.data(), (void*)p.device_output_buf, bytes);
#endif

Measurement::Stop();
}
Expand Down
Loading

0 comments on commit 3937a64

Please sign in to comment.