Skip to content

Commit

Permalink
check vulkan fp16 uniform support and implement lfp conversion withou…
Browse files Browse the repository at this point in the history
…t fp16u (#5287)
  • Loading branch information
nihui authored Jan 17, 2024
1 parent 656b082 commit 5329d32
Show file tree
Hide file tree
Showing 8 changed files with 87 additions and 33 deletions.
8 changes: 4 additions & 4 deletions docs/developer-guide/glsl-extension.md
Original file line number Diff line number Diff line change
Expand Up @@ -170,10 +170,10 @@ declare variable in shared local memory
shared lfp tmp_a[8][4][2];
```

|local type|fp32|fp16p / fp16s|fp16s + fp16a|
|---|---|---|---|
|lfp|float|float|float16_t|
|lfpvec4|vec4|uvec2|f16vec4|
|local type|fp32|fp16p / fp16s only|fp16s+fp16a|fp16s+fp16u|
|---|---|---|---|---|
|lfp|float|float|float|float16_t|
|lfpvec4|vec4|uvec2|uint64_t|f16vec4|

## image format and precision hint type

Expand Down
8 changes: 4 additions & 4 deletions docs/developer-guide/glsl-extension.zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -170,10 +170,10 @@ void main()
shared lfp tmp_a[8][4][2];
```

|local type|fp32|fp16p / fp16s|fp16s + fp16a|
|---|---|---|---|
|lfp|float|float|float16_t|
|lfpvec4|vec4|uvec2|f16vec4|
|local type|fp32|fp16p / fp16s only|fp16s+fp16a|fp16s+fp16u|
|---|---|---|---|---|
|lfp|float|float|float|float16_t|
|lfpvec4|vec4|uvec2|uint64_t|f16vec4|

## 图像格式类型(image format type)和精度类型(precision hint type)

Expand Down
71 changes: 48 additions & 23 deletions src/gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -321,9 +321,11 @@ class GpuInfoPrivate
// fp16 and int8 feature
bool support_fp16_packed;
bool support_fp16_storage;
bool support_fp16_uniform;
bool support_fp16_arithmetic;
bool support_int8_packed;
bool support_int8_storage;
bool support_int8_uniform;
bool support_int8_arithmetic;

// ycbcr conversion feature
Expand Down Expand Up @@ -604,6 +606,11 @@ bool GpuInfo::support_fp16_storage() const
return d->support_fp16_storage;
}

bool GpuInfo::support_fp16_uniform() const
{
return d->support_fp16_uniform;
}

bool GpuInfo::support_fp16_arithmetic() const
{
return d->support_fp16_arithmetic;
Expand All @@ -619,6 +626,11 @@ bool GpuInfo::support_int8_storage() const
return d->support_int8_storage;
}

bool GpuInfo::support_int8_uniform() const
{
return d->support_int8_uniform;
}

bool GpuInfo::support_int8_arithmetic() const
{
return d->support_int8_arithmetic;
Expand Down Expand Up @@ -1763,9 +1775,11 @@ int create_gpu_instance(const char* driver_path)
// check features
gpu_info.support_fp16_packed = true;
gpu_info.support_fp16_storage = false;
gpu_info.support_fp16_uniform = false;
gpu_info.support_fp16_arithmetic = false;
gpu_info.support_int8_packed = true;
gpu_info.support_int8_storage = false;
gpu_info.support_int8_uniform = false;
gpu_info.support_int8_arithmetic = false;
gpu_info.support_ycbcr_conversion = false;
gpu_info.support_cooperative_matrix = false;
Expand Down Expand Up @@ -1843,30 +1857,18 @@ int create_gpu_instance(const char* driver_path)
if (gpu_info.support_VK_KHR_8bit_storage)
{
gpu_info.support_int8_storage = query8BitStorageFeatures.storageBuffer8BitAccess;
gpu_info.support_int8_uniform = query8BitStorageFeatures.uniformAndStorageBuffer8BitAccess;
}
if (gpu_info.support_VK_KHR_16bit_storage && queryFeatures.features.shaderStorageImageExtendedFormats)
{
// shaderStorageImageExtendedFormats enables r16f format in storage image
gpu_info.support_fp16_storage = query16BitStorageFeatures.storageBuffer16BitAccess;
gpu_info.support_fp16_uniform = query16BitStorageFeatures.uniformAndStorageBuffer16BitAccess;
}
if (gpu_info.support_VK_KHR_shader_float16_int8)
{
if (gpu_info.support_fp16_storage)
{
gpu_info.support_fp16_arithmetic = queryFloat16Int8Features.shaderFloat16 && query16BitStorageFeatures.uniformAndStorageBuffer16BitAccess;
}
else
{
gpu_info.support_fp16_arithmetic = queryFloat16Int8Features.shaderFloat16;
}
if (gpu_info.support_int8_storage)
{
gpu_info.support_int8_arithmetic = queryFloat16Int8Features.shaderInt8 && query8BitStorageFeatures.uniformAndStorageBuffer8BitAccess;
}
else
{
gpu_info.support_int8_arithmetic = queryFloat16Int8Features.shaderInt8;
}
gpu_info.support_fp16_arithmetic = queryFloat16Int8Features.shaderFloat16;
gpu_info.support_int8_arithmetic = queryFloat16Int8Features.shaderInt8;
}
if (gpu_info.support_VK_KHR_sampler_ycbcr_conversion)
{
Expand Down Expand Up @@ -2018,9 +2020,9 @@ int create_gpu_instance(const char* driver_path)
NCNN_LOGE("[%u %s] bugsbn1=%d bugbilz=%d bugcopc=%d bugihfa=%d", i, physicalDeviceProperties.deviceName,
gpu_info.bug_storage_buffer_no_l1, gpu_info.bug_buffer_image_load_zero, gpu_info.bug_corrupted_online_pipeline_cache, gpu_info.bug_implicit_fp16_arithmetic);

NCNN_LOGE("[%u %s] fp16-p/s/a=%d/%d/%d int8-p/s/a=%d/%d/%d", i, physicalDeviceProperties.deviceName,
gpu_info.support_fp16_packed, gpu_info.support_fp16_storage, gpu_info.support_fp16_arithmetic,
gpu_info.support_int8_packed, gpu_info.support_int8_storage, gpu_info.support_int8_arithmetic);
NCNN_LOGE("[%u %s] fp16-p/s/u/a=%d/%d/%d/%d int8-p/s/u/a=%d/%d/%d/%d", i, physicalDeviceProperties.deviceName,
gpu_info.support_fp16_packed, gpu_info.support_fp16_storage, gpu_info.support_fp16_uniform, gpu_info.support_fp16_arithmetic,
gpu_info.support_int8_packed, gpu_info.support_int8_storage, gpu_info.support_int8_uniform, gpu_info.support_int8_arithmetic);

NCNN_LOGE("[%u %s] subgroup=%u basic/vote/ballot/shuffle=%d/%d/%d/%d", i, physicalDeviceProperties.deviceName,
gpu_info.subgroup_size, gpu_info.support_subgroup_basic, gpu_info.support_subgroup_vote,
Expand Down Expand Up @@ -2470,7 +2472,7 @@ VulkanDevice::VulkanDevice(int device_index)
enabled8BitStorageFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR;
enabled8BitStorageFeatures.pNext = 0;
enabled8BitStorageFeatures.storageBuffer8BitAccess = info.support_int8_storage();
enabled8BitStorageFeatures.uniformAndStorageBuffer8BitAccess = info.support_int8_storage() && info.support_int8_arithmetic();
enabled8BitStorageFeatures.uniformAndStorageBuffer8BitAccess = info.support_int8_uniform();
enabled8BitStorageFeatures.storagePushConstant8 = VK_FALSE;
if (support_VK_KHR_get_physical_device_properties2 && info.support_VK_KHR_8bit_storage())
{
Expand All @@ -2483,7 +2485,7 @@ VulkanDevice::VulkanDevice(int device_index)
enabled16BitStorageFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES_KHR;
enabled16BitStorageFeatures.pNext = 0;
enabled16BitStorageFeatures.storageBuffer16BitAccess = info.support_fp16_storage();
enabled16BitStorageFeatures.uniformAndStorageBuffer16BitAccess = info.support_fp16_storage() && info.support_fp16_arithmetic();
enabled16BitStorageFeatures.uniformAndStorageBuffer16BitAccess = info.support_fp16_uniform();
enabled16BitStorageFeatures.storagePushConstant16 = VK_FALSE;
enabled16BitStorageFeatures.storageInputOutput16 = VK_FALSE;
if (support_VK_KHR_get_physical_device_properties2 && info.support_VK_KHR_16bit_storage())
Expand Down Expand Up @@ -3857,11 +3859,16 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option
custom_defines.push_back(std::make_pair("afpmat4", "mat4"));
}

if (opt.use_fp16_storage && opt.use_fp16_arithmetic)
if (opt.use_fp16_storage && opt.use_fp16_uniform && opt.use_fp16_arithmetic)
{
custom_defines.push_back(std::make_pair("lfp", "float16_t"));
custom_defines.push_back(std::make_pair("lfpvec4", "f16vec4"));
}
else if (opt.use_fp16_storage && opt.use_fp16_arithmetic)
{
custom_defines.push_back(std::make_pair("lfp", "float"));
custom_defines.push_back(std::make_pair("lfpvec4", "uint64_t"));
}
else if (opt.use_fp16_storage || opt.use_fp16_packed)
{
custom_defines.push_back(std::make_pair("lfp", "float"));
Expand All @@ -3873,14 +3880,22 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option
custom_defines.push_back(std::make_pair("lfpvec4", "vec4"));
}

if (opt.use_fp16_storage && opt.use_fp16_arithmetic)
if (opt.use_fp16_storage && opt.use_fp16_uniform && opt.use_fp16_arithmetic)
{
custom_defines.push_back(std::make_pair("sfp2lfp(v)", "v"));
custom_defines.push_back(std::make_pair("sfp2lfpvec4(v)", "v"));

custom_defines.push_back(std::make_pair("lfp2afp(v)", "v"));
custom_defines.push_back(std::make_pair("lfp2afpvec4(v)", "v"));
}
else if (opt.use_fp16_storage && opt.use_fp16_arithmetic)
{
custom_defines.push_back(std::make_pair("sfp2lfp(v)", "float(v)"));
custom_defines.push_back(std::make_pair("sfp2lfpvec4(v)", "pack64(halfBitsToUInt16(v))"));

custom_defines.push_back(std::make_pair("lfp2afp(v)", "float16_t(v)"));
custom_defines.push_back(std::make_pair("lfp2afpvec4(v)", "int16BitsToHalf(unpack16(v))"));
}
else if (opt.use_fp16_packed && opt.use_fp16_arithmetic)
{
custom_defines.push_back(std::make_pair("sfp2lfp(v)", "v"));
Expand Down Expand Up @@ -4208,6 +4223,11 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option
custom_defines.push_back(std::make_pair("NCNN_fp16_packed", "1"));
}

if (opt.use_fp16_uniform)
{
custom_defines.push_back(std::make_pair("NCNN_fp16_uniform", "1"));
}

if (opt.use_fp16_arithmetic)
{
custom_defines.push_back(std::make_pair("NCNN_fp16_arithmetic", "1"));
Expand All @@ -4222,6 +4242,11 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option
custom_defines.push_back(std::make_pair("NCNN_int8_packed", "1"));
}

if (opt.use_int8_uniform)
{
custom_defines.push_back(std::make_pair("NCNN_int8_uniform", "1"));
}

if (opt.use_int8_arithmetic)
{
custom_defines.push_back(std::make_pair("NCNN_int8_arithmetic", "1"));
Expand Down
2 changes: 2 additions & 0 deletions src/gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -260,9 +260,11 @@ class NCNN_EXPORT GpuInfo
// fp16 and int8 feature
bool support_fp16_packed() const;
bool support_fp16_storage() const;
bool support_fp16_uniform() const;
bool support_fp16_arithmetic() const;
bool support_int8_packed() const;
bool support_int8_storage() const;
bool support_int8_uniform() const;
bool support_int8_arithmetic() const;

// ycbcr conversion feature
Expand Down
12 changes: 12 additions & 0 deletions src/net.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1347,8 +1347,11 @@ int Net::load_param(const DataReader& dr)
// sanitize use options
if (!d->vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
if (!d->vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
if (!d->vkdev->info.support_fp16_uniform()) opt.use_fp16_uniform = false;
if (!d->vkdev->info.support_fp16_arithmetic()) opt.use_fp16_arithmetic = false;
if (!d->vkdev->info.support_int8_packed()) opt.use_int8_packed = false;
if (!d->vkdev->info.support_int8_storage()) opt.use_int8_storage = false;
if (!d->vkdev->info.support_int8_uniform()) opt.use_int8_uniform = false;
if (!d->vkdev->info.support_int8_arithmetic()) opt.use_int8_arithmetic = false;
if (!d->vkdev->info.support_cooperative_matrix()) opt.use_cooperative_matrix = false;

Expand All @@ -1359,6 +1362,9 @@ int Net::load_param(const DataReader& dr)

// fp16a makes no sense when fp16 storage disabled
if (!opt.use_fp16_packed && !opt.use_fp16_storage) opt.use_fp16_arithmetic = false;

// fp16 uniform makes no sense when fp16 arithmetic disabled
if (!opt.use_fp16_arithmetic) opt.use_fp16_uniform = false;
}
else
{
Expand Down Expand Up @@ -1637,8 +1643,11 @@ int Net::load_param_bin(const DataReader& dr)
// sanitize use options
if (!d->vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
if (!d->vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
if (!d->vkdev->info.support_fp16_uniform()) opt.use_fp16_uniform = false;
if (!d->vkdev->info.support_fp16_arithmetic()) opt.use_fp16_arithmetic = false;
if (!d->vkdev->info.support_int8_packed()) opt.use_int8_packed = false;
if (!d->vkdev->info.support_int8_storage()) opt.use_int8_storage = false;
if (!d->vkdev->info.support_int8_uniform()) opt.use_int8_uniform = false;
if (!d->vkdev->info.support_int8_arithmetic()) opt.use_int8_arithmetic = false;
if (!d->vkdev->info.support_cooperative_matrix()) opt.use_cooperative_matrix = false;

Expand All @@ -1649,6 +1658,9 @@ int Net::load_param_bin(const DataReader& dr)

// fp16a makes no sense when fp16 storage disabled
if (!opt.use_fp16_packed && !opt.use_fp16_storage) opt.use_fp16_arithmetic = false;

// fp16 uniform makes no sense when fp16 arithmetic disabled
if (!opt.use_fp16_arithmetic) opt.use_fp16_uniform = false;
}
else
{
Expand Down
3 changes: 3 additions & 0 deletions src/option.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ Option::Option()
use_winograd63_convolution = true;

use_a53_a55_optimized_kernel = is_current_thread_running_on_a53_a55();

use_fp16_uniform = true;
use_int8_uniform = true;
}

} // namespace ncnn
6 changes: 4 additions & 2 deletions src/option.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,10 @@ class NCNN_EXPORT Option
// but you can force this on/off if you wish
bool use_a53_a55_optimized_kernel;

bool use_reserved_7;
bool use_reserved_8;
// enable options for shared variables in gpu shader
bool use_fp16_uniform;
bool use_int8_uniform;

bool use_reserved_9;
bool use_reserved_10;
bool use_reserved_11;
Expand Down
10 changes: 10 additions & 0 deletions tests/testutil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -684,7 +684,12 @@ int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n

if (!vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
if (!vkdev->info.support_fp16_uniform()) opt.use_fp16_uniform = false;
if (!vkdev->info.support_fp16_arithmetic()) opt.use_fp16_arithmetic = false;
if (!vkdev->info.support_int8_packed()) opt.use_int8_packed = false;
if (!vkdev->info.support_int8_storage()) opt.use_int8_storage = false;
if (!vkdev->info.support_int8_uniform()) opt.use_int8_uniform = false;
if (!vkdev->info.support_int8_arithmetic()) opt.use_int8_arithmetic = false;
if (!vkdev->info.support_cooperative_matrix()) opt.use_cooperative_matrix = false;

// FIXME fp16a may produce large error
Expand Down Expand Up @@ -1179,7 +1184,12 @@ int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n

if (!vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
if (!vkdev->info.support_fp16_uniform()) opt.use_fp16_uniform = false;
if (!vkdev->info.support_fp16_arithmetic()) opt.use_fp16_arithmetic = false;
if (!vkdev->info.support_int8_packed()) opt.use_int8_packed = false;
if (!vkdev->info.support_int8_storage()) opt.use_int8_storage = false;
if (!vkdev->info.support_int8_uniform()) opt.use_int8_uniform = false;
if (!vkdev->info.support_int8_arithmetic()) opt.use_int8_arithmetic = false;
if (!vkdev->info.support_cooperative_matrix()) opt.use_cooperative_matrix = false;

// FIXME fp16a may produce large error
Expand Down

0 comments on commit 5329d32

Please sign in to comment.