Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
143 changes: 140 additions & 3 deletions src/gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3575,6 +3575,134 @@ VkShaderModule VulkanDevice::compile_shader_module(const uint32_t* spv_data, siz
return shader_module;
}

static void inject_fast_math(const uint32_t* code, size_t size, std::vector<uint32_t>& dstcode, uint32_t fast_math_flag)
{
// check spv magic number
if (size < 20 || code[0] != 0x07230203)
{
dstcode.assign(code, code + size / sizeof(uint32_t));
return;
}

// analyze spv
uint32_t bound = code[3];
uint32_t entry_point_id = 0;
uint32_t float32_type_id = 0;
uint32_t uint32_type_id = 0;
bool has_float_controls2_capability = false;
bool has_float_controls2_extension = false;

const uint32_t* memory_model_ptr = nullptr;
const uint32_t* first_function_ptr = nullptr;

const uint32_t* p = code + 5;
const uint32_t* end = code + (size / sizeof(uint32_t));

while (p < end)
{
uint16_t wordcount = p[0] >> 16;
if (wordcount == 0 || p + wordcount > end) break; // for safety
uint16_t op = p[0] & 0xffff;

switch (op)
{
case 14: // OpMemoryModel
if (!memory_model_ptr) memory_model_ptr = p;
break;
case 15: // OpEntryPoint
if (p[1] == 5 /* GLCompute */) entry_point_id = p[2];
break;
case 21: // OpTypeInt
if (wordcount == 4 && p[2] == 32 && p[3] == 0) uint32_type_id = p[1];
break;
case 22: // OpTypeFloat
if (wordcount == 3 && p[2] == 32) float32_type_id = p[1];
break;
case 54: // OpFunction
if (!first_function_ptr) first_function_ptr = p;
break;
case 17: // OpCapability
if (p[1] == 6029 /* FloatControls2 */) has_float_controls2_capability = true;
break;
case 10: // OpExtension
if (strcmp((const char*)&p[1], "SPV_KHR_float_controls2") == 0) has_float_controls2_extension = true;
break;
}

// fin
if (first_function_ptr) break;

p += wordcount;
}

// cannot find key elements
if (entry_point_id == 0 || float32_type_id == 0 || uint32_type_id == 0 || !memory_model_ptr || !first_function_ptr)
{
dstcode.assign(code, code + size / sizeof(uint32_t));
return;
}

// build spirv
dstcode.clear();
dstcode.reserve(size / sizeof(uint32_t) + 20);

// prepare
uint32_t fast_math_constant_id = bound;
uint32_t new_bound = bound + 1; // for new OpConstant

// header
dstcode.insert(dstcode.end(), code, code + 5);
dstcode[3] = new_bound;

p = code + 5;
while (p < end)
{
uint16_t wordcount = p[0] >> 16;
if (wordcount == 0) break;

// constant need before at first function
if (p == first_function_ptr)
{
dstcode.push_back((4u << 16) | 43 /* OpConstant */);
dstcode.push_back(uint32_type_id);
dstcode.push_back(fast_math_constant_id);
dstcode.push_back(fast_math_flag);
}

// Pass
dstcode.insert(dstcode.end(), p, p + wordcount);

// inject new instructions
if (p == memory_model_ptr)
{
if (!has_float_controls2_capability)
{
dstcode.push_back((2u << 16) | 17 /* OpCapability */);
dstcode.push_back(6029 /* FloatControls2 */);
}
if (!has_float_controls2_extension)
{
const char ext_name[] = "SPV_KHR_float_controls2";
size_t ext_word_count = (sizeof(ext_name) + 3) / 4;
dstcode.push_back(((ext_word_count + 1) << 16) | 10 /* OpExtension */);
std::vector<uint32_t> ext_words(ext_word_count, 0);
memcpy(ext_words.data(), ext_name, sizeof(ext_name));
dstcode.insert(dstcode.end(), ext_words.begin(), ext_words.end());
}
}
else if ((p[0] & 0xffff) == 15 /* OpEntryPoint */ && p[2] == entry_point_id)
{
dstcode.push_back((5u << 16) | 16 /* OpExecutionMode */);
dstcode.push_back(entry_point_id);
dstcode.push_back(6028 /* FPFastMathDefault */);
dstcode.push_back(float32_type_id);
dstcode.push_back(fast_math_constant_id);
}

p += wordcount;
}
}

static void inject_local_size_xyz(const uint32_t* code, size_t size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z, uint32_t* dstcode, size_t* dstsize)
{
uint32_t local_size_x_id = -1;
Expand Down Expand Up @@ -3672,16 +3800,25 @@ static void inject_local_size_xyz(const uint32_t* code, size_t size, uint32_t lo
*dstsize = (unsigned char*)dp - (unsigned char*)dstcode;
}

VkShaderModule VulkanDevice::compile_shader_module(const uint32_t* spv_data, size_t spv_data_size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const
VkShaderModule VulkanDevice::compile_shader_module(const uint32_t* spv_data, size_t spv_data_size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z, uint32_t fast_math_flag) const
{
uint32_t* spv_data_modified = (uint32_t*)malloc(spv_data_size);
size_t spv_data_size_modified = spv_data_size;
inject_local_size_xyz(spv_data, spv_data_size, local_size_x, local_size_y, local_size_z, spv_data_modified, &spv_data_size_modified);

VkShaderModule shader_module = compile_shader_module(spv_data_modified, spv_data_size_modified);
VkShaderModule shader_module;
if (fast_math_flag != 0)
{
std::vector<uint32_t> buffer;
inject_fast_math(spv_data_modified, spv_data_size_modified, buffer, fast_math_flag);

shader_module = compile_shader_module(buffer.data(), buffer.size() * sizeof(uint32_t));
}
else
{
shader_module = compile_shader_module(spv_data_modified, spv_data_size_modified);
}
free(spv_data_modified);

return shader_module;
}

Expand Down
2 changes: 1 addition & 1 deletion src/gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,7 @@ class NCNN_EXPORT VulkanDevice
VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size) const;

// with fixed workgroup size
VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const;
VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z, uint32_t fast_math_flag = 0) const;

// helper for creating pipeline
int create_descriptorset_layout(int binding_count, const int* binding_types, VkDescriptorSetLayout* descriptorset_layout) const;
Expand Down
3 changes: 2 additions & 1 deletion src/option.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ Option::Option()
workspace_vkallocator = 0;
staging_vkallocator = 0;
pipeline_cache = 0;
#endif // NCNN_VULKAN
vk_fast_math_flag = 0; // default no fast math
#endif // NCNN_VULKAN

openmp_blocktime = 20;

Expand Down
18 changes: 18 additions & 0 deletions src/option.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,24 @@ class NCNN_EXPORT Option

// pipeline cache
PipelineCache* pipeline_cache;

enum VK_FAST_MATH_FLAG
{
// Base
VK_FAST_MATH_FLAG_DISABLE = 0x0,
VK_FAST_MATH_FLAG_NotNaN = 0x1, // Assume parameters and result are not NaN. If this assumption does not hold then the operation returns an undefined value.
VK_FAST_MATH_FLAG_NotInf = 0x2, // Assume parameters and result are not +/- Inf. If this assumption does not hold then the operation returns an undefined value.
VK_FAST_MATH_FLAG_NSZ = 0x4, // Treat the sign of a zero parameter or result as insignificant.
VK_FAST_MATH_FLAG_AllowRecip = 0x8, // Allow the usage of reciprocal rather than perform a division.
VK_FAST_MATH_FLAG_Fast = 0x10, // Allow algebraic transformations according to real-number associative and distributive algebra. This flag implies above;
// FloatControls2
VK_FAST_MATH_FLAG_AllowContract = 0x10000, // Allows a floating-point operation to be contracted with any operation(s) producing its operands. Rounding steps may be eliminated or may preserve higher bit-depth than the specified types. The instructions producing the operands do not need to be decorated to allow this transformation.
VK_FAST_MATH_FLAG_AllowReassoc = 0x20000, // Allows a floating-point operation to be reordered with any operation(s) producing its operands according to real-number associativity rules. The instructions producing the operands do not need to be decorated to allow this transformation.
VK_FAST_MATH_FLAG_AllowTransform = 0x40000, // Allows a floating-point operation to be transformed with any operation(s) producing its operands according to real-number rules. This is a superset of AllowContract and AllowReassoc and those bits must be set whenever this bit is set. The instructions producing the operands do not need to be decorated to allow this transformation, but note that non-trivial transformations may require multiple instructions to be decorated.
};

// vk fast math mode, 0 is disable
int vk_fast_math_flag;
#endif // NCNN_VULKAN

// the time openmp threads busy-wait for more work before going to sleep
Expand Down
6 changes: 3 additions & 3 deletions src/pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -216,14 +216,14 @@ void Pipeline::set_local_size_xyz(int w, int h, int c)
// NCNN_LOGE("local size = %d %d %d", local_size_x, local_size_y, local_size_z);
}

int Pipeline::create(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations)
int Pipeline::create(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations, uint32_t fast_math_flag)
{
const PipelineCache* pipeline_cache = vkdev->get_pipeline_cache();

// get from pipeline cache
return pipeline_cache->get_pipeline(spv_data, spv_data_size, specializations, d->local_size_x, d->local_size_y, d->local_size_z, d->subgroup_size,
&d->shader_module, &d->descriptorset_layout, &d->pipeline_layout, &d->pipeline, &d->descriptor_update_template,
d->shader_info);
d->shader_info, fast_math_flag);
}

int Pipeline::create(int shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations)
Expand Down Expand Up @@ -461,7 +461,7 @@ int ImportAndroidHardwareBufferPipeline::create_shader_module(const Option& opt)

set_shader_info(shader_info);

VkShaderModule shader_module = vkdev->compile_shader_module(spv_data, spv_data_size, local_size_x(), local_size_y(), local_size_z());
VkShaderModule shader_module = vkdev->compile_shader_module(spv_data, spv_data_size, local_size_x(), local_size_y(), local_size_z(), opt.vk_fast_math_flag);
set_shader_module(shader_module);

return 0;
Expand Down
2 changes: 1 addition & 1 deletion src/pipeline.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class NCNN_EXPORT Pipeline
void set_local_size_xyz(int w, int h, int c);
void set_subgroup_size(uint32_t subgroup_size);

int create(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations);
int create(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations, uint32_t fast_math_flag = 0);

int create(int shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations);

Expand Down
33 changes: 25 additions & 8 deletions src/pipelinecache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,18 +58,18 @@ class PipelineCachePrivate
struct pipeline_cache_digest
{
pipeline_cache_digest(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations,
uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z, uint32_t subgroup_size);
uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z, uint32_t subgroup_size, uint32_t fast_math_flag = 0);
pipeline_cache_digest(int shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations,
uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z, uint32_t subgroup_size);

bool operator==(const pipeline_cache_digest& rhs) const
{
return d0 == rhs.d0 && d1 == rhs.d1 && d2 == rhs.d2 && d3 == rhs.d3;
return d0 == rhs.d0 && d1 == rhs.d1 && d2 == rhs.d2 && d3 == rhs.d3 && d4 == rhs.d4;
}

bool operator!=(const pipeline_cache_digest& rhs) const
{
return d0 != rhs.d0 || d1 != rhs.d1 || d2 != rhs.d2 || d3 != rhs.d3;
return d0 != rhs.d0 || d1 != rhs.d1 || d2 != rhs.d2 || d3 != rhs.d3 || d4 != rhs.d4;
}

union
Expand All @@ -88,6 +88,8 @@ class PipelineCachePrivate
uint32_t subgroup_size;
uint32_t specializations_murmur3;
uint32_t specializations_fnv1a;
uint32_t fast_math_flag;
uint32_t reserved_0; // for future use
};

struct
Expand All @@ -96,6 +98,7 @@ class PipelineCachePrivate
uint64_t d1;
uint64_t d2;
uint64_t d3;
uint64_t d4;
};
};
};
Expand All @@ -116,7 +119,7 @@ class PipelineCachePrivate
};

PipelineCachePrivate::pipeline_cache_digest::pipeline_cache_digest(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations,
uint32_t _local_size_x, uint32_t _local_size_y, uint32_t _local_size_z, uint32_t _subgroup_size)
uint32_t _local_size_x, uint32_t _local_size_y, uint32_t _local_size_z, uint32_t _subgroup_size, uint32_t _fast_math_flag)
{
spv_data_murmur3 = murmur3_32(spv_data, spv_data_size / 4);

Expand All @@ -126,6 +129,8 @@ PipelineCachePrivate::pipeline_cache_digest::pipeline_cache_digest(const uint32_
local_size_y = _local_size_y;
local_size_z = _local_size_z;
subgroup_size = _subgroup_size;
fast_math_flag = _fast_math_flag;
reserved_0 = 0; // for future use

// encode specializations
const int specialization_count = specializations.size();
Expand All @@ -150,6 +155,8 @@ PipelineCachePrivate::pipeline_cache_digest::pipeline_cache_digest(int _shader_t
local_size_y = _local_size_y;
local_size_z = _local_size_z;
subgroup_size = _subgroup_size;
fast_math_flag = opt.vk_fast_math_flag;
reserved_0 = 0; // for future use

// encode specializations
const int specialization_count = specializations.size();
Expand Down Expand Up @@ -227,11 +234,11 @@ int PipelineCache::get_pipeline(const uint32_t* spv_data, size_t spv_data_size,
VkPipelineLayout* pipeline_layout,
VkPipeline* pipeline,
VkDescriptorUpdateTemplateKHR* descriptor_update_template,
ShaderInfo& shader_info) const
ShaderInfo& shader_info, uint32_t fast_math_flag) const
{
MutexLockGuard lock(d->cache_lock);

PipelineCachePrivate::pipeline_cache_digest key(spv_data, spv_data_size, specializations, local_size_x, local_size_y, local_size_z, subgroup_size);
PipelineCachePrivate::pipeline_cache_digest key(spv_data, spv_data_size, specializations, local_size_x, local_size_y, local_size_z, subgroup_size, fast_math_flag);

if (!vkdev->info.bug_corrupted_online_pipeline_cache())
{
Expand Down Expand Up @@ -266,7 +273,12 @@ int PipelineCache::get_pipeline(const uint32_t* spv_data, size_t spv_data_size,
return -1;
}

VkShaderModule shader_module = vkdev->compile_shader_module(spv_data, spv_data_size, local_size_x, local_size_y, local_size_z);
if (fast_math_flag != 0 && !vkdev->info.support_VK_KHR_shader_float_controls2())
{
NCNN_LOGE("fast_math_flag is not supported on this device");
return -1;
}
VkShaderModule shader_module = vkdev->compile_shader_module(spv_data, spv_data_size, local_size_x, local_size_y, local_size_z, fast_math_flag);
if (!shader_module)
{
NCNN_LOGE("create_shader_module failed");
Expand Down Expand Up @@ -351,6 +363,11 @@ int PipelineCache::get_pipeline(int shader_type_index, const Option& opt, const
return -1;
}

if (opt.vk_fast_math_flag != 0 && !vkdev->info.support_VK_KHR_shader_float_controls2())
{
NCNN_LOGE("fast_math_flag is not supported on this device");
return -1;
}
ret = new_pipeline(shader_module, shader_info, specializations, subgroup_size, descriptorset_layout, pipeline_layout, pipeline, descriptor_update_template);
if (ret != 0)
{
Expand Down Expand Up @@ -403,7 +420,7 @@ int PipelineCache::create_shader_module(int shader_type_index, const Option& opt
return -1;
}

VkShaderModule shader_module = vkdev->compile_shader_module(spv_data, spv_data_size, local_size_x, local_size_y, local_size_z);
VkShaderModule shader_module = vkdev->compile_shader_module(spv_data, spv_data_size, local_size_x, local_size_y, local_size_z, opt.vk_fast_math_flag);

if (!shader_module)
{
Expand Down
2 changes: 1 addition & 1 deletion src/pipelinecache.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class NCNN_EXPORT PipelineCache
VkPipelineLayout* pipeline_layout,
VkPipeline* pipeline,
VkDescriptorUpdateTemplateKHR* descriptor_update_template,
ShaderInfo& shader_info) const;
ShaderInfo& shader_info, uint32_t fast_math_flag = 0) const;

int get_pipeline(int shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations,
uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z, uint32_t subgroup_size,
Expand Down
1 change: 1 addition & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ ncnn_add_test(paramdict)

if(NCNN_VULKAN)
ncnn_add_test(command)
ncnn_add_test(fast_math)
endif()

if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
Expand Down
Loading
Loading