Tencent · futz12 · Aug 1, 2025 · Aug 1, 2025 · Aug 1, 2025 · Aug 1, 2025
diff --git a/src/gpu.cpp b/src/gpu.cpp
@@ -3575,6 +3575,134 @@ VkShaderModule VulkanDevice::compile_shader_module(const uint32_t* spv_data, siz
     return shader_module;
 }
 
+static void inject_fast_math(const uint32_t* code, size_t size, std::vector<uint32_t>& dstcode, uint32_t fast_math_flag)
+{
+    // check spv magic number
+    if (size < 20 || code[0] != 0x07230203)
+    {
+        dstcode.assign(code, code + size / sizeof(uint32_t));
+        return;
+    }
+
+    // analyze spv
+    uint32_t bound = code[3];
+    uint32_t entry_point_id = 0;
+    uint32_t float32_type_id = 0;
+    uint32_t uint32_type_id = 0;
+    bool has_float_controls2_capability = false;
+    bool has_float_controls2_extension = false;
+
+    const uint32_t* memory_model_ptr = nullptr;
+    const uint32_t* first_function_ptr = nullptr;
+
+    const uint32_t* p = code + 5;
+    const uint32_t* end = code + (size / sizeof(uint32_t));
+
+    while (p < end)
+    {
+        uint16_t wordcount = p[0] >> 16;
+        if (wordcount == 0 || p + wordcount > end) break; // for safety
+        uint16_t op = p[0] & 0xffff;
+
+        switch (op)
+        {
+        case 14: // OpMemoryModel
+            if (!memory_model_ptr) memory_model_ptr = p;
+            break;
+        case 15: // OpEntryPoint
+            if (p[1] == 5 /* GLCompute */) entry_point_id = p[2];
+            break;
+        case 21: // OpTypeInt
+            if (wordcount == 4 && p[2] == 32 && p[3] == 0) uint32_type_id = p[1];
+            break;
+        case 22: // OpTypeFloat
+            if (wordcount == 3 && p[2] == 32) float32_type_id = p[1];
+            break;
+        case 54: // OpFunction
+            if (!first_function_ptr) first_function_ptr = p;
+            break;
+        case 17: // OpCapability
+            if (p[1] == 6029 /* FloatControls2 */) has_float_controls2_capability = true;
+            break;
+        case 10: // OpExtension
+            if (strcmp((const char*)&p[1], "SPV_KHR_float_controls2") == 0) has_float_controls2_extension = true;
+            break;
+        }
+
+        // fin
+        if (first_function_ptr) break;
+
+        p += wordcount;
+    }
+
+    // cannot find key elements
+    if (entry_point_id == 0 || float32_type_id == 0 || uint32_type_id == 0 || !memory_model_ptr || !first_function_ptr)
+    {
+        dstcode.assign(code, code + size / sizeof(uint32_t));
+        return;
+    }
+
+    // build spirv
+    dstcode.clear();
+    dstcode.reserve(size / sizeof(uint32_t) + 20);
+
+    // prepare
+    uint32_t fast_math_constant_id = bound;
+    uint32_t new_bound = bound + 1; // for new OpConstant
+
+    // header
+    dstcode.insert(dstcode.end(), code, code + 5);
+    dstcode[3] = new_bound;
+
+    p = code + 5;
+    while (p < end)
+    {
+        uint16_t wordcount = p[0] >> 16;
+        if (wordcount == 0) break;
+
+        // constant need before at first function
+        if (p == first_function_ptr)
+        {
+            dstcode.push_back((4u << 16) | 43 /* OpConstant */);
+            dstcode.push_back(uint32_type_id);
+            dstcode.push_back(fast_math_constant_id);
+            dstcode.push_back(fast_math_flag);
+        }
+
+        // Pass
+        dstcode.insert(dstcode.end(), p, p + wordcount);
+
+        // inject new instructions
+        if (p == memory_model_ptr)
+        {
+            if (!has_float_controls2_capability)
+            {
+                dstcode.push_back((2u << 16) | 17 /* OpCapability */);
+                dstcode.push_back(6029 /* FloatControls2 */);
+            }
+            if (!has_float_controls2_extension)
+            {
+                const char ext_name[] = "SPV_KHR_float_controls2";
+                size_t ext_word_count = (sizeof(ext_name) + 3) / 4;
+                dstcode.push_back(((ext_word_count + 1) << 16) | 10 /* OpExtension */);
+                std::vector<uint32_t> ext_words(ext_word_count, 0);
+                memcpy(ext_words.data(), ext_name, sizeof(ext_name));
+                dstcode.insert(dstcode.end(), ext_words.begin(), ext_words.end());
+            }
+        }
+        else if ((p[0] & 0xffff) == 15 /* OpEntryPoint */ && p[2] == entry_point_id)
+        {
+            dstcode.push_back((5u << 16) | 16 /* OpExecutionMode */);
+            dstcode.push_back(entry_point_id);
+            dstcode.push_back(6028 /* FPFastMathDefault */);
+            dstcode.push_back(float32_type_id);
+            dstcode.push_back(fast_math_constant_id);
+        }
+
+        p += wordcount;
+    }
+}
+
 static void inject_local_size_xyz(const uint32_t* code, size_t size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z, uint32_t* dstcode, size_t* dstsize)
 {
     uint32_t local_size_x_id = -1;
@@ -3672,16 +3800,25 @@ static void inject_local_size_xyz(const uint32_t* code, size_t size, uint32_t lo
     *dstsize = (unsigned char*)dp - (unsigned char*)dstcode;
 }
 
-VkShaderModule VulkanDevice::compile_shader_module(const uint32_t* spv_data, size_t spv_data_size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const
+VkShaderModule VulkanDevice::compile_shader_module(const uint32_t* spv_data, size_t spv_data_size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z, uint32_t fast_math_flag) const
 {
     uint32_t* spv_data_modified = (uint32_t*)malloc(spv_data_size);
     size_t spv_data_size_modified = spv_data_size;
     inject_local_size_xyz(spv_data, spv_data_size, local_size_x, local_size_y, local_size_z, spv_data_modified, &spv_data_size_modified);
 
-    VkShaderModule shader_module = compile_shader_module(spv_data_modified, spv_data_size_modified);
+    VkShaderModule shader_module;
+    if (fast_math_flag != 0)
+    {
+        std::vector<uint32_t> buffer;
+        inject_fast_math(spv_data_modified, spv_data_size_modified, buffer, fast_math_flag);
 
+        shader_module = compile_shader_module(buffer.data(), buffer.size() * sizeof(uint32_t));
+    }
+    else
+    {
+        shader_module = compile_shader_module(spv_data_modified, spv_data_size_modified);
+    }
     free(spv_data_modified);
-
     return shader_module;
 }
 

diff --git a/src/gpu.h b/src/gpu.h
@@ -418,7 +418,7 @@ class NCNN_EXPORT VulkanDevice
     VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size) const;
 
     // with fixed workgroup size
-    VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const;
+    VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z, uint32_t fast_math_flag = 0) const;
 
     // helper for creating pipeline
     int create_descriptorset_layout(int binding_count, const int* binding_types, VkDescriptorSetLayout* descriptorset_layout) const;

diff --git a/src/option.cpp b/src/option.cpp
@@ -23,7 +23,8 @@ Option::Option()
     workspace_vkallocator = 0;
     staging_vkallocator = 0;
     pipeline_cache = 0;
-#endif // NCNN_VULKAN
+    vk_fast_math_flag = 0; // default no fast math
+#endif                     // NCNN_VULKAN
 
     openmp_blocktime = 20;
 

diff --git a/src/option.h b/src/option.h
@@ -55,6 +55,24 @@ class NCNN_EXPORT Option
 
     // pipeline cache
     PipelineCache* pipeline_cache;
+
+    enum VK_FAST_MATH_FLAG
+    {
+        // Base
+        VK_FAST_MATH_FLAG_DISABLE = 0x0,
+        VK_FAST_MATH_FLAG_NotNaN = 0x1,     // Assume parameters and result are not NaN. If this assumption does not hold then the operation returns an undefined value.
+        VK_FAST_MATH_FLAG_NotInf = 0x2,     // Assume parameters and result are not +/- Inf. If this assumption does not hold then the operation returns an undefined value.
+        VK_FAST_MATH_FLAG_NSZ = 0x4,        // Treat the sign of a zero parameter or result as insignificant.
+        VK_FAST_MATH_FLAG_AllowRecip = 0x8, // Allow the usage of reciprocal rather than perform a division.
+        VK_FAST_MATH_FLAG_Fast = 0x10,      // Allow algebraic transformations according to real-number associative and distributive algebra. This flag implies above;
+        // FloatControls2
+        VK_FAST_MATH_FLAG_AllowContract = 0x10000,  // Allows a floating-point operation to be contracted with any operation(s) producing its operands. Rounding steps may be eliminated or may preserve higher bit-depth than the specified types. The instructions producing the operands do not need to be decorated to allow this transformation.
+        VK_FAST_MATH_FLAG_AllowReassoc = 0x20000,   // Allows a floating-point operation to be reordered with any operation(s) producing its operands according to real-number associativity rules. The instructions producing the operands do not need to be decorated to allow this transformation.
+        VK_FAST_MATH_FLAG_AllowTransform = 0x40000, // Allows a floating-point operation to be transformed with any operation(s) producing its operands according to real-number rules. This is a superset of AllowContract and AllowReassoc and those bits must be set whenever this bit is set. The instructions producing the operands do not need to be decorated to allow this transformation, but note that non-trivial transformations may require multiple instructions to be decorated.
+    };
+
+    // vk fast math mode, 0 is disable
+    int vk_fast_math_flag;
 #endif // NCNN_VULKAN
 
     // the time openmp threads busy-wait for more work before going to sleep

diff --git a/src/pipeline.cpp b/src/pipeline.cpp
@@ -216,14 +216,14 @@ void Pipeline::set_local_size_xyz(int w, int h, int c)
     // NCNN_LOGE("local size = %d %d %d", local_size_x, local_size_y, local_size_z);
 }
 
-int Pipeline::create(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations)
+int Pipeline::create(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations, uint32_t fast_math_flag)
 {
     const PipelineCache* pipeline_cache = vkdev->get_pipeline_cache();
 
     // get from pipeline cache
     return pipeline_cache->get_pipeline(spv_data, spv_data_size, specializations, d->local_size_x, d->local_size_y, d->local_size_z, d->subgroup_size,
                                         &d->shader_module, &d->descriptorset_layout, &d->pipeline_layout, &d->pipeline, &d->descriptor_update_template,
-                                        d->shader_info);
+                                        d->shader_info, fast_math_flag);
 }
 
 int Pipeline::create(int shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations)
@@ -461,7 +461,7 @@ int ImportAndroidHardwareBufferPipeline::create_shader_module(const Option& opt)
 
     set_shader_info(shader_info);
 
-    VkShaderModule shader_module = vkdev->compile_shader_module(spv_data, spv_data_size, local_size_x(), local_size_y(), local_size_z());
+    VkShaderModule shader_module = vkdev->compile_shader_module(spv_data, spv_data_size, local_size_x(), local_size_y(), local_size_z(), opt.vk_fast_math_flag);
     set_shader_module(shader_module);
 
     return 0;

diff --git a/src/pipeline.h b/src/pipeline.h
@@ -27,7 +27,7 @@ class NCNN_EXPORT Pipeline
     void set_local_size_xyz(int w, int h, int c);
     void set_subgroup_size(uint32_t subgroup_size);
 
-    int create(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations);
+    int create(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations, uint32_t fast_math_flag = 0);
 
     int create(int shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations);
 

diff --git a/src/pipelinecache.cpp b/src/pipelinecache.cpp
@@ -58,18 +58,18 @@ class PipelineCachePrivate
     struct pipeline_cache_digest
     {
         pipeline_cache_digest(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations,
-                              uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z, uint32_t subgroup_size);
+                              uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z, uint32_t subgroup_size, uint32_t fast_math_flag = 0);
         pipeline_cache_digest(int shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations,
                               uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z, uint32_t subgroup_size);
 
         bool operator==(const pipeline_cache_digest& rhs) const
         {
-            return d0 == rhs.d0 && d1 == rhs.d1 && d2 == rhs.d2 && d3 == rhs.d3;
+            return d0 == rhs.d0 && d1 == rhs.d1 && d2 == rhs.d2 && d3 == rhs.d3 && d4 == rhs.d4;
         }
 
         bool operator!=(const pipeline_cache_digest& rhs) const
         {
-            return d0 != rhs.d0 || d1 != rhs.d1 || d2 != rhs.d2 || d3 != rhs.d3;
+            return d0 != rhs.d0 || d1 != rhs.d1 || d2 != rhs.d2 || d3 != rhs.d3 || d4 != rhs.d4;
         }
 
         union
@@ -88,6 +88,8 @@ class PipelineCachePrivate
                 uint32_t subgroup_size;
                 uint32_t specializations_murmur3;
                 uint32_t specializations_fnv1a;
+                uint32_t fast_math_flag;
+                uint32_t reserved_0; // for future use
             };
 
             struct
@@ -96,6 +98,7 @@ class PipelineCachePrivate
                 uint64_t d1;
                 uint64_t d2;
                 uint64_t d3;
+                uint64_t d4;
             };
         };
     };
@@ -116,7 +119,7 @@ class PipelineCachePrivate
 };
 
 PipelineCachePrivate::pipeline_cache_digest::pipeline_cache_digest(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations,
-        uint32_t _local_size_x, uint32_t _local_size_y, uint32_t _local_size_z, uint32_t _subgroup_size)
+        uint32_t _local_size_x, uint32_t _local_size_y, uint32_t _local_size_z, uint32_t _subgroup_size, uint32_t _fast_math_flag)
 {
     spv_data_murmur3 = murmur3_32(spv_data, spv_data_size / 4);
 
@@ -126,6 +129,8 @@ PipelineCachePrivate::pipeline_cache_digest::pipeline_cache_digest(const uint32_
     local_size_y = _local_size_y;
     local_size_z = _local_size_z;
     subgroup_size = _subgroup_size;
+    fast_math_flag = _fast_math_flag;
+    reserved_0 = 0; // for future use
 
     // encode specializations
     const int specialization_count = specializations.size();
@@ -150,6 +155,8 @@ PipelineCachePrivate::pipeline_cache_digest::pipeline_cache_digest(int _shader_t
     local_size_y = _local_size_y;
     local_size_z = _local_size_z;
     subgroup_size = _subgroup_size;
+    fast_math_flag = opt.vk_fast_math_flag;
+    reserved_0 = 0; // for future use
 
     // encode specializations
     const int specialization_count = specializations.size();
@@ -227,11 +234,11 @@ int PipelineCache::get_pipeline(const uint32_t* spv_data, size_t spv_data_size,
                                 VkPipelineLayout* pipeline_layout,
                                 VkPipeline* pipeline,
                                 VkDescriptorUpdateTemplateKHR* descriptor_update_template,
-                                ShaderInfo& shader_info) const
+                                ShaderInfo& shader_info, uint32_t fast_math_flag) const
 {
     MutexLockGuard lock(d->cache_lock);
 
-    PipelineCachePrivate::pipeline_cache_digest key(spv_data, spv_data_size, specializations, local_size_x, local_size_y, local_size_z, subgroup_size);
+    PipelineCachePrivate::pipeline_cache_digest key(spv_data, spv_data_size, specializations, local_size_x, local_size_y, local_size_z, subgroup_size, fast_math_flag);
 
     if (!vkdev->info.bug_corrupted_online_pipeline_cache())
     {
@@ -266,7 +273,12 @@ int PipelineCache::get_pipeline(const uint32_t* spv_data, size_t spv_data_size,
         return -1;
     }
 
-    VkShaderModule shader_module = vkdev->compile_shader_module(spv_data, spv_data_size, local_size_x, local_size_y, local_size_z);
+    if (fast_math_flag != 0 && !vkdev->info.support_VK_KHR_shader_float_controls2())
+    {
+        NCNN_LOGE("fast_math_flag is not supported on this device");
+        return -1;
+    }
+    VkShaderModule shader_module = vkdev->compile_shader_module(spv_data, spv_data_size, local_size_x, local_size_y, local_size_z, fast_math_flag);
     if (!shader_module)
     {
         NCNN_LOGE("create_shader_module failed");
@@ -351,6 +363,11 @@ int PipelineCache::get_pipeline(int shader_type_index, const Option& opt, const
         return -1;
     }
 
+    if (opt.vk_fast_math_flag != 0 && !vkdev->info.support_VK_KHR_shader_float_controls2())
+    {
+        NCNN_LOGE("fast_math_flag is not supported on this device");
+        return -1;
+    }
     ret = new_pipeline(shader_module, shader_info, specializations, subgroup_size, descriptorset_layout, pipeline_layout, pipeline, descriptor_update_template);
     if (ret != 0)
     {
@@ -403,7 +420,7 @@ int PipelineCache::create_shader_module(int shader_type_index, const Option& opt
         return -1;
     }
 
-    VkShaderModule shader_module = vkdev->compile_shader_module(spv_data, spv_data_size, local_size_x, local_size_y, local_size_z);
+    VkShaderModule shader_module = vkdev->compile_shader_module(spv_data, spv_data_size, local_size_x, local_size_y, local_size_z, opt.vk_fast_math_flag);
 
     if (!shader_module)
     {

diff --git a/src/pipelinecache.h b/src/pipelinecache.h
@@ -31,7 +31,7 @@ class NCNN_EXPORT PipelineCache
                      VkPipelineLayout* pipeline_layout,
                      VkPipeline* pipeline,
                      VkDescriptorUpdateTemplateKHR* descriptor_update_template,
-                     ShaderInfo& shader_info) const;
+                     ShaderInfo& shader_info, uint32_t fast_math_flag = 0) const;
 
     int get_pipeline(int shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations,
                      uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z, uint32_t subgroup_size,

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -65,6 +65,7 @@ ncnn_add_test(paramdict)
 
 if(NCNN_VULKAN)
     ncnn_add_test(command)
+    ncnn_add_test(fast_math)
 endif()
 
 if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten")