[MathToVecLib] Add support for setting bit-widths for AVX512, AVX, and SSE to prevent "Illegal instruction (core dumped)" (#234)

Artlesbol · web-flow · commit 0625715c2714 · 2025-04-25T23:24:21.000-05:00
* [MathToVecLib] Add support for setting bit-widths for AVX512, AVX, and SSE to prevent "Illegal instruction (core dumped)"

* [MathToVecLib] Fix incorrect vec_size_in_bits update method and initialization

* [MathToVecLib] Add tests for generating SLEEF functions with different ISA.

* [MathToVecLib] Remove unrelated huge vector contact from test

* [MathToVecLib] Fix coding style issues and apply necessary adjustments

* [MathToVecLib] Fix code formatting issues updated by pre-commit
diff --git a/test/TritonCPU/math-to-vec-lib.mlir b/test/TritonCPU/math-to-vec-lib.mlir
@@ -0,0 +1,41 @@
+// RUN: triton-opt %s -split-input-file -triton-cpu-math-to-vec-lib="cpu_features=sse" | FileCheck %s --check-prefix=CHECK-SSE
+// RUN: triton-opt %s -split-input-file -triton-cpu-math-to-vec-lib="cpu_features=sse,sse2,sse3" | FileCheck %s --check-prefix=CHECK-SSE
+// RUN: triton-opt %s -split-input-file -triton-cpu-math-to-vec-lib="cpu_features=avx" | FileCheck %s --check-prefix=CHECK-AVX
+// RUN: triton-opt %s -split-input-file -triton-cpu-math-to-vec-lib="cpu_features=avx,avx2" | FileCheck %s --check-prefix=CHECK-AVX
+// RUN: triton-opt %s -split-input-file -triton-cpu-math-to-vec-lib="cpu_features=avx,sse" | FileCheck %s --check-prefix=CHECK-AVX
+// RUN: triton-opt %s -split-input-file -triton-cpu-math-to-vec-lib="cpu_features=avx512f" | FileCheck %s --check-prefix=CHECK-AVX512F
+// RUN: triton-opt %s -split-input-file -triton-cpu-math-to-vec-lib="cpu_features=avx512f,avx" | FileCheck %s --check-prefix=CHECK-AVX512F
+// RUN: triton-opt %s -split-input-file -triton-cpu-math-to-vec-lib="cpu_features=avx512f,avx,sse" | FileCheck %s --check-prefix=CHECK-AVX512F
+
+// Convert math ops to VecLib ops.
+
+// CHECK-SSE-LABEL: @exp_kernel
+// CHECK-SSE: %[[EXTRACTED:.*]] = vector.extract %{{.*}}[0] : vector<4xf32> from vector<256x4xf32>
+// CHECK-SSE-NEXT: %[[CALLED:.*]] = func.call @Sleef_expf4_u10(%[[EXTRACTED]]) : (vector<4xf32>) -> vector<4xf32>
+// CHECK-SSE-NEXT: %[[INSERTED:.*]] = vector.insert %[[CALLED]], %{{.*}}[0] : vector<4xf32> into vector<256x4xf32>
+
+// CHECK-AVX-LABEL: @exp_kernel
+// CHECK-AVX: %[[EXTRACTED:.*]] = vector.extract %{{.*}}[0] : vector<8xf32> from vector<128x8xf32>
+// CHECK-AVX-NEXT: %[[CALLED:.*]] = func.call @Sleef_expf8_u10(%[[EXTRACTED]]) : (vector<8xf32>) -> vector<8xf32>
+// CHECK-AVX-NEXT: %[[INSERTED:.*]] = vector.insert %[[CALLED]], %{{.*}}[0] : vector<8xf32> into vector<128x8xf32>
+
+// CHECK-AVX512F-LABEL: @exp_kernel
+// CHECK-AVX512F: %[[EXTRACTED:.*]] = vector.extract %{{.*}}[0] : vector<16xf32> from vector<64x16xf32>
+// CHECK-AVX512F-NEXT: %[[CALLED:.*]] = func.call @Sleef_expf16_u10(%[[EXTRACTED]]) : (vector<16xf32>) -> vector<16xf32>
+// CHECK-AVX512F-NEXT: %[[INSERTED:.*]] = vector.insert %[[CALLED]], %{{.*}}[0] : vector<16xf32> into vector<64x16xf32>
+
+module {
+  tt.func public @exp_kernel(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32} , %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32} , %arg2: i32 {tt.divisibility = 16 : i32} ) attributes {noinline = false} {
+    %c0 = arith.constant 0 : index
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %arg2 : i32
+    %2 = tt.addptr %arg1, %1 : !tt.ptr<f32>, i32
+    %3 = triton_cpu.ptr_to_memref %2 : <f32> -> memref<1024xf32>
+    %4 = vector.load %3[%c0] : memref<1024xf32>, vector<1024xf32>
+    %5 = math.exp %4 : vector<1024xf32>
+    %6 = tt.addptr %arg0, %1 : !tt.ptr<f32>, i32
+    %7 = triton_cpu.ptr_to_memref %6 : <f32> -> memref<1024xf32>
+    vector.store %5, %7[%c0] : memref<1024xf32>, vector<1024xf32>
+    tt.return
+  }
+}
diff --git a/third_party/cpu/include/TritonCPUToLLVM/Passes.td b/third_party/cpu/include/TritonCPUToLLVM/Passes.td
@@ -104,6 +104,8 @@ def MathToVecLib : Pass<"triton-cpu-math-to-vec-lib", "mlir::ModuleOp"> {
                clEnumValN(mlir::triton::cpu::VecLib::Mvec, "mvec",
                 "Use Mvec as mm lib")
               )}]>,
+        ListOption<"cpu_features", "cpu_features", "std::string",
+             "A list of available CPU features to choose proper vector functions">,
     ];
 
     let dependentDialects = ["mlir::vector::VectorDialect",
diff --git a/third_party/cpu/lib/TritonCPUToLLVM/MathToVecLib.cpp b/third_party/cpu/lib/TritonCPUToLLVM/MathToVecLib.cpp
@@ -346,7 +346,8 @@ void populatePatternsForOp(RewritePatternSet &patterns,
 struct MathToVecLibPass
     : public mlir::triton::cpu::impl::MathToVecLibBase<MathToVecLibPass> {
   MathToVecLibPass() = default;
-  size_t vec_size_in_bits;
+  // Default to 128-bit if no features are specified.
+  size_t vec_size_in_bits = 128;
 
   explicit MathToVecLibPass(VecLib lib, std::set<std::string> cpu_features) {
     this->lib = lib;
@@ -358,10 +359,15 @@ struct MathToVecLibPass
     //  Refactor this as an independent function.
     //  And improve this to support other x86 SIMD ISAs and also for arm SVE
     //  (VLA)
-    vec_size_in_bits = 512;
     for (auto feature : cpu_features) {
-      // Arm NEON is fixed 128-bit SIMD ISA.
-      if (feature == "neon") {
+      if (feature == "avx512f") {
+        vec_size_in_bits = std::max<size_t>(vec_size_in_bits, 512);
+      } else if (feature == "avx") {
+        vec_size_in_bits = std::max<size_t>(vec_size_in_bits, 256);
+      } else if (feature == "sse") {
+        vec_size_in_bits = std::max<size_t>(vec_size_in_bits, 128);
+      } else if (feature == "neon") {
+        // Arm NEON is fixed 128-bit SIMD ISA.
         vec_size_in_bits = 128;
         break;
       }
@@ -374,6 +380,12 @@ struct MathToVecLibPass
 
     RewritePatternSet patterns(context);
 
+    if (!cpu_features.empty()) {
+      std::set<std::string> cpu_features_set{cpu_features.begin(),
+                                             cpu_features.end()};
+      update_vec_size(cpu_features_set);
+    }
+
     switch (lib) {
     case VecLib::Mvec: {
       populateCommonPatterns<MvecNameGenerator>(patterns);