jkawamoto · jkawamoto · Oct 12, 2025 · Oct 12, 2025
diff --git a/README.md b/README.md
@@ -55,10 +55,12 @@ The above features require setting the `CUDA_TOOLKIT_ROOT_DIR` environment varia
 - `accelerate`: Enables [Apple Accelerate](https://developer.apple.com/documentation/accelerate) support (macOS only)
 - `openmp-runtime-comp`: Enables OpenMP runtime support
 - `openmp-runtime-intel`: Enables OpenMP runtime support for Intel compilers
-- `msse4_1`: Enables MSSE4.1 support
 
 Multiple features can be enabled at the same time.
 
+To enable Streaming SIMD Extensions 4.1 (SSE4.1), add the `-C target-feature=+sse4.1` flag to `RUSTFLAGS` environment
+variable.
+
 By default, the `ruy` feature is enabled.
 
 If you want to use platform-specific default features, use the `ct2rs-platform` crate.
@@ -93,8 +95,7 @@ When `ct2rs-platform` is used, the following features are automatically selected
 - Windows: `openmp-runtime-intel`, `dnnl`, `cuda`, `cudnn`, `cuda-dynamic-loading`, `mkl`
 - Intel MacOS: `dnnl`, `mkl`
 - Apple Silicon MacOS: `accelerate`, `ruy`
-- Linux (non-ARM): `dnnl`, `openmp-runtime-comp`, `cuda`, `cudnn`, `cuda-dynamic-loading`, `mkl`, `tensor-parallel`,
-  `msse4_1`
+- Linux (non-ARM): `dnnl`, `openmp-runtime-comp`, `cuda`, `cudnn`, `cuda-dynamic-loading`, `mkl`, `tensor-parallel`
 - Linux (ARM): `openmp-runtime-comp`, `openblas`, `ruy`
 
 ## Supported Models

diff --git a/ct2rs-platform/Cargo.toml b/ct2rs-platform/Cargo.toml
@@ -32,7 +32,7 @@ features = ["accelerate", "ruy"]
 version = "=0.9.10"
 path = "../ct2rs"
 default-features = false
-features = ["dnnl", "openmp-runtime-comp", "cuda", "cudnn", "cuda-dynamic-loading", "mkl", "tensor-parallel", "msse4_1"]
+features = ["dnnl", "openmp-runtime-comp", "cuda", "cudnn", "cuda-dynamic-loading", "mkl", "tensor-parallel"]
 
 [target.'cfg(all(target_os = "linux", target_arch = "aarch64"))'.dependencies.ct2rs]
 version = "=0.9.10"
@@ -56,7 +56,6 @@ openmp-runtime-intel = ["ct2rs/openmp-runtime-intel"]
 ruy = ["ct2rs/ruy"]
 accelerate = ["ct2rs/accelerate"]
 cuda = ["ct2rs/cuda"]
-msse4_1 = ["ct2rs/msse4_1"]
 cudnn = ["ct2rs/cudnn"]
 
 # Features to enable GPU functionality.

diff --git a/ct2rs/Cargo.toml b/ct2rs/Cargo.toml
@@ -88,7 +88,6 @@ ruy = []
 accelerate = []
 openmp-runtime-comp = []
 openmp-runtime-intel = []
-msse4_1 = []
 
 # Features to enable GPU functionality.
 flash-attention = []

diff --git a/ct2rs/build.rs b/ct2rs/build.rs
@@ -63,7 +63,7 @@ fn build_ctranslate2() {
     let dnnl = cfg!(feature = "dnnl");
     let mut openmp_comp: bool = cfg!(feature = "openmp-runtime-comp");
     let openmp_intel = cfg!(feature = "openmp-runtime-intel");
-    let msse4_1 = cfg!(feature = "msse4_1");
+    let sse4_1 = cfg!(target_feature = "sse4.1");
     if !openmp_intel && !openmp_comp && dnnl {
         if os == Os::Linux {
             openmp_comp = true;
@@ -156,7 +156,7 @@ fn build_ctranslate2() {
     if tensor_parallel {
         cmake.define("WITH_TENSOR_PARALLEL", "ON");
     }
-    if msse4_1 {
+    if sse4_1 {
         cmake.define("CMAKE_CXX_FLAGS", "-msse4.1");
     }
     if dnnl {