diff --git a/README.md b/README.md index c24f1d2..ccfa5bf 100644 --- a/README.md +++ b/README.md @@ -55,10 +55,12 @@ The above features require setting the `CUDA_TOOLKIT_ROOT_DIR` environment varia - `accelerate`: Enables [Apple Accelerate](https://developer.apple.com/documentation/accelerate) support (macOS only) - `openmp-runtime-comp`: Enables OpenMP runtime support - `openmp-runtime-intel`: Enables OpenMP runtime support for Intel compilers -- `msse4_1`: Enables MSSE4.1 support Multiple features can be enabled at the same time. +To enable Streaming SIMD Extensions 4.1 (SSE4.1), add the `-C target-feature=+sse4.1` flag to `RUSTFLAGS` environment +variable. + By default, the `ruy` feature is enabled. If you want to use platform-specific default features, use the `ct2rs-platform` crate. @@ -93,8 +95,7 @@ When `ct2rs-platform` is used, the following features are automatically selected - Windows: `openmp-runtime-intel`, `dnnl`, `cuda`, `cudnn`, `cuda-dynamic-loading`, `mkl` - Intel MacOS: `dnnl`, `mkl` - Apple Silicon MacOS: `accelerate`, `ruy` -- Linux (non-ARM): `dnnl`, `openmp-runtime-comp`, `cuda`, `cudnn`, `cuda-dynamic-loading`, `mkl`, `tensor-parallel`, - `msse4_1` +- Linux (non-ARM): `dnnl`, `openmp-runtime-comp`, `cuda`, `cudnn`, `cuda-dynamic-loading`, `mkl`, `tensor-parallel` - Linux (ARM): `openmp-runtime-comp`, `openblas`, `ruy` ## Supported Models diff --git a/ct2rs-platform/Cargo.toml b/ct2rs-platform/Cargo.toml index 73581f4..4aa6f75 100644 --- a/ct2rs-platform/Cargo.toml +++ b/ct2rs-platform/Cargo.toml @@ -32,7 +32,7 @@ features = ["accelerate", "ruy"] version = "=0.9.10" path = "../ct2rs" default-features = false -features = ["dnnl", "openmp-runtime-comp", "cuda", "cudnn", "cuda-dynamic-loading", "mkl", "tensor-parallel", "msse4_1"] +features = ["dnnl", "openmp-runtime-comp", "cuda", "cudnn", "cuda-dynamic-loading", "mkl", "tensor-parallel"] [target.'cfg(all(target_os = "linux", target_arch = "aarch64"))'.dependencies.ct2rs] version = "=0.9.10" @@ -56,7 +56,6 @@ openmp-runtime-intel = ["ct2rs/openmp-runtime-intel"] ruy = ["ct2rs/ruy"] accelerate = ["ct2rs/accelerate"] cuda = ["ct2rs/cuda"] -msse4_1 = ["ct2rs/msse4_1"] cudnn = ["ct2rs/cudnn"] # Features to enable GPU functionality. diff --git a/ct2rs/Cargo.toml b/ct2rs/Cargo.toml index 8d5c5e4..6a2d8fe 100644 --- a/ct2rs/Cargo.toml +++ b/ct2rs/Cargo.toml @@ -88,7 +88,6 @@ ruy = [] accelerate = [] openmp-runtime-comp = [] openmp-runtime-intel = [] -msse4_1 = [] # Features to enable GPU functionality. flash-attention = [] diff --git a/ct2rs/build.rs b/ct2rs/build.rs index a1d8fec..399b6ab 100644 --- a/ct2rs/build.rs +++ b/ct2rs/build.rs @@ -63,7 +63,7 @@ fn build_ctranslate2() { let dnnl = cfg!(feature = "dnnl"); let mut openmp_comp: bool = cfg!(feature = "openmp-runtime-comp"); let openmp_intel = cfg!(feature = "openmp-runtime-intel"); - let msse4_1 = cfg!(feature = "msse4_1"); + let sse4_1 = cfg!(target_feature = "sse4.1"); if !openmp_intel && !openmp_comp && dnnl { if os == Os::Linux { openmp_comp = true; @@ -156,7 +156,7 @@ fn build_ctranslate2() { if tensor_parallel { cmake.define("WITH_TENSOR_PARALLEL", "ON"); } - if msse4_1 { + if sse4_1 { cmake.define("CMAKE_CXX_FLAGS", "-msse4.1"); } if dnnl {