diff --git a/CMakeLists.txt b/CMakeLists.txt index 447370c13..3bdb587fe 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -208,10 +208,10 @@ macro(ct2_compile_kernels_for_isa isa flag) list(APPEND SOURCES ${CMAKE_CURRENT_BINARY_DIR}/kernels_${isa}.cc) endmacro() -if(CMAKE_SYSTEM_PROCESSOR MATCHES "(arm64)|(aarch64)" +if(CMAKE_SYSTEM_PROCESSOR MATCHES "(arm64)|(aarch64)|(armv7-a)" OR (APPLE AND CMAKE_OSX_ARCHITECTURES STREQUAL "arm64")) - add_definitions(-DCT2_ARM64_BUILD) - set(CT2_BUILD_ARCH "arm64") + add_definitions(-DCT2_ARM_BUILD) + set(CT2_BUILD_ARCH "arm") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(amd64)|(AMD64)") add_definitions(-DCT2_X86_BUILD) set(CT2_BUILD_ARCH "x86_64") @@ -240,7 +240,7 @@ if(ENABLE_CPU_DISPATCH) ct2_compile_kernels_for_isa(avx2 "-mavx2 -mfma") ct2_compile_kernels_for_isa(avx512 "-mavx512f -mavx512cd -mavx512vl -mavx512bw -mavx512dq") endif() - elseif(CT2_BUILD_ARCH STREQUAL "arm64") + elseif(CT2_BUILD_ARCH STREQUAL "arm") ct2_compile_kernels_for_isa(neon "-DUSE_NEON") endif() endif() diff --git a/src/cpu/cpu_info.cc b/src/cpu/cpu_info.cc index 9030ac7a4..f2e58085e 100644 --- a/src/cpu/cpu_info.cc +++ b/src/cpu/cpu_info.cc @@ -42,7 +42,7 @@ namespace ctranslate2 { } } -#elif defined(CT2_ARM64_BUILD) +#elif defined(CT2_ARM_BUILD) namespace ctranslate2 { namespace cpu { diff --git a/src/cpu/cpu_info.h b/src/cpu/cpu_info.h index c2951bcc0..0ece94ced 100644 --- a/src/cpu/cpu_info.h +++ b/src/cpu/cpu_info.h @@ -12,7 +12,7 @@ namespace ctranslate2 { bool cpu_supports_avx(); bool cpu_supports_avx2(); bool cpu_supports_avx512(); -#elif defined(CT2_ARM64_BUILD) +#elif defined(CT2_ARM_BUILD) bool cpu_supports_neon(); #endif diff --git a/src/cpu/cpu_isa.cc b/src/cpu/cpu_isa.cc index c16aeda22..c07268ce8 100644 --- a/src/cpu/cpu_isa.cc +++ b/src/cpu/cpu_isa.cc @@ -32,7 +32,7 @@ namespace ctranslate2 { return "AVX2"; case CpuIsa::AVX512: return "AVX512"; -#elif defined(CT2_ARM64_BUILD) +#elif defined(CT2_ARM_BUILD) case CpuIsa::NEON: return "NEON"; #endif @@ -51,7 +51,7 @@ namespace ctranslate2 { return try_isa(env_isa, CpuIsa::AVX2, cpu_supports_avx2()); if (env_isa == "AVX") return try_isa(env_isa, CpuIsa::AVX, cpu_supports_avx()); -#elif defined(CT2_ARM64_BUILD) +#elif defined(CT2_ARM_BUILD) if (env_isa == "NEON") return try_isa(env_isa, CpuIsa::NEON, cpu_supports_neon()); #endif @@ -68,7 +68,7 @@ namespace ctranslate2 { return CpuIsa::AVX2; if (cpu_supports_avx()) return CpuIsa::AVX; -# elif defined(CT2_ARM64_BUILD) +# elif defined(CT2_ARM_BUILD) if (cpu_supports_neon()) return CpuIsa::NEON; # endif diff --git a/src/cpu/cpu_isa.h b/src/cpu/cpu_isa.h index 4f42bdf26..85f8b16c3 100644 --- a/src/cpu/cpu_isa.h +++ b/src/cpu/cpu_isa.h @@ -11,7 +11,7 @@ namespace ctranslate2 { AVX, AVX2, AVX512, -#elif defined(CT2_ARM64_BUILD) +#elif defined(CT2_ARM_BUILD) NEON, #endif }; @@ -48,7 +48,7 @@ namespace ctranslate2 { CPU_ISA_CASE(cpu::CpuIsa::AVX, SINGLE_ARG(STMTS)) \ CPU_ISA_DEFAULT(cpu::CpuIsa::GENERIC, SINGLE_ARG(STMTS)) \ } -#elif defined(CT2_ARM64_BUILD) +#elif defined(CT2_ARM_BUILD) # define CPU_ISA_DISPATCH(STMTS) \ switch (cpu::get_cpu_isa()) { \ CPU_ISA_CASE(cpu::CpuIsa::NEON, SINGLE_ARG(STMTS)) \ diff --git a/src/cpu/vec_neon.h b/src/cpu/vec_neon.h index 4ffb20773..8ea83b2dd 100644 --- a/src/cpu/vec_neon.h +++ b/src/cpu/vec_neon.h @@ -144,19 +144,35 @@ namespace ctranslate2 { } static inline value_type div(value_type a, value_type b) { +#ifdef __aarch64__ return vdivq_f32(a, b); +#else + return a / b; +#endif } static inline value_type mul_add(value_type a, value_type b, value_type c) { +#ifdef __aarch64__ return vfmaq_f32(c, a, b); +#else + return a * b + c; +#endif } static inline float reduce_add(value_type a) { +#ifdef __aarch64__ return vaddvq_f32(a); +#else + return a[0] + a[1] + a[2] + a[3]; +#endif } static inline float reduce_max(value_type a) { +#ifdef __aarch64__ return vmaxvq_f32(a); +#else + return std::max({a[0], a[1], a[2], a[3]}); +#endif } }; diff --git a/src/utils.cc b/src/utils.cc index f0eb29509..25c890e13 100644 --- a/src/utils.cc +++ b/src/utils.cc @@ -38,7 +38,7 @@ namespace ctranslate2 { cpu::cpu_supports_avx(), cpu::cpu_supports_avx2(), cpu::cpu_supports_avx512()); -#elif defined(CT2_ARM64_BUILD) +#elif defined(CT2_ARM_BUILD) spdlog::info("CPU: {} (NEON={})", cpu::cpu_vendor(), cpu::cpu_supports_neon());