diff --git a/.github/workflows/rapfi.yml b/.github/workflows/rapfi.yml index 835c553c..28eaec71 100644 --- a/.github/workflows/rapfi.yml +++ b/.github/workflows/rapfi.yml @@ -20,7 +20,7 @@ jobs: target: - { name: linux-gcc, - os: ubuntu-20.04, + os: ubuntu-22.04, c_compiler: gcc, cxx_compiler: g++, cmake_command: "", @@ -29,8 +29,8 @@ jobs: - { name: linux-clang, os: ubuntu-20.04, - c_compiler: clang, - cxx_compiler: clang++, + c_compiler: clang-18, + cxx_compiler: clang++-18, cmake_command: "", shell: "bash {0}" } @@ -55,9 +55,11 @@ jobs: shell: "msys2 {0}" } arch: - - { name: avx2, cmake_command: "-DUSE_BMI2=OFF -DUSE_AVX2=ON -DUSE_AVX=ON -DUSE_SSE=ON" } - - { name: avx, cmake_command: "-DUSE_BMI2=OFF -DUSE_AVX2=OFF -DUSE_AVX=ON -DUSE_SSE=ON" } - - { name: sse, cmake_command: "-DUSE_BMI2=OFF -DUSE_AVX2=OFF -DUSE_AVX=OFF -DUSE_SSE=ON" } + - { name: sse, cmake_command: "-DUSE_AVX2=OFF" } + - { name: avx2, cmake_command: "-DUSE_AVX2=ON" } + - { name: avxvnni, cmake_command: "-DUSE_AVX2=ON -DUSE_VNNI=ON" } + - { name: avx512, cmake_command: "-DUSE_AVX2=ON -DUSE_AVX512=ON" } + - { name: avx512vnni, cmake_command: "-DUSE_AVX2=ON -DUSE_AVX512=ON -DUSE_VNNI=ON" } defaults: run: @@ -71,18 +73,20 @@ jobs: fetch-depth: 0 submodules: true - - name: Download required linux packages - if: runner.os == 'Linux' + - name: Download latest clang compiler on linux + if: matrix.target.name == 'linux-clang' run: | - sudo apt update - sudo apt install -y libtbb-dev + wget https://apt.llvm.org/llvm.sh + chmod u+x llvm.sh + echo | sudo ./llvm.sh 18 + rm llvm.sh - name: Setup msys and install required packages if: runner.os == 'Windows' uses: msys2/setup-msys2@v2 with: msystem: ${{matrix.target.msys_sys}} - install: mingw-w64-${{matrix.target.msys_env}}-${{matrix.target.c_compiler}} mingw-w64-${{matrix.target.msys_env}}-tbb mingw-w64-${{matrix.target.msys_env}}-cmake make git + install: mingw-w64-${{matrix.target.msys_env}}-${{matrix.target.c_compiler}} mingw-w64-${{matrix.target.msys_env}}-cmake make git - name: Extract the bench hash from the commit history run: | diff --git a/Rapfi/CMakeLists.txt b/Rapfi/CMakeLists.txt index 2daa9e64..a2fef98c 100644 --- a/Rapfi/CMakeLists.txt +++ b/Rapfi/CMakeLists.txt @@ -332,7 +332,7 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "C if(USE_AVX512) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mavx512dq -mavx512bw") if(USE_VNNI) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512vnni") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512vnni -mavx512vl") endif() elseif(USE_AVX2) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2 -mfma") diff --git a/Rapfi/CMakeSettings.json b/Rapfi/CMakeSettings.json index 9a5fe428..fd4f0ab3 100644 --- a/Rapfi/CMakeSettings.json +++ b/Rapfi/CMakeSettings.json @@ -121,6 +121,34 @@ } ] }, + { + "name": "x64-Release-AVX512VNNI", + "generator": "Ninja", + "configurationType": "Release", + "buildRoot": "${projectDir}\\build\\${name}", + "installRoot": "${projectDir}\\build\\install\\${name}", + "cmakeCommandArgs": "", + "buildCommandArgs": "", + "ctestCommandArgs": "", + "inheritEnvironments": [ "msvc_x64_x64" ], + "variables": [ + { + "name": "ENABLE_LTO", + "value": "True", + "type": "BOOL" + }, + { + "name": "USE_AVX512", + "value": "True", + "type": "BOOL" + }, + { + "name": "USE_VNNI", + "value": "True", + "type": "BOOL" + } + ] + }, { "name": "x64-Release-ST", "generator": "Ninja", @@ -219,6 +247,29 @@ } ] }, + { + "name": "x64-Clang-Release-AVX512VNNI", + "generator": "Ninja", + "configurationType": "Release", + "buildRoot": "${projectDir}\\build\\${name}", + "installRoot": "${projectDir}\\build\\install\\${name}", + "cmakeCommandArgs": "", + "buildCommandArgs": "", + "ctestCommandArgs": "", + "inheritEnvironments": [ "clang_cl_x64_x64" ], + "variables": [ + { + "name": "USE_AVX512", + "value": "True", + "type": "BOOL" + }, + { + "name": "USE_VNNI", + "value": "True", + "type": "BOOL" + } + ] + }, { "name": "x64-Clang-Release-ST", "generator": "Ninja", diff --git a/Rapfi/eval/mix9litennue.cpp b/Rapfi/eval/mix9litennue.cpp index 56fdf1fc..58235551 100644 --- a/Rapfi/eval/mix9litennue.cpp +++ b/Rapfi/eval/mix9litennue.cpp @@ -54,17 +54,13 @@ constexpr int MaxOuterChanges[23] = {5, 11, 33, 107, 293, 675, 1 static Evaluation::WeightRegistry Mix9LiteWeightRegistry; constexpr int Alignment = 16; -constexpr simd::InstructionType IT512 = getInstTypeOfWidth(simd::NativeInstType, 512); constexpr simd::InstructionType IT256 = getInstTypeOfWidth(simd::NativeInstType, 256); constexpr simd::InstructionType IT128 = getInstTypeOfWidth(simd::NativeInstType, 128); template -using Batch = std::conditional_t< - simd::detail::VecBatch::NumExtra == 0, - simd::detail::VecBatch, - std::conditional_t::NumExtra == 0, - simd::detail::VecBatch, - simd::detail::VecBatch>>; +using Batch = std::conditional_t::NumExtra == 0, + simd::detail::VecBatch, + simd::detail::VecBatch>; template using Convert = simd::detail::VecCvt; diff --git a/Rapfi/eval/simdops.h b/Rapfi/eval/simdops.h index c01fde50..b931db5c 100644 --- a/Rapfi/eval/simdops.h +++ b/Rapfi/eval/simdops.h @@ -823,7 +823,11 @@ namespace detail { static FORCE_INLINE void dot4_u7i8_accum(R &acc, R a, R b) { #if defined(USE_VNNI) + #if !defined(USE_AVX512) acc = _mm_dpbusd_avx_epi32(acc, a, b); + #else + acc = _mm_dpbusd_epi32(acc, a, b); + #endif #else R product0 = simde_mm_maddubs_epi16(a, b); product0 = simde_mm_madd_epi16(product0, simde_mm_set1_epi16(1)); @@ -840,8 +844,13 @@ namespace detail { R low7 = simde_mm_andnot_si128(highest_bit, a); #if defined(USE_VNNI) + #if !defined(USE_AVX512) msb = _mm_dpbusd_avx_epi32(_mm_setzero_si128(), msb, b); // 0 or 128 low7 = _mm_dpbusd_avx_epi32(_mm_setzero_si128(), low7, b); + #else + msb = _mm_dpbusd_epi32(_mm_setzero_si128(), msb, b); // 0 or 128 + low7 = _mm_dpbusd_epi32(_mm_setzero_si128(), low7, b); + #endif #else // Multiply a * b in two parts and accumulate neighbouring outputs into int16 values msb = simde_mm_maddubs_epi16(msb, b); // 0 or 128 @@ -877,7 +886,11 @@ namespace detail { static FORCE_INLINE void dot4_u7i8_accum(R &acc, R a, R b) { #if defined(USE_VNNI) + #if !defined(USE_AVX512) acc = _mm256_dpbusd_avx_epi32(acc, a, b); + #else + acc = _mm256_dpbusd_epi32(acc, a, b); + #endif #else R product0 = simde_mm256_maddubs_epi16(a, b); product0 = simde_mm256_madd_epi16(product0, simde_mm256_set1_epi16(1)); @@ -894,8 +907,13 @@ namespace detail { R low7 = simde_mm256_andnot_si256(highest_bit, a); #if defined(USE_VNNI) + #if !defined(USE_AVX512) msb = _mm256_dpbusd_avx_epi32(_mm256_setzero_si256(), msb, b); // 0 or 128 low7 = _mm256_dpbusd_avx_epi32(_mm256_setzero_si256(), low7, b); + #else + msb = _mm256_dpbusd_epi32(_mm256_setzero_si256(), msb, b); // 0 or 128 + low7 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), low7, b); + #endif #else // Multiply a * b in two parts and accumulate neighbouring outputs into int16 values msb = simde_mm256_maddubs_epi16(msb, b); // 0 or 128