diff --git a/.github/workflows/rapfi.yml b/.github/workflows/rapfi.yml
index 835c553c..28eaec71 100644
--- a/.github/workflows/rapfi.yml
+++ b/.github/workflows/rapfi.yml
@@ -20,7 +20,7 @@ jobs:
         target:
           - {
               name: linux-gcc,
-              os: ubuntu-20.04,
+              os: ubuntu-22.04,
               c_compiler: gcc,
               cxx_compiler: g++,
               cmake_command: "",
@@ -29,8 +29,8 @@ jobs:
           - {
               name: linux-clang,
               os: ubuntu-20.04,
-              c_compiler: clang,
-              cxx_compiler: clang++,
+              c_compiler: clang-18,
+              cxx_compiler: clang++-18,
               cmake_command: "",
               shell: "bash {0}"
             }
@@ -55,9 +55,11 @@ jobs:
               shell: "msys2 {0}"
             }
         arch:
-          - { name: avx2, cmake_command: "-DUSE_BMI2=OFF -DUSE_AVX2=ON -DUSE_AVX=ON -DUSE_SSE=ON" }
-          - { name: avx, cmake_command: "-DUSE_BMI2=OFF -DUSE_AVX2=OFF -DUSE_AVX=ON -DUSE_SSE=ON" }
-          - { name: sse, cmake_command: "-DUSE_BMI2=OFF -DUSE_AVX2=OFF -DUSE_AVX=OFF -DUSE_SSE=ON" }
+          - { name: sse, cmake_command: "-DUSE_AVX2=OFF" }
+          - { name: avx2, cmake_command: "-DUSE_AVX2=ON" }
+          - { name: avxvnni, cmake_command: "-DUSE_AVX2=ON -DUSE_VNNI=ON" }
+          - { name: avx512, cmake_command: "-DUSE_AVX2=ON -DUSE_AVX512=ON" }
+          - { name: avx512vnni, cmake_command: "-DUSE_AVX2=ON -DUSE_AVX512=ON -DUSE_VNNI=ON" }
 
     defaults:
       run:
@@ -71,18 +73,20 @@ jobs:
           fetch-depth: 0
           submodules: true
 
-      - name: Download required linux packages
-        if: runner.os == 'Linux'
+      - name: Download latest clang compiler on linux
+        if: matrix.target.name == 'linux-clang'
         run: |
-          sudo apt update
-          sudo apt install -y libtbb-dev
+          wget https://apt.llvm.org/llvm.sh
+          chmod u+x llvm.sh
+          echo | sudo ./llvm.sh 18
+          rm llvm.sh
 
       - name: Setup msys and install required packages
         if: runner.os == 'Windows'
         uses: msys2/setup-msys2@v2
         with:
           msystem: ${{matrix.target.msys_sys}}
-          install: mingw-w64-${{matrix.target.msys_env}}-${{matrix.target.c_compiler}} mingw-w64-${{matrix.target.msys_env}}-tbb mingw-w64-${{matrix.target.msys_env}}-cmake make git
+          install: mingw-w64-${{matrix.target.msys_env}}-${{matrix.target.c_compiler}} mingw-w64-${{matrix.target.msys_env}}-cmake make git
 
       - name: Extract the bench hash from the commit history
         run: |
diff --git a/Rapfi/CMakeLists.txt b/Rapfi/CMakeLists.txt
index 2daa9e64..a2fef98c 100644
--- a/Rapfi/CMakeLists.txt
+++ b/Rapfi/CMakeLists.txt
@@ -332,7 +332,7 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "C
 	if(USE_AVX512)
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mavx512dq -mavx512bw")
         if(USE_VNNI)
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512vnni")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512vnni -mavx512vl")
         endif()
     elseif(USE_AVX2)
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2 -mfma")
diff --git a/Rapfi/CMakeSettings.json b/Rapfi/CMakeSettings.json
index 9a5fe428..fd4f0ab3 100644
--- a/Rapfi/CMakeSettings.json
+++ b/Rapfi/CMakeSettings.json
@@ -121,6 +121,34 @@
         }
       ]
     },
+    {
+      "name": "x64-Release-AVX512VNNI",
+      "generator": "Ninja",
+      "configurationType": "Release",
+      "buildRoot": "${projectDir}\\build\\${name}",
+      "installRoot": "${projectDir}\\build\\install\\${name}",
+      "cmakeCommandArgs": "",
+      "buildCommandArgs": "",
+      "ctestCommandArgs": "",
+      "inheritEnvironments": [ "msvc_x64_x64" ],
+      "variables": [
+        {
+          "name": "ENABLE_LTO",
+          "value": "True",
+          "type": "BOOL"
+        },
+        {
+          "name": "USE_AVX512",
+          "value": "True",
+          "type": "BOOL"
+        },
+        {
+          "name": "USE_VNNI",
+          "value": "True",
+          "type": "BOOL"
+        }
+      ]
+    },
     {
       "name": "x64-Release-ST",
       "generator": "Ninja",
@@ -219,6 +247,29 @@
         }
       ]
     },
+    {
+      "name": "x64-Clang-Release-AVX512VNNI",
+      "generator": "Ninja",
+      "configurationType": "Release",
+      "buildRoot": "${projectDir}\\build\\${name}",
+      "installRoot": "${projectDir}\\build\\install\\${name}",
+      "cmakeCommandArgs": "",
+      "buildCommandArgs": "",
+      "ctestCommandArgs": "",
+      "inheritEnvironments": [ "clang_cl_x64_x64" ],
+      "variables": [
+        {
+          "name": "USE_AVX512",
+          "value": "True",
+          "type": "BOOL"
+        },
+        {
+          "name": "USE_VNNI",
+          "value": "True",
+          "type": "BOOL"
+        }
+      ]
+    },
     {
       "name": "x64-Clang-Release-ST",
       "generator": "Ninja",
diff --git a/Rapfi/eval/mix9litennue.cpp b/Rapfi/eval/mix9litennue.cpp
index 56fdf1fc..58235551 100644
--- a/Rapfi/eval/mix9litennue.cpp
+++ b/Rapfi/eval/mix9litennue.cpp
@@ -54,17 +54,13 @@ constexpr int MaxOuterChanges[23] = {5,     11,    33,    107,   293,   675,   1
 static Evaluation::WeightRegistry<Mix9LiteWeight> Mix9LiteWeightRegistry;
 
 constexpr int                   Alignment = 16;
-constexpr simd::InstructionType IT512     = getInstTypeOfWidth(simd::NativeInstType, 512);
 constexpr simd::InstructionType IT256     = getInstTypeOfWidth(simd::NativeInstType, 256);
 constexpr simd::InstructionType IT128     = getInstTypeOfWidth(simd::NativeInstType, 128);
 
 template <size_t Size, typename T>
-using Batch = std::conditional_t<
-    simd::detail::VecBatch<Size, T, IT512, true>::NumExtra == 0,
-    simd::detail::VecBatch<Size, T, IT512>,
-    std::conditional_t<simd::detail::VecBatch<Size, T, IT256, true>::NumExtra == 0,
-                       simd::detail::VecBatch<Size, T, IT256>,
-                       simd::detail::VecBatch<Size, T, IT128>>>;
+using Batch = std::conditional_t<simd::detail::VecBatch<Size, T, IT256, true>::NumExtra == 0,
+                                 simd::detail::VecBatch<Size, T, IT256>,
+                                 simd::detail::VecBatch<Size, T, IT128>>;
 
 template <typename FT, typename TT, typename Batch>
 using Convert = simd::detail::VecCvt<FT, TT, Batch::Inst>;
diff --git a/Rapfi/eval/simdops.h b/Rapfi/eval/simdops.h
index c01fde50..b931db5c 100644
--- a/Rapfi/eval/simdops.h
+++ b/Rapfi/eval/simdops.h
@@ -823,7 +823,11 @@ namespace detail {
         static FORCE_INLINE void dot4_u7i8_accum(R &acc, R a, R b)
         {
 #if defined(USE_VNNI)
+    #if !defined(USE_AVX512)
             acc = _mm_dpbusd_avx_epi32(acc, a, b);
+    #else
+            acc = _mm_dpbusd_epi32(acc, a, b);
+    #endif
 #else
             R product0 = simde_mm_maddubs_epi16(a, b);
             product0   = simde_mm_madd_epi16(product0, simde_mm_set1_epi16(1));
@@ -840,8 +844,13 @@ namespace detail {
             R low7 = simde_mm_andnot_si128(highest_bit, a);
 
 #if defined(USE_VNNI)
+    #if !defined(USE_AVX512)
             msb  = _mm_dpbusd_avx_epi32(_mm_setzero_si128(), msb, b);  // 0 or 128
             low7 = _mm_dpbusd_avx_epi32(_mm_setzero_si128(), low7, b);
+    #else
+            msb  = _mm_dpbusd_epi32(_mm_setzero_si128(), msb, b);  // 0 or 128
+            low7 = _mm_dpbusd_epi32(_mm_setzero_si128(), low7, b);
+    #endif
 #else
             // Multiply a * b in two parts and accumulate neighbouring outputs into int16 values
             msb  = simde_mm_maddubs_epi16(msb, b);  // 0 or 128
@@ -877,7 +886,11 @@ namespace detail {
         static FORCE_INLINE void dot4_u7i8_accum(R &acc, R a, R b)
         {
 #if defined(USE_VNNI)
+    #if !defined(USE_AVX512)
             acc = _mm256_dpbusd_avx_epi32(acc, a, b);
+    #else
+            acc = _mm256_dpbusd_epi32(acc, a, b);
+    #endif
 #else
             R product0 = simde_mm256_maddubs_epi16(a, b);
             product0   = simde_mm256_madd_epi16(product0, simde_mm256_set1_epi16(1));
@@ -894,8 +907,13 @@ namespace detail {
             R low7 = simde_mm256_andnot_si256(highest_bit, a);
 
 #if defined(USE_VNNI)
+    #if !defined(USE_AVX512)
             msb  = _mm256_dpbusd_avx_epi32(_mm256_setzero_si256(), msb, b);  // 0 or 128
             low7 = _mm256_dpbusd_avx_epi32(_mm256_setzero_si256(), low7, b);
+    #else
+            msb  = _mm256_dpbusd_epi32(_mm256_setzero_si256(), msb, b);  // 0 or 128
+            low7 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), low7, b);
+    #endif
 #else
             // Multiply a * b in two parts and accumulate neighbouring outputs into int16 values
             msb  = simde_mm256_maddubs_epi16(msb, b);  // 0 or 128