diff --git a/candle-core/src/cpu/avx.rs b/candle-core/src/cpu/avx.rs index 9398a3460a..113fc14ced 100644 --- a/candle-core/src/cpu/avx.rs +++ b/candle-core/src/cpu/avx.rs @@ -1,10 +1,10 @@ -use super::{Cpu, CpuF16}; +use super::{Cpu, CpuBF16, CpuF16}; #[cfg(target_arch = "x86")] use core::arch::x86::*; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::*; -use half::f16; +use half::{bf16, f16}; pub struct CurrentCpu {} @@ -146,3 +146,82 @@ impl CpuF16 for CurrentCpuF16 { *y = _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); } } + +pub struct CurrentCpuBF16 {} +impl CpuBF16 for CurrentCpuBF16 { + type Unit = __m256; + type Array = [__m256; ARR]; + + const STEP: usize = STEP; + const EPR: usize = EPR; + + fn n() -> usize { + ARR + } + + unsafe fn zero() -> Self::Unit { + _mm256_setzero_ps() + } + + unsafe fn zero_array() -> Self::Array { + [Self::zero(); ARR] + } + + unsafe fn from_f32(v: f32) -> Self::Unit { + _mm256_set1_ps(v) + } + + #[cfg(target_feature = "f16c")] + unsafe fn load(mem_addr: *const bf16) -> Self::Unit { + _mm256_cvtph_ps(_mm_loadu_si128(mem_addr as *const __m128i)) + } + + #[cfg(not(target_feature = "f16c"))] + unsafe fn load(mem_addr: *const bf16) -> Self::Unit { + let mut tmp = [0.0f32; 8]; + for i in 0..8 { + tmp[i] = (*mem_addr.add(i)).to_f32(); + } + _mm256_loadu_ps(tmp.as_ptr()) + } + + unsafe fn vec_add(a: Self::Unit, b: Self::Unit) -> Self::Unit { + _mm256_add_ps(a, b) + } + + unsafe fn vec_fma(a: Self::Unit, b: Self::Unit, c: Self::Unit) -> Self::Unit { + _mm256_add_ps(_mm256_mul_ps(b, c), a) + } + + #[cfg(target_feature = "f16c")] + unsafe fn vec_store(mem_addr: *mut bf16, a: Self::Unit) { + _mm_storeu_si128(mem_addr as *mut __m128i, _mm256_cvtps_ph(a, 0)) + } + + #[cfg(not(target_feature = "f16c"))] + unsafe fn vec_store(mem_addr: *mut bf16, a: Self::Unit) { + let mut tmp = [0.0f32; 8]; + _mm256_storeu_ps(tmp.as_mut_ptr(), a); + for i in 0..8 { + *mem_addr.add(i) = bf16::from_f32(tmp[i]); + } + } + + unsafe fn vec_reduce(mut x: Self::Array, y: *mut f32) { + let mut offset = ARR >> 1; + for i in 0..offset { + x[i] = _mm256_add_ps(x[i], x[offset + i]); + } + offset >>= 1; + for i in 0..offset { + x[i] = _mm256_add_ps(x[i], x[offset + i]); + } + offset >>= 1; + for i in 0..offset { + x[i] = _mm256_add_ps(x[i], x[offset + i]); + } + let t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), _mm256_extractf128_ps(x[0], 1)); + let t1 = _mm_hadd_ps(t0, t0); + *y = _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); + } +} diff --git a/candle-core/src/cpu/kernels.rs b/candle-core/src/cpu/kernels.rs index 527646d62b..64f728f63f 100644 --- a/candle-core/src/cpu/kernels.rs +++ b/candle-core/src/cpu/kernels.rs @@ -121,6 +121,13 @@ impl VecOps for half::bf16 { fn max(self, other: Self) -> Self { Self::max(self, other) } + + #[inline(always)] + unsafe fn vec_dot(lhs: *const Self, rhs: *const Self, res: *mut Self, len: usize) { + let mut res_f32 = 0f32; + super::vec_dot_bf16(lhs, rhs, &mut res_f32, len); + *res = half::bf16::from_f32(res_f32); + } } impl VecOps for u8 { #[inline(always)] diff --git a/candle-core/src/cpu/mod.rs b/candle-core/src/cpu/mod.rs index be5b99128e..1ad47ff5cd 100644 --- a/candle-core/src/cpu/mod.rs +++ b/candle-core/src/cpu/mod.rs @@ -38,14 +38,33 @@ trait CpuF16 { unsafe fn from_f32(v: f32) -> Self::Unit; unsafe fn vec_store(mem_addr: *mut f16, a: Self::Unit); } -use half::f16; + +#[allow(unused)] +trait CpuBF16 { + type Unit; + type Array; + const STEP: usize; + const EPR: usize; + + fn n() -> usize; + unsafe fn zero() -> Self::Unit; + unsafe fn zero_array() -> Self::Array; + unsafe fn load(mem_addr: *const bf16) -> Self::Unit; + unsafe fn vec_add(a: Self::Unit, b: Self::Unit) -> Self::Unit; + unsafe fn vec_fma(a: Self::Unit, b: Self::Unit, c: Self::Unit) -> Self::Unit; + unsafe fn vec_reduce(x: Self::Array, y: *mut f32); + unsafe fn from_f32(v: f32) -> Self::Unit; + unsafe fn vec_store(mem_addr: *mut bf16, a: Self::Unit); +} + +use half::{bf16, f16}; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[cfg(target_feature = "avx")] pub mod avx; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[cfg(target_feature = "avx")] -pub use avx::{CurrentCpu, CurrentCpuF16}; +pub use avx::{CurrentCpu, CurrentCpuBF16, CurrentCpuF16}; #[cfg(target_arch = "wasm32")] #[cfg(target_feature = "simd128")] @@ -172,6 +191,34 @@ pub(crate) unsafe fn vec_dot_f16(a_row: *const f16, b_row: *const f16, c: *mut f *c = sumf; } +#[cfg(target_feature = "avx")] +#[inline(always)] +pub(crate) unsafe fn vec_dot_bf16(a_row: *const bf16, b_row: *const bf16, c: *mut f32, k: usize) { + let mut sumf = 0.0f32; + let np = k & !(CurrentCpuBF16::STEP - 1); + + let mut sum = CurrentCpuBF16::zero_array(); + let mut ax = CurrentCpuBF16::zero_array(); + let mut ay = CurrentCpuBF16::zero_array(); + + for i in (0..np).step_by(CurrentCpuBF16::STEP) { + for j in 0..CurrentCpuBF16::n() { + ax[j] = CurrentCpuBF16::load(a_row.add(i + j * CurrentCpuBF16::EPR)); + ay[j] = CurrentCpuBF16::load(b_row.add(i + j * CurrentCpuBF16::EPR)); + + sum[j] = CurrentCpuBF16::vec_fma(sum[j], ax[j], ay[j]); + } + } + + CurrentCpuBF16::vec_reduce(sum, &mut sumf); + + // leftovers + for i in np..k { + sumf += (*a_row.add(i)).to_f32() * (*b_row.add(i)).to_f32(); + } + *c = sumf; +} + #[cfg(not(target_feature = "avx"))] #[inline(always)] pub(crate) unsafe fn vec_dot_f16(a_row: *const f16, b_row: *const f16, c: *mut f32, k: usize) { @@ -182,3 +229,14 @@ pub(crate) unsafe fn vec_dot_f16(a_row: *const f16, b_row: *const f16, c: *mut f } *c = sum; } + +#[cfg(not(target_feature = "avx"))] +#[inline(always)] +pub(crate) unsafe fn vec_dot_bf16(a_row: *const bf16, b_row: *const bf16, c: *mut f32, k: usize) { + // leftovers + let mut sum = 0.0; + for i in 0..k { + sum += (*a_row.add(i)).to_f32() * (*b_row.add(i)).to_f32(); + } + *c = sum; +} diff --git a/candle-core/src/quantized/cuda.rs b/candle-core/src/quantized/cuda.rs index 3c24c0e546..08df2d5bf7 100644 --- a/candle-core/src/quantized/cuda.rs +++ b/candle-core/src/quantized/cuda.rs @@ -423,6 +423,7 @@ impl QCudaStorage { match self.dtype { GgmlDType::F32 => deq::(&buffer, block_len, &mut out)?, GgmlDType::F16 => deq::(&buffer, block_len, &mut out)?, + GgmlDType::BF16 => deq::(&buffer, block_len, &mut out)?, GgmlDType::Q4_0 => deq::(&buffer, block_len, &mut out)?, GgmlDType::Q4_1 => deq::(&buffer, block_len, &mut out)?, GgmlDType::Q5_0 => deq::(&buffer, block_len, &mut out)?, diff --git a/candle-core/src/quantized/ggml_file.rs b/candle-core/src/quantized/ggml_file.rs index 0f7e9c118c..6108030afd 100644 --- a/candle-core/src/quantized/ggml_file.rs +++ b/candle-core/src/quantized/ggml_file.rs @@ -153,6 +153,7 @@ pub fn qtensor_from_ggml( match ggml_dtype { GgmlDType::F32 => from_raw_data::(raw_data, size_in_bytes, dims, device), GgmlDType::F16 => from_raw_data::(raw_data, size_in_bytes, dims, device), + GgmlDType::BF16 => from_raw_data::(raw_data, size_in_bytes, dims, device), GgmlDType::Q4_0 => { from_raw_data::(raw_data, size_in_bytes, dims, device) } diff --git a/candle-core/src/quantized/k_quants.rs b/candle-core/src/quantized/k_quants.rs index 6210ac1e9f..2e92921954 100644 --- a/candle-core/src/quantized/k_quants.rs +++ b/candle-core/src/quantized/k_quants.rs @@ -5,7 +5,7 @@ use super::utils::{ use super::GgmlDType; use crate::Result; use byteorder::{ByteOrder, LittleEndian}; -use half::f16; +use half::{bf16, f16}; use rayon::prelude::*; // Default to QK_K 256 rather than 64. @@ -1963,3 +1963,47 @@ impl GgmlType for f16 { Ok(()) } } + +impl GgmlType for bf16 { + const DTYPE: GgmlDType = GgmlDType::BF16; + const BLCK_SIZE: usize = 1; + type VecDotType = bf16; + + fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result { + Self::vec_dot_unopt(n, xs, ys) + } + + fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result { + if xs.len() < n { + crate::bail!("size mismatch {} < {n}", xs.len()) + } + if ys.len() < n { + crate::bail!("size mismatch {} < {n}", ys.len()) + } + let mut res = 0f32; + unsafe { crate::cpu::vec_dot_bf16(xs.as_ptr(), ys.as_ptr(), &mut res, n) }; + Ok(res) + } + + fn from_float(xs: &[f32], ys: &mut [Self]) -> Result<()> { + if xs.len() != ys.len() { + crate::bail!("size mismatch {} {}", xs.len(), ys.len()); + } + // TODO: vectorize + for (x, y) in xs.iter().zip(ys.iter_mut()) { + *y = bf16::from_f32(*x) + } + Ok(()) + } + + fn to_float(xs: &[Self], ys: &mut [f32]) -> Result<()> { + if xs.len() != ys.len() { + crate::bail!("size mismatch {} {}", xs.len(), ys.len()); + } + // TODO: vectorize + for (x, y) in xs.iter().zip(ys.iter_mut()) { + *y = x.to_f32() + } + Ok(()) + } +} diff --git a/candle-core/src/quantized/metal.rs b/candle-core/src/quantized/metal.rs index f7f5b68ac2..031f429b99 100644 --- a/candle-core/src/quantized/metal.rs +++ b/candle-core/src/quantized/metal.rs @@ -55,6 +55,10 @@ impl QMetalStorage { let vec: Vec = read_to_vec(&buffer, block_len); half::f16::to_float(&vec, &mut out)?; } + GgmlDType::BF16 => { + let vec: Vec = read_to_vec(&buffer, block_len); + half::bf16::to_float(&vec, &mut out)?; + } GgmlDType::Q4_0 => { let vec: Vec = read_to_vec(&buffer, block_len); crate::quantized::BlockQ4_0::to_float(&vec, &mut out)?; @@ -225,6 +229,7 @@ impl From for candle_metal_kernels::GgmlDType { GgmlDType::Q8K => candle_metal_kernels::GgmlDType::Q8K, GgmlDType::F16 => candle_metal_kernels::GgmlDType::F16, GgmlDType::F32 => candle_metal_kernels::GgmlDType::F32, + GgmlDType::BF16 => candle_metal_kernels::GgmlDType::F16, } } } diff --git a/candle-core/src/quantized/mod.rs b/candle-core/src/quantized/mod.rs index 236f5a9811..d9476942b9 100644 --- a/candle-core/src/quantized/mod.rs +++ b/candle-core/src/quantized/mod.rs @@ -28,7 +28,7 @@ pub mod neon; #[cfg(target_feature = "simd128")] pub mod simd128; pub mod utils; -use half::f16; +use half::{bf16, f16}; pub use k_quants::GgmlType; @@ -134,6 +134,7 @@ impl QStorage { pub enum GgmlDType { F32, F16, + BF16, Q4_0, Q4_1, Q5_0, @@ -165,6 +166,8 @@ impl GgmlDType { 13 => Self::Q5K, 14 => Self::Q6K, 15 => Self::Q8K, + // https://github.com/ggerganov/ggml/blob/29d87fc6676e7ed0cdfdec0804b06001d9c2bb44/include/ggml.h#L389 + 30 => Self::BF16, _ => crate::bail!("unknown dtype for tensor {u}"), }; Ok(dtype) @@ -186,6 +189,8 @@ impl GgmlDType { Self::Q5K => 13, Self::Q6K => 14, Self::Q8K => 15, + // https://github.com/ggerganov/ggml/blob/29d87fc6676e7ed0cdfdec0804b06001d9c2bb44/include/ggml.h#L389 + Self::BF16 => 30, } } @@ -206,6 +211,7 @@ impl GgmlDType { Self::Q5K => Box::new(vec![BlockQ5K::zeros(); elem_count / BlockQ5K::BLCK_SIZE]), Self::Q6K => Box::new(vec![BlockQ6K::zeros(); elem_count / BlockQ6K::BLCK_SIZE]), Self::Q8K => Box::new(vec![BlockQ8K::zeros(); elem_count / BlockQ8K::BLCK_SIZE]), + Self::BF16 => Box::new(vec![bf16::zeros(); elem_count]), } } /// The type size for blocks in bytes. @@ -213,7 +219,7 @@ impl GgmlDType { use k_quants::*; match self { Self::F32 => 4, - Self::F16 => 2, + Self::F16 | Self::BF16 => 2, Self::Q4_0 => std::mem::size_of::(), Self::Q4_1 => std::mem::size_of::(), Self::Q5_0 => std::mem::size_of::(), @@ -234,7 +240,7 @@ impl GgmlDType { pub fn block_size(&self) -> usize { match self { Self::F32 => 1, - Self::F16 => 1, + Self::F16 | Self::BF16 => 1, Self::Q4_0 => k_quants::QK4_0, Self::Q4_1 => k_quants::QK4_1, Self::Q5_0 => k_quants::QK5_0, @@ -422,7 +428,7 @@ thread_local! { impl QMatMul { pub fn from_arc(qtensor: std::sync::Arc) -> Result { let dequantize = match qtensor.dtype() { - GgmlDType::F32 | GgmlDType::F16 => true, + GgmlDType::F32 | GgmlDType::F16 | GgmlDType::BF16 => true, _ => DEQUANTIZE_ALL.with(|b| *b), }; let t = if dequantize { diff --git a/candle-metal-kernels/src/lib.rs b/candle-metal-kernels/src/lib.rs index 0843cc1179..dd7e9153bf 100644 --- a/candle-metal-kernels/src/lib.rs +++ b/candle-metal-kernels/src/lib.rs @@ -2164,6 +2164,7 @@ pub enum GgmlDType { Q8K, F16, F32, + BF16, } #[allow(clippy::too_many_arguments)] @@ -2241,7 +2242,7 @@ pub fn call_quantized_matmul_mv_t( let align = 2; (nth0, nth1, align) } - GgmlDType::F16 | GgmlDType::Q8K => { + GgmlDType::F16 | GgmlDType::BF16 | GgmlDType::Q8K => { // Original implem uses rows let nth0 = 32; let nth1 = 1; @@ -2279,6 +2280,7 @@ pub fn call_quantized_matmul_mv_t( GgmlDType::Q6K => "kernel_mul_mv_q6_K_f32", GgmlDType::Q8K => "kernel_mul_mv_q8_K_f32", GgmlDType::F16 => "kernel_mul_mv_f16_f32", + GgmlDType::BF16 => "kernel_mul_mv_bf16_f32", GgmlDType::F32 => "kernel_mul_mv_f32_f32", }; diff --git a/candle-metal-kernels/src/quantized.metal b/candle-metal-kernels/src/quantized.metal index fef6ac54f8..162b7a2d19 100644 --- a/candle-metal-kernels/src/quantized.metal +++ b/candle-metal-kernels/src/quantized.metal @@ -1495,8 +1495,203 @@ kernel void kernel_mul_mv_f16_f32( kernel_mul_mv_f16_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, nb10, nb11, nb12, ne0, ne1, r2, r3, tgpig, tiisg); } +#if defined(__HAVE_BFLOAT__) +void kernel_mul_mv_bf16_f32_1row_impl( + device const char * src0, + device const char * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]]) { + + const int64_t r0 = tgpig.x; + const int64_t r1 = tgpig.y; + const int64_t im = tgpig.z; + + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02; + + device const bfloat* x = (device const bfloat*) (src0 + offset0); + device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12); + + float sumf = 0; + if (ne00 < 128) { + for (int i = tiisg; i < ne00; i += 32) { + sumf += (float) x[i] * (float) y[i]; + } + float all_sum = simd_sum(sumf); + if (tiisg == 0) { + dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum; + } + } else { + device const bfloat4* x4 = (device const bfloat4*) x; + device const float4 * y4 = (device const float4 *) y; + for (int i = tiisg; i < ne00/4; i += 32) { + for (int k = 0; k < 4; ++k) sumf += (float)x4[i][k] * y4[i][k]; + } + float all_sum = simd_sum(sumf); + if (tiisg == 0) { + for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i]; + dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum; + } + } +} + +[[host_name("kernel_mul_mv_bf16_f32_1row")]] +kernel void kernel_mul_mv_bf16_f32_1row( + device const char * src0, + device const char * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]]) { + kernel_mul_mv_bf16_f32_1row_impl(src0, src1, dst, ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, nb10, nb11, nb12, ne0, ne1, r2, r3, tgpig, tiisg); +} +#endif + +#define N_BF16_F32 4 + +#if defined(__HAVE_BFLOAT__) +void kernel_mul_mv_bf16_f32_impl( + device const char * src0, + device const char * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]]) { + + const int64_t r0 = tgpig.x; + const int64_t rb = tgpig.y*N_BF16_F32; + const int64_t im = tgpig.z; + + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02; + + device const bfloat * x = (device const bfloat *) (src0 + offset0); + + if (ne00 < 128) { + for (int row = 0; row < N_BF16_F32; ++row) { + int r1 = rb + row; + if (r1 >= ne11) { + break; + } + + device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12); + + float sumf = 0; + for (int i = tiisg; i < ne00; i += 32) { + sumf += (float) x[i] * (float) y[i]; + } + + float all_sum = simd_sum(sumf); + if (tiisg == 0) { + dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum; + } + } + } else { + device const bfloat4 * x4 = (device const bfloat4 *)x; + for (int row = 0; row < N_BF16_F32; ++row) { + int r1 = rb + row; + if (r1 >= ne11) { + break; + } + + device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12); + device const float4 * y4 = (device const float4 *) y; + + float sumf = 0; + for (int i = tiisg; i < ne00/4; i += 32) { + for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k]; + } + + float all_sum = simd_sum(sumf); + if (tiisg == 0) { + for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i]; + dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum; + } + } + } +} + +[[host_name("kernel_mul_mv_bf16_f32")]] +kernel void kernel_mul_mv_bf16_f32( + device const char * src0, + device const char * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]]) { + kernel_mul_mv_bf16_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, nb10, nb11, nb12, ne0, ne1, r2, r3, tgpig, tiisg); +} +#endif + +#if defined(__HAVE_BFLOAT__) // Assumes row size (ne00) is a multiple of 4 -kernel void kernel_mul_mv_f16_f32_l4( +kernel void kernel_mul_mv_bf16_f32_l4( device const char * src0, device const char * src1, device float * dst, @@ -1528,7 +1723,7 @@ kernel void kernel_mul_mv_f16_f32_l4( const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02; - device const half4 * x4 = (device const half4 *) (src0 + offset0); + device const bfloat4 * x4 = (device const bfloat4 *) (src0 + offset0); for (int r1 = 0; r1 < nrows; ++r1) { device const float4 * y4 = (device const float4 *) (src1 + r1*nb11 + im*nb12); @@ -1544,6 +1739,7 @@ kernel void kernel_mul_mv_f16_f32_l4( } } } +#endif kernel void kernel_alibi_f32( device const float * src0,