Adjust local memory requirement for single-group scan, sort and reduce (

#833) * Adjust local memory requirement for single-group scan, sort and reduce * Remove unused variable * auto -> ::std::size_t * Change floating point multiply to integer divide * Fix underflow in SLM size check in radix sort * Guard single-work-group radix sort with _ONEDPL_USE_SINGLE_GROUP_RADIX_SORT macro set to 0
oneapi-src · Mar 6, 2023 · f58ae41 · f58ae41
1 parent 6e196bc
commit f58ae41
Show file tree

Hide file tree

Showing 6 changed files with 13 additions and 6 deletions.
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -517,7 +517,6 @@ struct __parallel_transform_scan_static_single_group_submitter<_Inclusive, _Elem
                     // This kernel is only launched for sizes less than 2^16
                     const ::std::uint16_t __item_id = __self_item.get_local_linear_id();
                     const ::std::uint16_t __subgroup_id = __subgroup.get_group_id();
-                    const ::std::uint16_t __id_in_subgroup = __subgroup.get_local_id();
                     const ::std::uint16_t __subgroup_size = __subgroup.get_local_linear_range();
 
 #if _ONEDPL_SYCL_SUB_GROUP_LOAD_STORE_PRESENT

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h
@@ -694,7 +694,7 @@ __parallel_radix_sort(_ExecutionPolicy&& __exec, _Range&& __in_rng)
 
     const auto __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec);
 
-#if __SYCL_COMPILER_VERSION >= 20230101 //for Intel(R) oneAPI C++ Compiler Classic 2023 and later
+#if _ONEDPL_USE_SINGLE_GROUP_RADIX_SORT
     //TODO: 1.to reduce number of the kernels; 2.to define work group size in runtime, depending on number of elements
     constexpr auto __wg_size = 64;
 

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h
@@ -128,13 +128,15 @@ struct __subgroup_radix_sort
     {
         assert(__n <= 1 << (sizeof(uint16_t) * 8)); //the kernel is designed for data size <= 64K
 
-        const auto __max_slm_size = __q.get_device().template get_info<sycl::info::device::local_mem_size>();
+        // Pessimistically only use half of the memory to take into account memory used by compiled kernel
+        const ::std::size_t __max_slm_size =
+            __q.get_device().template get_info<sycl::info::device::local_mem_size>() / 2;
 
         const auto __n_uniform = 1 << (::std::uint32_t(log2(__n - 1)) + 1);
         const auto __req_slm_size_val = sizeof(_T) * __n_uniform;
         const auto __req_slm_size_counters = __counter_buf_sz * sizeof(uint32_t);
 
-        return __req_slm_size_val <= __max_slm_size - __req_slm_size_counters; //counters should be placed in SLM
+        return __req_slm_size_val + __req_slm_size_counters <= __max_slm_size; //counters should be placed in SLM
     }
 
     template <typename _KernelName>

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h
@@ -314,8 +314,9 @@ __parallel_transform_reduce(_ExecutionPolicy&& __exec, _ReduceOp __reduce_op, _T
     // TODO: find a way to generalize getting of reliable work-group size
     ::std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec);
     // change __work_group_size according to local memory limit
+    // Pessimistically double the memory requirement to take into account memory used by compiled kernel
     __work_group_size = oneapi::dpl::__internal::__max_local_allocation_size(::std::forward<_ExecutionPolicy>(__exec),
-                                                                             sizeof(_Tp), __work_group_size);
+                                                                             sizeof(_Tp) * 2, __work_group_size);
     if (__n <= 65536 && __work_group_size >= 512)
     {
         if (__n <= 128)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h
@@ -62,6 +62,9 @@
 #    define _ONEDPL_SYCL_REQD_SUB_GROUP_SIZE(SIZE) intel::reqd_sub_group_size(SIZE)
 #endif
 
+// TODO: re-enable when sort test passes for all devices of interest
+#define _ONEDPL_USE_SINGLE_GROUP_RADIX_SORT 0
+
 namespace __dpl_sycl
 {
 

diff --git a/include/oneapi/dpl/pstl/hetero/numeric_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/numeric_impl_hetero.h
@@ -120,7 +120,9 @@ __pattern_transform_scan_base_impl(_ExecutionPolicy&& __exec, _Iterator1 __first
     if ((__n_uniform & (__n_uniform - 1)) != 0)
         __n_uniform = __par_backend_hetero::__dpl_bit_floor(__n) << 1;
 
-    const auto __max_slm_size = __exec.queue().get_device().template get_info<sycl::info::device::local_mem_size>();
+    // Pessimistically only use half of the memory to take into account memory used by compiled kernel
+    const ::std::size_t __max_slm_size =
+        __exec.queue().get_device().template get_info<sycl::info::device::local_mem_size>() / 2;
     const auto __req_slm_size = sizeof(_Type) * __n_uniform;
 
     constexpr int __single_group_upper_limit = 16384;