Skip to content

Commit

Permalink
Adjust local memory requirement for single-group scan, sort and reduce (
Browse files Browse the repository at this point in the history
#833)

* Adjust local memory requirement for single-group scan, sort and reduce

* Remove unused variable

* auto -> ::std::size_t

* Change floating point multiply to integer divide

* Fix underflow in SLM size check in radix sort

* Guard single-work-group radix sort with _ONEDPL_USE_SINGLE_GROUP_RADIX_SORT macro set to 0
  • Loading branch information
adamfidel authored and timmiesmith committed Mar 6, 2023
1 parent 6e196bc commit f58ae41
Show file tree
Hide file tree
Showing 6 changed files with 13 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -517,7 +517,6 @@ struct __parallel_transform_scan_static_single_group_submitter<_Inclusive, _Elem
// This kernel is only launched for sizes less than 2^16
const ::std::uint16_t __item_id = __self_item.get_local_linear_id();
const ::std::uint16_t __subgroup_id = __subgroup.get_group_id();
const ::std::uint16_t __id_in_subgroup = __subgroup.get_local_id();
const ::std::uint16_t __subgroup_size = __subgroup.get_local_linear_range();

#if _ONEDPL_SYCL_SUB_GROUP_LOAD_STORE_PRESENT
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -694,7 +694,7 @@ __parallel_radix_sort(_ExecutionPolicy&& __exec, _Range&& __in_rng)

const auto __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec);

#if __SYCL_COMPILER_VERSION >= 20230101 //for Intel(R) oneAPI C++ Compiler Classic 2023 and later
#if _ONEDPL_USE_SINGLE_GROUP_RADIX_SORT
//TODO: 1.to reduce number of the kernels; 2.to define work group size in runtime, depending on number of elements
constexpr auto __wg_size = 64;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,13 +128,15 @@ struct __subgroup_radix_sort
{
assert(__n <= 1 << (sizeof(uint16_t) * 8)); //the kernel is designed for data size <= 64K

const auto __max_slm_size = __q.get_device().template get_info<sycl::info::device::local_mem_size>();
// Pessimistically only use half of the memory to take into account memory used by compiled kernel
const ::std::size_t __max_slm_size =
__q.get_device().template get_info<sycl::info::device::local_mem_size>() / 2;

const auto __n_uniform = 1 << (::std::uint32_t(log2(__n - 1)) + 1);
const auto __req_slm_size_val = sizeof(_T) * __n_uniform;
const auto __req_slm_size_counters = __counter_buf_sz * sizeof(uint32_t);

return __req_slm_size_val <= __max_slm_size - __req_slm_size_counters; //counters should be placed in SLM
return __req_slm_size_val + __req_slm_size_counters <= __max_slm_size; //counters should be placed in SLM
}

template <typename _KernelName>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -314,8 +314,9 @@ __parallel_transform_reduce(_ExecutionPolicy&& __exec, _ReduceOp __reduce_op, _T
// TODO: find a way to generalize getting of reliable work-group size
::std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec);
// change __work_group_size according to local memory limit
// Pessimistically double the memory requirement to take into account memory used by compiled kernel
__work_group_size = oneapi::dpl::__internal::__max_local_allocation_size(::std::forward<_ExecutionPolicy>(__exec),
sizeof(_Tp), __work_group_size);
sizeof(_Tp) * 2, __work_group_size);
if (__n <= 65536 && __work_group_size >= 512)
{
if (__n <= 128)
Expand Down
3 changes: 3 additions & 0 deletions include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@
# define _ONEDPL_SYCL_REQD_SUB_GROUP_SIZE(SIZE) intel::reqd_sub_group_size(SIZE)
#endif

// TODO: re-enable when sort test passes for all devices of interest
#define _ONEDPL_USE_SINGLE_GROUP_RADIX_SORT 0

namespace __dpl_sycl
{

Expand Down
4 changes: 3 additions & 1 deletion include/oneapi/dpl/pstl/hetero/numeric_impl_hetero.h
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,9 @@ __pattern_transform_scan_base_impl(_ExecutionPolicy&& __exec, _Iterator1 __first
if ((__n_uniform & (__n_uniform - 1)) != 0)
__n_uniform = __par_backend_hetero::__dpl_bit_floor(__n) << 1;

const auto __max_slm_size = __exec.queue().get_device().template get_info<sycl::info::device::local_mem_size>();
// Pessimistically only use half of the memory to take into account memory used by compiled kernel
const ::std::size_t __max_slm_size =
__exec.queue().get_device().template get_info<sycl::info::device::local_mem_size>() / 2;
const auto __req_slm_size = sizeof(_Type) * __n_uniform;

constexpr int __single_group_upper_limit = 16384;
Expand Down

0 comments on commit f58ae41

Please sign in to comment.