From c4a729082fbb98ea6f9b8c9eed9086433006589d Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Fri, 10 Dec 2021 20:09:50 +0300 Subject: [PATCH] [SYCL] Switch to using blocking USM free for OpenCL GPU (#4928) Whenever a kernel is enqueued on GPU, the GPU driver records the state of all USM pointers that might be used in an indirect fashion. Because of this, these pointers cannot be freed until the execution of the kernel is finished. This change addresses this problem for OpenCL by using a blocking version of free, while Level Zero already handles this by deferring USM release. The change is temporarily limited to OpenCL GPU until a bug in OpenCL CPU runtime is resolved. --- sycl/include/CL/sycl/detail/pi.h | 4 ++- sycl/plugins/opencl/pi_opencl.cpp | 52 +++++++++++++++++++++++++++++-- 2 files changed, 52 insertions(+), 4 deletions(-) diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h index c27be2edb35e8..b7c15f4aa288e 100644 --- a/sycl/include/CL/sycl/detail/pi.h +++ b/sycl/include/CL/sycl/detail/pi.h @@ -1635,7 +1635,9 @@ __SYCL_EXPORT pi_result piextUSMSharedAlloc(void **result_ptr, pi_usm_mem_properties *properties, size_t size, pi_uint32 alignment); -/// Frees allocated USM memory +/// Indicates that the allocated USM memory is no longer needed on the runtime +/// side. The actual freeing of the memory may be done in a blocking or deferred +/// manner, e.g. to avoid issues with indirect memory access from kernels. /// /// \param context is the pi_context of the allocation /// \param ptr is the memory to be freed diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index 57d2e15b96317..060d27d6ece23 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -57,6 +57,7 @@ CONSTFIX char clHostMemAllocName[] = "clHostMemAllocINTEL"; CONSTFIX char clDeviceMemAllocName[] = "clDeviceMemAllocINTEL"; CONSTFIX char clSharedMemAllocName[] = "clSharedMemAllocINTEL"; CONSTFIX char clMemFreeName[] = "clMemFreeINTEL"; +CONSTFIX char clMemBlockingFreeName[] = "clMemBlockingFreeINTEL"; CONSTFIX char clCreateBufferWithPropertiesName[] = "clCreateBufferWithPropertiesINTEL"; CONSTFIX char clSetKernelArgMemPointerName[] = "clSetKernelArgMemPointerINTEL"; @@ -969,11 +970,56 @@ pi_result piextUSMSharedAlloc(void **result_ptr, pi_context context, /// \param context is the pi_context of the allocation /// \param ptr is the memory to be freed pi_result piextUSMFree(pi_context context, void *ptr) { + // Use a blocking free to avoid issues with indirect access from kernels that + // might be still running. + clMemBlockingFreeINTEL_fn FuncPtr = nullptr; + + // We need to use clMemBlockingFreeINTEL here, however, due to a bug in OpenCL + // CPU runtime this call fails with CL_INVALID_EVENT on CPU devices in certain + // cases. As a temporary workaround, this function replicates caching of + // extension function pointers in getExtFuncFromContext, while choosing + // clMemBlockingFreeINTEL for GPU and clMemFreeINTEL for other device types. + // TODO remove this workaround when the new OpenCL CPU runtime version is + // uplifted in CI. + static_assert( + std::is_same::value); + cl_uint deviceCount; + cl_int ret_err = + clGetContextInfo(cast(context), CL_CONTEXT_NUM_DEVICES, + sizeof(cl_uint), &deviceCount, nullptr); + + if (ret_err != CL_SUCCESS || deviceCount < 1) { + return PI_INVALID_CONTEXT; + } + + std::vector devicesInCtx(deviceCount); + ret_err = clGetContextInfo(cast(context), CL_CONTEXT_DEVICES, + deviceCount * sizeof(cl_device_id), + devicesInCtx.data(), nullptr); + + if (ret_err != CL_SUCCESS) { + return PI_INVALID_CONTEXT; + } + + bool useBlockingFree = true; + for (const cl_device_id &dev : devicesInCtx) { + cl_device_type devType = CL_DEVICE_TYPE_DEFAULT; + ret_err = clGetDeviceInfo(dev, CL_DEVICE_TYPE, sizeof(cl_device_type), + &devType, nullptr); + if (ret_err != CL_SUCCESS) { + return PI_INVALID_DEVICE; + } + useBlockingFree &= devType == CL_DEVICE_TYPE_GPU; + } - clMemFreeINTEL_fn FuncPtr = nullptr; pi_result RetVal = PI_INVALID_OPERATION; - RetVal = getExtFuncFromContext(context, - &FuncPtr); + if (useBlockingFree) + RetVal = + getExtFuncFromContext( + context, &FuncPtr); + else + RetVal = getExtFuncFromContext(context, + &FuncPtr); if (FuncPtr) { RetVal = cast(FuncPtr(cast(context), ptr));