From 780faefe73b66cb655978715ab264bc25eb4e55a Mon Sep 17 00:00:00 2001 From: Shengchen Date: Tue, 17 Dec 2024 09:13:51 +0800 Subject: [PATCH 1/4] [SYCLomatic] Add 3 surface driver API migration. (#2544) Signed-off-by: Chen, Sheng S --- clang/lib/DPCT/AnalysisInfo.cpp | 2 +- clang/lib/DPCT/RuleInfra/MapNames.cpp | 7 +++++ clang/lib/DPCT/RulesLang/APINamesTexture.inc | 29 +++++++++++++++++++ clang/lib/DPCT/RulesLang/RulesLangTexture.cpp | 28 +++++++++--------- clang/lib/DPCT/SrcAPI/APINames.inc | 6 ++-- clang/runtime/dpct-rt/include/dpct/image.hpp | 22 ++++++++++++-- clang/test/dpct/surface.cu | 28 ++++++++++++++++++ .../texture/surface_object_bindless_image.cu | 14 +++++++++ 8 files changed, 117 insertions(+), 19 deletions(-) diff --git a/clang/lib/DPCT/AnalysisInfo.cpp b/clang/lib/DPCT/AnalysisInfo.cpp index ea00d8ecbe09..d410a7e8ba57 100644 --- a/clang/lib/DPCT/AnalysisInfo.cpp +++ b/clang/lib/DPCT/AnalysisInfo.cpp @@ -5036,7 +5036,7 @@ void DeviceFunctionDecl::buildTextureObjectParamsInfo( std::string ParamName = DpctGlobalInfo::getUnqualifiedTypeName(Param->getType()); if (ParamName == "cudaTextureObject_t" || - ParamName == "cudaSurfaceObject_t") { + ParamName == "cudaSurfaceObject_t" || ParamName == "CUsurfObject") { TextureObjectList[Idx] = std::make_shared(Param); } } diff --git a/clang/lib/DPCT/RuleInfra/MapNames.cpp b/clang/lib/DPCT/RuleInfra/MapNames.cpp index 4453d03da8d8..58249f1b6d94 100644 --- a/clang/lib/DPCT/RuleInfra/MapNames.cpp +++ b/clang/lib/DPCT/RuleInfra/MapNames.cpp @@ -524,6 +524,13 @@ void MapNames::setExplicitNamespaceMap( "ext::oneapi::experimental::unsampled_image_handle" : getDpctNamespace() + "image_wrapper_base_p", HelperFeatureEnum::device_ext)}, + {"CUsurfObject", + std::make_shared( + DpctGlobalInfo::useExtBindlessImages() + ? getClNamespace() + + "ext::oneapi::experimental::unsampled_image_handle" + : getDpctNamespace() + "image_wrapper_base_p", + HelperFeatureEnum::device_ext)}, {"textureReference", std::make_shared(getDpctNamespace() + "image_wrapper_base", HelperFeatureEnum::device_ext)}, diff --git a/clang/lib/DPCT/RulesLang/APINamesTexture.inc b/clang/lib/DPCT/RulesLang/APINamesTexture.inc index 77d26077e84a..4d9bf13f820b 100644 --- a/clang/lib/DPCT/RulesLang/APINamesTexture.inc +++ b/clang/lib/DPCT/RulesLang/APINamesTexture.inc @@ -72,6 +72,14 @@ ASSIGNABLE_FACTORY(CONDITIONAL_FACTORY_ENTRY( ARG(0), QUEUESTR)), DELETER_FACTORY_ENTRY("cudaDestroySurfaceObject", ARG(0)))) +ASSIGNABLE_FACTORY(CONDITIONAL_FACTORY_ENTRY( + UseExtBindlessImages, + CALL_FACTORY_ENTRY("cuSurfObjectDestroy", + CALL(MapNames::getDpctNamespace() + + "experimental::destroy_bindless_image", + ARG(0), QUEUESTR)), + DELETER_FACTORY_ENTRY("cuSurfObjectDestroy", ARG(0)))) + CONDITIONAL_FACTORY_ENTRY( UseExtBindlessImages, ASSIGNABLE_FACTORY(ASSIGN_FACTORY_ENTRY( @@ -120,6 +128,17 @@ CONDITIONAL_FACTORY_ENTRY( ASSIGNABLE_FACTORY(ASSIGN_FACTORY_ENTRY( "cudaGetSurfaceObjectResourceDesc", DEREF(0), MEMBER_CALL(ARG(1), true, "get_data"))))) + +CONDITIONAL_FACTORY_ENTRY( + UseExtBindlessImages, + ASSIGNABLE_FACTORY(ASSIGN_FACTORY_ENTRY( + "cuSurfObjectGetResourceDesc", DEREF(0), + CALL(MapNames::getDpctNamespace() + "experimental::get_data", ARG(1)))), + FEATURE_REQUEST_FACTORY(HelperFeatureEnum::device_ext, + ASSIGNABLE_FACTORY(ASSIGN_FACTORY_ENTRY( + "cuSurfObjectGetResourceDesc", DEREF(0), + MEMBER_CALL(ARG(1), true, "get_data"))))) + CONDITIONAL_FACTORY_ENTRY( UseExtBindlessImages, FEATURE_REQUEST_FACTORY( @@ -263,6 +282,16 @@ FEATURE_REQUEST_FACTORY(HelperFeatureEnum::device_ext, "create_image_wrapper", DEREF(1))))) +FEATURE_REQUEST_FACTORY(HelperFeatureEnum::device_ext, + ASSIGNABLE_FACTORY(ASSIGN_FACTORY_ENTRY( + "cuSurfObjectCreate", DEREF(0), + CALL(DpctGlobalInfo::useExtBindlessImages() + ? MapNames::getDpctNamespace() + + "experimental::create_bindless_image" + : MapNames::getDpctNamespace() + + "create_image_wrapper", + DEREF(1))))) + ENTRY_UNSUPPORTED("cudaGetTextureObjectResourceViewDesc", Diagnostics::API_NOT_MIGRATED) diff --git a/clang/lib/DPCT/RulesLang/RulesLangTexture.cpp b/clang/lib/DPCT/RulesLang/RulesLangTexture.cpp index 1eaa360b63cc..c6bf90aa86e9 100644 --- a/clang/lib/DPCT/RulesLang/RulesLangTexture.cpp +++ b/clang/lib/DPCT/RulesLang/RulesLangTexture.cpp @@ -496,12 +496,11 @@ void TextureRule::registerMatcher(MatchFinder &MF) { ) .bind("tex"), this); - MF.addMatcher( - typeLoc( - loc(qualType(hasDeclaration(typedefDecl(hasAnyName( - "cudaTextureObject_t", "cudaSurfaceObject_t", "CUtexObject")))))) - .bind("texObj"), - this); + MF.addMatcher(typeLoc(loc(qualType(hasDeclaration(typedefDecl(hasAnyName( + "cudaTextureObject_t", "cudaSurfaceObject_t", + "CUsurfObject", "CUtexObject")))))) + .bind("texObj"), + this); MF.addMatcher( memberExpr( hasObjectExpression(hasType(type(hasUnqualifiedDesugaredType( @@ -516,13 +515,13 @@ void TextureRule::registerMatcher(MatchFinder &MF) { loc(qualType(hasDeclaration(namedDecl(hasAnyName( "cudaChannelFormatDesc", "cudaChannelFormatKind", "cudaTextureDesc", "cudaResourceDesc", "cudaResourceType", - "cudaTextureAddressMode", "cudaTextureFilterMode", "cudaArray", - "cudaArray_t", "CUarray_st", "CUarray", "CUarray_format", - "CUarray_format_enum", "CUresourcetype", "CUresourcetype_enum", - "CUaddress_mode", "CUaddress_mode_enum", "CUfilter_mode", - "CUfilter_mode_enum", "CUDA_RESOURCE_DESC", "CUDA_TEXTURE_DESC", - "CUtexref", "textureReference", "cudaMipmappedArray", - "cudaMipmappedArray_t")))))) + "CUDA_RESOURCE_DESC", "cudaTextureAddressMode", + "cudaTextureFilterMode", "cudaArray", "cudaArray_t", "CUarray_st", + "CUarray", "CUarray_format", "CUarray_format_enum", + "CUresourcetype", "CUresourcetype_enum", "CUaddress_mode", + "CUaddress_mode_enum", "CUfilter_mode", "CUfilter_mode_enum", + "CUDA_TEXTURE_DESC", "CUtexref", "textureReference", + "cudaMipmappedArray", "cudaMipmappedArray_t")))))) .bind("texType"), this); @@ -571,6 +570,9 @@ void TextureRule::registerMatcher(MatchFinder &MF) { "cudaCreateSurfaceObject", "cudaDestroySurfaceObject", "cudaGetSurfaceObjectResourceDesc", + "cuSurfObjectCreate", + "cuSurfObjectGetResourceDesc", + "cuSurfObjectDestroy", "cuArray3DCreate_v2", "cuArrayCreate_v2", "cuArrayDestroy", diff --git a/clang/lib/DPCT/SrcAPI/APINames.inc b/clang/lib/DPCT/SrcAPI/APINames.inc index 4741aa840d03..082346b1bf31 100644 --- a/clang/lib/DPCT/SrcAPI/APINames.inc +++ b/clang/lib/DPCT/SrcAPI/APINames.inc @@ -1992,9 +1992,9 @@ ENTRY(cuTexObjectGetResourceViewDesc, cuTexObjectGetResourceViewDesc, false, NO_ ENTRY(cuTexObjectGetTextureDesc, cuTexObjectGetTextureDesc, true, NO_FLAG, P4, "Successful") // Surface Object Management -ENTRY(cuSurfObjectCreate, cuSurfObjectCreate, false, NO_FLAG, P4, "comment") -ENTRY(cuSurfObjectDestroy, cuSurfObjectDestroy, false, NO_FLAG, P4, "comment") -ENTRY(cuSurfObjectGetResourceDesc, cuSurfObjectGetResourceDesc, false, NO_FLAG, P4, "comment") +ENTRY(cuSurfObjectCreate, cuSurfObjectCreate, true, NO_FLAG, P4, "Successful") +ENTRY(cuSurfObjectDestroy, cuSurfObjectDestroy, true, NO_FLAG, P4, "Successful") +ENTRY(cuSurfObjectGetResourceDesc, cuSurfObjectGetResourceDesc, true, NO_FLAG, P4, "Successfuls") // Tensor Map Object Managment ENTRY(cuTensorMapEncodeIm2col, cuTensorMapEncodeIm2col, false, NO_FLAG, P4, "comment") diff --git a/clang/runtime/dpct-rt/include/dpct/image.hpp b/clang/runtime/dpct-rt/include/dpct/image.hpp index 2907423fced7..d9acf485e3c2 100644 --- a/clang/runtime/dpct-rt/include/dpct/image.hpp +++ b/clang/runtime/dpct-rt/include/dpct/image.hpp @@ -894,6 +894,13 @@ class image_accessor_ext { return detail::fetch_data()(_img_acc.read(x, _sampler)); } + /// Read data from accessor. + template + typename std::enable_if::type read_byte(float x, float y, + float z) { + return detail::fetch_data()( + _img_acc.read(sycl::float4(x / sizeof(T), y, z, 0), _sampler)); + } /// Read data from accessor. template ()( _img_acc.read(sycl::int2(x / sizeof(T), y), _sampler)); } - + /// Read data from accessor. + template + typename std::enable_if::type read_byte(float x, float y) { + return detail::fetch_data()( + _img_acc.read(sycl::float2(x / sizeof(T), y), _sampler)); + } /// Read data from accessor. template ::value> typename std::enable_if::type read_byte(CoordT x) { return detail::fetch_data()(_img_acc.read(x / sizeof(T), _sampler)); } + + /// Read data from accessor. + template + typename std::enable_if::type read_byte(float x) { + return detail::fetch_data()(_img_acc.read(x / sizeof(T), _sampler)); + } }; template class image_accessor_ext { @@ -970,7 +988,7 @@ template class image_accessor_ext /// \param info Image sampling info used to create image wrapper. /// \returns Pointer to base class of created image wrapper object. static inline image_wrapper_base *create_image_wrapper(image_data data, - sampling_info info) { + sampling_info info = {}) { image_channel channel; int dims = 1; if (data.get_data_type() == image_data_type::matrix) { diff --git a/clang/test/dpct/surface.cu b/clang/test/dpct/surface.cu index 5e1e5431705b..9aa3fbc03b1b 100644 --- a/clang/test/dpct/surface.cu +++ b/clang/test/dpct/surface.cu @@ -19,6 +19,34 @@ template __global__ void kernel(cudaSurfaceObject_t surf) { surf3Dread(&i, surf, k, j, i); } +template __global__ void test_kernel_driver(CUsurfObject surf) { + int i; + float j, k, l, m; + // CHECK: surf.read_byte(i); + surf1Dread(surf, i); + // CHECK: i = surf.read_byte(i); + surf1Dread(&i, surf, i); + // CHECK: surf.read_byte(j, i); + surf2Dread(surf, j, i); + // CHECK: i = surf.read_byte(j, i); + surf2Dread(&i, surf, j, i); + // CHECK: surf.read_byte(k, j, i); + surf3Dread(surf, k, j, i); + // CHECK: i = surf.read_byte(k, j, i); + surf3Dread(&i, surf, k, j, i); +} +void test_driver() { + // CHECK: dpct::image_wrapper_base_p surf; + CUsurfObject surf; + // CHECK: dpct::image_data pResDesc; + CUDA_RESOURCE_DESC pResDesc; + // CHECK: surf = dpct::create_image_wrapper(pResDesc); + cuSurfObjectCreate(&surf, &pResDesc); + // CHECK: delete surf; + cuSurfObjectDestroy(surf); + // CHECK: pResDesc = surf->get_data(); + cuSurfObjectGetResourceDesc(&pResDesc, surf); +} static texture tex21; __device__ void device01() { diff --git a/clang/test/dpct/texture/surface_object_bindless_image.cu b/clang/test/dpct/texture/surface_object_bindless_image.cu index eb5f49ddbf4e..7b6cc4ba0645 100644 --- a/clang/test/dpct/texture/surface_object_bindless_image.cu +++ b/clang/test/dpct/texture/surface_object_bindless_image.cu @@ -1,6 +1,8 @@ // RUN: dpct --format-range=none --use-experimental-features=bindless_images -out-root %T/texture/surface_object_bindless_image %s --cuda-include-path="%cuda-path/include" -- -x cuda --cuda-host-only -std=c++14 // RUN: FileCheck --input-file %T/texture/surface_object_bindless_image/surface_object_bindless_image.dp.cpp --match-full-lines %s // RUN: %if build_lit %{icpx -c -fsycl %T/texture/surface_object_bindless_image/surface_object_bindless_image.dp.cpp -o %T/texture/surface_object_bindless_image/surface_object_bindless_image.dp.o %} +#include +#include #include #include @@ -46,6 +48,18 @@ template __global__ void kernel2(cudaSurfaceObject_t surf) { // CHECK: i = dpct::experimental::fetch_image_by_byte(surf, sycl::int3(k, j, i)); surf3Dread(&i, surf, k, j, i); } +void surface_driver_function() { + // CHECK: sycl::ext::oneapi::experimental::unsampled_image_handle surf; + CUsurfObject surf; + // CHECK: dpct::image_data pResDesc; + CUDA_RESOURCE_DESC pResDesc; + // CHECK: surf = dpct::experimental::create_bindless_image(pResDesc); + cuSurfObjectCreate(&surf, &pResDesc); + // CHECK: dpct::experimental::destroy_bindless_image(surf, dpct::get_in_order_queue()); + cuSurfObjectDestroy(surf); + // CHECK: pResDesc = dpct::experimental::get_data(surf); + cuSurfObjectGetResourceDesc(&pResDesc, surf); +} int main() { // CHECK: sycl::ext::oneapi::experimental::unsampled_image_handle surf; cudaSurfaceObject_t surf; From e4dbf0048201fee3e89399d8bd0683e06498a8ea Mon Sep 17 00:00:00 2001 From: "Jiang, Zhiwei" Date: Tue, 17 Dec 2024 13:57:37 +0800 Subject: [PATCH 2/4] [SYCLomatic] Migrate CUBLASLT_EPILOGUE_DGELU to dpct::blas_gemm::experimental::epilogue_t::dgelu (#2569) Signed-off-by: Jiang, Zhiwei --- clang/lib/DPCT/RulesMathLib/MapNamesBlas.cpp | 3 +++ clang/test/dpct/cublaslt.cu | 2 ++ 2 files changed, 5 insertions(+) diff --git a/clang/lib/DPCT/RulesMathLib/MapNamesBlas.cpp b/clang/lib/DPCT/RulesMathLib/MapNamesBlas.cpp index 6bd5cc987e88..42f932c052aa 100644 --- a/clang/lib/DPCT/RulesMathLib/MapNamesBlas.cpp +++ b/clang/lib/DPCT/RulesMathLib/MapNamesBlas.cpp @@ -1642,6 +1642,9 @@ void MapNamesBlas::setExplicitNamespaceMap( {"CUBLASLT_EPILOGUE_GELU_AUX_BIAS", MapNames::getLibraryHelperNamespace() + "blas_gemm::experimental::epilogue_t::gelu_aux_bias"}, + {"CUBLASLT_EPILOGUE_DGELU", + MapNames::getLibraryHelperNamespace() + + "blas_gemm::experimental::epilogue_t::dgelu"}, {"CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE", MapNames::getLibraryHelperNamespace() + "blas_gemm::experimental::transform_desc_t::attribute::scale_type"}, diff --git a/clang/test/dpct/cublaslt.cu b/clang/test/dpct/cublaslt.cu index 888094cbd218..38a1a460b26a 100644 --- a/clang/test/dpct/cublaslt.cu +++ b/clang/test/dpct/cublaslt.cu @@ -240,6 +240,7 @@ void foo3() { // CHECK-NEXT: e = dpct::blas_gemm::experimental::epilogue_t::gelu_aux; // CHECK-NEXT: e = dpct::blas_gemm::experimental::epilogue_t::gelu_aux_bias; // CHECK-NEXT: e = dpct::blas_gemm::experimental::epilogue_t::bgradb; + // CHECK-NEXT: e = dpct::blas_gemm::experimental::epilogue_t::dgelu; cublasLtEpilogue_t e; e = CUBLASLT_EPILOGUE_DEFAULT; e = CUBLASLT_EPILOGUE_RELU; @@ -249,6 +250,7 @@ void foo3() { e = CUBLASLT_EPILOGUE_GELU_AUX; e = CUBLASLT_EPILOGUE_GELU_AUX_BIAS; e = CUBLASLT_EPILOGUE_BGRADB; + e = CUBLASLT_EPILOGUE_DGELU; } void foo4() { From a5b0ce36da6ba9c8557619ceefe92eb57dfb703d Mon Sep 17 00:00:00 2001 From: "Jiang, Zhiwei" Date: Tue, 17 Dec 2024 16:36:28 +0800 Subject: [PATCH 3/4] [SYCLomatic] Fix some bugs for migrating APPs using Pytorch as a library (#2568) Signed-off-by: Jiang, Zhiwei --- clang/lib/DPCT/AnalysisInfo.cpp | 3 +- .../DPCT/RulesInclude/InclusionHeaders.cpp | 3 +- .../DPCT/RulesInclude/InclusionHeaders.inc | 3 +- clang/lib/DPCT/RulesLang/RulesLang.cpp | 7 ++++ clang/test/dpct/pytorch/ATen.cu | 2 +- .../pytorch_api_type/pytoch_api_test1.cpp | 11 +++--- .../dpct/pytorch_api_type/pytoch_api_test2.cu | 32 ++++++++++++---- .../test/dpct/two_analysis_scopes/app/test.cu | 5 ++- .../dpct/two_analysis_scopes/app/test.cuh | 5 +++ .../tools/dpct/DpctOptRules/pytorch_api.yaml | 37 +++++++++++++------ 10 files changed, 77 insertions(+), 31 deletions(-) create mode 100644 clang/test/dpct/two_analysis_scopes/app/test.cuh diff --git a/clang/lib/DPCT/AnalysisInfo.cpp b/clang/lib/DPCT/AnalysisInfo.cpp index d410a7e8ba57..458724d38b1a 100644 --- a/clang/lib/DPCT/AnalysisInfo.cpp +++ b/clang/lib/DPCT/AnalysisInfo.cpp @@ -246,7 +246,8 @@ std::shared_ptr makeTextureObjectInfo(const ValueDecl *D, if (auto VD = dyn_cast(D)) { return std::make_shared(VD); } - } else if (auto PVD = dyn_cast(D)) { + } else if (const auto *PVD = dyn_cast(D); + PVD && PVD->getTypeSourceInfo()) { return std::make_shared(PVD); } return std::shared_ptr(); diff --git a/clang/lib/DPCT/RulesInclude/InclusionHeaders.cpp b/clang/lib/DPCT/RulesInclude/InclusionHeaders.cpp index 79673ca76c2e..672a76e3f5b2 100644 --- a/clang/lib/DPCT/RulesInclude/InclusionHeaders.cpp +++ b/clang/lib/DPCT/RulesInclude/InclusionHeaders.cpp @@ -200,7 +200,8 @@ void IncludesCallbacks::InclusionDirective( DpctGlobalInfo::getIncludeMapSet().push_back({IncludedFile, Repl}); } } - return; + if (Global.isInRoot(IncludedFile)) + return; } if (!Global.isInAnalysisScope(LocInfo.first) && diff --git a/clang/lib/DPCT/RulesInclude/InclusionHeaders.inc b/clang/lib/DPCT/RulesInclude/InclusionHeaders.inc index 87991dc3ae14..dad32862ed64 100644 --- a/clang/lib/DPCT/RulesInclude/InclusionHeaders.inc +++ b/clang/lib/DPCT/RulesInclude/InclusionHeaders.inc @@ -50,8 +50,7 @@ REGIST_INCLUSION("curand_kernel.h", FullMatch, Rng, Replace, false, REGIST_INCLUSION("cusparse.h", FullMatch, Sparse, Replace, false, HeaderType::HT_DPCT_SPBLAS_Utils) REGIST_INCLUSION("cusparse_v2.h", FullMatch, Sparse, Replace, false, - HeaderType::HT_DPCT_SPBLAS_Utils, - HeaderType::HT_DPCT_BLAS_Utils) + HeaderType::HT_DPCT_SPBLAS_Utils) REGIST_INCLUSION("cufft.h", FullMatch, FFT, Replace, false, HeaderType::HT_DPCT_FFT_Utils) diff --git a/clang/lib/DPCT/RulesLang/RulesLang.cpp b/clang/lib/DPCT/RulesLang/RulesLang.cpp index af65bd17b184..68ffdaf523b7 100644 --- a/clang/lib/DPCT/RulesLang/RulesLang.cpp +++ b/clang/lib/DPCT/RulesLang/RulesLang.cpp @@ -1082,6 +1082,13 @@ void TypeInDeclRule::runRule(const MatchFinder::MatchResult &Result) { } Str = Itr->second; } + } else if (llvm::StringRef(TypeStr).starts_with("cublas")) { + // In most cases, we do not need to insert blas_utils.hpp manually since + // the cublas_v2.h will be migrated. However, when the include directive + // of cublas_v2.h is not in the in-root, the migrated code cannot be + // built successfully. + DpctGlobalInfo::getInstance().insertHeader( + TL->getBeginLoc(), HeaderType::HT_DPCT_BLAS_Utils); } // Add '#include ' directive to the file only once diff --git a/clang/test/dpct/pytorch/ATen.cu b/clang/test/dpct/pytorch/ATen.cu index d613ff66d474..8df6fd9be66b 100644 --- a/clang/test/dpct/pytorch/ATen.cu +++ b/clang/test/dpct/pytorch/ATen.cu @@ -5,7 +5,7 @@ // RUN: cp -r %S/pytorch_cuda_inc %T/pytorch/ATen/ // RUN: cd %T/pytorch/ATen // RUN: mkdir dpct_out -// RUN: dpct -out-root dpct_out %T/pytorch/ATen/src/ATen.cu --extra-arg="-I%T/pytorch/ATen/pytorch_cuda_inc" --cuda-include-path="%cuda-path/include" --rule-file=%T/pytorch/ATen/user_defined_rule_pytorch.yaml -- -x cuda --cuda-host-only +// RUN: dpct --out-root dpct_out %T/pytorch/ATen/src/ATen.cu --extra-arg="-I%T/pytorch/ATen/pytorch_cuda_inc" --cuda-include-path="%cuda-path/include" --rule-file=%T/pytorch/ATen/user_defined_rule_pytorch.yaml --analysis-scope-path %T/pytorch/ATen/pytorch_cuda_inc --analysis-scope-path %T/pytorch/ATen/src --in-root %T/pytorch/ATen/src // RUN: FileCheck --input-file %T/pytorch/ATen/dpct_out/ATen.dp.cpp --match-full-lines %T/pytorch/ATen/src/ATen.cu // RUN: %if build_lit %{icpx -c -fsycl -DNO_BUILD_TEST %T/pytorch/ATen/dpct_out/ATen.dp.cpp -o %T/pytorch/ATen/dpct_out/ATen.dp.o %} diff --git a/clang/test/dpct/pytorch_api_type/pytoch_api_test1.cpp b/clang/test/dpct/pytorch_api_type/pytoch_api_test1.cpp index 68bf1013d60f..fb0828bfee5c 100644 --- a/clang/test/dpct/pytorch_api_type/pytoch_api_test1.cpp +++ b/clang/test/dpct/pytorch_api_type/pytoch_api_test1.cpp @@ -8,15 +8,17 @@ #include #include -// CHECK: #include "c10/xpu/XPUStream.h" +// CHECK: #include "ATen/xpu/XPUContext.h" #include "ATen/cuda/CUDAContext.h" -class TensorStub { +namespace torch { +class Tensor { public: bool is_cuda() const { return true; } }; +} // namespace torch #define MY_CHECK(condition, message) \ do { \ @@ -25,9 +27,8 @@ class TensorStub { } \ } while (0) -int main() { - TensorStub x; - // CHECK: MY_CHECK(x.is_xpu(), "x must reside on device"); +void foo(torch::Tensor x) { + // CHECK: MY_CHECK(x. is_xpu(), "x must reside on device"); MY_CHECK(x.is_cuda(), "x must reside on device"); return 0; diff --git a/clang/test/dpct/pytorch_api_type/pytoch_api_test2.cu b/clang/test/dpct/pytorch_api_type/pytoch_api_test2.cu index 284f67902330..771dc3c2833f 100644 --- a/clang/test/dpct/pytorch_api_type/pytoch_api_test2.cu +++ b/clang/test/dpct/pytorch_api_type/pytoch_api_test2.cu @@ -6,12 +6,26 @@ #define AT_CUDA_CHECK(stmt) (stmt) -namespace at { +namespace c10 { +using DeviceIndex = int8_t; namespace cuda { -cudaStream_t getCurrentCUDAStream() { - return nullptr; // Return a dummy stream +class CUDAStream { +public: + CUDAStream() {} + cudaStream_t stream() { return 0; } + operator cudaStream_t() const { + return stream(); + } + cudaStream_t stream() const; +}; +CUDAStream getCurrentCUDAStream(DeviceIndex device_index = -1) { + return CUDAStream(); } } // namespace cuda +} // namespace c10 + +namespace at { +using namespace c10; } // namespace at __global__ void kernel() {} @@ -21,15 +35,19 @@ int main() { dim3 blockSize(8, 8, 1); void *args[] = {nullptr}; - // CHECK: [&](){ - // CHECK-NEXT: &static_cast(c10::xpu::getCurrentXPUStream())->parallel_for( + // CHECK:([&](){ + // CHECK-NEXT: ((sycl::queue*)(c10::xpu::getCurrentXPUStream()))->parallel_for( // CHECK-NEXT: sycl::nd_range<3>(gridSize * blockSize, blockSize), // CHECK-NEXT: [=](sycl::nd_item<3> item_ct1) { // CHECK-NEXT: kernel(); // CHECK-NEXT: }); // CHECK-NEXT: return 0; - // CHECK-NEXT:}(); + // CHECK-NEXT:}()); AT_CUDA_CHECK(cudaLaunchKernel((const void *)kernel, gridSize, blockSize, args, 0, at::cuda::getCurrentCUDAStream())); - + at::DeviceIndex d = 1; + // CHECK: c10::xpu::getCurrentXPUStream(d); + at::cuda::getCurrentCUDAStream(d); + // CHECK: dpct::queue_ptr s = &static_cast(c10::xpu::getCurrentXPUStream(). queue()); + cudaStream_t s = at::cuda::getCurrentCUDAStream().stream(); return 0; } diff --git a/clang/test/dpct/two_analysis_scopes/app/test.cu b/clang/test/dpct/two_analysis_scopes/app/test.cu index 247013fddf31..0819f27962e5 100644 --- a/clang/test/dpct/two_analysis_scopes/app/test.cu +++ b/clang/test/dpct/two_analysis_scopes/app/test.cu @@ -1,15 +1,16 @@ // RUN: dpct --format-range=none --out-root %T/out %s --analysis-scope-path %S --analysis-scope-path %S/../deps --cuda-include-path="%cuda-path/include" --extra-arg="-I%S/../deps" // RUN: FileCheck --match-full-lines --input-file %T/out/test.dp.cpp %s +// RUN: FileCheck --match-full-lines --input-file %T/out/test.dp.hpp %S/test.cuh // RUN: echo "// empty" > %T/out/dep.h // RUN: %if build_lit %{icpx -c -fsycl %T/out/test.dp.cpp -o %T/out/test.dp.o -I%T/out %} // CHECK: #include // CHECK-NEXT: #include -// CHECK-NEXT: #include +// CHECK-NEXT: #include "test.dp.hpp" // CHECK-NEXT: #include +#include "test.cuh" #include #include -#include void foo(cublasHandle_t handle, const half *a, const half *b, half *c, int n, half *alpha, half *beta) { diff --git a/clang/test/dpct/two_analysis_scopes/app/test.cuh b/clang/test/dpct/two_analysis_scopes/app/test.cuh new file mode 100644 index 000000000000..f90a1831f970 --- /dev/null +++ b/clang/test/dpct/two_analysis_scopes/app/test.cuh @@ -0,0 +1,5 @@ +// CHECK: #include +// CHECK-NEXT: #include +#include + +void foo_bar(cublasHandle_t h); diff --git a/clang/tools/dpct/DpctOptRules/pytorch_api.yaml b/clang/tools/dpct/DpctOptRules/pytorch_api.yaml index 169e96229d8b..a363f955fa5e 100644 --- a/clang/tools/dpct/DpctOptRules/pytorch_api.yaml +++ b/clang/tools/dpct/DpctOptRules/pytorch_api.yaml @@ -40,29 +40,42 @@ Priority: Takeover In: get_in_order_queue Out: static_cast(c10::xpu::getCurrentXPUStream()) + Includes: ["c10/xpu/XPUStream.h"] -- Rule: rule_process_is_cuda - Kind: PatternRewriter +- Rule: rule_torch_Tensor + Kind: Class Priority: Takeover - In: is_cuda - Out: is_xpu + In: torch::Tensor + Out: torch::Tensor + Methods: + - In: is_cuda + Out: $method_base is_xpu() - Rule: rule_getCurrentCUDAStream - Kind: PatternRewriter + Kind: API Priority: Takeover - In: at::cuda::getCurrentCUDAStream() - Out: | - &static_cast(c10::xpu::getCurrentXPUStream()) + In: at::cuda::getCurrentCUDAStream + Out: c10::xpu::getCurrentXPUStream($1) + Includes: ["c10/xpu/XPUStream.h"] + +- Rule: rule_CUDAStream + Kind: Class + Priority: Takeover + In: c10::cuda::CUDAStream + Out: c10::xpu::XPUStream + Methods: + - In: stream + Out: \&static_cast($method_base queue()) - Rule: rule_remove_AT_CUDA_CHECK - Kind: PatternRewriter + Kind: Macro Priority: Takeover - In: AT_CUDA_CHECK(${args}); - Out: ${args}; + In: AT_CUDA_CHECK + Out: | - Rule: rule_CUDAContext_h Kind: Header Priority: Takeover In: ATen/cuda/CUDAContext.h - Out: c10/xpu/XPUStream.h + Out: ATen/xpu/XPUContext.h Includes: [] From cfe128c8956173ac89dc5486b0f4ceb73d38d1f5 Mon Sep 17 00:00:00 2001 From: "Jiang, Zhiwei" Date: Tue, 17 Dec 2024 16:47:17 +0800 Subject: [PATCH 4/4] [SYCLomatic] Update helper function dpct::blas_gemm::experimental::matmul() to support batch mode. (#2571) Signed-off-by: Jiang, Zhiwei --- .../dpct-rt/include/dpct/blas_gemm_utils.hpp | 642 ++++++++++-------- 1 file changed, 344 insertions(+), 298 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/blas_gemm_utils.hpp b/clang/runtime/dpct-rt/include/dpct/blas_gemm_utils.hpp index d33c8f50b616..0e5ed9341911 100644 --- a/clang/runtime/dpct-rt/include/dpct/blas_gemm_utils.hpp +++ b/clang/runtime/dpct-rt/include/dpct/blas_gemm_utils.hpp @@ -550,7 +550,7 @@ template struct scale_d_impl { template struct set_buffer_impl { void operator()(::dnnl::memory *dnnl_memory, const void *ptr) { - auto buf = get_buffer(ptr); + auto buf = get_buffer(ptr); ::dnnl::sycl_interop::set_buffer(*dnnl_memory, buf); } }; @@ -774,331 +774,377 @@ inline sycl::event matmul(descriptor_ptr handle, matmul_desc_ptr compute_desc, "&& b_type==float && c_type==float."); } - // For non-col_major matrix, convert it to col_major. - const void *new_a = a; - const void *new_b = b; - const void *new_c = c; - void *new_d = d; - bool new_b_allocated = false; - bool new_c_allocated = false; - bool new_d_allocated = false; - size_t new_lda = a_desc->_ld, new_ldb = b_desc->_ld, new_ldc = c_desc->_ld, - new_ldd = d_desc->_ld; - std::vector transform_events; - - if (a_desc->_order != order_t::col) - new_lda = a_desc->_rows; - size_t size_of_element = - dpct::detail::library_data_size[static_cast( - a_desc->_type)] / - 8; - new_a = ::dpct::cs::malloc(size_of_element * a_desc->_cols * new_lda, *q_ptr); - sycl::event e_init; - if (a_desc->_order != order_t::col) - e_init = detail::type_dispatch( - a_desc->_type, q_ptr, a_desc->_rows, a_desc->_cols, a_desc->_ld, - a_desc->_order, (const std::int8_t *)a, new_lda, order_t::col, - (std::int8_t *)new_a, std::vector{}); - else - e_init = ::dpct::cs::memcpy(*q_ptr, (void *)new_a, a, - size_of_element * a_desc->_cols * new_lda, - ::dpct::cs::memcpy_direction::device_to_device); - - // alpha = alpha * scale_a * scale_b - sycl::event e_scale_new_a = detail::scale_new_a( - q_ptr, m, k, (void *)new_a, a_type, alpha, scale_type, vector_alpha, - device_alpha, compute_desc->_a_scale_pointer, - compute_desc->_b_scale_pointer, {e_init}); - - transform_events.push_back(e_scale_new_a); - - if (b_desc->_order != order_t::col) { - new_ldb = b_desc->_rows; - size_t size_of_element = - dpct::detail::library_data_size[static_cast( - b_desc->_type)] / - 8; - new_b = - ::dpct::cs::malloc(size_of_element * b_desc->_cols * new_ldb, *q_ptr); - new_b_allocated = true; - sycl::event e = detail::type_dispatch( - b_desc->_type, q_ptr, b_desc->_rows, b_desc->_cols, b_desc->_ld, - b_desc->_order, b, new_ldb, order_t::col, const_cast(new_b), - std::vector{}); - transform_events.push_back(e); + if (a_desc->_batch_count != b_desc->_batch_count || + a_desc->_batch_count != c_desc->_batch_count || + a_desc->_batch_count != d_desc->_batch_count) { + throw std::runtime_error("dpct::blas_gemm::experimental::matmul() does not " + "support different batch count."); } - if (!beta_is_zero && c_desc->_order != order_t::col) { - new_ldc = c_desc->_rows; - size_t size_of_element = - dpct::detail::library_data_size[static_cast( - c_desc->_type)] / - 8; - new_c = - ::dpct::cs::malloc(size_of_element * c_desc->_cols * new_ldc, *q_ptr); - new_c_allocated = true; - sycl::event e = detail::type_dispatch( - c_desc->_type, q_ptr, c_desc->_rows, c_desc->_cols, c_desc->_ld, - c_desc->_order, c, new_ldc, order_t::col, const_cast(new_c), - std::vector{}); - transform_events.push_back(e); +#ifdef DPCT_USM_LEVEL_NONE + if (a_desc->_batch_count != 1) { + throw std::runtime_error( + "dpct::blas_gemm::experimental::matmul() only supports " + "batch mode with USM enabled."); } +#endif - if (d_desc->_order != order_t::col) { - new_ldd = d_desc->_rows; - size_t size_of_element = - dpct::detail::library_data_size[static_cast( - d_desc->_type)] / - 8; - new_d = - ::dpct::cs::malloc(size_of_element * d_desc->_cols * new_ldd, *q_ptr); - new_d_allocated = true; - } + size_t a_elm_size = dpct::detail::library_data_size[static_cast( + a_desc->_type)] / + 8; + size_t b_elm_size = dpct::detail::library_data_size[static_cast( + b_desc->_type)] / + 8; + size_t c_elm_size = dpct::detail::library_data_size[static_cast( + c_desc->_type)] / + 8; + size_t d_elm_size = dpct::detail::library_data_size[static_cast( + d_desc->_type)] / + 8; + + static const auto matmul_single = + [](descriptor_ptr handle, matmul_desc_ptr compute_desc, const void *alpha, + const void *a, matrix_layout_ptr a_desc, const void *b, + matrix_layout_ptr b_desc, const void *beta, const void *c, + matrix_layout_ptr c_desc, void *d, matrix_layout_ptr d_desc, + ::dpct::cs::queue_ptr q_ptr, const size_t m, const size_t n, + const size_t k, const library_data_t a_type, + const library_data_t b_type, const library_data_t c_type, + const library_data_t d_type, const library_data_t scale_type, + bool vector_alpha, bool device_alpha, bool beta_is_zero, + size_t a_elm_size, size_t b_elm_size, size_t c_elm_size, + size_t d_elm_size) -> sycl::event { + // For non-col_major matrix, convert it to col_major. + const void *new_a = a; + const void *new_b = b; + const void *new_c = c; + void *new_d = d; + bool new_b_allocated = false; + bool new_c_allocated = false; + bool new_d_allocated = false; + size_t new_lda = a_desc->_ld, new_ldb = b_desc->_ld, new_ldc = c_desc->_ld, + new_ldd = d_desc->_ld; + std::vector transform_events; + + if (a_desc->_order != order_t::col) + new_lda = a_desc->_rows; + new_a = ::dpct::cs::malloc(a_elm_size * a_desc->_cols * new_lda, *q_ptr); + sycl::event e_init; + if (a_desc->_order != order_t::col) + e_init = detail::type_dispatch( + a_desc->_type, q_ptr, a_desc->_rows, a_desc->_cols, a_desc->_ld, + a_desc->_order, (const std::int8_t *)a, new_lda, order_t::col, + (std::int8_t *)new_a, std::vector{}); + else + e_init = ::dpct::cs::memcpy( + *q_ptr, (void *)new_a, a, a_elm_size * a_desc->_cols * new_lda, + ::dpct::cs::memcpy_direction::device_to_device); + + // alpha = alpha * scale_a * scale_b + sycl::event e_scale_new_a = detail::scale_new_a( + q_ptr, m, k, (void *)new_a, a_type, alpha, scale_type, vector_alpha, + device_alpha, compute_desc->_a_scale_pointer, + compute_desc->_b_scale_pointer, {e_init}); + + transform_events.push_back(e_scale_new_a); + + if (b_desc->_order != order_t::col) { + new_ldb = b_desc->_rows; + new_b = ::dpct::cs::malloc(b_elm_size * b_desc->_cols * new_ldb, *q_ptr); + new_b_allocated = true; + sycl::event e = detail::type_dispatch( + b_desc->_type, q_ptr, b_desc->_rows, b_desc->_cols, b_desc->_ld, + b_desc->_order, b, new_ldb, order_t::col, const_cast(new_b), + std::vector{}); + transform_events.push_back(e); + } - // start to call oneDNN matmul primitive - // a,d are col_major, b is row_major - const ::dnnl::memory::dim M = m; - const ::dnnl::memory::dim N = n; - const ::dnnl::memory::dim K = k; - - ::dnnl::memory::dims src_dims = {M, K}; - ::dnnl::memory::dims weights_dims = {K, N}; - ::dnnl::memory::dims bias_dims = {M, N}; - ::dnnl::memory::dims dst_dims = {M, N}; - - const ::dnnl::memory::dims src_strides = - compute_desc->_trans_a == oneapi::mkl::transpose::nontrans - ? ::dnnl::memory::dims{1, static_cast(new_lda)} - : ::dnnl::memory::dims{static_cast(new_lda), 1}; - const ::dnnl::memory::dims weights_strides = - compute_desc->_trans_b == oneapi::mkl::transpose::nontrans - ? ::dnnl::memory::dims{1, static_cast(new_ldb)} - : ::dnnl::memory::dims{static_cast(new_ldb), 1}; - const ::dnnl::memory::dims bias_strides = - ::dnnl::memory::dims{1, static_cast(new_ldc)}; - const ::dnnl::memory::dims dst_strides = - ::dnnl::memory::dims{1, static_cast(new_ldd)}; - - auto src_md = ::dnnl::memory::desc( - src_dims, dpct::dnnl::memory_desc_ext::to_dnnl_data_type(a_type), - src_strides); - auto weights_md = ::dnnl::memory::desc( - weights_dims, dpct::dnnl::memory_desc_ext::to_dnnl_data_type(b_type), - weights_strides); - auto bias_md = ::dnnl::memory::desc( - bias_dims, dpct::dnnl::memory_desc_ext::to_dnnl_data_type(c_type), - bias_strides); - auto dst_md = ::dnnl::memory::desc( - dst_dims, dpct::dnnl::memory_desc_ext::to_dnnl_data_type(d_type), - dst_strides); - - auto *src_mem = - new ::dnnl::memory(src_md, handle->get_engine(), DNNL_MEMORY_NONE); - auto *weights_mem = - new ::dnnl::memory(weights_md, handle->get_engine(), DNNL_MEMORY_NONE); - auto *bias_mem = - new ::dnnl::memory(bias_md, handle->get_engine(), DNNL_MEMORY_NONE); - auto *dst_mem = - new ::dnnl::memory(dst_md, handle->get_engine(), DNNL_MEMORY_NONE); + if (!beta_is_zero && c_desc->_order != order_t::col) { + new_ldc = c_desc->_rows; + new_c = ::dpct::cs::malloc(c_elm_size * c_desc->_cols * new_ldc, *q_ptr); + new_c_allocated = true; + sycl::event e = detail::type_dispatch( + c_desc->_type, q_ptr, c_desc->_rows, c_desc->_cols, c_desc->_ld, + c_desc->_order, c, new_ldc, order_t::col, const_cast(new_c), + std::vector{}); + transform_events.push_back(e); + } + + if (d_desc->_order != order_t::col) { + new_ldd = d_desc->_rows; + new_d = ::dpct::cs::malloc(d_elm_size * d_desc->_cols * new_ldd, *q_ptr); + new_d_allocated = true; + } + + // start to call oneDNN matmul primitive + // a,d are col_major, b is row_major + const ::dnnl::memory::dim M = m; + const ::dnnl::memory::dim N = n; + const ::dnnl::memory::dim K = k; + + ::dnnl::memory::dims src_dims = {M, K}; + ::dnnl::memory::dims weights_dims = {K, N}; + ::dnnl::memory::dims bias_dims = {M, N}; + ::dnnl::memory::dims dst_dims = {M, N}; + + const ::dnnl::memory::dims src_strides = + compute_desc->_trans_a == oneapi::mkl::transpose::nontrans + ? ::dnnl::memory::dims{1, static_cast(new_lda)} + : ::dnnl::memory::dims{static_cast(new_lda), 1}; + const ::dnnl::memory::dims weights_strides = + compute_desc->_trans_b == oneapi::mkl::transpose::nontrans + ? ::dnnl::memory::dims{1, static_cast(new_ldb)} + : ::dnnl::memory::dims{static_cast(new_ldb), 1}; + const ::dnnl::memory::dims bias_strides = + ::dnnl::memory::dims{1, static_cast(new_ldc)}; + const ::dnnl::memory::dims dst_strides = + ::dnnl::memory::dims{1, static_cast(new_ldd)}; + + auto src_md = ::dnnl::memory::desc( + src_dims, dpct::dnnl::memory_desc_ext::to_dnnl_data_type(a_type), + src_strides); + auto weights_md = ::dnnl::memory::desc( + weights_dims, dpct::dnnl::memory_desc_ext::to_dnnl_data_type(b_type), + weights_strides); + auto bias_md = ::dnnl::memory::desc( + bias_dims, dpct::dnnl::memory_desc_ext::to_dnnl_data_type(c_type), + bias_strides); + auto dst_md = ::dnnl::memory::desc( + dst_dims, dpct::dnnl::memory_desc_ext::to_dnnl_data_type(d_type), + dst_strides); + + auto *src_mem = + new ::dnnl::memory(src_md, handle->get_engine(), DNNL_MEMORY_NONE); + auto *weights_mem = + new ::dnnl::memory(weights_md, handle->get_engine(), DNNL_MEMORY_NONE); + auto *bias_mem = + new ::dnnl::memory(bias_md, handle->get_engine(), DNNL_MEMORY_NONE); + auto *dst_mem = + new ::dnnl::memory(dst_md, handle->get_engine(), DNNL_MEMORY_NONE); #ifdef DPCT_USM_LEVEL_NONE - detail::type_dispatch(a_type, src_mem, new_a); - detail::type_dispatch(b_type, weights_mem, new_b); - if (!beta_is_zero) - detail::type_dispatch(c_type, bias_mem, new_c); - detail::type_dispatch(d_type, dst_mem, new_d); + detail::type_dispatch(a_type, src_mem, new_a); + detail::type_dispatch(b_type, weights_mem, new_b); + if (!beta_is_zero) + detail::type_dispatch(c_type, bias_mem, new_c); + detail::type_dispatch(d_type, dst_mem, new_d); #else - src_mem->set_data_handle(const_cast(new_a)); - weights_mem->set_data_handle(const_cast(new_b)); - if (!beta_is_zero) - bias_mem->set_data_handle(const_cast(new_c)); - dst_mem->set_data_handle(new_d); + src_mem->set_data_handle(const_cast(new_a)); + weights_mem->set_data_handle(const_cast(new_b)); + if (!beta_is_zero) + bias_mem->set_data_handle(const_cast(new_c)); + dst_mem->set_data_handle(new_d); #endif - std::unordered_map matmul_args; - matmul_args.insert({DNNL_ARG_SRC, *src_mem}); - matmul_args.insert({DNNL_ARG_WEIGHTS, *weights_mem}); - matmul_args.insert({DNNL_ARG_DST, *dst_mem}); - ::dnnl::primitive_attr matmul_attr; - - ::dnnl::post_ops matmul_ops; - if (!beta_is_zero) { - matmul_ops.append_binary(::dnnl::algorithm::binary_add, bias_md); - matmul_args.insert( - {DNNL_ARG_ATTR_MULTIPLE_POST_OP(matmul_ops.len() - 1) | DNNL_ARG_SRC_1, - *bias_mem}); - } + std::unordered_map matmul_args; + matmul_args.insert({DNNL_ARG_SRC, *src_mem}); + matmul_args.insert({DNNL_ARG_WEIGHTS, *weights_mem}); + matmul_args.insert({DNNL_ARG_DST, *dst_mem}); + ::dnnl::primitive_attr matmul_attr; + + ::dnnl::post_ops matmul_ops; + if (!beta_is_zero) { + matmul_ops.append_binary(::dnnl::algorithm::binary_add, bias_md); + matmul_args.insert({DNNL_ARG_ATTR_MULTIPLE_POST_OP(matmul_ops.len() - 1) | + DNNL_ARG_SRC_1, + *bias_mem}); + } - ::dnnl::memory *po_bias_mem = nullptr; - auto po_bias_md = - ::dnnl::memory::desc(::dnnl::memory::dims{M, 1}, - dpct::dnnl::memory_desc_ext::to_dnnl_data_type( - compute_desc->_bias_data_type), - ::dnnl::memory::dims{1, M}); - if (compute_desc->_epilogue == epilogue_t::bias || - compute_desc->_epilogue == epilogue_t::gelu_bias || - compute_desc->_epilogue == epilogue_t::gelu_aux_bias) { - po_bias_mem = - new ::dnnl::memory(po_bias_md, handle->get_engine(), DNNL_MEMORY_NONE); + ::dnnl::memory *po_bias_mem = nullptr; + auto po_bias_md = + ::dnnl::memory::desc(::dnnl::memory::dims{M, 1}, + dpct::dnnl::memory_desc_ext::to_dnnl_data_type( + compute_desc->_bias_data_type), + ::dnnl::memory::dims{1, M}); + if (compute_desc->_epilogue == epilogue_t::bias || + compute_desc->_epilogue == epilogue_t::gelu_bias || + compute_desc->_epilogue == epilogue_t::gelu_aux_bias) { + po_bias_mem = new ::dnnl::memory(po_bias_md, handle->get_engine(), + DNNL_MEMORY_NONE); #ifdef DPCT_USM_LEVEL_NONE - detail::type_dispatch( - compute_desc->_bias_data_type, po_bias_mem, - compute_desc->_bias_pointer); + detail::type_dispatch( + compute_desc->_bias_data_type, po_bias_mem, + compute_desc->_bias_pointer); #else - po_bias_mem->set_data_handle(compute_desc->_bias_pointer); + po_bias_mem->set_data_handle(compute_desc->_bias_pointer); #endif - } + } - ::dnnl::memory *po_aux_mem = nullptr; - auto po_aux_md = ::dnnl::memory::desc( - ::dnnl::memory::dims{M, N}, - dpct::dnnl::memory_desc_ext::to_dnnl_data_type( - compute_desc->_epilogue_aux_data_type), - ::dnnl::memory::dims{1, - static_cast(compute_desc->_epilogue_aux_ld)}); - if (compute_desc->_epilogue == epilogue_t::dgelu) { - po_aux_mem = - new ::dnnl::memory(po_aux_md, handle->get_engine(), DNNL_MEMORY_NONE); + ::dnnl::memory *po_aux_mem = nullptr; + auto po_aux_md = ::dnnl::memory::desc( + ::dnnl::memory::dims{M, N}, + dpct::dnnl::memory_desc_ext::to_dnnl_data_type( + compute_desc->_epilogue_aux_data_type), + ::dnnl::memory::dims{ + 1, static_cast(compute_desc->_epilogue_aux_ld)}); + if (compute_desc->_epilogue == epilogue_t::dgelu) { + po_aux_mem = + new ::dnnl::memory(po_aux_md, handle->get_engine(), DNNL_MEMORY_NONE); #ifdef DPCT_USM_LEVEL_NONE - detail::type_dispatch( - compute_desc->_epilogue_aux_data_type, po_aux_mem, - compute_desc->_epilogue_aux_pointer); + detail::type_dispatch( + compute_desc->_epilogue_aux_data_type, po_aux_mem, + compute_desc->_epilogue_aux_pointer); #else - po_aux_mem->set_data_handle(compute_desc->_epilogue_aux_pointer); + po_aux_mem->set_data_handle(compute_desc->_epilogue_aux_pointer); #endif - } + } - switch (compute_desc->_epilogue) { - case epilogue_t::relu: - matmul_ops.append_eltwise(::dnnl::algorithm::eltwise_relu, 0.f, 0.f); - break; - case epilogue_t::bias: - case epilogue_t::gelu_bias: { - matmul_ops.append_binary(::dnnl::algorithm::binary_add, po_bias_md); - matmul_args.insert( - {DNNL_ARG_ATTR_MULTIPLE_POST_OP(matmul_ops.len() - 1) | DNNL_ARG_SRC_1, - *po_bias_mem}); - if (compute_desc->_epilogue == epilogue_t::gelu_bias) + switch (compute_desc->_epilogue) { + case epilogue_t::relu: + matmul_ops.append_eltwise(::dnnl::algorithm::eltwise_relu, 0.f, 0.f); + break; + case epilogue_t::bias: + case epilogue_t::gelu_bias: { + matmul_ops.append_binary(::dnnl::algorithm::binary_add, po_bias_md); + matmul_args.insert({DNNL_ARG_ATTR_MULTIPLE_POST_OP(matmul_ops.len() - 1) | + DNNL_ARG_SRC_1, + *po_bias_mem}); + if (compute_desc->_epilogue == epilogue_t::gelu_bias) + matmul_ops.append_eltwise(::dnnl::algorithm::eltwise_gelu_tanh, 0.f, + 0.f); + break; + } + case epilogue_t::gelu: matmul_ops.append_eltwise(::dnnl::algorithm::eltwise_gelu_tanh, 0.f, 0.f); - break; - } - case epilogue_t::gelu: - matmul_ops.append_eltwise(::dnnl::algorithm::eltwise_gelu_tanh, 0.f, 0.f); - break; - default: - break; - } + break; + default: + break; + } - matmul_attr.set_post_ops(matmul_ops); - - auto matmul_pd = ::dnnl::matmul::primitive_desc( - handle->get_engine(), src_md, weights_md, dst_md, matmul_attr); - auto matmul_prim = ::dnnl::matmul(matmul_pd); - sycl::event matmul_prim_event = ::dnnl::sycl_interop::execute( - matmul_prim, handle->get_engine_stream(), matmul_args, transform_events); - - // post-op implemented by separate primitives - sycl::event post_op_prim_event; - if (compute_desc->_epilogue == epilogue_t::gelu_aux || - compute_desc->_epilogue == epilogue_t::gelu_aux_bias) { - sycl::event prev_event = matmul_prim_event; - if (compute_desc->_epilogue == epilogue_t::gelu_aux_bias) { - auto po_bias_pd = ::dnnl::binary::primitive_desc( - handle->get_engine(), ::dnnl::algorithm::binary_add, dst_md, - po_bias_md, dst_md); - auto po_bias_prim = ::dnnl::binary(po_bias_pd); - std::unordered_map po_bias_args; - po_bias_args.insert({DNNL_ARG_SRC_0, *dst_mem}); - po_bias_args.insert({DNNL_ARG_SRC_1, *po_bias_mem}); - po_bias_args.insert({DNNL_ARG_DST, *dst_mem}); - sycl::event prev_event = ::dnnl::sycl_interop::execute( - po_bias_prim, handle->get_engine_stream(), po_bias_args, - {matmul_prim_event}); + matmul_attr.set_post_ops(matmul_ops); + + auto matmul_pd = ::dnnl::matmul::primitive_desc( + handle->get_engine(), src_md, weights_md, dst_md, matmul_attr); + auto matmul_prim = ::dnnl::matmul(matmul_pd); + sycl::event matmul_prim_event = + ::dnnl::sycl_interop::execute(matmul_prim, handle->get_engine_stream(), + matmul_args, transform_events); + + // post-op implemented by separate primitives + sycl::event post_op_prim_event; + if (compute_desc->_epilogue == epilogue_t::gelu_aux || + compute_desc->_epilogue == epilogue_t::gelu_aux_bias) { + sycl::event prev_event = matmul_prim_event; + if (compute_desc->_epilogue == epilogue_t::gelu_aux_bias) { + auto po_bias_pd = ::dnnl::binary::primitive_desc( + handle->get_engine(), ::dnnl::algorithm::binary_add, dst_md, + po_bias_md, dst_md); + auto po_bias_prim = ::dnnl::binary(po_bias_pd); + std::unordered_map po_bias_args; + po_bias_args.insert({DNNL_ARG_SRC_0, *dst_mem}); + po_bias_args.insert({DNNL_ARG_SRC_1, *po_bias_mem}); + po_bias_args.insert({DNNL_ARG_DST, *dst_mem}); + sycl::event prev_event = ::dnnl::sycl_interop::execute( + po_bias_prim, handle->get_engine_stream(), po_bias_args, + {matmul_prim_event}); + } + size_t size_of_element = + dpct::detail::library_data_size[static_cast( + d_desc->_type)] / + 8; + sycl::event copy_e = dpct::blas::matrix_mem_copy_async( + compute_desc->_epilogue_aux_pointer, new_d, + compute_desc->_epilogue_aux_ld, d_desc->_ld, m, n, size_of_element, + ::dpct::cs::memcpy_direction::automatic, *q_ptr, {prev_event}); + + auto gelu_pd = ::dnnl::eltwise_forward::primitive_desc( + handle->get_engine(), ::dnnl::prop_kind::forward_training, + ::dnnl::algorithm::eltwise_gelu_tanh, dst_md, dst_md); + auto gelu_prim = ::dnnl::eltwise_forward(gelu_pd); + std::unordered_map gelu_args; + gelu_args.insert({DNNL_ARG_SRC, *dst_mem}); + gelu_args.insert({DNNL_ARG_DST, *dst_mem}); + post_op_prim_event = ::dnnl::sycl_interop::execute( + gelu_prim, handle->get_engine_stream(), gelu_args, {copy_e}); + } else if (compute_desc->_epilogue == epilogue_t::dgelu) { + auto gelu_pd = ::dnnl::eltwise_forward::primitive_desc( + handle->get_engine(), ::dnnl::prop_kind::forward_training, + ::dnnl::algorithm::eltwise_gelu_tanh, po_aux_md, po_aux_md); + auto dgelu_pd = ::dnnl::eltwise_backward::primitive_desc( + handle->get_engine(), ::dnnl::algorithm::eltwise_gelu_tanh, dst_md, + dst_md, po_aux_md, gelu_pd); + auto dgelu_prim = ::dnnl::eltwise_backward(dgelu_pd); + std::unordered_map dgelu_args; + dgelu_args.insert({DNNL_ARG_SRC, *po_aux_mem}); + dgelu_args.insert({DNNL_ARG_DIFF_DST, *dst_mem}); + dgelu_args.insert({DNNL_ARG_DIFF_SRC, *dst_mem}); + post_op_prim_event = + ::dnnl::sycl_interop::execute(dgelu_prim, handle->get_engine_stream(), + dgelu_args, {matmul_prim_event}); } - size_t size_of_element = - dpct::detail::library_data_size[static_cast( - d_desc->_type)] / - 8; - sycl::event copy_e = dpct::blas::matrix_mem_copy_async( - compute_desc->_epilogue_aux_pointer, new_d, - compute_desc->_epilogue_aux_ld, d_desc->_ld, m, n, size_of_element, - ::dpct::cs::memcpy_direction::automatic, *q_ptr, {prev_event}); - - auto gelu_pd = ::dnnl::eltwise_forward::primitive_desc( - handle->get_engine(), ::dnnl::prop_kind::forward_training, - ::dnnl::algorithm::eltwise_gelu_tanh, dst_md, dst_md); - auto gelu_prim = ::dnnl::eltwise_forward(gelu_pd); - std::unordered_map gelu_args; - gelu_args.insert({DNNL_ARG_SRC, *dst_mem}); - gelu_args.insert({DNNL_ARG_DST, *dst_mem}); - post_op_prim_event = ::dnnl::sycl_interop::execute( - gelu_prim, handle->get_engine_stream(), gelu_args, {copy_e}); - } else if (compute_desc->_epilogue == epilogue_t::dgelu) { - auto gelu_pd = ::dnnl::eltwise_forward::primitive_desc( - handle->get_engine(), ::dnnl::prop_kind::forward_training, - ::dnnl::algorithm::eltwise_gelu_tanh, po_aux_md, po_aux_md); - auto dgelu_pd = ::dnnl::eltwise_backward::primitive_desc( - handle->get_engine(), ::dnnl::algorithm::eltwise_gelu_tanh, dst_md, - dst_md, po_aux_md, gelu_pd); - auto dgelu_prim = ::dnnl::eltwise_backward(dgelu_pd); - std::unordered_map dgelu_args; - dgelu_args.insert({DNNL_ARG_SRC, *po_aux_mem}); - dgelu_args.insert({DNNL_ARG_DIFF_DST, *dst_mem}); - dgelu_args.insert({DNNL_ARG_DIFF_SRC, *dst_mem}); - post_op_prim_event = - ::dnnl::sycl_interop::execute(dgelu_prim, handle->get_engine_stream(), - dgelu_args, {matmul_prim_event}); - } - // end of calling oneDNN + // end of calling oneDNN - sycl::event absmax_d_event; - if (auto absmax_ptr = compute_desc->_absmax_d_pointer) { - absmax_d_event = detail::type_dispatch( - d_desc->_type, absmax_ptr, new_d, new_ldd, d_desc->_rows, d_desc->_cols, - q_ptr, std::vector{matmul_prim_event, post_op_prim_event}); - } + sycl::event absmax_d_event; + if (auto absmax_ptr = compute_desc->_absmax_d_pointer) { + absmax_d_event = detail::type_dispatch( + d_desc->_type, absmax_ptr, new_d, new_ldd, d_desc->_rows, + d_desc->_cols, q_ptr, + std::vector{matmul_prim_event, post_op_prim_event}); + } - sycl::event scale_d_event; - if (auto d_scale_ptr = compute_desc->_d_scale_pointer) { - scale_d_event = detail::type_dispatch( - d_desc->_type, d_scale_ptr, new_d, new_ldd, d_desc->_rows, - d_desc->_cols, q_ptr, compute_desc->_scale_type, - std::vector{matmul_prim_event, absmax_d_event, - post_op_prim_event}); - } + sycl::event scale_d_event; + if (auto d_scale_ptr = compute_desc->_d_scale_pointer) { + scale_d_event = detail::type_dispatch( + d_desc->_type, d_scale_ptr, new_d, new_ldd, d_desc->_rows, + d_desc->_cols, q_ptr, compute_desc->_scale_type, + std::vector{matmul_prim_event, absmax_d_event, + post_op_prim_event}); + } - sycl::event transform_d_event; - if (d_desc->_order != order_t::col) { - detail::type_dispatch( - d_desc->_type, q_ptr, d_desc->_rows, d_desc->_cols, new_ldd, - order_t::col, new_d, d_desc->_ld, d_desc->_order, d, - std::vector{matmul_prim_event, absmax_d_event, - post_op_prim_event}); - } + sycl::event transform_d_event; + if (d_desc->_order != order_t::col) { + detail::type_dispatch( + d_desc->_type, q_ptr, d_desc->_rows, d_desc->_cols, new_ldd, + order_t::col, new_d, d_desc->_ld, d_desc->_order, d, + std::vector{matmul_prim_event, absmax_d_event, + post_op_prim_event}); + } - sycl::event free_event = q_ptr->submit([&](sycl::handler &cgh) { - cgh.depends_on({transform_d_event, matmul_prim_event, absmax_d_event, - post_op_prim_event}); - cgh.host_task([=] { - delete src_mem; - delete weights_mem; - delete bias_mem; - delete dst_mem; - if (po_bias_mem) - delete po_bias_mem; - if (po_aux_mem) - delete po_aux_mem; - ::dpct::cs::free((void *)new_a, *q_ptr); - if (new_b_allocated) - ::dpct::cs::free((void *)new_b, *q_ptr); - if (new_c_allocated) - ::dpct::cs::free((void *)new_c, *q_ptr); - if (new_d_allocated) - ::dpct::cs::free((void *)new_d, *q_ptr); + sycl::event free_event = q_ptr->submit([&](sycl::handler &cgh) { + cgh.depends_on({transform_d_event, matmul_prim_event, absmax_d_event, + post_op_prim_event}); + cgh.host_task([=] { + delete src_mem; + delete weights_mem; + delete bias_mem; + delete dst_mem; + if (po_bias_mem) + delete po_bias_mem; + if (po_aux_mem) + delete po_aux_mem; + ::dpct::cs::free((void *)new_a, *q_ptr); + if (new_b_allocated) + ::dpct::cs::free((void *)new_b, *q_ptr); + if (new_c_allocated) + ::dpct::cs::free((void *)new_c, *q_ptr); + if (new_d_allocated) + ::dpct::cs::free((void *)new_d, *q_ptr); + }); }); - }); - return free_event; + return free_event; + }; + + std::vector events; + const std::byte *offsetted_a = static_cast(a); + const std::byte *offsetted_b = static_cast(b); + const std::byte *offsetted_c = static_cast(c); + std::byte *offsetted_d = static_cast(d); + for (std::uint64_t i = 0; i < a_desc->_batch_count; i++) { + sycl::event e = matmul_single( + handle, compute_desc, alpha, offsetted_a, a_desc, offsetted_b, b_desc, + beta, offsetted_c, c_desc, offsetted_d, d_desc, q_ptr, m, n, k, a_type, + b_type, c_type, d_type, scale_type, vector_alpha, device_alpha, + beta_is_zero, a_elm_size, b_elm_size, c_elm_size, d_elm_size); + events.push_back(e); + + offsetted_a += a_elm_size * a_desc->_strided_batch_offset; + offsetted_b += b_elm_size * b_desc->_strided_batch_offset; + offsetted_c += c_elm_size * c_desc->_strided_batch_offset; + offsetted_d += d_elm_size * d_desc->_strided_batch_offset; + } + + return q_ptr->single_task(events, [] {}); } class transform_desc_t {