diff --git a/docs/reference/index.rst b/docs/reference/index.rst index c2b74eabee..60baec4073 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -39,3 +39,4 @@ The MIOpen API library is structured as follows: * :doc:`ReLU <../doxygen/html/group___re_l_u>` (experimental) * :doc:`Kthvalue <../doxygen/html/group__kthvalue>` (experimental) * :doc:`GLU <../doxygen/html/group__glu>` (experimental) + * :doc:`SmoothL1Loss <../doxygen/html/_smooth_l1_loss>` (experimental) diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt index 60d6fe6ce6..61687bf96d 100644 --- a/driver/CMakeLists.txt +++ b/driver/CMakeLists.txt @@ -57,6 +57,7 @@ add_executable(MIOpenDriver dm_reducecalculation.cpp dm_rnn.cpp dm_rope.cpp + dm_smooth_l1loss.cpp dm_softmarginloss.cpp dm_softmax.cpp dm_t5layernorm.cpp diff --git a/driver/dm_smooth_l1loss.cpp b/driver/dm_smooth_l1loss.cpp new file mode 100644 index 0000000000..d700d4b9e8 --- /dev/null +++ b/driver/dm_smooth_l1loss.cpp @@ -0,0 +1,41 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "registry_driver_maker.hpp" +#include "smooth_l1loss_driver.hpp" + +static Driver* makeDriver(const std::string& base_arg) +{ + if(base_arg == "smoothl1loss") + return new SmoothL1LossDriver(); + if(base_arg == "smoothl1lossfp16") + return new SmoothL1LossDriver(); + if(base_arg == "smoothl1lossbfp16") + return new SmoothL1LossDriver(); + return nullptr; +} + +REGISTER_DRIVER_MAKER(makeDriver); diff --git a/driver/driver.hpp b/driver/driver.hpp index d77d5d02d2..e4f7356211 100644 --- a/driver/driver.hpp +++ b/driver/driver.hpp @@ -314,7 +314,7 @@ inline void PadBufferSize(size_t& sz, int datatype_sz) "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, " "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], rope[bfp16|fp16], " "prelu[bfp16|fp16], kthvalue[bfp16|fp16], glu[bfp16|fp16], softmarginloss[bfp16|fp16], " - "multimarginloss[bfp16|fp16]\n"); + "multimarginloss[bfp16|fp16], smoothl1loss[bfp16|fp16]\n"); exit(0); // NOLINT (concurrency-mt-unsafe) } @@ -352,6 +352,7 @@ inline std::string ParseBaseArg(int argc, char* argv[]) arg != "kthvaluebfp16" && arg != "glu" && arg != "glufp16" && arg != "glubfp16" && arg != "softmarginloss" && arg != "softmarginlossfp16" && arg != "softmarginlossbfp16" && arg != "multimarginloss" && arg != "multimarginlossfp16" && arg != "multimarginlossbfp16" && + arg != "smoothl1loss" && arg != "smoothl1lossfp16" && arg != "smoothl1lossbfp16" && arg != "--version") { printf("FAILED: Invalid Base Input Argument\n"); diff --git a/driver/smooth_l1loss_driver.hpp b/driver/smooth_l1loss_driver.hpp new file mode 100644 index 0000000000..bd3b35584c --- /dev/null +++ b/driver/smooth_l1loss_driver.hpp @@ -0,0 +1,618 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include "InputFlags.hpp" +#include "driver.hpp" +#include "miopen/errors.hpp" +#include "tensor_driver.hpp" +#include "timer.hpp" + +#include <../test/ford.hpp> +#include <../test/tensor_holder.hpp> +#include <../test/verify.hpp> + +#include +#include +#include + +#include + +#ifndef MLO_SMOOTH_L1LOSSMHOST_H_ +#define MLO_SMOOTH_L1LOSSMHOST_H_ + +template +int32_t mloSmoothL1LossForwardRunHost(const miopenTensorDescriptor_t iDesc, + const miopenTensorDescriptor_t tDesc, + const miopenTensorDescriptor_t oDesc, + const Tgpu* input, + const Tgpu* target, + Tcheck* outputhost, + const float beta, + const miopenLossReductionMode_t reduction) +{ + // Treat contiguous tensors as non-contiguous tensors (for consistency) + auto I_tv = get_inner_expanded_tv<5>(miopen::deref(iDesc)); + auto T_tv = get_inner_expanded_tv<5>(miopen::deref(tDesc)); + auto O_tv = get_inner_expanded_tv<5>(miopen::deref(oDesc)); + + auto size = miopen::deref(iDesc).GetElementSize(); + double loss_sum = 0.0; + + ford(size)([&](size_t i) { + const auto tensor_layout = tensor_layout_t<5>(I_tv, i); + const uint64_t Iidx = I_tv.get_tensor_view_idx(tensor_layout); + const uint64_t Tidx = T_tv.get_tensor_view_idx(tensor_layout); + + auto diff = abs(input[Iidx] - target[Tidx]); + auto loss = (diff < beta ? 0.5f * diff * diff / beta : diff - 0.5f * beta); + + if(reduction == MIOPEN_LOSS_REDUCTION_NONE) + outputhost[O_tv.get_tensor_view_idx(tensor_layout)] = static_cast(loss); + else + loss_sum += loss; + }); + if(reduction == MIOPEN_LOSS_REDUCTION_MEAN) + loss_sum /= size; + if(reduction != MIOPEN_LOSS_REDUCTION_NONE) + outputhost[0] = static_cast(loss_sum); + + return miopenStatusSuccess; +} + +template +int32_t mloSmoothL1LossBackwardRunHost(const miopenTensorDescriptor_t iDesc, + const miopenTensorDescriptor_t tDesc, + const miopenTensorDescriptor_t dODesc, + const miopenTensorDescriptor_t diDesc, + const miopenTensorDescriptor_t dtDesc, + const Tgpu* input, + const Tgpu* target, + const Tgpu* dO, + Tcheck* dI, + Tcheck* dT, + const float beta, + const miopenLossReductionMode_t reduction) +{ + // Treat contiguous tensors as non-contiguous tensors (for consistency) + auto I_tv = get_inner_expanded_tv<5>(miopen::deref(iDesc)); + auto T_tv = get_inner_expanded_tv<5>(miopen::deref(tDesc)); + auto dI_tv = get_inner_expanded_tv<5>(miopen::deref(diDesc)); + auto dT_tv = get_inner_expanded_tv<5>(miopen::deref(dtDesc)); + auto dO_tv = get_inner_expanded_tv<5>(miopen::deref(dODesc)); + + auto size = miopen::deref(iDesc).GetElementSize(); + + par_ford(size)([&](size_t i) { + const auto tensor_layout = tensor_layout_t<5>(I_tv, i); + const uint64_t Iidx = I_tv.get_tensor_view_idx(tensor_layout); + const uint64_t Tidx = T_tv.get_tensor_view_idx(tensor_layout); + + float sub = input[Iidx] - target[Tidx]; + float grad = 0.0f; + + if(fabs(sub) < beta) + grad = sub / beta * + dO[reduction == MIOPEN_LOSS_REDUCTION_NONE + ? dO_tv.get_tensor_view_idx(tensor_layout) + : 0]; + else + grad = (sub >= 0 ? 1.0f : -1.0f) * dO[reduction == MIOPEN_LOSS_REDUCTION_NONE + ? dO_tv.get_tensor_view_idx(tensor_layout) + : 0]; + + if(dI) + dI[dI_tv.get_tensor_view_idx(tensor_layout)] = static_cast(grad); + if(dT) + dT[dT_tv.get_tensor_view_idx(tensor_layout)] = static_cast(-grad); + }); + + return miopenStatusSuccess; +} +#endif + +inline std::vector GetStrides(std::vector lengths, int contiguous) +{ + if(contiguous != 0 && contiguous != 1) + std::cerr << "Error Tensor Contiguous should be 0 or 1" << std::endl; + if(contiguous == 0) + std::swap(lengths.front(), lengths.back()); + std::vector strides(lengths.size()); + strides.back() = 1; + for(int i = lengths.size() - 2; i >= 0; --i) + strides[i] = strides[i + 1] * lengths[i + 1]; + if(contiguous == 0) + std::swap(strides.front(), strides.back()); + return strides; +} + +template +class SmoothL1LossDriver : public Driver +{ +public: + SmoothL1LossDriver() : Driver() + { + miopenCreateTensorDescriptor(&inputDesc); + miopenCreateTensorDescriptor(&targetDesc); + miopenCreateTensorDescriptor(&outputDesc); + miopenCreateTensorDescriptor(&diDesc); + miopenCreateTensorDescriptor(&dtDesc); + miopenCreateTensorDescriptor(&doDesc); + + data_type = miopen_type{}; + } + + int AddCmdLineArgs() override; + int ParseCmdLineArgs(int argc, char* argv[]) override; + InputFlags& GetInputFlags() override { return inflags; } + + int GetandSetData() override; + + int AllocateBuffersAndCopy() override; + + int RunForwardGPU() override; + int RunForwardCPU(); + + int RunBackwardGPU() override; + int RunBackwardCPU(); + + Tref GetTolerance(); + int VerifyBackward() override; + int VerifyForward() override; + ~SmoothL1LossDriver() override + { + miopenDestroyTensorDescriptor(inputDesc); + miopenDestroyTensorDescriptor(targetDesc); + miopenDestroyTensorDescriptor(outputDesc); + miopenDestroyTensorDescriptor(diDesc); + miopenDestroyTensorDescriptor(dtDesc); + miopenDestroyTensorDescriptor(doDesc); + } + +private: + InputFlags inflags; + + int forw; + + miopenTensorDescriptor_t inputDesc; + miopenTensorDescriptor_t targetDesc; + miopenTensorDescriptor_t outputDesc; + miopenTensorDescriptor_t diDesc; + miopenTensorDescriptor_t dtDesc; + miopenTensorDescriptor_t doDesc; + + std::unique_ptr in_dev; + std::unique_ptr tar_dev; + std::unique_ptr out_dev; + std::unique_ptr workspace_dev; + std::unique_ptr dI_dev; + std::unique_ptr dT_dev; + std::unique_ptr dO_dev; + + std::vector in; + std::vector tar; + std::vector out; + std::vector workspace; + std::vector dI; + std::vector dT; + std::vector dO; + + std::vector outhost; + std::vector dIhost; + std::vector dThost; + + size_t ws_sizeInBytes; + + float beta; + miopenLossReductionMode_t reduction_mode; +}; + +template +int SmoothL1LossDriver::ParseCmdLineArgs(int argc, char* argv[]) +{ + inflags.Parse(argc, argv); + + auto reduction = inflags.GetValueStr("Reduction"); + if(reduction != "none" && reduction != "mean" && reduction != "sum") + return miopenStatusInvalidValue; + if(reduction == "none") + reduction_mode = MIOPEN_LOSS_REDUCTION_NONE; + else if(reduction == "mean") + reduction_mode = MIOPEN_LOSS_REDUCTION_MEAN; + else if(reduction == "sum") + reduction_mode = MIOPEN_LOSS_REDUCTION_SUM; + + beta = inflags.GetValueInt("Beta"); + + if(inflags.GetValueInt("time") == 1) + { + miopenEnableProfiling(GetHandle(), true); + } + + forw = inflags.GetValueInt("forw"); + + return miopenStatusSuccess; +} + +template +int SmoothL1LossDriver::GetandSetData() +{ + auto length = inflags.GetValueTensor("input").lengths; + auto in_strides = GetStrides(length, 1); + auto tar_strides = GetStrides(length, inflags.GetValueInt("Contiguous")); + + if(SetTensorNd(inputDesc, length, in_strides, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error parsing input tensor: " + inflags.GetValueStr("input") + "."); + if(SetTensorNd(targetDesc, length, tar_strides, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error parsing target tensor"); + + if(reduction_mode == MIOPEN_LOSS_REDUCTION_NONE) + { + if(SetTensorNd(outputDesc, length, in_strides, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error parsing output tensor"); + } + else + { + std::vector out_lens = {1}; + if(SetTensorNd(outputDesc, out_lens, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error parsing output tensor"); + } + + if(SetTensorNd(diDesc, length, in_strides, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error parsing input gradient tensor"); + if(SetTensorNd(dtDesc, length, tar_strides, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error parsing target gradient tensor"); + + if(reduction_mode == MIOPEN_LOSS_REDUCTION_NONE) + { + if(SetTensorNd(doDesc, length, in_strides, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error parsing output gradient tensor"); + } + else + { + std::vector out_lens = {1}; + if(SetTensorNd(doDesc, out_lens, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error parsing output gradient tensor"); + } + + return miopenStatusSuccess; +} + +template +int SmoothL1LossDriver::AddCmdLineArgs() +{ + inflags.AddInputFlag("forw", 'F', "1", "Run only Forward SmoothL1Loss (Default=1)", "int"); + inflags.AddInputFlag("input", + 'D', + "256x4x1x1x8723", + "Input tensor descriptor (Default=256x4x1x1x8723)", + "tensor"); + inflags.AddInputFlag("Contiguous", + 'C', + "1", + "Is input tensor contiguous? (Default=1 for contiguous tensor)", + "int"); + inflags.AddInputFlag("Reduction", + 'R', + "0", + "Specifies the reduction to apply to the output ('none'|'mean'|'sum') " + "(Default=none to indicate no reduction)", + "string"); + inflags.AddInputFlag("Beta", + 'B', + "1", + "Specifies the threshold at which to change between L1 and L2 loss. The " + "value must be non-negative (Default=1)", + "int"); + inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int"); + inflags.AddInputFlag("verify", 'V', "0", "Verify Each Layer (Default=0)", "int"); + inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int"); + inflags.AddInputFlag( + "wall", 'w', "0", "Wall-clock Time Each Layer, Requires time == 1 (Default=0)", "int"); + + return miopenStatusSuccess; +} + +template +int SmoothL1LossDriver::AllocateBuffersAndCopy() +{ + size_t in_sz = GetTensorSize(inputDesc); + size_t tar_sz = GetTensorSize(targetDesc); + size_t out_sz = GetTensorSize(outputDesc); + + miopenGetSmoothL1LossForwardWorkspaceSize( + GetHandle(), inputDesc, outputDesc, reduction_mode, &ws_sizeInBytes); + if(ws_sizeInBytes == static_cast(-1)) + return miopenStatusAllocFailed; + size_t ws_sz = ws_sizeInBytes / sizeof(Tgpu); + + uint32_t ctx = 0; + + in_dev = std::unique_ptr(new GPUMem(ctx, in_sz, sizeof(Tgpu))); + tar_dev = std::unique_ptr(new GPUMem(ctx, tar_sz, sizeof(Tgpu))); + out_dev = std::unique_ptr(new GPUMem(ctx, out_sz, sizeof(Tgpu))); + workspace_dev = std::unique_ptr(new GPUMem(ctx, ws_sizeInBytes, sizeof(std::byte))); + dI_dev = std::unique_ptr(new GPUMem(ctx, in_sz, sizeof(Tgpu))); + dT_dev = std::unique_ptr(new GPUMem(ctx, tar_sz, sizeof(Tgpu))); + dO_dev = std::unique_ptr(new GPUMem(ctx, out_sz, sizeof(Tgpu))); + + in = std::vector(in_sz, static_cast(0)); + tar = std::vector(tar_sz, static_cast(0)); + out = std::vector(out_sz, static_cast(0)); + workspace = std::vector(ws_sz, static_cast(0)); + dI = std::vector(in_sz, static_cast(0)); + dT = std::vector(tar_sz, static_cast(0)); + dO = std::vector(out_sz, static_cast(0)); + + outhost = std::vector(out_sz, static_cast(0)); + dIhost = std::vector(in_sz, static_cast(0)); + dThost = std::vector(tar_sz, static_cast(0)); + + for(int i = 0; i < in_sz; i++) + { + in[i] = prng::gen_A_to_B(static_cast(0.0), static_cast(0.2)); + } + + for(int i = 0; i < tar_sz; i++) + { + tar[i] = prng::gen_A_to_B(static_cast(0.01), static_cast(0.21)); + } + + fill(out.begin(), out.end(), static_cast(0)); + + fill(dO.begin(), dO.end(), static_cast(0.5)); + + if(in_dev->ToGPU(GetStream(), in.data()) != 0) + { + std::cerr << "Error copying (in) to GPU, size: " << in_dev->GetSize() << std::endl; + return miopenStatusInternalError; + } + + if(tar_dev->ToGPU(GetStream(), tar.data()) != 0) + { + std::cerr << "Error copying (tar) to GPU, size: " << tar_dev->GetSize() << std::endl; + return miopenStatusInternalError; + } + + if(dO_dev->ToGPU(GetStream(), dO.data()) != 0) + { + std::cerr << "Error copying (out grad) to GPU, size: " << dO_dev->GetSize() << std::endl; + return miopenStatusInternalError; + } + + return miopenStatusSuccess; +} + +template +int SmoothL1LossDriver::RunForwardGPU() +{ + float kernel_total_time = 0; + float kernel_first_time = 0; + + Timer t; + START_TIME + + for(int i = 0; i < inflags.GetValueInt("iter"); i++) + { + miopenSmoothL1LossForward(GetHandle(), + workspace_dev->GetMem(), + ws_sizeInBytes, + inputDesc, + in_dev->GetMem(), + targetDesc, + tar_dev->GetMem(), + outputDesc, + out_dev->GetMem(), + beta, + reduction_mode); + + float time = 0.0; + miopenGetKernelTime(GetHandle(), &time); + kernel_total_time += time; + if(i == 0) + kernel_first_time = time; + } + + if(inflags.GetValueInt("time") == 1) + { + STOP_TIME + int iter = inflags.GetValueInt("iter"); + if(WALL_CLOCK) + std::cout << "Wall-clock Time Forward SmoothL1Loss Elapsed: " << t.gettime_ms() / iter + << " ms\n"; + + float kernel_average_time = + iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time; + std::cout << "GPU Kernel Time Forward SmoothL1Loss Elapsed: " << kernel_average_time + << " ms\n"; + } + + if(out_dev->FromGPU(GetStream(), out.data()) != 0) + { + std::cerr << "Error copying (out_dev) from GPU, size: " << out_dev->GetSize() << std::endl; + return miopenStatusInternalError; + } + + return miopenStatusSuccess; +} + +template +int SmoothL1LossDriver::RunForwardCPU() +{ + auto status = mloSmoothL1LossForwardRunHost(inputDesc, + targetDesc, + outputDesc, + in.data(), + tar.data(), + outhost.data(), + beta, + reduction_mode); + + return status; +} + +template +int SmoothL1LossDriver::RunBackwardGPU() +{ + float kernel_total_time = 0; + float kernel_first_time = 0; + + Timer t; + START_TIME + + for(int i = 0; i < inflags.GetValueInt("iter"); i++) + { + miopen::deref(GetHandle()).ResetKernelTime(); + miopenSmoothL1LossBackward(GetHandle(), + inputDesc, + in_dev->GetMem(), + targetDesc, + tar_dev->GetMem(), + doDesc, + dO_dev->GetMem(), + diDesc, + dI_dev->GetMem(), + dtDesc, + dT_dev->GetMem(), + beta, + reduction_mode); + + float time = 0.0; + miopenGetKernelTime(GetHandle(), &time); + kernel_total_time += time; + if(i == 0) + kernel_first_time = time; + } + + if(inflags.GetValueInt("time") == 1) + { + STOP_TIME + int iter = inflags.GetValueInt("iter"); + if(WALL_CLOCK) + std::cout << "Wall-clock Time Backward SmoothL1Loss Elapsed: " << t.gettime_ms() / iter + << " ms\n"; + + float kernel_average_time = + iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time; + std::cout << "GPU Kernel Time Backward SmoothL1Loss Elapsed: " << kernel_average_time + << " ms\n"; + } + + if(dI_dev->FromGPU(GetStream(), dI.data()) != 0) + std::cerr << "Error copying (dI_dev) from GPU, size: " << dI_dev->GetSize() << std::endl; + if(dT_dev->FromGPU(GetStream(), dT.data()) != 0) + std::cerr << "Error copying (dT_dev) from GPU, size: " << dT_dev->GetSize() << std::endl; + + return miopenStatusSuccess; +} + +template +int SmoothL1LossDriver::RunBackwardCPU() +{ + auto status = mloSmoothL1LossBackwardRunHost(inputDesc, + targetDesc, + doDesc, + diDesc, + dtDesc, + in.data(), + tar.data(), + dO.data(), + dIhost.data(), + dThost.data(), + beta, + reduction_mode); + + return status; +} + +template +Tref SmoothL1LossDriver::GetTolerance() +{ + // Computation error of fp16 is ~2^13 (=8192) bigger than + // the one of fp32 because mantissa is shorter by 13 bits. + auto tolerance = std::is_same::value ? 1.5e-6 : 8.2e-3; + + // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. + if(std::is_same::value) + tolerance *= 8.0; + return tolerance; +} + +template +int SmoothL1LossDriver::VerifyForward() +{ + RunForwardCPU(); + const Tref tolerance = GetTolerance(); + auto error = miopen::rms_range(outhost, out); + + if(!std::isfinite(error) || error > tolerance) + { + std::cout << "Forward SmoothL1Loss FAILED: " << error << " > " << tolerance << std::endl; + return EC_VerifyFwd; + } + else + { + std::cout << "Forward SmoothL1Loss Verifies OK on CPU reference (" << error << " < " + << tolerance << ')' << std::endl; + } + + return miopenStatusSuccess; +} + +template +int SmoothL1LossDriver::VerifyBackward() +{ + RunBackwardCPU(); + const Tref tolerance = GetTolerance(); + auto error_dI = miopen::rms_range(dIhost, dI); + auto error_dT = miopen::rms_range(dThost, dT); + + if(!std::isfinite(error_dI) || error_dI > tolerance) + { + std::cout << "Backward SmoothL1Loss Input Gradient FAILED: " << error_dI << " > " + << tolerance << std::endl; + return EC_VerifyBwd; + } + else + { + std::cout << "Backward SmoothL1Loss Input Gradient Verifies OK on CPU reference (" + << error_dI << " < " << tolerance << ')' << std::endl; + } + + if(!std::isfinite(error_dT) || error_dT > tolerance) + { + std::cout << "Backward SmoothL1Loss Target Gradient FAILED: " << error_dT << " > " + << tolerance << std::endl; + return EC_VerifyBwd; + } + else + { + std::cout << "Backward SmoothL1Loss Target Gradient Verifies OK on CPU reference (" + << error_dT << " < " << tolerance << ')' << std::endl; + } + + return miopenStatusSuccess; +} diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index 67652ab832..b623e4f61e 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -7999,7 +7999,6 @@ MIOPEN_EXPORT miopenStatus_t miopenPReLUBackward(miopenHandle_t handle, #endif // MIOPEN_BETA_API #ifdef MIOPEN_BETA_API - /*! @ingroup LossFunction * @enum miopenLossReductionMode_t * Reduction mode for loss function @@ -8011,8 +8010,10 @@ typedef enum MIOPEN_LOSS_REDUCTION_MEAN = 2, /*!< output tensor elements are summed up and divided with total number of elements to get mean value */ } miopenLossReductionMode_t; +#endif // MIOPEN_BETA_API // SoftMarginLoss APIs +#ifdef MIOPEN_BETA_API /** @addtogroup LossFunction * * @{ @@ -8092,10 +8093,10 @@ MIOPEN_EXPORT miopenStatus_t miopenSoftMarginLossBackward(miopenHandle_t handle, /** @} */ // CLOSEOUT LossFunction DOXYGEN GROUP -#endif +#endif // MIOPEN_BETA_API -#ifdef MIOPEN_BETA_API // MultiMarginLoss APIs +#ifdef MIOPEN_BETA_API /** @addtogroup LossFunction * * @{ @@ -8176,6 +8177,91 @@ MIOPEN_EXPORT miopenStatus_t miopenMultiMarginLossForward(miopenHandle_t handle, // CLOSEOUT LossFunction DOXYGEN GROUP #endif // MIOPEN_BETA_API +// SmoothL1Loss APIs +#ifdef MIOPEN_BETA_API +/** @addtogroup LossFunction + * + * @{ + */ + +/*! @brief Helper function to query the minimum workspace size required by the smooth L1Loss call + * + * @param [in] handle MIOpen Handle + * @param [in] inputDesc Tensor descriptor for input tensor + * @param [in] outputDesc Tensor descriptor for output tensor + * @param [out] sizeInBytes Pointer to data to return the minimum workspace size + * @param [in] reduction Reduction mode (none, sum, mean) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t +miopenGetSmoothL1LossForwardWorkspaceSize(miopenHandle_t handle, + miopenTensorDescriptor_t inputDesc, + miopenTensorDescriptor_t outputDesc, + miopenLossReductionMode_t reduction, + size_t* sizeInBytes); + +/*! @brief Execute a Smooth L1Loss forward layer + * + * @param [in] handle MIOpen handle (input) + * @param [in] workspace Address of the allocated workspace data + * @param [in] workspaceSizeInBytes Size in bytes of the allocated workspace data + * @param [in] inputDesc Tensor descriptor for input tensor + * @param [in] input Data tensor input + * @param [in] targetDesc Tensor descriptor for target tensor + * @param [in] target Data tensor target + * @param [in] outputDesc Tensor descriptor for output tensor + * @param [out] output Data tensor output + * @param [in] beta Beta + * @param [in] reduction Reduction mode (none, sum, mean) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t miopenSmoothL1LossForward(miopenHandle_t handle, + void* workspace, + size_t workspaceSizeInBytes, + miopenTensorDescriptor_t inputDesc, + const void* input, + miopenTensorDescriptor_t targetDesc, + const void* target, + miopenTensorDescriptor_t outputDesc, + void* output, + float beta, + miopenLossReductionMode_t reduction); + +/*! @brief Execute the Backward Smooth L1Loss + * + * @param [in] handle MIOpen handle + * @param [in] inputDesc Tensor descriptor for input tensor + * @param [in] input Data tensor input + * @param [in] targetDesc Tensor descriptor for target tensor + * @param [in] target Data tensor target + * @param [in] doutputDesc Tensor descriptor for output gradient + * @param [in] doutput Gradient of output + * @param [in] dinputDesc Tensor descriptor for input gradient + * @param [out] dinput Gradient of input + * @param [in] dtargetDesc Tensor descriptor for target gradient + * @param [out] dtarget Gradient of target + * @param [in] beta Beta + * @param [in] reduction Reduction mode (none, sum, mean) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t miopenSmoothL1LossBackward(miopenHandle_t handle, + miopenTensorDescriptor_t inputDesc, + const void* input, + miopenTensorDescriptor_t targetDesc, + const void* target, + miopenTensorDescriptor_t doutputDesc, + const void* doutput, + miopenTensorDescriptor_t dinputDesc, + void* dinput, + miopenTensorDescriptor_t dtargetDesc, + void* dtarget, + float beta, + miopenLossReductionMode_t reduction); + +/** @} */ +// CLOSEOUT LossFunction DOXYGEN GROUP +#endif // MIOPEN_BETA_API + #ifdef __cplusplus } #endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 70cd246873..1e0d2e8296 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -197,6 +197,8 @@ set( MIOpen_Source rope_api.cpp rope/problem_description.cpp scalar.cpp + smoothl1loss/problem_description.cpp + smoothl1loss_api.cpp softmarginloss/problem_description.cpp softmarginloss_api.cpp softmax.cpp @@ -331,6 +333,8 @@ set( MIOpen_Source solver/reduce/forward_sum.cpp solver/rope/backward_rope.cpp solver/rope/forward_rope.cpp + solver/smoothl1loss/forward_smoothl1loss.cpp + solver/smoothl1loss/backward_smoothl1loss.cpp solver/softmarginloss/backward_softmarginloss.cpp solver/softmarginloss/forward_softmarginloss.cpp solver/softmax/attn_softmax.cpp @@ -469,6 +473,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/Conv_Winograd_v30_3_1_gfx11_fp32_f3x2_stride1.inc kernels/Conv_Winograd_v30_3_1_gfx11_fp32_f3x2_stride2.inc kernels/Conv_Winograd_v30_3_1_metadata.inc + kernels/MIOpenLossReductionMode.hpp kernels/MIOpenReduceCalculation.hpp kernels/MIOpenReduceExtreme.hpp kernels/bfloat16_dev.hpp @@ -503,7 +508,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/xform_kd_cov2.inc kernels/xform_metadata.inc ) - + set(MIOPEN_KERNELS ${STATIC_COMPOSABLE_KERNEL_SOURCE} ${COMPOSABLE_KERNEL_SOURCE} @@ -551,6 +556,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/MIOpenReduceExtreme.cpp kernels/MIOpenReduceSum.cpp kernels/MIOpenRoPE.cpp + kernels/MIOpenSmoothL1Loss.cpp kernels/MIOpenSoftMarginLoss.cpp kernels/MIOpenSoftmax.cl kernels/MIOpenSoftmaxAttn.cpp @@ -701,6 +707,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN reducecalculation.cpp reduceextreme.cpp rope.cpp + smoothl1loss.cpp softmarginloss.cpp transformers_adam_w.cpp ${PROJECT_BINARY_DIR}/db_path.cpp diff --git a/src/include/miopen/smoothl1loss.hpp b/src/include/miopen/smoothl1loss.hpp new file mode 100644 index 0000000000..673a56001b --- /dev/null +++ b/src/include/miopen/smoothl1loss.hpp @@ -0,0 +1,68 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include +#include + +namespace miopen { + +struct Handle; +struct TensorDescriptor; + +MIOPEN_INTERNALS_EXPORT size_t +GetSmoothL1LossForwardWorkspaceSize(Handle& handle, + const TensorDescriptor& iDesc, + const TensorDescriptor& oDesc, + miopenLossReductionMode_t reduction); + +MIOPEN_INTERNALS_EXPORT miopenStatus_t SmoothL1LossForward(Handle& handle, + Data_t workspace, + size_t workspaceSizeInBytes, + const TensorDescriptor& iDesc, + ConstData_t i, + const TensorDescriptor& tDesc, + ConstData_t t, + const TensorDescriptor& oDesc, + Data_t o, + float beta, + miopenLossReductionMode_t reduction); + +MIOPEN_INTERNALS_EXPORT miopenStatus_t SmoothL1LossBackward(Handle& handle, + const TensorDescriptor& iDesc, + ConstData_t i, + const TensorDescriptor& tDesc, + ConstData_t t, + const TensorDescriptor& dODesc, + ConstData_t dO, + const TensorDescriptor& dIDesc, + Data_t dI, + const TensorDescriptor& dTDesc, + Data_t dT, + float beta, + miopenLossReductionMode_t reduction); + +} // namespace miopen diff --git a/src/include/miopen/smoothl1loss/invoke_params.hpp b/src/include/miopen/smoothl1loss/invoke_params.hpp new file mode 100644 index 0000000000..b0b3c37467 --- /dev/null +++ b/src/include/miopen/smoothl1loss/invoke_params.hpp @@ -0,0 +1,76 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include +#include + +namespace miopen { + +namespace smoothl1loss { + +struct BaseInvokeParams : public miopen::InvokeParams +{ + const TensorDescriptor* iDesc = nullptr; + const TensorDescriptor* tDesc = nullptr; + + ConstData_t i = nullptr; + ConstData_t t = nullptr; + + float beta = 0; + + Data_t workspace = nullptr; + std::size_t workspace_size = 0; + + std::size_t GetWorkspaceSize() const { return workspace_size; } + Data_t GetWorkspace() const { return workspace; } +}; + +struct FwdInvokeParams : public BaseInvokeParams +{ + FwdInvokeParams() = default; + + const TensorDescriptor* oDesc = nullptr; + + Data_t o = nullptr; +}; + +struct BwdInvokeParams : public BaseInvokeParams +{ + BwdInvokeParams() = default; + + const TensorDescriptor* dIDesc = nullptr; + const TensorDescriptor* dTDesc = nullptr; + const TensorDescriptor* dODesc = nullptr; + + Data_t dI = nullptr; + Data_t dT = nullptr; + ConstData_t dO = nullptr; +}; + +} // namespace smoothl1loss + +} // namespace miopen diff --git a/src/include/miopen/smoothl1loss/problem_description.hpp b/src/include/miopen/smoothl1loss/problem_description.hpp new file mode 100644 index 0000000000..f14f5d23bf --- /dev/null +++ b/src/include/miopen/smoothl1loss/problem_description.hpp @@ -0,0 +1,185 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include +#include +#include +#include + +namespace miopen { + +struct NetworkConfig; + +namespace smoothl1loss { + +struct ForwardProblemDescription : ProblemDescriptionBase +{ + ForwardProblemDescription(const TensorDescriptor& iDesc_, + const TensorDescriptor& tDesc_, + const TensorDescriptor& oDesc_, + const miopenLossReductionMode_t reduction_) + : iDesc(iDesc_), tDesc(tDesc_), oDesc(oDesc_), reduction(reduction_) + { + IsSameType(); + IsSameLength(); + } + + const TensorDescriptor& GetIDesc() const { return iDesc; } + const TensorDescriptor& GetTDesc() const { return tDesc; } + const TensorDescriptor& GetODesc() const { return oDesc; } + miopenLossReductionMode_t GetReduction() const { return reduction; } + + bool IsSameType() const + { + if(iDesc.GetType() != tDesc.GetType()) + MIOPEN_THROW(miopenStatusBadParm, + "SmoothL1Loss: Input and Target tensor types do not match."); + if(iDesc.GetType() != oDesc.GetType()) + MIOPEN_THROW(miopenStatusBadParm, + "SmoothL1Loss: Input and Output tensor types do not match."); + return true; + } + + bool IsSameLength() const + { + if(iDesc.GetLengths() != tDesc.GetLengths()) + MIOPEN_THROW(miopenStatusBadParm, + "SmoothL1Loss: Input and Target tensor dimension lengths do not match."); + if(reduction == MIOPEN_LOSS_REDUCTION_NONE && iDesc.GetLengths() != oDesc.GetLengths()) + MIOPEN_THROW(miopenStatusBadParm, + "SmoothL1Loss: Without reduction, Input and Output tensor dimension " + "lengths should be equal."); + if(reduction != MIOPEN_LOSS_REDUCTION_NONE && oDesc.GetElementSize() != 1) + MIOPEN_THROW( + miopenStatusBadParm, + "SmoothL1Loss: When reduction, Output tensor dimension lengths must be (1)."); + return true; + } + + NetworkConfig MakeNetworkConfig() const override; + +private: + TensorDescriptor iDesc; + TensorDescriptor tDesc; + TensorDescriptor oDesc; + miopenLossReductionMode_t reduction; + + NetworkConfig MakeForwardNetworkConfig() const; +}; + +struct BackwardProblemDescription : ProblemDescriptionBase +{ + BackwardProblemDescription(const TensorDescriptor& iDesc_, + const TensorDescriptor& tDesc_, + const TensorDescriptor& dODesc_, + const TensorDescriptor& dIDesc_, + const TensorDescriptor& dTDesc_, + const miopenLossReductionMode_t reduction_) + : iDesc(iDesc_), + tDesc(tDesc_), + dODesc(dODesc_), + dIDesc(dIDesc_), + dTDesc(dTDesc_), + reduction(reduction_) + { + IsSameType(); + IsSameLength(); + } + + const TensorDescriptor& GetIDesc() const { return iDesc; } + const TensorDescriptor& GetTDesc() const { return tDesc; } + const TensorDescriptor& GetDODesc() const { return dODesc; } + const TensorDescriptor& GetDIDesc() const { return dIDesc; } + const TensorDescriptor& GetDTDesc() const { return dTDesc; } + miopenLossReductionMode_t GetReduction() const { return reduction; } + + bool IsSameType() const + { + if(iDesc.GetType() != tDesc.GetType()) + MIOPEN_THROW(miopenStatusBadParm, + "SmoothL1Loss: Input and Target tensor types do not match."); + if(iDesc.GetType() != dIDesc.GetType()) + MIOPEN_THROW(miopenStatusBadParm, + "SmoothL1Loss: Input and its Gradient tensor types do not match."); + if(tDesc.GetType() != dTDesc.GetType()) + MIOPEN_THROW(miopenStatusBadParm, + "SmoothL1Loss: Target and its Gradient tensor types do not match."); + if(iDesc.GetType() != dODesc.GetType()) + MIOPEN_THROW(miopenStatusBadParm, + "SmoothL1Loss: Input and Output Gradient tensor types do not match."); + return true; + } + + bool IsSameLength() const + { + if(iDesc.GetLengths() != tDesc.GetLengths()) + MIOPEN_THROW(miopenStatusBadParm, + "SmoothL1Loss: Input and Target tensor dimension lengths do not match."); + if(iDesc.GetLengths() != dIDesc.GetLengths()) + MIOPEN_THROW( + miopenStatusBadParm, + "SmoothL1Loss: Input and its Gradient tensor dimension lengths do not match."); + if(tDesc.GetLengths() != dTDesc.GetLengths()) + MIOPEN_THROW( + miopenStatusBadParm, + "SmoothL1Loss: Target and its Gradient tensor dimension lengths do not match."); + if(reduction == MIOPEN_LOSS_REDUCTION_NONE && iDesc.GetLengths() != dODesc.GetLengths()) + MIOPEN_THROW( + miopenStatusBadParm, + "SmoothL1Loss: Without reduction, Input and Output Gradient tensor dimension " + "lengths should be equal."); + if(reduction != MIOPEN_LOSS_REDUCTION_NONE && dODesc.GetElementSize() != 1) + MIOPEN_THROW(miopenStatusBadParm, + "SmoothL1Loss: When reduction, Output Gradient tensor dimension lengths " + "must be (1)."); + return true; + } + + bool IsAllContiguous() const + { + if(!iDesc.IsContiguous() || !tDesc.IsContiguous() || !dIDesc.IsContiguous() || + !dTDesc.IsContiguous() || !dODesc.IsContiguous()) + return false; + return true; + } + + NetworkConfig MakeNetworkConfig() const override; + +private: + TensorDescriptor iDesc; + TensorDescriptor tDesc; + TensorDescriptor dODesc; + TensorDescriptor dIDesc; + TensorDescriptor dTDesc; + miopenLossReductionMode_t reduction; + + NetworkConfig MakeBackwardNetworkConfig() const; +}; + +} // namespace smoothl1loss + +} // namespace miopen diff --git a/src/include/miopen/smoothl1loss/solvers.hpp b/src/include/miopen/smoothl1loss/solvers.hpp new file mode 100644 index 0000000000..8bdec836b3 --- /dev/null +++ b/src/include/miopen/smoothl1loss/solvers.hpp @@ -0,0 +1,75 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include +#include + +namespace miopen { + +namespace solver { + +namespace smoothl1loss { + +using SmoothL1LossForwardSolverBase = + NonTunableSolverBase; + +struct SmoothL1LossForward final : SmoothL1LossForwardSolverBase +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool + IsApplicable(const ExecutionContext& context, + const miopen::smoothl1loss::ForwardProblemDescription& problem) const override; + ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::smoothl1loss::ForwardProblemDescription& problem) const override; + std::size_t + GetWorkspaceSize(const ExecutionContext& context, + const miopen::smoothl1loss::ForwardProblemDescription& problem) const override; + bool MayNeedWorkspace() const override { return true; } +}; + +using SmoothL1LossBackwardSolverBase = + NonTunableSolverBase; + +struct SmoothL1LossBackward final : SmoothL1LossBackwardSolverBase +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool + IsApplicable(const ExecutionContext& context, + const miopen::smoothl1loss::BackwardProblemDescription& problem) const override; + ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::smoothl1loss::BackwardProblemDescription& problem) const override; +}; + +} // namespace smoothl1loss + +} // namespace solver + +} // namespace miopen diff --git a/src/include/miopen/solver_id.hpp b/src/include/miopen/solver_id.hpp index 76a13b051c..8b6c7d6878 100644 --- a/src/include/miopen/solver_id.hpp +++ b/src/include/miopen/solver_id.hpp @@ -63,8 +63,7 @@ enum class Primitive RoPE, ReLU, Kthvalue, - SoftMarginLoss, - MultiMarginLoss + Loss }; struct MIOPEN_INTERNALS_EXPORT Id diff --git a/src/kernels/MIOpenLossReductionMode.hpp b/src/kernels/MIOpenLossReductionMode.hpp new file mode 100644 index 0000000000..f8c62b41a6 --- /dev/null +++ b/src/kernels/MIOpenLossReductionMode.hpp @@ -0,0 +1,42 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_KERNELS_MIOPEN_LOSS_REDUCTION_MODE_HPP +#define GUARD_KERNELS_MIOPEN_LOSS_REDUCTION_MODE_HPP + +enum class LossReductionMode_t +{ + NONE = 0, + SUM, + MEAN, +}; + +#ifndef __HIP_DEVICE_COMPILE__ +static_assert(MIOPEN_LOSS_REDUCTION_NONE == static_cast(LossReductionMode_t::NONE)); +static_assert(MIOPEN_LOSS_REDUCTION_SUM == static_cast(LossReductionMode_t::SUM)); +static_assert(MIOPEN_LOSS_REDUCTION_MEAN == static_cast(LossReductionMode_t::MEAN)); +#endif + +#endif // GUARD_KERNELS_MIOPEN_LOSS_REDUCTION_MODE_HPP diff --git a/src/kernels/MIOpenMultiMarginLoss.cpp b/src/kernels/MIOpenMultiMarginLoss.cpp index 2443f7863a..eccdbf8f70 100644 --- a/src/kernels/MIOpenMultiMarginLoss.cpp +++ b/src/kernels/MIOpenMultiMarginLoss.cpp @@ -30,8 +30,9 @@ #include "float_types.h" #include "tensor_view.hpp" +#include "MIOpenLossReductionMode.hpp" -template +template __device__ void multimarginlossforward2d(const DTYPE* __restrict__ I, const uint64_t* __restrict__ T, const DTYPE* __restrict__ W, @@ -76,9 +77,11 @@ __device__ void multimarginlossforward2d(const DTYPE* __restrict__ I, loss /= C; switch(REDUCTION_T) { - case 0: static_cast(O)[O_tv.get_tensor_view_idx({n})] = CVT_ACCUM2FLOAT(loss); break; - case 1: static_cast(O)[n] = loss; break; - case 2: static_cast(O)[n] = loss / N; break; + case LossReductionMode_t::NONE: + static_cast(O)[O_tv.get_tensor_view_idx({n})] = CVT_ACCUM2FLOAT(loss); + break; + case LossReductionMode_t::SUM: static_cast(O)[n] = loss; break; + case LossReductionMode_t::MEAN: static_cast(O)[n] = loss / N; break; default: break; } } @@ -95,5 +98,6 @@ extern "C" __global__ void MultiMarginLossForward2d(const FLOAT* __restrict__ I, tensor_view_t<1> O_tv) { // instantiate the kernel - multimarginlossforward2d(I, T, W, O, p, margin, I_tv, T_tv, W_tv, O_tv); + multimarginlossforward2d(REDUCTION_TYPE)>( + I, T, W, O, p, margin, I_tv, T_tv, W_tv, O_tv); } diff --git a/src/kernels/MIOpenSmoothL1Loss.cpp b/src/kernels/MIOpenSmoothL1Loss.cpp new file mode 100644 index 0000000000..07cc394fd8 --- /dev/null +++ b/src/kernels/MIOpenSmoothL1Loss.cpp @@ -0,0 +1,141 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS +#include +#include +#endif + +#include "float_types.h" +#include "tensor_view.hpp" +#include "MIOpenLossReductionMode.hpp" + +template +__device__ void SmoothL1LossForward(const TIO* I, + const TIO* T, + void* O, + const float beta, + const uint64_t size, + tensor_view_t I_tv, + tensor_view_t T_tv, + tensor_view_t O_tv) +{ + const uint64_t gid = blockIdx.x * blockDim.x + threadIdx.x; + + tensor_layout_t<5> tensor_layout(I_tv, gid); + if(tensor_layout.layout[0] >= I_tv.size[0]) + return; + + FLOAT_ACCUM i = CVT_FLOAT2ACCUM(I[I_tv.get_tensor_view_idx(tensor_layout)]); + FLOAT_ACCUM t = CVT_FLOAT2ACCUM(T[T_tv.get_tensor_view_idx(tensor_layout)]); + FLOAT_ACCUM diff = abs(i - t); + FLOAT_ACCUM loss = diff < beta ? 0.5f * diff * diff / beta : diff - 0.5f * beta; + + switch(REDUCTION_T) + { + case LossReductionMode_t::NONE: + static_cast(O)[O_tv.get_tensor_view_idx(tensor_layout)] = CVT_ACCUM2FLOAT(loss); + break; + case LossReductionMode_t::SUM: static_cast(O)[gid] = loss; break; + case LossReductionMode_t::MEAN: static_cast(O)[gid] = loss / size; break; + default: break; + } +} + +extern "C" __global__ void SmoothL1LossForward(const FLOAT* __restrict__ I, + const FLOAT* __restrict__ T, + void* __restrict__ O, + const float beta, + const uint64_t size, + tensor_view_t I_tv, + tensor_view_t T_tv, + tensor_view_t O_tv) +{ + // instantiate the kernel + SmoothL1LossForward(REDUCTION_TYPE)>( + I, T, O, beta, size, I_tv, T_tv, O_tv); +} + +template +__device__ void SmoothL1LossBackward(const TIO* I, + const TIO* T, + const TIO* dO, + TIO* dI, + TIO* dT, + float beta, + const uint64_t size, + tensor_view_t I_tv, + tensor_view_t T_tv, + tensor_view_t dO_tv, + tensor_view_t dI_tv, + tensor_view_t dT_tv) +{ + size_t gid = blockIdx.x * blockDim.x + threadIdx.x; + tensor_layout_t<5> tensor_layout(I_tv, gid); + if(tensor_layout.layout[0] >= I_tv.size[0]) + return; + + FLOAT_ACCUM o_grad; + switch(REDUCTION_T) + { + case LossReductionMode_t::NONE: + o_grad = CVT_FLOAT2ACCUM(dO[dO_tv.get_tensor_view_idx(tensor_layout)]); + break; + case LossReductionMode_t::SUM: o_grad = CVT_FLOAT2ACCUM(dO[0]); break; + case LossReductionMode_t::MEAN: o_grad = CVT_FLOAT2ACCUM(dO[0]) / size; break; + default: break; + } + + FLOAT_ACCUM i = CVT_FLOAT2ACCUM(I[I_tv.get_tensor_view_idx(tensor_layout)]); + FLOAT_ACCUM t = CVT_FLOAT2ACCUM(T[T_tv.get_tensor_view_idx(tensor_layout)]); + FLOAT_ACCUM sub = i - t; + FLOAT_ACCUM grad; + if(fabs(sub) < beta) + grad = sub / beta * o_grad; + else + grad = (sub >= 0 ? 1.0f : -1.0f) * o_grad; + + if(dI) + dI[dI_tv.get_tensor_view_idx(tensor_layout)] = CVT_ACCUM2FLOAT(grad); + if(dT) + dT[dT_tv.get_tensor_view_idx(tensor_layout)] = CVT_ACCUM2FLOAT(-grad); +} + +extern "C" __global__ void SmoothL1LossBackward(const FLOAT* __restrict__ I, + const FLOAT* __restrict__ T, + const FLOAT* __restrict__ dO, + FLOAT* __restrict__ dI, + FLOAT* __restrict__ dT, + float beta, + const uint64_t size, + tensor_view_t I_tv, + tensor_view_t T_tv, + tensor_view_t dO_tv, + tensor_view_t dI_tv, + tensor_view_t dT_tv) +{ + SmoothL1LossBackward(REDUCTION_TYPE)>( + I, T, dO, dI, dT, beta, size, I_tv, T_tv, dO_tv, dI_tv, dT_tv); +} diff --git a/src/kernels/MIOpenSoftMarginLoss.cpp b/src/kernels/MIOpenSoftMarginLoss.cpp index 6a8beaad61..4d249724ce 100644 --- a/src/kernels/MIOpenSoftMarginLoss.cpp +++ b/src/kernels/MIOpenSoftMarginLoss.cpp @@ -30,8 +30,9 @@ #include "float_types.h" #include "tensor_view.hpp" +#include "MIOpenLossReductionMode.hpp" -template +template __device__ void softmarginlossforward5d(const DTYPE* __restrict__ I, const DTYPE* __restrict__ T, void* __restrict__ O, @@ -52,13 +53,15 @@ __device__ void softmarginlossforward5d(const DTYPE* __restrict__ I, switch(REDUCTION_T) { // If reduction = None, O is DTYPE* - case 0: static_cast(O)[O_tv.get_tensor_view_idx(idx)] = CVT_ACCUM2FLOAT(loss); break; + case LossReductionMode_t::NONE: + static_cast(O)[O_tv.get_tensor_view_idx(idx)] = CVT_ACCUM2FLOAT(loss); + break; // If reduction = Sum, O is FLOAT_ACCUM* and then all elements will be sum up in the next // kernel - case 1: static_cast(O)[gid] = loss; break; + case LossReductionMode_t::SUM: static_cast(O)[gid] = loss; break; // If reduction = Mean, same as Sum but O will be divided by num_elem, then the next kernel sum // up will return mean of all elements - case 2: static_cast(O)[gid] = loss / num_elem; break; + case LossReductionMode_t::MEAN: static_cast(O)[gid] = loss / num_elem; break; default: break; } } @@ -72,10 +75,11 @@ extern "C" __global__ void SoftMarginLossForward5d(const FLOAT* __restrict__ I, tensor_view_t<5> O_tv) { // instantiate the kernel - softmarginlossforward5d(I, T, O, num_elem, I_tv, T_tv, O_tv); + softmarginlossforward5d(REDUCTION_TYPE)>( + I, T, O, num_elem, I_tv, T_tv, O_tv); } -template +template __device__ void softmarginlossbackward5d(const DTYPE* __restrict__ I, const DTYPE* __restrict__ T, const DTYPE* __restrict__ dO, @@ -98,9 +102,13 @@ __device__ void softmarginlossbackward5d(const DTYPE* __restrict__ I, FLOAT_ACCUM loss = -t / (exp(i * t) + 1) * dO_accum; switch(REDUCTION_T) { - case 0: - case 1: dI[dI_tv.get_tensor_view_idx(idx)] = CVT_ACCUM2FLOAT(loss); break; - case 2: dI[dI_tv.get_tensor_view_idx(idx)] = CVT_ACCUM2FLOAT(loss / num_elem); break; + case LossReductionMode_t::NONE: + case LossReductionMode_t::SUM: + dI[dI_tv.get_tensor_view_idx(idx)] = CVT_ACCUM2FLOAT(loss); + break; + case LossReductionMode_t::MEAN: + dI[dI_tv.get_tensor_view_idx(idx)] = CVT_ACCUM2FLOAT(loss / num_elem); + break; default: break; } } @@ -116,6 +124,6 @@ extern "C" __global__ void SoftMarginLossBackward5d(const FLOAT* __restrict__ I, tensor_view_t<5> dI_tv) { // instantiate the kernel - softmarginlossbackward5d( + softmarginlossbackward5d(REDUCTION_TYPE)>( I, T, dO, dI, num_elem, I_tv, T_tv, dO_tv, dI_tv); } diff --git a/src/smoothl1loss.cpp b/src/smoothl1loss.cpp new file mode 100644 index 0000000000..dd6bdeb67b --- /dev/null +++ b/src/smoothl1loss.cpp @@ -0,0 +1,130 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace miopen { + +size_t GetSmoothL1LossForwardWorkspaceSize(Handle& handle, + const TensorDescriptor& iDesc, + const TensorDescriptor& oDesc, + const miopenLossReductionMode_t reduction) +{ + auto ctx = ExecutionContext{&handle}; + const auto problem = smoothl1loss::ForwardProblemDescription{iDesc, iDesc, oDesc, reduction}; + + const auto solvers = solver::SolverContainer{}; + + auto pair_size_vector = solvers.GetWorkspaceSizes(ctx, problem); + return pair_size_vector.empty() ? static_cast(-1) : pair_size_vector.front().second; +} + +miopenStatus_t SmoothL1LossForward(Handle& handle, + Data_t workspace, + size_t workspaceSizeInBytes, + const TensorDescriptor& iDesc, + ConstData_t i, + const TensorDescriptor& tDesc, + ConstData_t t, + const TensorDescriptor& oDesc, + Data_t o, + float beta, + const miopenLossReductionMode_t reduction) +{ + const auto problem = smoothl1loss::ForwardProblemDescription{iDesc, tDesc, oDesc, reduction}; + + const auto invoke_params = [&]() { + auto tmp = smoothl1loss::FwdInvokeParams{}; + tmp.type = InvokeType::Run; + tmp.iDesc = &iDesc; + tmp.tDesc = &tDesc; + tmp.oDesc = &oDesc; + tmp.i = i; + tmp.t = t; + tmp.o = o; + tmp.beta = beta; + tmp.workspace = workspace; + tmp.workspace_size = workspaceSizeInBytes; + return tmp; + }(); + + const auto algo = AlgorithmName{"SmoothL1LossForward"}; + const auto solvers = solver::SolverContainer{}; + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + + return miopenStatusSuccess; +} + +miopenStatus_t SmoothL1LossBackward(Handle& handle, + const TensorDescriptor& iDesc, + ConstData_t i, + const TensorDescriptor& tDesc, + ConstData_t t, + const TensorDescriptor& dODesc, + ConstData_t dO, + const TensorDescriptor& dIDesc, + Data_t dI, + const TensorDescriptor& dTDesc, + Data_t dT, + float beta, + const miopenLossReductionMode_t reduction) +{ + const auto problem = + smoothl1loss::BackwardProblemDescription{iDesc, tDesc, dODesc, dIDesc, dTDesc, reduction}; + + const auto invoke_params = [&]() { + auto tmp = smoothl1loss::BwdInvokeParams{}; + tmp.type = InvokeType::Run; + tmp.iDesc = &iDesc; + tmp.tDesc = &tDesc; + tmp.dODesc = &dODesc; + tmp.dIDesc = &dIDesc; + tmp.dTDesc = &dTDesc; + tmp.i = i; + tmp.t = t; + tmp.dO = dO; + tmp.dI = dI; + tmp.dT = dT; + tmp.beta = beta; + return tmp; + }(); + + const auto algo = AlgorithmName{"SmoothL1LossBackward"}; + const auto solvers = solver::SolverContainer{}; + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + + return miopenStatusSuccess; +} + +} // namespace miopen diff --git a/src/smoothl1loss/problem_description.cpp b/src/smoothl1loss/problem_description.cpp new file mode 100644 index 0000000000..f75aa53070 --- /dev/null +++ b/src/smoothl1loss/problem_description.cpp @@ -0,0 +1,68 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include + +#include + +namespace miopen { + +namespace smoothl1loss { + +NetworkConfig ForwardProblemDescription::MakeNetworkConfig() const +{ + auto dtype = iDesc.GetType(); + auto size = iDesc.GetElementSize(); + + std::ostringstream ss; + + ss << "smoothl1loss_fwd"; + ss << "dtype" << dtype; + ss << "size" << size; + ss << "reduce" << reduction; + + return NetworkConfig{ss.str()}; +} + +NetworkConfig BackwardProblemDescription::MakeNetworkConfig() const +{ + auto dtype = iDesc.GetType(); + auto size = iDesc.GetElementSize(); + + std::ostringstream ss; + + ss << "smoothl1loss_bwd"; + ss << "dtype" << dtype; + ss << "size" << size; + ss << "reduce" << reduction; + + return NetworkConfig{ss.str()}; +} + +} // namespace smoothl1loss + +} // namespace miopen diff --git a/src/smoothl1loss_api.cpp b/src/smoothl1loss_api.cpp new file mode 100644 index 0000000000..850f5c188d --- /dev/null +++ b/src/smoothl1loss_api.cpp @@ -0,0 +1,181 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +inline std::ostream& operator<<(std::ostream& os, const std::vector& v) +{ + os << '{'; + for(int input = 0; input < v.size(); ++input) + { + if(input != 0) + os << ','; + os << v[input]; + } + os << '}'; + return os; +} + +static void LogCmdSmoothL1Loss(const miopenTensorDescriptor_t inputDesc, + const miopenTensorDescriptor_t targetDesc, + const float beta, + const miopenLossReductionMode_t reduction, + bool is_fwd) +{ + if(miopen::IsLoggingCmd()) + { + std::stringstream ss; + auto dtype = miopen::deref(inputDesc).GetType(); + if(dtype == miopenHalf) + { + ss << "smoothl1lossfp16"; + } + else if(dtype == miopenFloat) + { + ss << "smoothl1lossfp32"; + } + else if(dtype == miopenBFloat16) + { + ss << "smoothl1lossbfp16"; + } + + MIOPEN_LOG_FUNCTION(inputDesc, targetDesc); + ss << " -n " << miopen::deref(inputDesc).GetLengths()[0]; + ss << " -T " << miopen::deref(inputDesc).GetLengths(); + ss << " -Si " << miopen::deref(inputDesc).GetStrides(); + ss << " -St " << miopen::deref(targetDesc).GetStrides(); + ss << " -F " << ((is_fwd) ? "1" : "2") << " -b " << beta << " -r " << reduction; + + MIOPEN_LOG_DRIVER_CMD(ss.str()); + } +} + +extern "C" miopenStatus_t +miopenGetSmoothL1LossForwardWorkspaceSize(miopenHandle_t handle, + const miopenTensorDescriptor_t inputDesc, + const miopenTensorDescriptor_t outputDesc, + const miopenLossReductionMode_t reduction, + size_t* sizeInBytes) +{ + + MIOPEN_LOG_FUNCTION(handle, inputDesc, outputDesc, reduction, sizeInBytes); + + return miopen::try_([&] { + miopen::deref(sizeInBytes) = miopen::GetSmoothL1LossForwardWorkspaceSize( + miopen::deref(handle), miopen::deref(inputDesc), miopen::deref(outputDesc), reduction); + }); +} + +extern "C" miopenStatus_t miopenSmoothL1LossForward(miopenHandle_t handle, + void* workspace, + size_t workspaceSizeInBytes, + const miopenTensorDescriptor_t inputDesc, + const void* input, + const miopenTensorDescriptor_t targetDesc, + const void* target, + const miopenTensorDescriptor_t outputDesc, + void* output, + const float beta, + const miopenLossReductionMode_t reduction) +{ + MIOPEN_LOG_FUNCTION(handle, + workspace, + workspaceSizeInBytes, + inputDesc, + input, + targetDesc, + target, + outputDesc, + output, + beta, + reduction); + + LogCmdSmoothL1Loss(inputDesc, targetDesc, beta, reduction, true); + return miopen::try_([&] { + miopen::SmoothL1LossForward(miopen::deref(handle), + DataCast(workspace), + workspaceSizeInBytes, + miopen::deref(inputDesc), + DataCast(input), + miopen::deref(targetDesc), + DataCast(target), + miopen::deref(outputDesc), + DataCast(output), + beta, + reduction); + }); +} + +extern "C" miopenStatus_t miopenSmoothL1LossBackward(miopenHandle_t handle, + const miopenTensorDescriptor_t inputDesc, + const void* input, + const miopenTensorDescriptor_t targetDesc, + const void* target, + const miopenTensorDescriptor_t doutputDesc, + const void* doutput, + const miopenTensorDescriptor_t dinputDesc, + void* dinput, + const miopenTensorDescriptor_t dtargetDesc, + void* dtarget, + const float beta, + const miopenLossReductionMode_t reduction) +{ + MIOPEN_LOG_FUNCTION(handle, + inputDesc, + input, + targetDesc, + target, + doutputDesc, + doutput, + dinputDesc, + dinput, + dtargetDesc, + dtarget, + beta, + reduction); + + LogCmdSmoothL1Loss(inputDesc, targetDesc, beta, reduction, false); + return miopen::try_([&] { + miopen::SmoothL1LossBackward(miopen::deref(handle), + miopen::deref(inputDesc), + DataCast(input), + miopen::deref(targetDesc), + DataCast(target), + miopen::deref(doutputDesc), + DataCast(doutput), + miopen::deref(dinputDesc), + DataCast(dinput), + miopen::deref(dtargetDesc), + DataCast(dtarget), + beta, + reduction); + }); +} diff --git a/src/solver.cpp b/src/solver.cpp index 856f04a8cf..1db9068d6a 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -701,20 +702,18 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) Register(registry, ++id, Primitive::Activation, glu::GLUForward{}.SolverDbId()); Register(registry, ++id, Primitive::Activation, glu::GLUBackward{}.SolverDbId()); - Register(registry, - ++id, - Primitive::SoftMarginLoss, - softmarginloss::SoftMarginLossForward{}.SolverDbId()); - Register(registry, - ++id, - Primitive::SoftMarginLoss, - softmarginloss::SoftMarginLossBackward{}.SolverDbId()); - Register(registry, - ++id, - Primitive::MultiMarginLoss, - multimarginloss::MultiMarginLossForward{}.SolverDbId()); + Register(registry, ++id, Primitive::Loss, softmarginloss::SoftMarginLossForward{}.SolverDbId()); + Register( + registry, ++id, Primitive::Loss, softmarginloss::SoftMarginLossBackward{}.SolverDbId()); + + Register( + registry, ++id, Primitive::Loss, multimarginloss::MultiMarginLossForward{}.SolverDbId()); Register(registry, ++id, Primitive::Mha, mha::MhaCKFlashAttentionV2Forward{}.SolverDbId()); + + Register(registry, ++id, Primitive::Loss, smoothl1loss::SmoothL1LossForward{}.SolverDbId()); + Register(registry, ++id, Primitive::Loss, smoothl1loss::SmoothL1LossBackward{}.SolverDbId()); + // IMPORTANT: New solvers should be added to the end of the function, and don't leave a white // space between this comment and the newly registered solver(s)! } diff --git a/src/solver/smoothl1loss/backward_smoothl1loss.cpp b/src/solver/smoothl1loss/backward_smoothl1loss.cpp new file mode 100644 index 0000000000..f48c787190 --- /dev/null +++ b/src/solver/smoothl1loss/backward_smoothl1loss.cpp @@ -0,0 +1,147 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "miopen/errors.hpp" +#include +#include +#include +#include +#include +#include +#include +#include + +#define LOCAL_SIZE_NONCONTIGUOUS_BWD 256 + +#define VIEW_DIMS 5 + +namespace miopen { + +namespace solver { + +const auto make_hip_kernel = [](std::vector localsize, + std::vector gridsize, + std::string kernel_file, + std::string kernel_name, + KernelBuildParameters build_params) { + while(localsize.size() < 3) + localsize.push_back(1); + while(gridsize.size() < 3) + gridsize.push_back(1); + for(int i = 0; i < localsize.size(); ++i) + gridsize[i] = AlignUp(gridsize[i], localsize[i]); + return KernelInfo{ + build_params.GenerateFor(kbp::HIP{}), localsize, gridsize, kernel_file, kernel_name}; +}; + +namespace smoothl1loss { + +namespace { +bool IsImprovementOverROCm(const ExecutionContext& /*context*/, + const miopen::smoothl1loss::BackwardProblemDescription& problem) +{ + if(problem.IsAllContiguous()) + return false; + return true; +} +} // namespace + +bool SmoothL1LossBackward::IsApplicable( + const ExecutionContext& context, + const miopen::smoothl1loss::BackwardProblemDescription& problem) const +{ + if(!(problem.GetIDesc().GetType() == miopenFloat || + problem.GetIDesc().GetType() == miopenHalf || + problem.GetIDesc().GetType() == miopenBFloat16)) + return false; + if(!IsImprovementOverROCm(context, problem)) + return false; + if(problem.GetIDesc().GetNumDims() > VIEW_DIMS) + return false; + return true; +} + +ConvSolution SmoothL1LossBackward::GetSolution( + const ExecutionContext& /*context*/, + const miopen::smoothl1loss::BackwardProblemDescription& problem) const +{ + auto result = ConvSolution{miopenStatusSuccess}; + + auto dtype = problem.GetDIDesc().GetType(); + auto input_dtype = miopen::GetDataType(problem.GetIDesc().GetType()); + auto output_dtype = miopen::GetDataType(problem.GetDODesc().GetType()); + auto size = problem.GetIDesc().GetElementSize(); + + const auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"REDUCTION_TYPE", static_cast(problem.GetReduction())}, + {"VIEW_DIMS", VIEW_DIMS}, + }; + + result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_NONCONTIGUOUS_BWD}, + {size}, + "MIOpenSmoothL1Loss.cpp", + "SmoothL1LossBackward", + build_params)); + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + auto i_tv = get_inner_expanded_tv(deref(params.iDesc)); + auto t_tv = get_inner_expanded_tv(deref(params.tDesc)); + auto dO_tv = get_inner_expanded_tv(deref(params.dODesc)); + auto dI_tv = get_inner_expanded_tv(deref(params.dIDesc)); + auto dT_tv = get_inner_expanded_tv(deref(params.dTDesc)); + + handle_.ResetKernelTime(); + kernel(params.i, + params.t, + params.dO, + params.dI, + params.dT, + params.beta, + deref(params.iDesc).GetElementSize(), + i_tv, + t_tv, + dO_tv, + dI_tv, + dT_tv); + }; + }; + + return result; +} + +} // namespace smoothl1loss + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/smoothl1loss/forward_smoothl1loss.cpp b/src/solver/smoothl1loss/forward_smoothl1loss.cpp new file mode 100644 index 0000000000..fc68c0c1fb --- /dev/null +++ b/src/solver/smoothl1loss/forward_smoothl1loss.cpp @@ -0,0 +1,250 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define LOCAL_SIZE_NONCONTIGUOUS_FWD 256 +#define LOCAL_SIZE_REDUCE 256 + +#define VIEW_DIMS 5 + +namespace miopen { + +namespace solver { + +namespace smoothl1loss { + +namespace { +const auto make_hip_kernel = [](std::vector localsize, + std::vector gridsize, + std::string kernel_file, + std::string kernel_name, + KernelBuildParameters build_params) { + while(localsize.size() < 3) + localsize.push_back(1); + while(gridsize.size() < 3) + gridsize.push_back(1); + for(int i = 0; i < localsize.size(); ++i) + gridsize[i] = AlignUp(gridsize[i], localsize[i]); + return KernelInfo{ + build_params.GenerateFor(kbp::HIP{}), localsize, gridsize, kernel_file, kernel_name}; +}; +} // namespace + +bool SmoothL1LossForward::IsApplicable( + const ExecutionContext& /*context*/, + const miopen::smoothl1loss::ForwardProblemDescription& problem) const +{ + if(!(problem.GetIDesc().GetType() == miopenFloat || + problem.GetIDesc().GetType() == miopenHalf || + problem.GetIDesc().GetType() == miopenBFloat16)) + return false; + if(problem.GetIDesc().GetNumDims() > VIEW_DIMS) + return false; + return true; +} + +ConvSolution SmoothL1LossForward::GetSolution( + const ExecutionContext& /*context*/, + const miopen::smoothl1loss::ForwardProblemDescription& problem) const +{ + auto result = ConvSolution{miopenStatusSuccess}; + + auto dtype = problem.GetODesc().GetType(); + auto size = problem.GetIDesc().GetElementSize(); + + /* Phase 1: Calc loss for each element. */ + { + const auto build_params = + KernelBuildParameters{{"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"REDUCTION_TYPE", static_cast(problem.GetReduction())}, + {"VIEW_DIMS", VIEW_DIMS}}; + result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_NONCONTIGUOUS_FWD}, + {size}, + "MIOpenSmoothL1Loss.cpp", + "SmoothL1LossForward", + build_params)); + } + + if(problem.GetReduction() != MIOPEN_LOSS_REDUCTION_NONE) + { + // If Reduction = NONE, then we should run second kernel to calculate mean/sum of result + // from first kernel above + /* Phase 2: Reduce */ + auto _size = size; + const auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"REDUCE_SIZE", LOCAL_SIZE_REDUCE}, + }; + /* Reduce FLOAT_ACCUM -> FLOAT_ACCUM */ + while(_size > LOCAL_SIZE_REDUCE) + { + result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_REDUCE}, + {_size}, + "MIOpenReduceSum.cpp", + "ReduceSumFLOATACCUM", + build_params)); + _size = (_size + LOCAL_SIZE_REDUCE - 1) / LOCAL_SIZE_REDUCE; + } + // Last kernel reduce: FLOAT_ACCUM -> FLOAT + result.construction_params.push_back(make_hip_kernel( + {LOCAL_SIZE_REDUCE}, {_size}, "MIOpenReduceSum.cpp", "ReduceSum", build_params)); + } + + if(problem.GetReduction() == MIOPEN_LOSS_REDUCTION_NONE) + { + // Reduction = None -> invoke 1 kernel + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + auto i_tv = get_inner_expanded_tv(deref(params.iDesc)); + auto t_tv = get_inner_expanded_tv(deref(params.tDesc)); + auto o_tv = get_inner_expanded_tv(deref(params.oDesc)); + + kernel(params.i, + params.t, + params.o, + params.beta, + deref(params.iDesc).GetElementSize(), + i_tv, + t_tv, + o_tv); + }; + }; + } + else + { + // Reduction != None -> invoke 2 or more kernels + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) params = raw_params.CastTo(); + auto i_tv = get_inner_expanded_tv(deref(params.iDesc)); + auto t_tv = get_inner_expanded_tv(deref(params.tDesc)); + auto o_tv = get_inner_expanded_tv(deref(params.oDesc)); + + float elapsed = 0.0f; + HipEventPtr start, stop; + + const bool profiling = handle_.IsProfilingEnabled(); + if(profiling) + { + handle_.EnableProfiling(false); + start = miopen::make_hip_event(); + stop = miopen::make_hip_event(); + hipEventRecord(start.get(), handle_.GetStream()); + } + + int kernelCnt = 0; + + /* Phase 1: Calc loss for each element. */ + { + decltype(auto) kernel = handle_.Run(kernels[kernelCnt++]); + kernel(params.i, + params.t, + params.workspace, + params.beta, + params.iDesc->GetElementSize(), + i_tv, + t_tv, + o_tv); + } + + /* Phase 2: Reduce */ + { + auto size = deref(params.iDesc).GetElementSize(); + auto data_size = get_data_size(miopenFloat); + auto wt = MultiBufferWorkspaceTraits{size * data_size, + (size + LOCAL_SIZE_REDUCE - 1) / + LOCAL_SIZE_REDUCE * data_size}; + auto work_a = params.workspace; + auto work_b = static_cast(static_cast(params.workspace) + + wt.GetOffset(1)); + while(size > LOCAL_SIZE_REDUCE) + { + auto kernel = handle_.Run(kernels[kernelCnt++]); + kernel(work_a, work_b, size); + size = (size + LOCAL_SIZE_REDUCE - 1) / LOCAL_SIZE_REDUCE; + std::swap(work_a, work_b); + } + handle_.Run(kernels[kernelCnt++])(work_a, params.o, size, o_tv); + } + + if(profiling) + { + hipEventRecord(stop.get(), handle_.GetStream()); + hipEventSynchronize(stop.get()); + hipEventElapsedTime(&elapsed, start.get(), stop.get()); + + // Clean up + hipEventDestroy(start.get()); + hipEventDestroy(stop.get()); + handle_.ResetKernelTime(); + handle_.AccumKernelTime(elapsed); + + handle_.EnableProfiling(true); + }; + }; + }; + } + + return result; +} + +std::size_t SmoothL1LossForward::GetWorkspaceSize( + const ExecutionContext& /*context*/, + const miopen::smoothl1loss::ForwardProblemDescription& problem) const +{ + if(problem.GetReduction() == MIOPEN_LOSS_REDUCTION_NONE) + return 0; + + auto size = problem.GetIDesc().GetElementSize(); + auto data_size = get_data_size(miopenFloat); + return MultiBufferWorkspaceTraits{ + size * data_size, (size + LOCAL_SIZE_REDUCE - 1) / LOCAL_SIZE_REDUCE * data_size} + .GetSize(); +} + +} // namespace smoothl1loss + +} // namespace solver + +} // namespace miopen diff --git a/test/cpu_multimarginloss.hpp b/test/cpu_multimarginloss.hpp index a8b19ba0f2..407d7311b1 100644 --- a/test/cpu_multimarginloss.hpp +++ b/test/cpu_multimarginloss.hpp @@ -45,13 +45,15 @@ void cpu_multimarginloss_forward(const tensor& input, auto O_tv = miopen::get_inner_expanded_tv<1>(ref_output.desc); auto N = I_tv.size[0], C = I_tv.size[1]; - double sum = 0; - for(size_t n = 0; n < N; n++) - { + std::vector buffer; + if(reduction_mode != MIOPEN_LOSS_REDUCTION_NONE) + buffer.assign(N, 0); + + par_ford(N)([&](size_t n) { double loss = 0; uint64_t y = target[T_tv.get_tensor_view_idx({n})]; if(y >= C) - continue; + return; for(size_t c = 0; c < C; c++) { if(y == c) @@ -69,14 +71,13 @@ void cpu_multimarginloss_forward(const tensor& input, if(reduction_mode == MIOPEN_LOSS_REDUCTION_NONE) ref_output[O_tv.get_tensor_view_idx({n})] = loss; else - sum += loss; - } + buffer[n] = loss; + }); + + auto sum = std::accumulate(buffer.begin(), buffer.end(), 0.0); + if(reduction_mode == MIOPEN_LOSS_REDUCTION_MEAN) - { - ref_output[0] = static_cast(sum / N); - } - else if(reduction_mode == MIOPEN_LOSS_REDUCTION_SUM) - { + sum /= N; + if(reduction_mode != MIOPEN_LOSS_REDUCTION_NONE) ref_output[0] = static_cast(sum); - } } diff --git a/test/cpu_smoothl1loss.hpp b/test/cpu_smoothl1loss.hpp new file mode 100644 index 0000000000..bdbf6a0487 --- /dev/null +++ b/test/cpu_smoothl1loss.hpp @@ -0,0 +1,112 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include "tensor_holder.hpp" +#include +#include + +template +void cpu_smoothl1loss_forward(const tensor input, + const tensor target, + tensor& ref_output, + const float beta, + const miopenLossReductionMode_t reduction) +{ + // Treat contiguous tensors as non-contiguous tensors (for consistency) + auto I_tv = get_inner_expanded_tv<5>(input.desc); + auto T_tv = get_inner_expanded_tv<5>(target.desc); + auto O_tv = get_inner_expanded_tv<5>(ref_output.desc); + + auto size = input.desc.GetElementSize(); + + std::vector buffer; + if(reduction != MIOPEN_LOSS_REDUCTION_NONE) + buffer.assign(size, 0); + + par_ford(size)([&](size_t i) { + const auto tensor_layout = tensor_layout_t<5>(I_tv, i); + const uint64_t Iidx = I_tv.get_tensor_view_idx(tensor_layout); + const uint64_t Tidx = T_tv.get_tensor_view_idx(tensor_layout); + float diff = abs(input[Iidx] - target[Tidx]); + float loss = (diff < beta ? 0.5f * diff * diff / beta : diff - 0.5f * beta); + if(reduction == MIOPEN_LOSS_REDUCTION_NONE) + ref_output[O_tv.get_tensor_view_idx(tensor_layout)] = static_cast(loss); + else + buffer[i] = loss; + }); + + auto loss_sum = std::accumulate(buffer.begin(), buffer.end(), 0.0); + + if(reduction == MIOPEN_LOSS_REDUCTION_MEAN) + loss_sum /= size; + if(reduction != MIOPEN_LOSS_REDUCTION_NONE) + ref_output[0] = static_cast(loss_sum); +} + +template +void cpu_smoothl1loss_backward(tensor input, + tensor target, + tensor dO, + tensor& ref_dI, + tensor& ref_dT, + float beta, + miopenLossReductionMode_t reduction) +{ + // Treat contiguous tensors as non-contiguous tensors (for consistency) + auto I_tv = get_inner_expanded_tv<5>(input.desc); + auto T_tv = get_inner_expanded_tv<5>(target.desc); + auto dO_tv = get_inner_expanded_tv<5>(dO.desc); + auto dI_tv = get_inner_expanded_tv<5>(ref_dI.desc); + auto dT_tv = get_inner_expanded_tv<5>(ref_dT.desc); + + auto size = input.desc.GetElementSize(); + + par_ford(size)([&](size_t i) { + const auto tensor_layout = tensor_layout_t<5>(I_tv, i); + const uint64_t Iidx = I_tv.get_tensor_view_idx(tensor_layout); + const uint64_t Tidx = T_tv.get_tensor_view_idx(tensor_layout); + + T sub = input[Iidx] - target[Tidx]; + T grad = static_cast(0.0f); + + if(fabs(sub) < beta) + grad = sub / beta * + dO[reduction == MIOPEN_LOSS_REDUCTION_NONE + ? dO_tv.get_tensor_view_idx(tensor_layout) + : 0]; + else + grad = (sub >= 0 ? 1.0f : -1.0f) * dO[reduction == MIOPEN_LOSS_REDUCTION_NONE + ? dO_tv.get_tensor_view_idx(tensor_layout) + : 0]; + + if(reduction == MIOPEN_LOSS_REDUCTION_MEAN) + grad = grad / size; + + ref_dI[dI_tv.get_tensor_view_idx(tensor_layout)] = grad; + ref_dT[dT_tv.get_tensor_view_idx(tensor_layout)] = -grad; + }); +} diff --git a/test/cpu_softmarginloss.hpp b/test/cpu_softmarginloss.hpp index 6fe5803970..6eefe59245 100644 --- a/test/cpu_softmarginloss.hpp +++ b/test/cpu_softmarginloss.hpp @@ -28,6 +28,7 @@ #include "miopen/miopen.h" #include "tensor_holder.hpp" #include +#include template void cpu_softmarginloss_forward(const tensor& input, @@ -40,23 +41,26 @@ void cpu_softmarginloss_forward(const tensor& input, auto t_tv = miopen::get_inner_expanded_tv<5>(target.desc); auto o_tv = miopen::get_inner_expanded_tv<5>(ref_output.desc); - double sum_loss = 0; - for(size_t gid = 0; gid < input_numel; gid++) - { + std::vector buffer; + if(reduction_mode != MIOPEN_LOSS_REDUCTION_NONE) + buffer.assign(input_numel, 0); + + par_ford(input_numel)([&](size_t gid) { tensor_layout_t<5> idx(i_tv, gid); - // Convert to double for better precision double i = input[i_tv.get_tensor_view_idx(idx)]; double t = target[t_tv.get_tensor_view_idx(idx)]; if(reduction_mode == MIOPEN_LOSS_REDUCTION_NONE) ref_output[o_tv.get_tensor_view_idx(idx)] = log1p(exp(-i * t)); else - sum_loss += log1p(exp(-i * t)); - }; + buffer[gid] = log1p(exp(-i * t)); + }); + + auto sum_loss = std::accumulate(buffer.begin(), buffer.end(), 0.0); if(reduction_mode == MIOPEN_LOSS_REDUCTION_MEAN) - ref_output[0] = sum_loss / input_numel; - else if(reduction_mode == MIOPEN_LOSS_REDUCTION_SUM) - ref_output[0] = sum_loss; + sum_loss /= input_numel; + if(reduction_mode != MIOPEN_LOSS_REDUCTION_NONE) + ref_output[0] = static_cast(sum_loss); } template diff --git a/test/gtest/smoothl1loss.cpp b/test/gtest/smoothl1loss.cpp new file mode 100644 index 0000000000..93f1ed2648 --- /dev/null +++ b/test/gtest/smoothl1loss.cpp @@ -0,0 +1,153 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "smoothl1loss.hpp" +#include + +namespace smoothl1loss { + +struct GPU_SmoothL1LossForward_FP32 : SmoothL1LossTestForward +{ +}; + +struct GPU_SmoothL1LossForward_FP16 : SmoothL1LossTestForward +{ +}; + +struct GPU_SmoothL1LossForward_BFP16 : SmoothL1LossTestForward +{ +}; + +struct GPU_SmoothL1LossBackward_FP32 : SmoothL1LossTestBackward +{ +}; + +struct GPU_SmoothL1LossBackward_FP16 : SmoothL1LossTestBackward +{ +}; + +struct GPU_SmoothL1LossBackward_BFP16 : SmoothL1LossTestBackward +{ +}; + +} // namespace smoothl1loss +using namespace smoothl1loss; + +TEST_P(GPU_SmoothL1LossForward_FP32, Test) +{ + RunTest(); + Verify(); +}; + +TEST_P(GPU_SmoothL1LossForward_FP16, Test) +{ + RunTest(); + Verify(); +}; + +TEST_P(GPU_SmoothL1LossForward_BFP16, Test) +{ + RunTest(); + Verify(); +}; + +TEST_P(GPU_SmoothL1LossBackward_FP32, Test) +{ + RunTest(); + Verify(); +}; + +TEST_P(GPU_SmoothL1LossBackward_FP16, Test) +{ + RunTest(); + Verify(); +}; + +TEST_P(GPU_SmoothL1LossBackward_BFP16, Test) +{ + RunTest(); + Verify(); +}; + +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_SmoothL1LossForward_FP32, + testing::ValuesIn(SmoothL1LossSmokeTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_SmoothL1LossForward_FP16, + testing::ValuesIn(SmoothL1LossSmokeTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_SmoothL1LossForward_BFP16, + testing::ValuesIn(SmoothL1LossSmokeTestConfigs())); + +INSTANTIATE_TEST_SUITE_P(Perf, + GPU_SmoothL1LossForward_FP32, + testing::ValuesIn(SmoothL1LossPerfTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Perf, + GPU_SmoothL1LossForward_FP16, + testing::ValuesIn(SmoothL1LossPerfTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Perf, + GPU_SmoothL1LossForward_BFP16, + testing::ValuesIn(SmoothL1LossPerfTestConfigs())); + +INSTANTIATE_TEST_SUITE_P(Full, + GPU_SmoothL1LossForward_FP32, + testing::ValuesIn(SmoothL1LossFullTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Full, + GPU_SmoothL1LossForward_FP16, + testing::ValuesIn(SmoothL1LossFullTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Full, + GPU_SmoothL1LossForward_BFP16, + testing::ValuesIn(SmoothL1LossFullTestConfigs())); + +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_SmoothL1LossBackward_FP32, + testing::ValuesIn(SmoothL1LossSmokeTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_SmoothL1LossBackward_FP16, + testing::ValuesIn(SmoothL1LossSmokeTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_SmoothL1LossBackward_BFP16, + testing::ValuesIn(SmoothL1LossSmokeTestConfigs())); + +INSTANTIATE_TEST_SUITE_P(Perf, + GPU_SmoothL1LossBackward_FP32, + testing::ValuesIn(SmoothL1LossPerfTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Perf, + GPU_SmoothL1LossBackward_FP16, + testing::ValuesIn(SmoothL1LossPerfTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Perf, + GPU_SmoothL1LossBackward_BFP16, + testing::ValuesIn(SmoothL1LossPerfTestConfigs())); + +INSTANTIATE_TEST_SUITE_P(Full, + GPU_SmoothL1LossBackward_FP32, + testing::ValuesIn(SmoothL1LossFullTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Full, + GPU_SmoothL1LossBackward_FP16, + testing::ValuesIn(SmoothL1LossFullTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Full, + GPU_SmoothL1LossBackward_BFP16, + testing::ValuesIn(SmoothL1LossFullTestConfigs())); diff --git a/test/gtest/smoothl1loss.hpp b/test/gtest/smoothl1loss.hpp new file mode 100644 index 0000000000..f6a566c671 --- /dev/null +++ b/test/gtest/smoothl1loss.hpp @@ -0,0 +1,356 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "cpu_smoothl1loss.hpp" +#include "get_handle.hpp" +#include "random.hpp" +#include "tensor_holder.hpp" +#include "verify.hpp" +#include +#include +#include + +inline std::ostream& operator<<(std::ostream& os, const std::vector& v) +{ + os << '{'; + for(int i = 0; i < v.size(); ++i) + { + if(i != 0) + os << ','; + os << v[i]; + } + os << '}'; + return os; +} + +struct SmoothL1LossTestCase +{ + std::vector lengths; + float beta; + miopenLossReductionMode_t reduction; + bool contiguous; + + friend std::ostream& operator<<(std::ostream& os, const SmoothL1LossTestCase& tc) + { + return os << " Lengths:" << tc.lengths << " Beta:" << tc.beta + << " Reduction:" << tc.reduction + << " Contiguous:" << (tc.contiguous ? "True" : "False"); + } +}; + +inline std::vector +SmoothL1LossTestConfigs(const std::vector>& SizeList) +{ + std::vector tcs; + auto all_mode = { + MIOPEN_LOSS_REDUCTION_NONE, MIOPEN_LOSS_REDUCTION_SUM, MIOPEN_LOSS_REDUCTION_MEAN}; + for(auto reduction : all_mode) + for(auto contiguous : {true, false}) + for(const auto& lengths : SizeList) + tcs.push_back({lengths, 1, reduction, contiguous}); + return tcs; +} + +inline std::vector SmoothL1LossSmokeTestConfigs() +{ + return SmoothL1LossTestConfigs({ + {1, 1, 1}, + {4, 7, 5}, + {1, 2, 3, 4}, + {1, 1, 1, 257}, + {34, 4, 5}, + {15, 4, 5}, + {5, 13, 17, 11}, + {2, 10, 128, 128}, + }); +} + +inline std::vector SmoothL1LossPerfTestConfigs() +{ + return SmoothL1LossTestConfigs({{256, 4, 8723}}); +} + +inline std::vector SmoothL1LossFullTestConfigs() +{ + std::vector tcs; + + auto smoke_test = SmoothL1LossSmokeTestConfigs(); + auto perf_test = SmoothL1LossPerfTestConfigs(); + + tcs.reserve(smoke_test.size() + perf_test.size()); + for(const auto& test : smoke_test) + tcs.push_back(test); + for(const auto& test : perf_test) + tcs.push_back(test); + + return tcs; +} + +inline std::vector GetStrides(std::vector lengths, bool contiguous) +{ + if(!contiguous) + std::swap(lengths.front(), lengths.back()); + std::vector strides(lengths.size()); + strides.back() = 1; + for(int i = lengths.size() - 2; i >= 0; --i) + strides[i] = strides[i + 1] * lengths[i + 1]; + if(!contiguous) + std::swap(strides.front(), strides.back()); + return strides; +} + +template +struct SmoothL1LossTestForward : public ::testing::TestWithParam +{ +protected: + void SetUp() override + { + auto&& handle = get_handle(); + smoothl1loss_config = GetParam(); + auto gen_value1 = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 1); }; + auto gen_value2 = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 2); }; + + beta = smoothl1loss_config.beta; + reduction = smoothl1loss_config.reduction; + auto lengths = smoothl1loss_config.lengths; + auto contiguous = smoothl1loss_config.contiguous; + + auto in_strides = GetStrides(lengths, true); + input = tensor{lengths, in_strides}.generate(gen_value1); + + auto tar_strides = GetStrides(lengths, contiguous); + target = tensor{lengths, tar_strides}.generate(gen_value2); + + auto out_lengths = + (reduction == MIOPEN_LOSS_REDUCTION_NONE ? lengths : std::vector{1}); + auto out_strides = GetStrides(out_lengths, true); + + output = tensor{out_lengths, out_strides}; + std::fill(output.begin(), output.end(), std::numeric_limits::quiet_NaN()); + + ref_output = tensor{out_lengths, out_strides}; + std::fill(ref_output.begin(), ref_output.end(), std::numeric_limits::quiet_NaN()); + + ws_sizeInBytes = + miopen::GetSmoothL1LossForwardWorkspaceSize(handle, input.desc, output.desc, reduction); + if(ws_sizeInBytes == static_cast(-1)) + GTEST_SKIP(); + + if(ws_sizeInBytes != 0) + { + std::vector workspace_dims; + workspace_dims.push_back(ws_sizeInBytes / sizeof(float)); + + workspace = tensor{workspace_dims}; + std::fill(workspace.begin(), workspace.end(), 0.0f); + + workspace_dev = handle.Write(workspace.data); + } + + input_dev = handle.Write(input.data); + target_dev = handle.Write(target.data); + output_dev = handle.Write(output.data); + } + + void RunTest() + { + auto&& handle = get_handle(); + + miopenStatus_t status; + + cpu_smoothl1loss_forward(input, target, ref_output, beta, reduction); + status = miopen::SmoothL1LossForward(handle, + workspace_dev.get(), + ws_sizeInBytes, + input.desc, + input_dev.get(), + target.desc, + target_dev.get(), + output.desc, + output_dev.get(), + beta, + reduction); + ASSERT_EQ(status, miopenStatusSuccess); + + workspace.data = handle.Read(workspace_dev, workspace.data.size()); + output.data = handle.Read(output_dev, output.data.size()); + } + + void Verify() + { + // Computation error of fp16 is ~2^13 (=8192) bigger than + // the one of fp32 because mantissa is shorter by 13 bits. + double tolerance = std::is_same::value ? 1.5e-6 : 8.2e-3; + + // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. + if(std::is_same::value) + tolerance *= 8.0; + + auto error = miopen::rms_range(ref_output, output); + + ASSERT_EQ(miopen::range_distance(ref_output), miopen::range_distance(output)); + EXPECT_LT(error, tolerance) + << "Error output beyond tolerance Error: " << error << ", Tolerance: " << tolerance; + } + SmoothL1LossTestCase smoothl1loss_config; + + tensor input; + tensor target; + tensor output; + tensor workspace; + + tensor ref_output; + + miopen::Allocator::ManageDataPtr input_dev; + miopen::Allocator::ManageDataPtr target_dev; + miopen::Allocator::ManageDataPtr output_dev; + miopen::Allocator::ManageDataPtr workspace_dev; + + size_t ws_sizeInBytes; + + float beta; + miopenLossReductionMode_t reduction; +}; + +template +struct SmoothL1LossTestBackward : public ::testing::TestWithParam +{ +protected: + void SetUp() override + { + auto&& handle = get_handle(); + smoothl1loss_config = GetParam(); + auto gen_value1 = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 100); }; + auto gen_value2 = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 101); }; + + beta = smoothl1loss_config.beta; + reduction = smoothl1loss_config.reduction; + auto lengths = smoothl1loss_config.lengths; + auto contiguous = smoothl1loss_config.contiguous; + + auto in_strides = GetStrides(lengths, true); + input = tensor{lengths, in_strides}.generate(gen_value1); + + auto tar_strides = GetStrides(lengths, contiguous); + target = tensor{lengths, tar_strides}.generate(gen_value2); + + auto out_lengths = + (reduction == MIOPEN_LOSS_REDUCTION_NONE ? lengths : std::vector{1}); + auto out_strides = GetStrides(out_lengths, true); + + dO = tensor{out_lengths, out_strides}; + std::fill(dO.begin(), dO.end(), 0); + + dI = tensor{lengths, in_strides}; + std::fill(dI.begin(), dI.end(), std::numeric_limits::quiet_NaN()); + dT = tensor{lengths, tar_strides}; + std::fill(dT.begin(), dT.end(), std::numeric_limits::quiet_NaN()); + + ref_dI = tensor{lengths, in_strides}; + std::fill(ref_dI.begin(), ref_dI.end(), std::numeric_limits::quiet_NaN()); + ref_dT = tensor{lengths, tar_strides}; + std::fill(ref_dT.begin(), ref_dT.end(), std::numeric_limits::quiet_NaN()); + + if(input.desc.IsContiguous() && target.desc.IsContiguous() && dO.desc.IsContiguous() && + dI.desc.IsContiguous() && dT.desc.IsContiguous()) + GTEST_SKIP(); + + input_dev = handle.Write(input.data); + target_dev = handle.Write(target.data); + dO_dev = handle.Write(dO.data); + dI_dev = handle.Write(dI.data); + dT_dev = handle.Write(dT.data); + } + + void RunTest() + { + auto&& handle = get_handle(); + + miopenStatus_t status; + + cpu_smoothl1loss_backward(input, target, dO, ref_dI, ref_dT, beta, reduction); + status = miopen::SmoothL1LossBackward(handle, + input.desc, + input_dev.get(), + target.desc, + target_dev.get(), + dO.desc, + dO_dev.get(), + dI.desc, + dI_dev.get(), + dT.desc, + dT_dev.get(), + beta, + reduction); + + EXPECT_EQ(status, miopenStatusSuccess); + + dI.data = handle.Read(dI_dev, dI.data.size()); + dT.data = handle.Read(dT_dev, dT.data.size()); + } + + void Verify() + { + // Computation error of fp16 is ~2^13 (=8192) bigger than + // the one of fp32 because mantissa is shorter by 13 bits. + double tolerance = std::is_same::value ? 1.5e-6 : 8.2e-3; + + // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. + if(std::is_same::value) + tolerance *= 8.0; + + auto error_dI = miopen::rms_range(ref_dI, dI); + auto error_dT = miopen::rms_range(ref_dT, dT); + + ASSERT_EQ(miopen::range_distance(ref_dI), miopen::range_distance(dI)); + ASSERT_EQ(miopen::range_distance(ref_dT), miopen::range_distance(dT)); + EXPECT_LT(error_dI, tolerance) + << "Error Input Gradient beyond tolerance Error: " << error_dI + << ", Tolerance: " << tolerance; + EXPECT_LT(error_dT, tolerance) + << "Error Target Gradient beyond tolerance Error: " << error_dT + << ", Tolerance: " << tolerance; + } + SmoothL1LossTestCase smoothl1loss_config; + + tensor input; + tensor target; + tensor dO; + tensor dI; + tensor dT; + + tensor ref_dI; + tensor ref_dT; + + miopen::Allocator::ManageDataPtr input_dev; + miopen::Allocator::ManageDataPtr target_dev; + miopen::Allocator::ManageDataPtr dO_dev; + miopen::Allocator::ManageDataPtr dI_dev; + miopen::Allocator::ManageDataPtr dT_dev; + + float beta; + miopenLossReductionMode_t reduction; +};