diff --git a/docs/reference/index.rst b/docs/reference/index.rst
index c2b74eabee..60baec4073 100644
--- a/docs/reference/index.rst
+++ b/docs/reference/index.rst
@@ -39,3 +39,4 @@ The MIOpen API library is structured as follows:
   * :doc:`ReLU <../doxygen/html/group___re_l_u>` (experimental)
   * :doc:`Kthvalue <../doxygen/html/group__kthvalue>` (experimental)
   * :doc:`GLU <../doxygen/html/group__glu>` (experimental)
+  * :doc:`SmoothL1Loss <../doxygen/html/_smooth_l1_loss>` (experimental)
diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt
index 60d6fe6ce6..61687bf96d 100644
--- a/driver/CMakeLists.txt
+++ b/driver/CMakeLists.txt
@@ -57,6 +57,7 @@ add_executable(MIOpenDriver
     dm_reducecalculation.cpp
     dm_rnn.cpp
     dm_rope.cpp
+    dm_smooth_l1loss.cpp
     dm_softmarginloss.cpp
     dm_softmax.cpp
     dm_t5layernorm.cpp
diff --git a/driver/dm_smooth_l1loss.cpp b/driver/dm_smooth_l1loss.cpp
new file mode 100644
index 0000000000..d700d4b9e8
--- /dev/null
+++ b/driver/dm_smooth_l1loss.cpp
@@ -0,0 +1,41 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "registry_driver_maker.hpp"
+#include "smooth_l1loss_driver.hpp"
+
+static Driver* makeDriver(const std::string& base_arg)
+{
+    if(base_arg == "smoothl1loss")
+        return new SmoothL1LossDriver<float, float>();
+    if(base_arg == "smoothl1lossfp16")
+        return new SmoothL1LossDriver<float16, float>();
+    if(base_arg == "smoothl1lossbfp16")
+        return new SmoothL1LossDriver<bfloat16, float>();
+    return nullptr;
+}
+
+REGISTER_DRIVER_MAKER(makeDriver);
diff --git a/driver/driver.hpp b/driver/driver.hpp
index d77d5d02d2..e4f7356211 100644
--- a/driver/driver.hpp
+++ b/driver/driver.hpp
@@ -314,7 +314,7 @@ inline void PadBufferSize(size_t& sz, int datatype_sz)
            "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, "
            "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], rope[bfp16|fp16], "
            "prelu[bfp16|fp16], kthvalue[bfp16|fp16], glu[bfp16|fp16], softmarginloss[bfp16|fp16], "
-           "multimarginloss[bfp16|fp16]\n");
+           "multimarginloss[bfp16|fp16], smoothl1loss[bfp16|fp16]\n");
     exit(0); // NOLINT (concurrency-mt-unsafe)
 }
 
@@ -352,6 +352,7 @@ inline std::string ParseBaseArg(int argc, char* argv[])
        arg != "kthvaluebfp16" && arg != "glu" && arg != "glufp16" && arg != "glubfp16" &&
        arg != "softmarginloss" && arg != "softmarginlossfp16" && arg != "softmarginlossbfp16" &&
        arg != "multimarginloss" && arg != "multimarginlossfp16" && arg != "multimarginlossbfp16" &&
+       arg != "smoothl1loss" && arg != "smoothl1lossfp16" && arg != "smoothl1lossbfp16" &&
        arg != "--version")
     {
         printf("FAILED: Invalid Base Input Argument\n");
diff --git a/driver/smooth_l1loss_driver.hpp b/driver/smooth_l1loss_driver.hpp
new file mode 100644
index 0000000000..bd3b35584c
--- /dev/null
+++ b/driver/smooth_l1loss_driver.hpp
@@ -0,0 +1,618 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include "InputFlags.hpp"
+#include "driver.hpp"
+#include "miopen/errors.hpp"
+#include "tensor_driver.hpp"
+#include "timer.hpp"
+
+#include <../test/ford.hpp>
+#include <../test/tensor_holder.hpp>
+#include <../test/verify.hpp>
+
+#include <miopen/miopen.h>
+#include <miopen/tensor.hpp>
+#include <miopen/tensor_view_utils.hpp>
+
+#include <vector>
+
+#ifndef MLO_SMOOTH_L1LOSSMHOST_H_
+#define MLO_SMOOTH_L1LOSSMHOST_H_
+
+template <typename Tgpu, typename Tcheck>
+int32_t mloSmoothL1LossForwardRunHost(const miopenTensorDescriptor_t iDesc,
+                                      const miopenTensorDescriptor_t tDesc,
+                                      const miopenTensorDescriptor_t oDesc,
+                                      const Tgpu* input,
+                                      const Tgpu* target,
+                                      Tcheck* outputhost,
+                                      const float beta,
+                                      const miopenLossReductionMode_t reduction)
+{
+    // Treat contiguous tensors as non-contiguous tensors (for consistency)
+    auto I_tv = get_inner_expanded_tv<5>(miopen::deref(iDesc));
+    auto T_tv = get_inner_expanded_tv<5>(miopen::deref(tDesc));
+    auto O_tv = get_inner_expanded_tv<5>(miopen::deref(oDesc));
+
+    auto size       = miopen::deref(iDesc).GetElementSize();
+    double loss_sum = 0.0;
+
+    ford(size)([&](size_t i) {
+        const auto tensor_layout = tensor_layout_t<5>(I_tv, i);
+        const uint64_t Iidx      = I_tv.get_tensor_view_idx(tensor_layout);
+        const uint64_t Tidx      = T_tv.get_tensor_view_idx(tensor_layout);
+
+        auto diff = abs(input[Iidx] - target[Tidx]);
+        auto loss = (diff < beta ? 0.5f * diff * diff / beta : diff - 0.5f * beta);
+
+        if(reduction == MIOPEN_LOSS_REDUCTION_NONE)
+            outputhost[O_tv.get_tensor_view_idx(tensor_layout)] = static_cast<Tcheck>(loss);
+        else
+            loss_sum += loss;
+    });
+    if(reduction == MIOPEN_LOSS_REDUCTION_MEAN)
+        loss_sum /= size;
+    if(reduction != MIOPEN_LOSS_REDUCTION_NONE)
+        outputhost[0] = static_cast<Tcheck>(loss_sum);
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tcheck>
+int32_t mloSmoothL1LossBackwardRunHost(const miopenTensorDescriptor_t iDesc,
+                                       const miopenTensorDescriptor_t tDesc,
+                                       const miopenTensorDescriptor_t dODesc,
+                                       const miopenTensorDescriptor_t diDesc,
+                                       const miopenTensorDescriptor_t dtDesc,
+                                       const Tgpu* input,
+                                       const Tgpu* target,
+                                       const Tgpu* dO,
+                                       Tcheck* dI,
+                                       Tcheck* dT,
+                                       const float beta,
+                                       const miopenLossReductionMode_t reduction)
+{
+    // Treat contiguous tensors as non-contiguous tensors (for consistency)
+    auto I_tv  = get_inner_expanded_tv<5>(miopen::deref(iDesc));
+    auto T_tv  = get_inner_expanded_tv<5>(miopen::deref(tDesc));
+    auto dI_tv = get_inner_expanded_tv<5>(miopen::deref(diDesc));
+    auto dT_tv = get_inner_expanded_tv<5>(miopen::deref(dtDesc));
+    auto dO_tv = get_inner_expanded_tv<5>(miopen::deref(dODesc));
+
+    auto size = miopen::deref(iDesc).GetElementSize();
+
+    par_ford(size)([&](size_t i) {
+        const auto tensor_layout = tensor_layout_t<5>(I_tv, i);
+        const uint64_t Iidx      = I_tv.get_tensor_view_idx(tensor_layout);
+        const uint64_t Tidx      = T_tv.get_tensor_view_idx(tensor_layout);
+
+        float sub  = input[Iidx] - target[Tidx];
+        float grad = 0.0f;
+
+        if(fabs(sub) < beta)
+            grad = sub / beta *
+                   dO[reduction == MIOPEN_LOSS_REDUCTION_NONE
+                          ? dO_tv.get_tensor_view_idx(tensor_layout)
+                          : 0];
+        else
+            grad = (sub >= 0 ? 1.0f : -1.0f) * dO[reduction == MIOPEN_LOSS_REDUCTION_NONE
+                                                      ? dO_tv.get_tensor_view_idx(tensor_layout)
+                                                      : 0];
+
+        if(dI)
+            dI[dI_tv.get_tensor_view_idx(tensor_layout)] = static_cast<Tcheck>(grad);
+        if(dT)
+            dT[dT_tv.get_tensor_view_idx(tensor_layout)] = static_cast<Tcheck>(-grad);
+    });
+
+    return miopenStatusSuccess;
+}
+#endif
+
+inline std::vector<int> GetStrides(std::vector<int> lengths, int contiguous)
+{
+    if(contiguous != 0 && contiguous != 1)
+        std::cerr << "Error Tensor Contiguous should be 0 or 1" << std::endl;
+    if(contiguous == 0)
+        std::swap(lengths.front(), lengths.back());
+    std::vector<int> strides(lengths.size());
+    strides.back() = 1;
+    for(int i = lengths.size() - 2; i >= 0; --i)
+        strides[i] = strides[i + 1] * lengths[i + 1];
+    if(contiguous == 0)
+        std::swap(strides.front(), strides.back());
+    return strides;
+}
+
+template <typename Tgpu, typename Tref>
+class SmoothL1LossDriver : public Driver
+{
+public:
+    SmoothL1LossDriver() : Driver()
+    {
+        miopenCreateTensorDescriptor(&inputDesc);
+        miopenCreateTensorDescriptor(&targetDesc);
+        miopenCreateTensorDescriptor(&outputDesc);
+        miopenCreateTensorDescriptor(&diDesc);
+        miopenCreateTensorDescriptor(&dtDesc);
+        miopenCreateTensorDescriptor(&doDesc);
+
+        data_type = miopen_type<Tgpu>{};
+    }
+
+    int AddCmdLineArgs() override;
+    int ParseCmdLineArgs(int argc, char* argv[]) override;
+    InputFlags& GetInputFlags() override { return inflags; }
+
+    int GetandSetData() override;
+
+    int AllocateBuffersAndCopy() override;
+
+    int RunForwardGPU() override;
+    int RunForwardCPU();
+
+    int RunBackwardGPU() override;
+    int RunBackwardCPU();
+
+    Tref GetTolerance();
+    int VerifyBackward() override;
+    int VerifyForward() override;
+    ~SmoothL1LossDriver() override
+    {
+        miopenDestroyTensorDescriptor(inputDesc);
+        miopenDestroyTensorDescriptor(targetDesc);
+        miopenDestroyTensorDescriptor(outputDesc);
+        miopenDestroyTensorDescriptor(diDesc);
+        miopenDestroyTensorDescriptor(dtDesc);
+        miopenDestroyTensorDescriptor(doDesc);
+    }
+
+private:
+    InputFlags inflags;
+
+    int forw;
+
+    miopenTensorDescriptor_t inputDesc;
+    miopenTensorDescriptor_t targetDesc;
+    miopenTensorDescriptor_t outputDesc;
+    miopenTensorDescriptor_t diDesc;
+    miopenTensorDescriptor_t dtDesc;
+    miopenTensorDescriptor_t doDesc;
+
+    std::unique_ptr<GPUMem> in_dev;
+    std::unique_ptr<GPUMem> tar_dev;
+    std::unique_ptr<GPUMem> out_dev;
+    std::unique_ptr<GPUMem> workspace_dev;
+    std::unique_ptr<GPUMem> dI_dev;
+    std::unique_ptr<GPUMem> dT_dev;
+    std::unique_ptr<GPUMem> dO_dev;
+
+    std::vector<Tgpu> in;
+    std::vector<Tgpu> tar;
+    std::vector<Tgpu> out;
+    std::vector<Tgpu> workspace;
+    std::vector<Tgpu> dI;
+    std::vector<Tgpu> dT;
+    std::vector<Tgpu> dO;
+
+    std::vector<Tref> outhost;
+    std::vector<Tref> dIhost;
+    std::vector<Tref> dThost;
+
+    size_t ws_sizeInBytes;
+
+    float beta;
+    miopenLossReductionMode_t reduction_mode;
+};
+
+template <typename Tgpu, typename Tref>
+int SmoothL1LossDriver<Tgpu, Tref>::ParseCmdLineArgs(int argc, char* argv[])
+{
+    inflags.Parse(argc, argv);
+
+    auto reduction = inflags.GetValueStr("Reduction");
+    if(reduction != "none" && reduction != "mean" && reduction != "sum")
+        return miopenStatusInvalidValue;
+    if(reduction == "none")
+        reduction_mode = MIOPEN_LOSS_REDUCTION_NONE;
+    else if(reduction == "mean")
+        reduction_mode = MIOPEN_LOSS_REDUCTION_MEAN;
+    else if(reduction == "sum")
+        reduction_mode = MIOPEN_LOSS_REDUCTION_SUM;
+
+    beta = inflags.GetValueInt("Beta");
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        miopenEnableProfiling(GetHandle(), true);
+    }
+
+    forw = inflags.GetValueInt("forw");
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int SmoothL1LossDriver<Tgpu, Tref>::GetandSetData()
+{
+    auto length      = inflags.GetValueTensor("input").lengths;
+    auto in_strides  = GetStrides(length, 1);
+    auto tar_strides = GetStrides(length, inflags.GetValueInt("Contiguous"));
+
+    if(SetTensorNd(inputDesc, length, in_strides, data_type) != miopenStatusSuccess)
+        MIOPEN_THROW("Error parsing input tensor: " + inflags.GetValueStr("input") + ".");
+    if(SetTensorNd(targetDesc, length, tar_strides, data_type) != miopenStatusSuccess)
+        MIOPEN_THROW("Error parsing target tensor");
+
+    if(reduction_mode == MIOPEN_LOSS_REDUCTION_NONE)
+    {
+        if(SetTensorNd(outputDesc, length, in_strides, data_type) != miopenStatusSuccess)
+            MIOPEN_THROW("Error parsing output tensor");
+    }
+    else
+    {
+        std::vector<int> out_lens = {1};
+        if(SetTensorNd(outputDesc, out_lens, data_type) != miopenStatusSuccess)
+            MIOPEN_THROW("Error parsing output tensor");
+    }
+
+    if(SetTensorNd(diDesc, length, in_strides, data_type) != miopenStatusSuccess)
+        MIOPEN_THROW("Error parsing input gradient tensor");
+    if(SetTensorNd(dtDesc, length, tar_strides, data_type) != miopenStatusSuccess)
+        MIOPEN_THROW("Error parsing target gradient tensor");
+
+    if(reduction_mode == MIOPEN_LOSS_REDUCTION_NONE)
+    {
+        if(SetTensorNd(doDesc, length, in_strides, data_type) != miopenStatusSuccess)
+            MIOPEN_THROW("Error parsing output gradient tensor");
+    }
+    else
+    {
+        std::vector<int> out_lens = {1};
+        if(SetTensorNd(doDesc, out_lens, data_type) != miopenStatusSuccess)
+            MIOPEN_THROW("Error parsing output gradient tensor");
+    }
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int SmoothL1LossDriver<Tgpu, Tref>::AddCmdLineArgs()
+{
+    inflags.AddInputFlag("forw", 'F', "1", "Run only Forward SmoothL1Loss (Default=1)", "int");
+    inflags.AddInputFlag("input",
+                         'D',
+                         "256x4x1x1x8723",
+                         "Input tensor descriptor (Default=256x4x1x1x8723)",
+                         "tensor");
+    inflags.AddInputFlag("Contiguous",
+                         'C',
+                         "1",
+                         "Is input tensor contiguous? (Default=1 for contiguous tensor)",
+                         "int");
+    inflags.AddInputFlag("Reduction",
+                         'R',
+                         "0",
+                         "Specifies the reduction to apply to the output ('none'|'mean'|'sum') "
+                         "(Default=none to indicate no reduction)",
+                         "string");
+    inflags.AddInputFlag("Beta",
+                         'B',
+                         "1",
+                         "Specifies the threshold at which to change between L1 and L2 loss. The "
+                         "value must be non-negative (Default=1)",
+                         "int");
+    inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int");
+    inflags.AddInputFlag("verify", 'V', "0", "Verify Each Layer (Default=0)", "int");
+    inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int");
+    inflags.AddInputFlag(
+        "wall", 'w', "0", "Wall-clock Time Each Layer, Requires time == 1 (Default=0)", "int");
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int SmoothL1LossDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
+{
+    size_t in_sz  = GetTensorSize(inputDesc);
+    size_t tar_sz = GetTensorSize(targetDesc);
+    size_t out_sz = GetTensorSize(outputDesc);
+
+    miopenGetSmoothL1LossForwardWorkspaceSize(
+        GetHandle(), inputDesc, outputDesc, reduction_mode, &ws_sizeInBytes);
+    if(ws_sizeInBytes == static_cast<size_t>(-1))
+        return miopenStatusAllocFailed;
+    size_t ws_sz = ws_sizeInBytes / sizeof(Tgpu);
+
+    uint32_t ctx = 0;
+
+    in_dev        = std::unique_ptr<GPUMem>(new GPUMem(ctx, in_sz, sizeof(Tgpu)));
+    tar_dev       = std::unique_ptr<GPUMem>(new GPUMem(ctx, tar_sz, sizeof(Tgpu)));
+    out_dev       = std::unique_ptr<GPUMem>(new GPUMem(ctx, out_sz, sizeof(Tgpu)));
+    workspace_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, ws_sizeInBytes, sizeof(std::byte)));
+    dI_dev        = std::unique_ptr<GPUMem>(new GPUMem(ctx, in_sz, sizeof(Tgpu)));
+    dT_dev        = std::unique_ptr<GPUMem>(new GPUMem(ctx, tar_sz, sizeof(Tgpu)));
+    dO_dev        = std::unique_ptr<GPUMem>(new GPUMem(ctx, out_sz, sizeof(Tgpu)));
+
+    in        = std::vector<Tgpu>(in_sz, static_cast<Tgpu>(0));
+    tar       = std::vector<Tgpu>(tar_sz, static_cast<Tgpu>(0));
+    out       = std::vector<Tgpu>(out_sz, static_cast<Tgpu>(0));
+    workspace = std::vector<Tgpu>(ws_sz, static_cast<Tgpu>(0));
+    dI        = std::vector<Tgpu>(in_sz, static_cast<Tgpu>(0));
+    dT        = std::vector<Tgpu>(tar_sz, static_cast<Tgpu>(0));
+    dO        = std::vector<Tgpu>(out_sz, static_cast<Tgpu>(0));
+
+    outhost = std::vector<Tref>(out_sz, static_cast<Tref>(0));
+    dIhost  = std::vector<Tref>(in_sz, static_cast<Tref>(0));
+    dThost  = std::vector<Tref>(tar_sz, static_cast<Tref>(0));
+
+    for(int i = 0; i < in_sz; i++)
+    {
+        in[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(0.0), static_cast<Tgpu>(0.2));
+    }
+
+    for(int i = 0; i < tar_sz; i++)
+    {
+        tar[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(0.01), static_cast<Tgpu>(0.21));
+    }
+
+    fill(out.begin(), out.end(), static_cast<Tgpu>(0));
+
+    fill(dO.begin(), dO.end(), static_cast<Tgpu>(0.5));
+
+    if(in_dev->ToGPU(GetStream(), in.data()) != 0)
+    {
+        std::cerr << "Error copying (in) to GPU, size: " << in_dev->GetSize() << std::endl;
+        return miopenStatusInternalError;
+    }
+
+    if(tar_dev->ToGPU(GetStream(), tar.data()) != 0)
+    {
+        std::cerr << "Error copying (tar) to GPU, size: " << tar_dev->GetSize() << std::endl;
+        return miopenStatusInternalError;
+    }
+
+    if(dO_dev->ToGPU(GetStream(), dO.data()) != 0)
+    {
+        std::cerr << "Error copying (out grad) to GPU, size: " << dO_dev->GetSize() << std::endl;
+        return miopenStatusInternalError;
+    }
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int SmoothL1LossDriver<Tgpu, Tref>::RunForwardGPU()
+{
+    float kernel_total_time = 0;
+    float kernel_first_time = 0;
+
+    Timer t;
+    START_TIME
+
+    for(int i = 0; i < inflags.GetValueInt("iter"); i++)
+    {
+        miopenSmoothL1LossForward(GetHandle(),
+                                  workspace_dev->GetMem(),
+                                  ws_sizeInBytes,
+                                  inputDesc,
+                                  in_dev->GetMem(),
+                                  targetDesc,
+                                  tar_dev->GetMem(),
+                                  outputDesc,
+                                  out_dev->GetMem(),
+                                  beta,
+                                  reduction_mode);
+
+        float time = 0.0;
+        miopenGetKernelTime(GetHandle(), &time);
+        kernel_total_time += time;
+        if(i == 0)
+            kernel_first_time = time;
+    }
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        STOP_TIME
+        int iter = inflags.GetValueInt("iter");
+        if(WALL_CLOCK)
+            std::cout << "Wall-clock Time Forward SmoothL1Loss Elapsed: " << t.gettime_ms() / iter
+                      << " ms\n";
+
+        float kernel_average_time =
+            iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
+        std::cout << "GPU Kernel Time Forward SmoothL1Loss Elapsed: " << kernel_average_time
+                  << " ms\n";
+    }
+
+    if(out_dev->FromGPU(GetStream(), out.data()) != 0)
+    {
+        std::cerr << "Error copying (out_dev) from GPU, size: " << out_dev->GetSize() << std::endl;
+        return miopenStatusInternalError;
+    }
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int SmoothL1LossDriver<Tgpu, Tref>::RunForwardCPU()
+{
+    auto status = mloSmoothL1LossForwardRunHost<Tgpu, Tref>(inputDesc,
+                                                            targetDesc,
+                                                            outputDesc,
+                                                            in.data(),
+                                                            tar.data(),
+                                                            outhost.data(),
+                                                            beta,
+                                                            reduction_mode);
+
+    return status;
+}
+
+template <typename Tgpu, typename Tref>
+int SmoothL1LossDriver<Tgpu, Tref>::RunBackwardGPU()
+{
+    float kernel_total_time = 0;
+    float kernel_first_time = 0;
+
+    Timer t;
+    START_TIME
+
+    for(int i = 0; i < inflags.GetValueInt("iter"); i++)
+    {
+        miopen::deref(GetHandle()).ResetKernelTime();
+        miopenSmoothL1LossBackward(GetHandle(),
+                                   inputDesc,
+                                   in_dev->GetMem(),
+                                   targetDesc,
+                                   tar_dev->GetMem(),
+                                   doDesc,
+                                   dO_dev->GetMem(),
+                                   diDesc,
+                                   dI_dev->GetMem(),
+                                   dtDesc,
+                                   dT_dev->GetMem(),
+                                   beta,
+                                   reduction_mode);
+
+        float time = 0.0;
+        miopenGetKernelTime(GetHandle(), &time);
+        kernel_total_time += time;
+        if(i == 0)
+            kernel_first_time = time;
+    }
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        STOP_TIME
+        int iter = inflags.GetValueInt("iter");
+        if(WALL_CLOCK)
+            std::cout << "Wall-clock Time Backward SmoothL1Loss Elapsed: " << t.gettime_ms() / iter
+                      << " ms\n";
+
+        float kernel_average_time =
+            iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
+        std::cout << "GPU Kernel Time Backward SmoothL1Loss Elapsed: " << kernel_average_time
+                  << " ms\n";
+    }
+
+    if(dI_dev->FromGPU(GetStream(), dI.data()) != 0)
+        std::cerr << "Error copying (dI_dev) from GPU, size: " << dI_dev->GetSize() << std::endl;
+    if(dT_dev->FromGPU(GetStream(), dT.data()) != 0)
+        std::cerr << "Error copying (dT_dev) from GPU, size: " << dT_dev->GetSize() << std::endl;
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int SmoothL1LossDriver<Tgpu, Tref>::RunBackwardCPU()
+{
+    auto status = mloSmoothL1LossBackwardRunHost<Tgpu, Tref>(inputDesc,
+                                                             targetDesc,
+                                                             doDesc,
+                                                             diDesc,
+                                                             dtDesc,
+                                                             in.data(),
+                                                             tar.data(),
+                                                             dO.data(),
+                                                             dIhost.data(),
+                                                             dThost.data(),
+                                                             beta,
+                                                             reduction_mode);
+
+    return status;
+}
+
+template <typename Tgpu, typename Tref>
+Tref SmoothL1LossDriver<Tgpu, Tref>::GetTolerance()
+{
+    // Computation error of fp16 is ~2^13 (=8192) bigger than
+    // the one of fp32 because mantissa is shorter by 13 bits.
+    auto tolerance = std::is_same<Tgpu, float>::value ? 1.5e-6 : 8.2e-3;
+
+    // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
+    if(std::is_same<Tgpu, bfloat16>::value)
+        tolerance *= 8.0;
+    return tolerance;
+}
+
+template <typename Tgpu, typename Tref>
+int SmoothL1LossDriver<Tgpu, Tref>::VerifyForward()
+{
+    RunForwardCPU();
+    const Tref tolerance = GetTolerance();
+    auto error           = miopen::rms_range(outhost, out);
+
+    if(!std::isfinite(error) || error > tolerance)
+    {
+        std::cout << "Forward SmoothL1Loss FAILED: " << error << " > " << tolerance << std::endl;
+        return EC_VerifyFwd;
+    }
+    else
+    {
+        std::cout << "Forward SmoothL1Loss Verifies OK on CPU reference (" << error << " < "
+                  << tolerance << ')' << std::endl;
+    }
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int SmoothL1LossDriver<Tgpu, Tref>::VerifyBackward()
+{
+    RunBackwardCPU();
+    const Tref tolerance = GetTolerance();
+    auto error_dI        = miopen::rms_range(dIhost, dI);
+    auto error_dT        = miopen::rms_range(dThost, dT);
+
+    if(!std::isfinite(error_dI) || error_dI > tolerance)
+    {
+        std::cout << "Backward SmoothL1Loss Input Gradient FAILED: " << error_dI << " > "
+                  << tolerance << std::endl;
+        return EC_VerifyBwd;
+    }
+    else
+    {
+        std::cout << "Backward SmoothL1Loss Input Gradient Verifies OK on CPU reference ("
+                  << error_dI << " < " << tolerance << ')' << std::endl;
+    }
+
+    if(!std::isfinite(error_dT) || error_dT > tolerance)
+    {
+        std::cout << "Backward SmoothL1Loss Target Gradient FAILED: " << error_dT << " > "
+                  << tolerance << std::endl;
+        return EC_VerifyBwd;
+    }
+    else
+    {
+        std::cout << "Backward SmoothL1Loss Target Gradient Verifies OK on CPU reference ("
+                  << error_dT << " < " << tolerance << ')' << std::endl;
+    }
+
+    return miopenStatusSuccess;
+}
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 67652ab832..b623e4f61e 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -7999,7 +7999,6 @@ MIOPEN_EXPORT miopenStatus_t miopenPReLUBackward(miopenHandle_t handle,
 #endif // MIOPEN_BETA_API
 
 #ifdef MIOPEN_BETA_API
-
 /*! @ingroup LossFunction
  * @enum miopenLossReductionMode_t
  * Reduction mode for loss function
@@ -8011,8 +8010,10 @@ typedef enum
     MIOPEN_LOSS_REDUCTION_MEAN = 2, /*!< output tensor elements are summed up and divided with total
                                        number of elements to get mean value */
 } miopenLossReductionMode_t;
+#endif // MIOPEN_BETA_API
 
 // SoftMarginLoss APIs
+#ifdef MIOPEN_BETA_API
 /** @addtogroup LossFunction
  *
  *  @{
@@ -8092,10 +8093,10 @@ MIOPEN_EXPORT miopenStatus_t miopenSoftMarginLossBackward(miopenHandle_t handle,
 
 /** @} */
 // CLOSEOUT LossFunction DOXYGEN GROUP
-#endif
+#endif // MIOPEN_BETA_API
 
-#ifdef MIOPEN_BETA_API
 // MultiMarginLoss APIs
+#ifdef MIOPEN_BETA_API
 /** @addtogroup LossFunction
  *
  *  @{
@@ -8176,6 +8177,91 @@ MIOPEN_EXPORT miopenStatus_t miopenMultiMarginLossForward(miopenHandle_t handle,
 // CLOSEOUT LossFunction DOXYGEN GROUP
 #endif // MIOPEN_BETA_API
 
+// SmoothL1Loss APIs
+#ifdef MIOPEN_BETA_API
+/** @addtogroup LossFunction
+ *
+ *  @{
+ */
+
+/*! @brief Helper function to query the minimum workspace size required by the smooth L1Loss call
+ *
+ * @param [in]  handle                  MIOpen Handle
+ * @param [in]  inputDesc               Tensor descriptor for input tensor
+ * @param [in]  outputDesc              Tensor descriptor for output tensor
+ * @param [out] sizeInBytes             Pointer to data to return the minimum workspace size
+ * @param [in]  reduction               Reduction mode (none, sum, mean)
+ * @return                              miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t
+miopenGetSmoothL1LossForwardWorkspaceSize(miopenHandle_t handle,
+                                          miopenTensorDescriptor_t inputDesc,
+                                          miopenTensorDescriptor_t outputDesc,
+                                          miopenLossReductionMode_t reduction,
+                                          size_t* sizeInBytes);
+
+/*! @brief Execute a Smooth L1Loss forward layer
+ *
+ * @param [in]  handle                  MIOpen handle (input)
+ * @param [in]  workspace               Address of the allocated workspace data
+ * @param [in]  workspaceSizeInBytes    Size in bytes of the allocated workspace data
+ * @param [in]  inputDesc               Tensor descriptor for input tensor
+ * @param [in]  input                   Data tensor input
+ * @param [in]  targetDesc              Tensor descriptor for target tensor
+ * @param [in]  target                  Data tensor target
+ * @param [in]  outputDesc              Tensor descriptor for output tensor
+ * @param [out] output                  Data tensor output
+ * @param [in]  beta                    Beta
+ * @param [in]  reduction               Reduction mode (none, sum, mean)
+ * @return                              miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t miopenSmoothL1LossForward(miopenHandle_t handle,
+                                                       void* workspace,
+                                                       size_t workspaceSizeInBytes,
+                                                       miopenTensorDescriptor_t inputDesc,
+                                                       const void* input,
+                                                       miopenTensorDescriptor_t targetDesc,
+                                                       const void* target,
+                                                       miopenTensorDescriptor_t outputDesc,
+                                                       void* output,
+                                                       float beta,
+                                                       miopenLossReductionMode_t reduction);
+
+/*! @brief Execute the Backward Smooth L1Loss
+ *
+ * @param [in]  handle                  MIOpen handle
+ * @param [in]  inputDesc               Tensor descriptor for input tensor
+ * @param [in]  input                   Data tensor input
+ * @param [in]  targetDesc              Tensor descriptor for target tensor
+ * @param [in]  target                  Data tensor target
+ * @param [in]  doutputDesc             Tensor descriptor for output gradient
+ * @param [in]  doutput                 Gradient of output
+ * @param [in]  dinputDesc              Tensor descriptor for input gradient
+ * @param [out] dinput                  Gradient of input
+ * @param [in]  dtargetDesc             Tensor descriptor for target gradient
+ * @param [out] dtarget                 Gradient of target
+ * @param [in]  beta                    Beta
+ * @param [in]  reduction               Reduction mode (none, sum, mean)
+ * @return                              miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t miopenSmoothL1LossBackward(miopenHandle_t handle,
+                                                        miopenTensorDescriptor_t inputDesc,
+                                                        const void* input,
+                                                        miopenTensorDescriptor_t targetDesc,
+                                                        const void* target,
+                                                        miopenTensorDescriptor_t doutputDesc,
+                                                        const void* doutput,
+                                                        miopenTensorDescriptor_t dinputDesc,
+                                                        void* dinput,
+                                                        miopenTensorDescriptor_t dtargetDesc,
+                                                        void* dtarget,
+                                                        float beta,
+                                                        miopenLossReductionMode_t reduction);
+
+/** @} */
+// CLOSEOUT LossFunction DOXYGEN GROUP
+#endif // MIOPEN_BETA_API
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 70cd246873..1e0d2e8296 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -197,6 +197,8 @@ set( MIOpen_Source
     rope_api.cpp
     rope/problem_description.cpp
     scalar.cpp
+    smoothl1loss/problem_description.cpp
+    smoothl1loss_api.cpp
     softmarginloss/problem_description.cpp 
     softmarginloss_api.cpp
     softmax.cpp
@@ -331,6 +333,8 @@ set( MIOpen_Source
     solver/reduce/forward_sum.cpp
     solver/rope/backward_rope.cpp
     solver/rope/forward_rope.cpp
+    solver/smoothl1loss/forward_smoothl1loss.cpp
+    solver/smoothl1loss/backward_smoothl1loss.cpp
     solver/softmarginloss/backward_softmarginloss.cpp
     solver/softmarginloss/forward_softmarginloss.cpp
     solver/softmax/attn_softmax.cpp
@@ -469,6 +473,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/Conv_Winograd_v30_3_1_gfx11_fp32_f3x2_stride1.inc
         kernels/Conv_Winograd_v30_3_1_gfx11_fp32_f3x2_stride2.inc
         kernels/Conv_Winograd_v30_3_1_metadata.inc
+        kernels/MIOpenLossReductionMode.hpp
         kernels/MIOpenReduceCalculation.hpp
         kernels/MIOpenReduceExtreme.hpp
         kernels/bfloat16_dev.hpp
@@ -503,7 +508,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/xform_kd_cov2.inc
         kernels/xform_metadata.inc
         )
-
+        
     set(MIOPEN_KERNELS
         ${STATIC_COMPOSABLE_KERNEL_SOURCE}
         ${COMPOSABLE_KERNEL_SOURCE}
@@ -551,6 +556,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/MIOpenReduceExtreme.cpp
         kernels/MIOpenReduceSum.cpp
         kernels/MIOpenRoPE.cpp
+        kernels/MIOpenSmoothL1Loss.cpp
         kernels/MIOpenSoftMarginLoss.cpp
         kernels/MIOpenSoftmax.cl
         kernels/MIOpenSoftmaxAttn.cpp
@@ -701,6 +707,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         reducecalculation.cpp
         reduceextreme.cpp
         rope.cpp
+        smoothl1loss.cpp
         softmarginloss.cpp
         transformers_adam_w.cpp
         ${PROJECT_BINARY_DIR}/db_path.cpp
diff --git a/src/include/miopen/smoothl1loss.hpp b/src/include/miopen/smoothl1loss.hpp
new file mode 100644
index 0000000000..673a56001b
--- /dev/null
+++ b/src/include/miopen/smoothl1loss.hpp
@@ -0,0 +1,68 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/miopen.h>
+#include <miopen/common.hpp>
+
+namespace miopen {
+
+struct Handle;
+struct TensorDescriptor;
+
+MIOPEN_INTERNALS_EXPORT size_t
+GetSmoothL1LossForwardWorkspaceSize(Handle& handle,
+                                    const TensorDescriptor& iDesc,
+                                    const TensorDescriptor& oDesc,
+                                    miopenLossReductionMode_t reduction);
+
+MIOPEN_INTERNALS_EXPORT miopenStatus_t SmoothL1LossForward(Handle& handle,
+                                                           Data_t workspace,
+                                                           size_t workspaceSizeInBytes,
+                                                           const TensorDescriptor& iDesc,
+                                                           ConstData_t i,
+                                                           const TensorDescriptor& tDesc,
+                                                           ConstData_t t,
+                                                           const TensorDescriptor& oDesc,
+                                                           Data_t o,
+                                                           float beta,
+                                                           miopenLossReductionMode_t reduction);
+
+MIOPEN_INTERNALS_EXPORT miopenStatus_t SmoothL1LossBackward(Handle& handle,
+                                                            const TensorDescriptor& iDesc,
+                                                            ConstData_t i,
+                                                            const TensorDescriptor& tDesc,
+                                                            ConstData_t t,
+                                                            const TensorDescriptor& dODesc,
+                                                            ConstData_t dO,
+                                                            const TensorDescriptor& dIDesc,
+                                                            Data_t dI,
+                                                            const TensorDescriptor& dTDesc,
+                                                            Data_t dT,
+                                                            float beta,
+                                                            miopenLossReductionMode_t reduction);
+
+} // namespace miopen
diff --git a/src/include/miopen/smoothl1loss/invoke_params.hpp b/src/include/miopen/smoothl1loss/invoke_params.hpp
new file mode 100644
index 0000000000..b0b3c37467
--- /dev/null
+++ b/src/include/miopen/smoothl1loss/invoke_params.hpp
@@ -0,0 +1,76 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/invoke_params.hpp>
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+
+namespace smoothl1loss {
+
+struct BaseInvokeParams : public miopen::InvokeParams
+{
+    const TensorDescriptor* iDesc = nullptr;
+    const TensorDescriptor* tDesc = nullptr;
+
+    ConstData_t i = nullptr;
+    ConstData_t t = nullptr;
+
+    float beta = 0;
+
+    Data_t workspace           = nullptr;
+    std::size_t workspace_size = 0;
+
+    std::size_t GetWorkspaceSize() const { return workspace_size; }
+    Data_t GetWorkspace() const { return workspace; }
+};
+
+struct FwdInvokeParams : public BaseInvokeParams
+{
+    FwdInvokeParams() = default;
+
+    const TensorDescriptor* oDesc = nullptr;
+
+    Data_t o = nullptr;
+};
+
+struct BwdInvokeParams : public BaseInvokeParams
+{
+    BwdInvokeParams() = default;
+
+    const TensorDescriptor* dIDesc = nullptr;
+    const TensorDescriptor* dTDesc = nullptr;
+    const TensorDescriptor* dODesc = nullptr;
+
+    Data_t dI      = nullptr;
+    Data_t dT      = nullptr;
+    ConstData_t dO = nullptr;
+};
+
+} // namespace smoothl1loss
+
+} // namespace miopen
diff --git a/src/include/miopen/smoothl1loss/problem_description.hpp b/src/include/miopen/smoothl1loss/problem_description.hpp
new file mode 100644
index 0000000000..f14f5d23bf
--- /dev/null
+++ b/src/include/miopen/smoothl1loss/problem_description.hpp
@@ -0,0 +1,185 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/miopen.h>
+#include <miopen/activ.hpp>
+#include <miopen/problem_description_base.hpp>
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+
+struct NetworkConfig;
+
+namespace smoothl1loss {
+
+struct ForwardProblemDescription : ProblemDescriptionBase
+{
+    ForwardProblemDescription(const TensorDescriptor& iDesc_,
+                              const TensorDescriptor& tDesc_,
+                              const TensorDescriptor& oDesc_,
+                              const miopenLossReductionMode_t reduction_)
+        : iDesc(iDesc_), tDesc(tDesc_), oDesc(oDesc_), reduction(reduction_)
+    {
+        IsSameType();
+        IsSameLength();
+    }
+
+    const TensorDescriptor& GetIDesc() const { return iDesc; }
+    const TensorDescriptor& GetTDesc() const { return tDesc; }
+    const TensorDescriptor& GetODesc() const { return oDesc; }
+    miopenLossReductionMode_t GetReduction() const { return reduction; }
+
+    bool IsSameType() const
+    {
+        if(iDesc.GetType() != tDesc.GetType())
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "SmoothL1Loss: Input and Target tensor types do not match.");
+        if(iDesc.GetType() != oDesc.GetType())
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "SmoothL1Loss: Input and Output tensor types do not match.");
+        return true;
+    }
+
+    bool IsSameLength() const
+    {
+        if(iDesc.GetLengths() != tDesc.GetLengths())
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "SmoothL1Loss: Input and Target tensor dimension lengths do not match.");
+        if(reduction == MIOPEN_LOSS_REDUCTION_NONE && iDesc.GetLengths() != oDesc.GetLengths())
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "SmoothL1Loss: Without reduction, Input and Output tensor dimension "
+                         "lengths should be equal.");
+        if(reduction != MIOPEN_LOSS_REDUCTION_NONE && oDesc.GetElementSize() != 1)
+            MIOPEN_THROW(
+                miopenStatusBadParm,
+                "SmoothL1Loss: When reduction, Output tensor dimension lengths must be (1).");
+        return true;
+    }
+
+    NetworkConfig MakeNetworkConfig() const override;
+
+private:
+    TensorDescriptor iDesc;
+    TensorDescriptor tDesc;
+    TensorDescriptor oDesc;
+    miopenLossReductionMode_t reduction;
+
+    NetworkConfig MakeForwardNetworkConfig() const;
+};
+
+struct BackwardProblemDescription : ProblemDescriptionBase
+{
+    BackwardProblemDescription(const TensorDescriptor& iDesc_,
+                               const TensorDescriptor& tDesc_,
+                               const TensorDescriptor& dODesc_,
+                               const TensorDescriptor& dIDesc_,
+                               const TensorDescriptor& dTDesc_,
+                               const miopenLossReductionMode_t reduction_)
+        : iDesc(iDesc_),
+          tDesc(tDesc_),
+          dODesc(dODesc_),
+          dIDesc(dIDesc_),
+          dTDesc(dTDesc_),
+          reduction(reduction_)
+    {
+        IsSameType();
+        IsSameLength();
+    }
+
+    const TensorDescriptor& GetIDesc() const { return iDesc; }
+    const TensorDescriptor& GetTDesc() const { return tDesc; }
+    const TensorDescriptor& GetDODesc() const { return dODesc; }
+    const TensorDescriptor& GetDIDesc() const { return dIDesc; }
+    const TensorDescriptor& GetDTDesc() const { return dTDesc; }
+    miopenLossReductionMode_t GetReduction() const { return reduction; }
+
+    bool IsSameType() const
+    {
+        if(iDesc.GetType() != tDesc.GetType())
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "SmoothL1Loss: Input and Target tensor types do not match.");
+        if(iDesc.GetType() != dIDesc.GetType())
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "SmoothL1Loss: Input and its Gradient tensor types do not match.");
+        if(tDesc.GetType() != dTDesc.GetType())
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "SmoothL1Loss: Target and its Gradient tensor types do not match.");
+        if(iDesc.GetType() != dODesc.GetType())
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "SmoothL1Loss: Input and Output Gradient tensor types do not match.");
+        return true;
+    }
+
+    bool IsSameLength() const
+    {
+        if(iDesc.GetLengths() != tDesc.GetLengths())
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "SmoothL1Loss: Input and Target tensor dimension lengths do not match.");
+        if(iDesc.GetLengths() != dIDesc.GetLengths())
+            MIOPEN_THROW(
+                miopenStatusBadParm,
+                "SmoothL1Loss: Input and its Gradient tensor dimension lengths do not match.");
+        if(tDesc.GetLengths() != dTDesc.GetLengths())
+            MIOPEN_THROW(
+                miopenStatusBadParm,
+                "SmoothL1Loss: Target and its Gradient tensor dimension lengths do not match.");
+        if(reduction == MIOPEN_LOSS_REDUCTION_NONE && iDesc.GetLengths() != dODesc.GetLengths())
+            MIOPEN_THROW(
+                miopenStatusBadParm,
+                "SmoothL1Loss: Without reduction, Input and Output Gradient tensor dimension "
+                "lengths should be equal.");
+        if(reduction != MIOPEN_LOSS_REDUCTION_NONE && dODesc.GetElementSize() != 1)
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "SmoothL1Loss: When reduction, Output Gradient tensor dimension lengths "
+                         "must be (1).");
+        return true;
+    }
+
+    bool IsAllContiguous() const
+    {
+        if(!iDesc.IsContiguous() || !tDesc.IsContiguous() || !dIDesc.IsContiguous() ||
+           !dTDesc.IsContiguous() || !dODesc.IsContiguous())
+            return false;
+        return true;
+    }
+
+    NetworkConfig MakeNetworkConfig() const override;
+
+private:
+    TensorDescriptor iDesc;
+    TensorDescriptor tDesc;
+    TensorDescriptor dODesc;
+    TensorDescriptor dIDesc;
+    TensorDescriptor dTDesc;
+    miopenLossReductionMode_t reduction;
+
+    NetworkConfig MakeBackwardNetworkConfig() const;
+};
+
+} // namespace smoothl1loss
+
+} // namespace miopen
diff --git a/src/include/miopen/smoothl1loss/solvers.hpp b/src/include/miopen/smoothl1loss/solvers.hpp
new file mode 100644
index 0000000000..8bdec836b3
--- /dev/null
+++ b/src/include/miopen/smoothl1loss/solvers.hpp
@@ -0,0 +1,75 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/smoothl1loss/problem_description.hpp>
+#include <miopen/solver.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace smoothl1loss {
+
+using SmoothL1LossForwardSolverBase =
+    NonTunableSolverBase<ExecutionContext, miopen::smoothl1loss::ForwardProblemDescription>;
+
+struct SmoothL1LossForward final : SmoothL1LossForwardSolverBase
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<SmoothL1LossForward>(); }
+
+    bool
+    IsApplicable(const ExecutionContext& context,
+                 const miopen::smoothl1loss::ForwardProblemDescription& problem) const override;
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::smoothl1loss::ForwardProblemDescription& problem) const override;
+    std::size_t
+    GetWorkspaceSize(const ExecutionContext& context,
+                     const miopen::smoothl1loss::ForwardProblemDescription& problem) const override;
+    bool MayNeedWorkspace() const override { return true; }
+};
+
+using SmoothL1LossBackwardSolverBase =
+    NonTunableSolverBase<ExecutionContext, miopen::smoothl1loss::BackwardProblemDescription>;
+
+struct SmoothL1LossBackward final : SmoothL1LossBackwardSolverBase
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<SmoothL1LossBackward>(); }
+
+    bool
+    IsApplicable(const ExecutionContext& context,
+                 const miopen::smoothl1loss::BackwardProblemDescription& problem) const override;
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::smoothl1loss::BackwardProblemDescription& problem) const override;
+};
+
+} // namespace smoothl1loss
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/include/miopen/solver_id.hpp b/src/include/miopen/solver_id.hpp
index 76a13b051c..8b6c7d6878 100644
--- a/src/include/miopen/solver_id.hpp
+++ b/src/include/miopen/solver_id.hpp
@@ -63,8 +63,7 @@ enum class Primitive
     RoPE,
     ReLU,
     Kthvalue,
-    SoftMarginLoss,
-    MultiMarginLoss
+    Loss
 };
 
 struct MIOPEN_INTERNALS_EXPORT Id
diff --git a/src/kernels/MIOpenLossReductionMode.hpp b/src/kernels/MIOpenLossReductionMode.hpp
new file mode 100644
index 0000000000..f8c62b41a6
--- /dev/null
+++ b/src/kernels/MIOpenLossReductionMode.hpp
@@ -0,0 +1,42 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_KERNELS_MIOPEN_LOSS_REDUCTION_MODE_HPP
+#define GUARD_KERNELS_MIOPEN_LOSS_REDUCTION_MODE_HPP
+
+enum class LossReductionMode_t
+{
+    NONE = 0,
+    SUM,
+    MEAN,
+};
+
+#ifndef __HIP_DEVICE_COMPILE__
+static_assert(MIOPEN_LOSS_REDUCTION_NONE == static_cast<int>(LossReductionMode_t::NONE));
+static_assert(MIOPEN_LOSS_REDUCTION_SUM == static_cast<int>(LossReductionMode_t::SUM));
+static_assert(MIOPEN_LOSS_REDUCTION_MEAN == static_cast<int>(LossReductionMode_t::MEAN));
+#endif
+
+#endif // GUARD_KERNELS_MIOPEN_LOSS_REDUCTION_MODE_HPP
diff --git a/src/kernels/MIOpenMultiMarginLoss.cpp b/src/kernels/MIOpenMultiMarginLoss.cpp
index 2443f7863a..eccdbf8f70 100644
--- a/src/kernels/MIOpenMultiMarginLoss.cpp
+++ b/src/kernels/MIOpenMultiMarginLoss.cpp
@@ -30,8 +30,9 @@
 
 #include "float_types.h"
 #include "tensor_view.hpp"
+#include "MIOpenLossReductionMode.hpp"
 
-template <typename DTYPE, int REDUCTION_T>
+template <typename DTYPE, LossReductionMode_t REDUCTION_T>
 __device__ void multimarginlossforward2d(const DTYPE* __restrict__ I,
                                          const uint64_t* __restrict__ T,
                                          const DTYPE* __restrict__ W,
@@ -76,9 +77,11 @@ __device__ void multimarginlossforward2d(const DTYPE* __restrict__ I,
     loss /= C;
     switch(REDUCTION_T)
     {
-    case 0: static_cast<DTYPE*>(O)[O_tv.get_tensor_view_idx({n})] = CVT_ACCUM2FLOAT(loss); break;
-    case 1: static_cast<FLOAT_ACCUM*>(O)[n] = loss; break;
-    case 2: static_cast<FLOAT_ACCUM*>(O)[n] = loss / N; break;
+    case LossReductionMode_t::NONE:
+        static_cast<DTYPE*>(O)[O_tv.get_tensor_view_idx({n})] = CVT_ACCUM2FLOAT(loss);
+        break;
+    case LossReductionMode_t::SUM: static_cast<FLOAT_ACCUM*>(O)[n] = loss; break;
+    case LossReductionMode_t::MEAN: static_cast<FLOAT_ACCUM*>(O)[n] = loss / N; break;
     default: break;
     }
 }
@@ -95,5 +98,6 @@ extern "C" __global__ void MultiMarginLossForward2d(const FLOAT* __restrict__ I,
                                                     tensor_view_t<1> O_tv)
 {
     // instantiate the kernel
-    multimarginlossforward2d<FLOAT, REDUCTION_TYPE>(I, T, W, O, p, margin, I_tv, T_tv, W_tv, O_tv);
+    multimarginlossforward2d<FLOAT, static_cast<LossReductionMode_t>(REDUCTION_TYPE)>(
+        I, T, W, O, p, margin, I_tv, T_tv, W_tv, O_tv);
 }
diff --git a/src/kernels/MIOpenSmoothL1Loss.cpp b/src/kernels/MIOpenSmoothL1Loss.cpp
new file mode 100644
index 0000000000..07cc394fd8
--- /dev/null
+++ b/src/kernels/MIOpenSmoothL1Loss.cpp
@@ -0,0 +1,141 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#endif
+
+#include "float_types.h"
+#include "tensor_view.hpp"
+#include "MIOpenLossReductionMode.hpp"
+
+template <typename TIO, uint32_t NDIM, LossReductionMode_t REDUCTION_T>
+__device__ void SmoothL1LossForward(const TIO* I,
+                                    const TIO* T,
+                                    void* O,
+                                    const float beta,
+                                    const uint64_t size,
+                                    tensor_view_t<NDIM> I_tv,
+                                    tensor_view_t<NDIM> T_tv,
+                                    tensor_view_t<NDIM> O_tv)
+{
+    const uint64_t gid = blockIdx.x * blockDim.x + threadIdx.x;
+
+    tensor_layout_t<5> tensor_layout(I_tv, gid);
+    if(tensor_layout.layout[0] >= I_tv.size[0])
+        return;
+
+    FLOAT_ACCUM i    = CVT_FLOAT2ACCUM(I[I_tv.get_tensor_view_idx(tensor_layout)]);
+    FLOAT_ACCUM t    = CVT_FLOAT2ACCUM(T[T_tv.get_tensor_view_idx(tensor_layout)]);
+    FLOAT_ACCUM diff = abs(i - t);
+    FLOAT_ACCUM loss = diff < beta ? 0.5f * diff * diff / beta : diff - 0.5f * beta;
+
+    switch(REDUCTION_T)
+    {
+    case LossReductionMode_t::NONE:
+        static_cast<TIO*>(O)[O_tv.get_tensor_view_idx(tensor_layout)] = CVT_ACCUM2FLOAT(loss);
+        break;
+    case LossReductionMode_t::SUM: static_cast<FLOAT_ACCUM*>(O)[gid] = loss; break;
+    case LossReductionMode_t::MEAN: static_cast<FLOAT_ACCUM*>(O)[gid] = loss / size; break;
+    default: break;
+    }
+}
+
+extern "C" __global__ void SmoothL1LossForward(const FLOAT* __restrict__ I,
+                                               const FLOAT* __restrict__ T,
+                                               void* __restrict__ O,
+                                               const float beta,
+                                               const uint64_t size,
+                                               tensor_view_t<VIEW_DIMS> I_tv,
+                                               tensor_view_t<VIEW_DIMS> T_tv,
+                                               tensor_view_t<VIEW_DIMS> O_tv)
+{
+    // instantiate the kernel
+    SmoothL1LossForward<FLOAT, VIEW_DIMS, static_cast<LossReductionMode_t>(REDUCTION_TYPE)>(
+        I, T, O, beta, size, I_tv, T_tv, O_tv);
+}
+
+template <typename TIO, uint32_t NDIM, LossReductionMode_t REDUCTION_T>
+__device__ void SmoothL1LossBackward(const TIO* I,
+                                     const TIO* T,
+                                     const TIO* dO,
+                                     TIO* dI,
+                                     TIO* dT,
+                                     float beta,
+                                     const uint64_t size,
+                                     tensor_view_t<NDIM> I_tv,
+                                     tensor_view_t<NDIM> T_tv,
+                                     tensor_view_t<NDIM> dO_tv,
+                                     tensor_view_t<NDIM> dI_tv,
+                                     tensor_view_t<NDIM> dT_tv)
+{
+    size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
+    tensor_layout_t<5> tensor_layout(I_tv, gid);
+    if(tensor_layout.layout[0] >= I_tv.size[0])
+        return;
+
+    FLOAT_ACCUM o_grad;
+    switch(REDUCTION_T)
+    {
+    case LossReductionMode_t::NONE:
+        o_grad = CVT_FLOAT2ACCUM(dO[dO_tv.get_tensor_view_idx(tensor_layout)]);
+        break;
+    case LossReductionMode_t::SUM: o_grad = CVT_FLOAT2ACCUM(dO[0]); break;
+    case LossReductionMode_t::MEAN: o_grad = CVT_FLOAT2ACCUM(dO[0]) / size; break;
+    default: break;
+    }
+
+    FLOAT_ACCUM i   = CVT_FLOAT2ACCUM(I[I_tv.get_tensor_view_idx(tensor_layout)]);
+    FLOAT_ACCUM t   = CVT_FLOAT2ACCUM(T[T_tv.get_tensor_view_idx(tensor_layout)]);
+    FLOAT_ACCUM sub = i - t;
+    FLOAT_ACCUM grad;
+    if(fabs(sub) < beta)
+        grad = sub / beta * o_grad;
+    else
+        grad = (sub >= 0 ? 1.0f : -1.0f) * o_grad;
+
+    if(dI)
+        dI[dI_tv.get_tensor_view_idx(tensor_layout)] = CVT_ACCUM2FLOAT(grad);
+    if(dT)
+        dT[dT_tv.get_tensor_view_idx(tensor_layout)] = CVT_ACCUM2FLOAT(-grad);
+}
+
+extern "C" __global__ void SmoothL1LossBackward(const FLOAT* __restrict__ I,
+                                                const FLOAT* __restrict__ T,
+                                                const FLOAT* __restrict__ dO,
+                                                FLOAT* __restrict__ dI,
+                                                FLOAT* __restrict__ dT,
+                                                float beta,
+                                                const uint64_t size,
+                                                tensor_view_t<VIEW_DIMS> I_tv,
+                                                tensor_view_t<VIEW_DIMS> T_tv,
+                                                tensor_view_t<VIEW_DIMS> dO_tv,
+                                                tensor_view_t<VIEW_DIMS> dI_tv,
+                                                tensor_view_t<VIEW_DIMS> dT_tv)
+{
+    SmoothL1LossBackward<FLOAT, VIEW_DIMS, static_cast<LossReductionMode_t>(REDUCTION_TYPE)>(
+        I, T, dO, dI, dT, beta, size, I_tv, T_tv, dO_tv, dI_tv, dT_tv);
+}
diff --git a/src/kernels/MIOpenSoftMarginLoss.cpp b/src/kernels/MIOpenSoftMarginLoss.cpp
index 6a8beaad61..4d249724ce 100644
--- a/src/kernels/MIOpenSoftMarginLoss.cpp
+++ b/src/kernels/MIOpenSoftMarginLoss.cpp
@@ -30,8 +30,9 @@
 
 #include "float_types.h"
 #include "tensor_view.hpp"
+#include "MIOpenLossReductionMode.hpp"
 
-template <typename DTYPE, int REDUCTION_T>
+template <typename DTYPE, LossReductionMode_t REDUCTION_T>
 __device__ void softmarginlossforward5d(const DTYPE* __restrict__ I,
                                         const DTYPE* __restrict__ T,
                                         void* __restrict__ O,
@@ -52,13 +53,15 @@ __device__ void softmarginlossforward5d(const DTYPE* __restrict__ I,
     switch(REDUCTION_T)
     {
     // If reduction = None, O is DTYPE*
-    case 0: static_cast<DTYPE*>(O)[O_tv.get_tensor_view_idx(idx)] = CVT_ACCUM2FLOAT(loss); break;
+    case LossReductionMode_t::NONE:
+        static_cast<DTYPE*>(O)[O_tv.get_tensor_view_idx(idx)] = CVT_ACCUM2FLOAT(loss);
+        break;
     // If reduction = Sum, O is FLOAT_ACCUM* and then all elements will be sum up in the next
     // kernel
-    case 1: static_cast<FLOAT_ACCUM*>(O)[gid] = loss; break;
+    case LossReductionMode_t::SUM: static_cast<FLOAT_ACCUM*>(O)[gid] = loss; break;
     // If reduction = Mean, same as Sum but O will be divided by num_elem, then the next kernel sum
     // up will return mean of all elements
-    case 2: static_cast<FLOAT_ACCUM*>(O)[gid] = loss / num_elem; break;
+    case LossReductionMode_t::MEAN: static_cast<FLOAT_ACCUM*>(O)[gid] = loss / num_elem; break;
     default: break;
     }
 }
@@ -72,10 +75,11 @@ extern "C" __global__ void SoftMarginLossForward5d(const FLOAT* __restrict__ I,
                                                    tensor_view_t<5> O_tv)
 {
     // instantiate the kernel
-    softmarginlossforward5d<FLOAT, REDUCTION_TYPE>(I, T, O, num_elem, I_tv, T_tv, O_tv);
+    softmarginlossforward5d<FLOAT, static_cast<LossReductionMode_t>(REDUCTION_TYPE)>(
+        I, T, O, num_elem, I_tv, T_tv, O_tv);
 }
 
-template <typename DTYPE, int REDUCTION_T>
+template <typename DTYPE, LossReductionMode_t REDUCTION_T>
 __device__ void softmarginlossbackward5d(const DTYPE* __restrict__ I,
                                          const DTYPE* __restrict__ T,
                                          const DTYPE* __restrict__ dO,
@@ -98,9 +102,13 @@ __device__ void softmarginlossbackward5d(const DTYPE* __restrict__ I,
     FLOAT_ACCUM loss     = -t / (exp(i * t) + 1) * dO_accum;
     switch(REDUCTION_T)
     {
-    case 0:
-    case 1: dI[dI_tv.get_tensor_view_idx(idx)] = CVT_ACCUM2FLOAT(loss); break;
-    case 2: dI[dI_tv.get_tensor_view_idx(idx)] = CVT_ACCUM2FLOAT(loss / num_elem); break;
+    case LossReductionMode_t::NONE:
+    case LossReductionMode_t::SUM:
+        dI[dI_tv.get_tensor_view_idx(idx)] = CVT_ACCUM2FLOAT(loss);
+        break;
+    case LossReductionMode_t::MEAN:
+        dI[dI_tv.get_tensor_view_idx(idx)] = CVT_ACCUM2FLOAT(loss / num_elem);
+        break;
     default: break;
     }
 }
@@ -116,6 +124,6 @@ extern "C" __global__ void SoftMarginLossBackward5d(const FLOAT* __restrict__ I,
                                                     tensor_view_t<5> dI_tv)
 {
     // instantiate the kernel
-    softmarginlossbackward5d<FLOAT, REDUCTION_TYPE>(
+    softmarginlossbackward5d<FLOAT, static_cast<LossReductionMode_t>(REDUCTION_TYPE)>(
         I, T, dO, dI, num_elem, I_tv, T_tv, dO_tv, dI_tv);
 }
diff --git a/src/smoothl1loss.cpp b/src/smoothl1loss.cpp
new file mode 100644
index 0000000000..dd6bdeb67b
--- /dev/null
+++ b/src/smoothl1loss.cpp
@@ -0,0 +1,130 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/datatype.hpp>
+#include <miopen/find_solution.hpp>
+#include <miopen/float_equal.hpp>
+#include <miopen/kernel_cache.hpp>
+#include <miopen/smoothl1loss/invoke_params.hpp>
+#include <miopen/smoothl1loss/problem_description.hpp>
+#include <miopen/smoothl1loss/solvers.hpp>
+#include <miopen/smoothl1loss.hpp>
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+
+size_t GetSmoothL1LossForwardWorkspaceSize(Handle& handle,
+                                           const TensorDescriptor& iDesc,
+                                           const TensorDescriptor& oDesc,
+                                           const miopenLossReductionMode_t reduction)
+{
+    auto ctx           = ExecutionContext{&handle};
+    const auto problem = smoothl1loss::ForwardProblemDescription{iDesc, iDesc, oDesc, reduction};
+
+    const auto solvers = solver::SolverContainer<solver::smoothl1loss::SmoothL1LossForward>{};
+
+    auto pair_size_vector = solvers.GetWorkspaceSizes(ctx, problem);
+    return pair_size_vector.empty() ? static_cast<size_t>(-1) : pair_size_vector.front().second;
+}
+
+miopenStatus_t SmoothL1LossForward(Handle& handle,
+                                   Data_t workspace,
+                                   size_t workspaceSizeInBytes,
+                                   const TensorDescriptor& iDesc,
+                                   ConstData_t i,
+                                   const TensorDescriptor& tDesc,
+                                   ConstData_t t,
+                                   const TensorDescriptor& oDesc,
+                                   Data_t o,
+                                   float beta,
+                                   const miopenLossReductionMode_t reduction)
+{
+    const auto problem = smoothl1loss::ForwardProblemDescription{iDesc, tDesc, oDesc, reduction};
+
+    const auto invoke_params = [&]() {
+        auto tmp           = smoothl1loss::FwdInvokeParams{};
+        tmp.type           = InvokeType::Run;
+        tmp.iDesc          = &iDesc;
+        tmp.tDesc          = &tDesc;
+        tmp.oDesc          = &oDesc;
+        tmp.i              = i;
+        tmp.t              = t;
+        tmp.o              = o;
+        tmp.beta           = beta;
+        tmp.workspace      = workspace;
+        tmp.workspace_size = workspaceSizeInBytes;
+        return tmp;
+    }();
+
+    const auto algo    = AlgorithmName{"SmoothL1LossForward"};
+    const auto solvers = solver::SolverContainer<solver::smoothl1loss::SmoothL1LossForward>{};
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
+miopenStatus_t SmoothL1LossBackward(Handle& handle,
+                                    const TensorDescriptor& iDesc,
+                                    ConstData_t i,
+                                    const TensorDescriptor& tDesc,
+                                    ConstData_t t,
+                                    const TensorDescriptor& dODesc,
+                                    ConstData_t dO,
+                                    const TensorDescriptor& dIDesc,
+                                    Data_t dI,
+                                    const TensorDescriptor& dTDesc,
+                                    Data_t dT,
+                                    float beta,
+                                    const miopenLossReductionMode_t reduction)
+{
+    const auto problem =
+        smoothl1loss::BackwardProblemDescription{iDesc, tDesc, dODesc, dIDesc, dTDesc, reduction};
+
+    const auto invoke_params = [&]() {
+        auto tmp   = smoothl1loss::BwdInvokeParams{};
+        tmp.type   = InvokeType::Run;
+        tmp.iDesc  = &iDesc;
+        tmp.tDesc  = &tDesc;
+        tmp.dODesc = &dODesc;
+        tmp.dIDesc = &dIDesc;
+        tmp.dTDesc = &dTDesc;
+        tmp.i      = i;
+        tmp.t      = t;
+        tmp.dO     = dO;
+        tmp.dI     = dI;
+        tmp.dT     = dT;
+        tmp.beta   = beta;
+        return tmp;
+    }();
+
+    const auto algo    = AlgorithmName{"SmoothL1LossBackward"};
+    const auto solvers = solver::SolverContainer<solver::smoothl1loss::SmoothL1LossBackward>{};
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
+} // namespace miopen
diff --git a/src/smoothl1loss/problem_description.cpp b/src/smoothl1loss/problem_description.cpp
new file mode 100644
index 0000000000..f75aa53070
--- /dev/null
+++ b/src/smoothl1loss/problem_description.cpp
@@ -0,0 +1,68 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/smoothl1loss/problem_description.hpp>
+#include <miopen/names.hpp>
+
+#include <sstream>
+
+namespace miopen {
+
+namespace smoothl1loss {
+
+NetworkConfig ForwardProblemDescription::MakeNetworkConfig() const
+{
+    auto dtype = iDesc.GetType();
+    auto size  = iDesc.GetElementSize();
+
+    std::ostringstream ss;
+
+    ss << "smoothl1loss_fwd";
+    ss << "dtype" << dtype;
+    ss << "size" << size;
+    ss << "reduce" << reduction;
+
+    return NetworkConfig{ss.str()};
+}
+
+NetworkConfig BackwardProblemDescription::MakeNetworkConfig() const
+{
+    auto dtype = iDesc.GetType();
+    auto size  = iDesc.GetElementSize();
+
+    std::ostringstream ss;
+
+    ss << "smoothl1loss_bwd";
+    ss << "dtype" << dtype;
+    ss << "size" << size;
+    ss << "reduce" << reduction;
+
+    return NetworkConfig{ss.str()};
+}
+
+} // namespace smoothl1loss
+
+} // namespace miopen
diff --git a/src/smoothl1loss_api.cpp b/src/smoothl1loss_api.cpp
new file mode 100644
index 0000000000..850f5c188d
--- /dev/null
+++ b/src/smoothl1loss_api.cpp
@@ -0,0 +1,181 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/miopen.h>
+#include <miopen/smoothl1loss.hpp>
+#include <miopen/errors.hpp>
+#include <miopen/handle.hpp>
+#include <miopen/logger.hpp>
+#include <miopen/tensor_ops.hpp>
+
+inline std::ostream& operator<<(std::ostream& os, const std::vector<size_t>& v)
+{
+    os << '{';
+    for(int input = 0; input < v.size(); ++input)
+    {
+        if(input != 0)
+            os << ',';
+        os << v[input];
+    }
+    os << '}';
+    return os;
+}
+
+static void LogCmdSmoothL1Loss(const miopenTensorDescriptor_t inputDesc,
+                               const miopenTensorDescriptor_t targetDesc,
+                               const float beta,
+                               const miopenLossReductionMode_t reduction,
+                               bool is_fwd)
+{
+    if(miopen::IsLoggingCmd())
+    {
+        std::stringstream ss;
+        auto dtype = miopen::deref(inputDesc).GetType();
+        if(dtype == miopenHalf)
+        {
+            ss << "smoothl1lossfp16";
+        }
+        else if(dtype == miopenFloat)
+        {
+            ss << "smoothl1lossfp32";
+        }
+        else if(dtype == miopenBFloat16)
+        {
+            ss << "smoothl1lossbfp16";
+        }
+
+        MIOPEN_LOG_FUNCTION(inputDesc, targetDesc);
+        ss << " -n " << miopen::deref(inputDesc).GetLengths()[0];
+        ss << " -T " << miopen::deref(inputDesc).GetLengths();
+        ss << " -Si " << miopen::deref(inputDesc).GetStrides();
+        ss << " -St " << miopen::deref(targetDesc).GetStrides();
+        ss << " -F " << ((is_fwd) ? "1" : "2") << " -b " << beta << " -r " << reduction;
+
+        MIOPEN_LOG_DRIVER_CMD(ss.str());
+    }
+}
+
+extern "C" miopenStatus_t
+miopenGetSmoothL1LossForwardWorkspaceSize(miopenHandle_t handle,
+                                          const miopenTensorDescriptor_t inputDesc,
+                                          const miopenTensorDescriptor_t outputDesc,
+                                          const miopenLossReductionMode_t reduction,
+                                          size_t* sizeInBytes)
+{
+
+    MIOPEN_LOG_FUNCTION(handle, inputDesc, outputDesc, reduction, sizeInBytes);
+
+    return miopen::try_([&] {
+        miopen::deref(sizeInBytes) = miopen::GetSmoothL1LossForwardWorkspaceSize(
+            miopen::deref(handle), miopen::deref(inputDesc), miopen::deref(outputDesc), reduction);
+    });
+}
+
+extern "C" miopenStatus_t miopenSmoothL1LossForward(miopenHandle_t handle,
+                                                    void* workspace,
+                                                    size_t workspaceSizeInBytes,
+                                                    const miopenTensorDescriptor_t inputDesc,
+                                                    const void* input,
+                                                    const miopenTensorDescriptor_t targetDesc,
+                                                    const void* target,
+                                                    const miopenTensorDescriptor_t outputDesc,
+                                                    void* output,
+                                                    const float beta,
+                                                    const miopenLossReductionMode_t reduction)
+{
+    MIOPEN_LOG_FUNCTION(handle,
+                        workspace,
+                        workspaceSizeInBytes,
+                        inputDesc,
+                        input,
+                        targetDesc,
+                        target,
+                        outputDesc,
+                        output,
+                        beta,
+                        reduction);
+
+    LogCmdSmoothL1Loss(inputDesc, targetDesc, beta, reduction, true);
+    return miopen::try_([&] {
+        miopen::SmoothL1LossForward(miopen::deref(handle),
+                                    DataCast(workspace),
+                                    workspaceSizeInBytes,
+                                    miopen::deref(inputDesc),
+                                    DataCast(input),
+                                    miopen::deref(targetDesc),
+                                    DataCast(target),
+                                    miopen::deref(outputDesc),
+                                    DataCast(output),
+                                    beta,
+                                    reduction);
+    });
+}
+
+extern "C" miopenStatus_t miopenSmoothL1LossBackward(miopenHandle_t handle,
+                                                     const miopenTensorDescriptor_t inputDesc,
+                                                     const void* input,
+                                                     const miopenTensorDescriptor_t targetDesc,
+                                                     const void* target,
+                                                     const miopenTensorDescriptor_t doutputDesc,
+                                                     const void* doutput,
+                                                     const miopenTensorDescriptor_t dinputDesc,
+                                                     void* dinput,
+                                                     const miopenTensorDescriptor_t dtargetDesc,
+                                                     void* dtarget,
+                                                     const float beta,
+                                                     const miopenLossReductionMode_t reduction)
+{
+    MIOPEN_LOG_FUNCTION(handle,
+                        inputDesc,
+                        input,
+                        targetDesc,
+                        target,
+                        doutputDesc,
+                        doutput,
+                        dinputDesc,
+                        dinput,
+                        dtargetDesc,
+                        dtarget,
+                        beta,
+                        reduction);
+
+    LogCmdSmoothL1Loss(inputDesc, targetDesc, beta, reduction, false);
+    return miopen::try_([&] {
+        miopen::SmoothL1LossBackward(miopen::deref(handle),
+                                     miopen::deref(inputDesc),
+                                     DataCast(input),
+                                     miopen::deref(targetDesc),
+                                     DataCast(target),
+                                     miopen::deref(doutputDesc),
+                                     DataCast(doutput),
+                                     miopen::deref(dinputDesc),
+                                     DataCast(dinput),
+                                     miopen::deref(dtargetDesc),
+                                     DataCast(dtarget),
+                                     beta,
+                                     reduction);
+    });
+}
diff --git a/src/solver.cpp b/src/solver.cpp
index 856f04a8cf..1db9068d6a 100644
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -40,6 +40,7 @@
 #include <miopen/reduce/solvers.hpp>
 #include <miopen/rope/solvers.hpp>
 #include <miopen/mha/solvers.hpp>
+#include <miopen/smoothl1loss/solvers.hpp>
 #include <miopen/softmarginloss/solvers.hpp>
 #include <miopen/softmax/solvers.hpp>
 #include <miopen/multimarginloss/solvers.hpp>
@@ -701,20 +702,18 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
     Register(registry, ++id, Primitive::Activation, glu::GLUForward{}.SolverDbId());
     Register(registry, ++id, Primitive::Activation, glu::GLUBackward{}.SolverDbId());
 
-    Register(registry,
-             ++id,
-             Primitive::SoftMarginLoss,
-             softmarginloss::SoftMarginLossForward{}.SolverDbId());
-    Register(registry,
-             ++id,
-             Primitive::SoftMarginLoss,
-             softmarginloss::SoftMarginLossBackward{}.SolverDbId());
-    Register(registry,
-             ++id,
-             Primitive::MultiMarginLoss,
-             multimarginloss::MultiMarginLossForward{}.SolverDbId());
+    Register(registry, ++id, Primitive::Loss, softmarginloss::SoftMarginLossForward{}.SolverDbId());
+    Register(
+        registry, ++id, Primitive::Loss, softmarginloss::SoftMarginLossBackward{}.SolverDbId());
+
+    Register(
+        registry, ++id, Primitive::Loss, multimarginloss::MultiMarginLossForward{}.SolverDbId());
 
     Register(registry, ++id, Primitive::Mha, mha::MhaCKFlashAttentionV2Forward{}.SolverDbId());
+
+    Register(registry, ++id, Primitive::Loss, smoothl1loss::SmoothL1LossForward{}.SolverDbId());
+    Register(registry, ++id, Primitive::Loss, smoothl1loss::SmoothL1LossBackward{}.SolverDbId());
+
     // IMPORTANT: New solvers should be added to the end of the function, and don't leave a white
     // space between this comment and the newly registered solver(s)!
 }
diff --git a/src/solver/smoothl1loss/backward_smoothl1loss.cpp b/src/solver/smoothl1loss/backward_smoothl1loss.cpp
new file mode 100644
index 0000000000..f48c787190
--- /dev/null
+++ b/src/solver/smoothl1loss/backward_smoothl1loss.cpp
@@ -0,0 +1,147 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "miopen/errors.hpp"
+#include <miopen/datatype.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/mlo_internal.hpp>
+#include <miopen/smoothl1loss/invoke_params.hpp>
+#include <miopen/smoothl1loss/solvers.hpp>
+#include <miopen/smoothl1loss.hpp>
+#include <miopen/target_properties.hpp>
+#include <miopen/tensor_view_utils.hpp>
+
+#define LOCAL_SIZE_NONCONTIGUOUS_BWD 256
+
+#define VIEW_DIMS 5
+
+namespace miopen {
+
+namespace solver {
+
+const auto make_hip_kernel = [](std::vector<size_t> localsize,
+                                std::vector<size_t> gridsize,
+                                std::string kernel_file,
+                                std::string kernel_name,
+                                KernelBuildParameters build_params) {
+    while(localsize.size() < 3)
+        localsize.push_back(1);
+    while(gridsize.size() < 3)
+        gridsize.push_back(1);
+    for(int i = 0; i < localsize.size(); ++i)
+        gridsize[i] = AlignUp(gridsize[i], localsize[i]);
+    return KernelInfo{
+        build_params.GenerateFor(kbp::HIP{}), localsize, gridsize, kernel_file, kernel_name};
+};
+
+namespace smoothl1loss {
+
+namespace {
+bool IsImprovementOverROCm(const ExecutionContext& /*context*/,
+                           const miopen::smoothl1loss::BackwardProblemDescription& problem)
+{
+    if(problem.IsAllContiguous())
+        return false;
+    return true;
+}
+} // namespace
+
+bool SmoothL1LossBackward::IsApplicable(
+    const ExecutionContext& context,
+    const miopen::smoothl1loss::BackwardProblemDescription& problem) const
+{
+    if(!(problem.GetIDesc().GetType() == miopenFloat ||
+         problem.GetIDesc().GetType() == miopenHalf ||
+         problem.GetIDesc().GetType() == miopenBFloat16))
+        return false;
+    if(!IsImprovementOverROCm(context, problem))
+        return false;
+    if(problem.GetIDesc().GetNumDims() > VIEW_DIMS)
+        return false;
+    return true;
+}
+
+ConvSolution SmoothL1LossBackward::GetSolution(
+    const ExecutionContext& /*context*/,
+    const miopen::smoothl1loss::BackwardProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    auto dtype        = problem.GetDIDesc().GetType();
+    auto input_dtype  = miopen::GetDataType(problem.GetIDesc().GetType());
+    auto output_dtype = miopen::GetDataType(problem.GetDODesc().GetType());
+    auto size         = problem.GetIDesc().GetElementSize();
+
+    const auto build_params = KernelBuildParameters{
+        {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+        {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+        {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+        {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+        {"REDUCTION_TYPE", static_cast<int>(problem.GetReduction())},
+        {"VIEW_DIMS", VIEW_DIMS},
+    };
+
+    result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_NONCONTIGUOUS_BWD},
+                                                         {size},
+                                                         "MIOpenSmoothL1Loss.cpp",
+                                                         "SmoothL1LossBackward",
+                                                         build_params));
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::smoothl1loss::BwdInvokeParams>();
+
+            auto i_tv  = get_inner_expanded_tv<VIEW_DIMS>(deref(params.iDesc));
+            auto t_tv  = get_inner_expanded_tv<VIEW_DIMS>(deref(params.tDesc));
+            auto dO_tv = get_inner_expanded_tv<VIEW_DIMS>(deref(params.dODesc));
+            auto dI_tv = get_inner_expanded_tv<VIEW_DIMS>(deref(params.dIDesc));
+            auto dT_tv = get_inner_expanded_tv<VIEW_DIMS>(deref(params.dTDesc));
+
+            handle_.ResetKernelTime();
+            kernel(params.i,
+                   params.t,
+                   params.dO,
+                   params.dI,
+                   params.dT,
+                   params.beta,
+                   deref(params.iDesc).GetElementSize(),
+                   i_tv,
+                   t_tv,
+                   dO_tv,
+                   dI_tv,
+                   dT_tv);
+        };
+    };
+
+    return result;
+}
+
+} // namespace smoothl1loss
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/smoothl1loss/forward_smoothl1loss.cpp b/src/solver/smoothl1loss/forward_smoothl1loss.cpp
new file mode 100644
index 0000000000..fc68c0c1fb
--- /dev/null
+++ b/src/solver/smoothl1loss/forward_smoothl1loss.cpp
@@ -0,0 +1,250 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/buffer_info.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/mlo_internal.hpp>
+#include <miopen/smoothl1loss/invoke_params.hpp>
+#include <miopen/smoothl1loss/solvers.hpp>
+#include <miopen/smoothl1loss.hpp>
+#include <miopen/target_properties.hpp>
+#include <miopen/tensor_view_utils.hpp>
+
+#define LOCAL_SIZE_NONCONTIGUOUS_FWD 256
+#define LOCAL_SIZE_REDUCE 256
+
+#define VIEW_DIMS 5
+
+namespace miopen {
+
+namespace solver {
+
+namespace smoothl1loss {
+
+namespace {
+const auto make_hip_kernel = [](std::vector<size_t> localsize,
+                                std::vector<size_t> gridsize,
+                                std::string kernel_file,
+                                std::string kernel_name,
+                                KernelBuildParameters build_params) {
+    while(localsize.size() < 3)
+        localsize.push_back(1);
+    while(gridsize.size() < 3)
+        gridsize.push_back(1);
+    for(int i = 0; i < localsize.size(); ++i)
+        gridsize[i] = AlignUp(gridsize[i], localsize[i]);
+    return KernelInfo{
+        build_params.GenerateFor(kbp::HIP{}), localsize, gridsize, kernel_file, kernel_name};
+};
+} // namespace
+
+bool SmoothL1LossForward::IsApplicable(
+    const ExecutionContext& /*context*/,
+    const miopen::smoothl1loss::ForwardProblemDescription& problem) const
+{
+    if(!(problem.GetIDesc().GetType() == miopenFloat ||
+         problem.GetIDesc().GetType() == miopenHalf ||
+         problem.GetIDesc().GetType() == miopenBFloat16))
+        return false;
+    if(problem.GetIDesc().GetNumDims() > VIEW_DIMS)
+        return false;
+    return true;
+}
+
+ConvSolution SmoothL1LossForward::GetSolution(
+    const ExecutionContext& /*context*/,
+    const miopen::smoothl1loss::ForwardProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    auto dtype = problem.GetODesc().GetType();
+    auto size  = problem.GetIDesc().GetElementSize();
+
+    /* Phase 1: Calc loss for each element. */
+    {
+        const auto build_params =
+            KernelBuildParameters{{"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+                                  {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+                                  {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+                                  {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+                                  {"REDUCTION_TYPE", static_cast<int>(problem.GetReduction())},
+                                  {"VIEW_DIMS", VIEW_DIMS}};
+        result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_NONCONTIGUOUS_FWD},
+                                                             {size},
+                                                             "MIOpenSmoothL1Loss.cpp",
+                                                             "SmoothL1LossForward",
+                                                             build_params));
+    }
+
+    if(problem.GetReduction() != MIOPEN_LOSS_REDUCTION_NONE)
+    {
+        // If Reduction = NONE, then we should run second kernel to calculate mean/sum of result
+        // from first kernel above
+        /* Phase 2: Reduce */
+        auto _size              = size;
+        const auto build_params = KernelBuildParameters{
+            {"MIOPEN_USE_FP16", static_cast<int32_t>(dtype == miopenHalf)},
+            {"MIOPEN_USE_FP32", static_cast<int32_t>(dtype == miopenFloat)},
+            {"MIOPEN_USE_FP64", static_cast<int32_t>(dtype == miopenDouble)},
+            {"MIOPEN_USE_BFP16", static_cast<int32_t>(dtype == miopenBFloat16)},
+            {"REDUCE_SIZE", LOCAL_SIZE_REDUCE},
+        };
+        /* Reduce FLOAT_ACCUM -> FLOAT_ACCUM */
+        while(_size > LOCAL_SIZE_REDUCE)
+        {
+            result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_REDUCE},
+                                                                 {_size},
+                                                                 "MIOpenReduceSum.cpp",
+                                                                 "ReduceSumFLOATACCUM",
+                                                                 build_params));
+            _size = (_size + LOCAL_SIZE_REDUCE - 1) / LOCAL_SIZE_REDUCE;
+        }
+        // Last kernel reduce: FLOAT_ACCUM -> FLOAT
+        result.construction_params.push_back(make_hip_kernel(
+            {LOCAL_SIZE_REDUCE}, {_size}, "MIOpenReduceSum.cpp", "ReduceSum", build_params));
+    }
+
+    if(problem.GetReduction() == MIOPEN_LOSS_REDUCTION_NONE)
+    {
+        // Reduction = None -> invoke 1 kernel
+        result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+            return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+                decltype(auto) kernel = handle_.Run(kernels.front());
+                decltype(auto) params = raw_params.CastTo<miopen::smoothl1loss::FwdInvokeParams>();
+
+                auto i_tv = get_inner_expanded_tv<VIEW_DIMS>(deref(params.iDesc));
+                auto t_tv = get_inner_expanded_tv<VIEW_DIMS>(deref(params.tDesc));
+                auto o_tv = get_inner_expanded_tv<VIEW_DIMS>(deref(params.oDesc));
+
+                kernel(params.i,
+                       params.t,
+                       params.o,
+                       params.beta,
+                       deref(params.iDesc).GetElementSize(),
+                       i_tv,
+                       t_tv,
+                       o_tv);
+            };
+        };
+    }
+    else
+    {
+        // Reduction != None -> invoke 2 or more kernels
+        result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+            return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+                decltype(auto) params = raw_params.CastTo<miopen::smoothl1loss::FwdInvokeParams>();
+                auto i_tv             = get_inner_expanded_tv<VIEW_DIMS>(deref(params.iDesc));
+                auto t_tv             = get_inner_expanded_tv<VIEW_DIMS>(deref(params.tDesc));
+                auto o_tv             = get_inner_expanded_tv<VIEW_DIMS>(deref(params.oDesc));
+
+                float elapsed = 0.0f;
+                HipEventPtr start, stop;
+
+                const bool profiling = handle_.IsProfilingEnabled();
+                if(profiling)
+                {
+                    handle_.EnableProfiling(false);
+                    start = miopen::make_hip_event();
+                    stop  = miopen::make_hip_event();
+                    hipEventRecord(start.get(), handle_.GetStream());
+                }
+
+                int kernelCnt = 0;
+
+                /* Phase 1: Calc loss for each element. */
+                {
+                    decltype(auto) kernel = handle_.Run(kernels[kernelCnt++]);
+                    kernel(params.i,
+                           params.t,
+                           params.workspace,
+                           params.beta,
+                           params.iDesc->GetElementSize(),
+                           i_tv,
+                           t_tv,
+                           o_tv);
+                }
+
+                /* Phase 2: Reduce */
+                {
+                    auto size      = deref(params.iDesc).GetElementSize();
+                    auto data_size = get_data_size(miopenFloat);
+                    auto wt        = MultiBufferWorkspaceTraits{size * data_size,
+                                                         (size + LOCAL_SIZE_REDUCE - 1) /
+                                                             LOCAL_SIZE_REDUCE * data_size};
+                    auto work_a    = params.workspace;
+                    auto work_b    = static_cast<Data_t>(static_cast<std::byte*>(params.workspace) +
+                                                      wt.GetOffset(1));
+                    while(size > LOCAL_SIZE_REDUCE)
+                    {
+                        auto kernel = handle_.Run(kernels[kernelCnt++]);
+                        kernel(work_a, work_b, size);
+                        size = (size + LOCAL_SIZE_REDUCE - 1) / LOCAL_SIZE_REDUCE;
+                        std::swap(work_a, work_b);
+                    }
+                    handle_.Run(kernels[kernelCnt++])(work_a, params.o, size, o_tv);
+                }
+
+                if(profiling)
+                {
+                    hipEventRecord(stop.get(), handle_.GetStream());
+                    hipEventSynchronize(stop.get());
+                    hipEventElapsedTime(&elapsed, start.get(), stop.get());
+
+                    // Clean up
+                    hipEventDestroy(start.get());
+                    hipEventDestroy(stop.get());
+                    handle_.ResetKernelTime();
+                    handle_.AccumKernelTime(elapsed);
+
+                    handle_.EnableProfiling(true);
+                };
+            };
+        };
+    }
+
+    return result;
+}
+
+std::size_t SmoothL1LossForward::GetWorkspaceSize(
+    const ExecutionContext& /*context*/,
+    const miopen::smoothl1loss::ForwardProblemDescription& problem) const
+{
+    if(problem.GetReduction() == MIOPEN_LOSS_REDUCTION_NONE)
+        return 0;
+
+    auto size      = problem.GetIDesc().GetElementSize();
+    auto data_size = get_data_size(miopenFloat);
+    return MultiBufferWorkspaceTraits{
+        size * data_size, (size + LOCAL_SIZE_REDUCE - 1) / LOCAL_SIZE_REDUCE * data_size}
+        .GetSize();
+}
+
+} // namespace smoothl1loss
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/test/cpu_multimarginloss.hpp b/test/cpu_multimarginloss.hpp
index a8b19ba0f2..407d7311b1 100644
--- a/test/cpu_multimarginloss.hpp
+++ b/test/cpu_multimarginloss.hpp
@@ -45,13 +45,15 @@ void cpu_multimarginloss_forward(const tensor<T>& input,
     auto O_tv = miopen::get_inner_expanded_tv<1>(ref_output.desc);
     auto N = I_tv.size[0], C = I_tv.size[1];
 
-    double sum = 0;
-    for(size_t n = 0; n < N; n++)
-    {
+    std::vector<double> buffer;
+    if(reduction_mode != MIOPEN_LOSS_REDUCTION_NONE)
+        buffer.assign(N, 0);
+
+    par_ford(N)([&](size_t n) {
         double loss = 0;
         uint64_t y  = target[T_tv.get_tensor_view_idx({n})];
         if(y >= C)
-            continue;
+            return;
         for(size_t c = 0; c < C; c++)
         {
             if(y == c)
@@ -69,14 +71,13 @@ void cpu_multimarginloss_forward(const tensor<T>& input,
         if(reduction_mode == MIOPEN_LOSS_REDUCTION_NONE)
             ref_output[O_tv.get_tensor_view_idx({n})] = loss;
         else
-            sum += loss;
-    }
+            buffer[n] = loss;
+    });
+
+    auto sum = std::accumulate(buffer.begin(), buffer.end(), 0.0);
+
     if(reduction_mode == MIOPEN_LOSS_REDUCTION_MEAN)
-    {
-        ref_output[0] = static_cast<T>(sum / N);
-    }
-    else if(reduction_mode == MIOPEN_LOSS_REDUCTION_SUM)
-    {
+        sum /= N;
+    if(reduction_mode != MIOPEN_LOSS_REDUCTION_NONE)
         ref_output[0] = static_cast<T>(sum);
-    }
 }
diff --git a/test/cpu_smoothl1loss.hpp b/test/cpu_smoothl1loss.hpp
new file mode 100644
index 0000000000..bdbf6a0487
--- /dev/null
+++ b/test/cpu_smoothl1loss.hpp
@@ -0,0 +1,112 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include "tensor_holder.hpp"
+#include <miopen/miopen.h>
+#include <miopen/tensor_view_utils.hpp>
+
+template <class T>
+void cpu_smoothl1loss_forward(const tensor<T> input,
+                              const tensor<T> target,
+                              tensor<T>& ref_output,
+                              const float beta,
+                              const miopenLossReductionMode_t reduction)
+{
+    // Treat contiguous tensors as non-contiguous tensors (for consistency)
+    auto I_tv = get_inner_expanded_tv<5>(input.desc);
+    auto T_tv = get_inner_expanded_tv<5>(target.desc);
+    auto O_tv = get_inner_expanded_tv<5>(ref_output.desc);
+
+    auto size = input.desc.GetElementSize();
+
+    std::vector<double> buffer;
+    if(reduction != MIOPEN_LOSS_REDUCTION_NONE)
+        buffer.assign(size, 0);
+
+    par_ford(size)([&](size_t i) {
+        const auto tensor_layout = tensor_layout_t<5>(I_tv, i);
+        const uint64_t Iidx      = I_tv.get_tensor_view_idx(tensor_layout);
+        const uint64_t Tidx      = T_tv.get_tensor_view_idx(tensor_layout);
+        float diff               = abs(input[Iidx] - target[Tidx]);
+        float loss               = (diff < beta ? 0.5f * diff * diff / beta : diff - 0.5f * beta);
+        if(reduction == MIOPEN_LOSS_REDUCTION_NONE)
+            ref_output[O_tv.get_tensor_view_idx(tensor_layout)] = static_cast<T>(loss);
+        else
+            buffer[i] = loss;
+    });
+
+    auto loss_sum = std::accumulate(buffer.begin(), buffer.end(), 0.0);
+
+    if(reduction == MIOPEN_LOSS_REDUCTION_MEAN)
+        loss_sum /= size;
+    if(reduction != MIOPEN_LOSS_REDUCTION_NONE)
+        ref_output[0] = static_cast<T>(loss_sum);
+}
+
+template <class T>
+void cpu_smoothl1loss_backward(tensor<T> input,
+                               tensor<T> target,
+                               tensor<T> dO,
+                               tensor<T>& ref_dI,
+                               tensor<T>& ref_dT,
+                               float beta,
+                               miopenLossReductionMode_t reduction)
+{
+    // Treat contiguous tensors as non-contiguous tensors (for consistency)
+    auto I_tv  = get_inner_expanded_tv<5>(input.desc);
+    auto T_tv  = get_inner_expanded_tv<5>(target.desc);
+    auto dO_tv = get_inner_expanded_tv<5>(dO.desc);
+    auto dI_tv = get_inner_expanded_tv<5>(ref_dI.desc);
+    auto dT_tv = get_inner_expanded_tv<5>(ref_dT.desc);
+
+    auto size = input.desc.GetElementSize();
+
+    par_ford(size)([&](size_t i) {
+        const auto tensor_layout = tensor_layout_t<5>(I_tv, i);
+        const uint64_t Iidx      = I_tv.get_tensor_view_idx(tensor_layout);
+        const uint64_t Tidx      = T_tv.get_tensor_view_idx(tensor_layout);
+
+        T sub  = input[Iidx] - target[Tidx];
+        T grad = static_cast<T>(0.0f);
+
+        if(fabs(sub) < beta)
+            grad = sub / beta *
+                   dO[reduction == MIOPEN_LOSS_REDUCTION_NONE
+                          ? dO_tv.get_tensor_view_idx(tensor_layout)
+                          : 0];
+        else
+            grad = (sub >= 0 ? 1.0f : -1.0f) * dO[reduction == MIOPEN_LOSS_REDUCTION_NONE
+                                                      ? dO_tv.get_tensor_view_idx(tensor_layout)
+                                                      : 0];
+
+        if(reduction == MIOPEN_LOSS_REDUCTION_MEAN)
+            grad = grad / size;
+
+        ref_dI[dI_tv.get_tensor_view_idx(tensor_layout)] = grad;
+        ref_dT[dT_tv.get_tensor_view_idx(tensor_layout)] = -grad;
+    });
+}
diff --git a/test/cpu_softmarginloss.hpp b/test/cpu_softmarginloss.hpp
index 6fe5803970..6eefe59245 100644
--- a/test/cpu_softmarginloss.hpp
+++ b/test/cpu_softmarginloss.hpp
@@ -28,6 +28,7 @@
 #include "miopen/miopen.h"
 #include "tensor_holder.hpp"
 #include <miopen/tensor_view_utils.hpp>
+#include <numeric>
 
 template <class T>
 void cpu_softmarginloss_forward(const tensor<T>& input,
@@ -40,23 +41,26 @@ void cpu_softmarginloss_forward(const tensor<T>& input,
     auto t_tv        = miopen::get_inner_expanded_tv<5>(target.desc);
     auto o_tv        = miopen::get_inner_expanded_tv<5>(ref_output.desc);
 
-    double sum_loss = 0;
-    for(size_t gid = 0; gid < input_numel; gid++)
-    {
+    std::vector<double> buffer;
+    if(reduction_mode != MIOPEN_LOSS_REDUCTION_NONE)
+        buffer.assign(input_numel, 0);
+
+    par_ford(input_numel)([&](size_t gid) {
         tensor_layout_t<5> idx(i_tv, gid);
-        // Convert to double for better precision
         double i = input[i_tv.get_tensor_view_idx(idx)];
         double t = target[t_tv.get_tensor_view_idx(idx)];
         if(reduction_mode == MIOPEN_LOSS_REDUCTION_NONE)
             ref_output[o_tv.get_tensor_view_idx(idx)] = log1p(exp(-i * t));
         else
-            sum_loss += log1p(exp(-i * t));
-    };
+            buffer[gid] = log1p(exp(-i * t));
+    });
+
+    auto sum_loss = std::accumulate(buffer.begin(), buffer.end(), 0.0);
 
     if(reduction_mode == MIOPEN_LOSS_REDUCTION_MEAN)
-        ref_output[0] = sum_loss / input_numel;
-    else if(reduction_mode == MIOPEN_LOSS_REDUCTION_SUM)
-        ref_output[0] = sum_loss;
+        sum_loss /= input_numel;
+    if(reduction_mode != MIOPEN_LOSS_REDUCTION_NONE)
+        ref_output[0] = static_cast<T>(sum_loss);
 }
 
 template <class T>
diff --git a/test/gtest/smoothl1loss.cpp b/test/gtest/smoothl1loss.cpp
new file mode 100644
index 0000000000..93f1ed2648
--- /dev/null
+++ b/test/gtest/smoothl1loss.cpp
@@ -0,0 +1,153 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "smoothl1loss.hpp"
+#include <miopen/bfloat16.hpp>
+
+namespace smoothl1loss {
+
+struct GPU_SmoothL1LossForward_FP32 : SmoothL1LossTestForward<float>
+{
+};
+
+struct GPU_SmoothL1LossForward_FP16 : SmoothL1LossTestForward<half>
+{
+};
+
+struct GPU_SmoothL1LossForward_BFP16 : SmoothL1LossTestForward<bfloat16>
+{
+};
+
+struct GPU_SmoothL1LossBackward_FP32 : SmoothL1LossTestBackward<float>
+{
+};
+
+struct GPU_SmoothL1LossBackward_FP16 : SmoothL1LossTestBackward<half>
+{
+};
+
+struct GPU_SmoothL1LossBackward_BFP16 : SmoothL1LossTestBackward<bfloat16>
+{
+};
+
+} // namespace smoothl1loss
+using namespace smoothl1loss;
+
+TEST_P(GPU_SmoothL1LossForward_FP32, Test)
+{
+    RunTest();
+    Verify();
+};
+
+TEST_P(GPU_SmoothL1LossForward_FP16, Test)
+{
+    RunTest();
+    Verify();
+};
+
+TEST_P(GPU_SmoothL1LossForward_BFP16, Test)
+{
+    RunTest();
+    Verify();
+};
+
+TEST_P(GPU_SmoothL1LossBackward_FP32, Test)
+{
+    RunTest();
+    Verify();
+};
+
+TEST_P(GPU_SmoothL1LossBackward_FP16, Test)
+{
+    RunTest();
+    Verify();
+};
+
+TEST_P(GPU_SmoothL1LossBackward_BFP16, Test)
+{
+    RunTest();
+    Verify();
+};
+
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_SmoothL1LossForward_FP32,
+                         testing::ValuesIn(SmoothL1LossSmokeTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_SmoothL1LossForward_FP16,
+                         testing::ValuesIn(SmoothL1LossSmokeTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_SmoothL1LossForward_BFP16,
+                         testing::ValuesIn(SmoothL1LossSmokeTestConfigs()));
+
+INSTANTIATE_TEST_SUITE_P(Perf,
+                         GPU_SmoothL1LossForward_FP32,
+                         testing::ValuesIn(SmoothL1LossPerfTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Perf,
+                         GPU_SmoothL1LossForward_FP16,
+                         testing::ValuesIn(SmoothL1LossPerfTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Perf,
+                         GPU_SmoothL1LossForward_BFP16,
+                         testing::ValuesIn(SmoothL1LossPerfTestConfigs()));
+
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_SmoothL1LossForward_FP32,
+                         testing::ValuesIn(SmoothL1LossFullTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_SmoothL1LossForward_FP16,
+                         testing::ValuesIn(SmoothL1LossFullTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_SmoothL1LossForward_BFP16,
+                         testing::ValuesIn(SmoothL1LossFullTestConfigs()));
+
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_SmoothL1LossBackward_FP32,
+                         testing::ValuesIn(SmoothL1LossSmokeTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_SmoothL1LossBackward_FP16,
+                         testing::ValuesIn(SmoothL1LossSmokeTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_SmoothL1LossBackward_BFP16,
+                         testing::ValuesIn(SmoothL1LossSmokeTestConfigs()));
+
+INSTANTIATE_TEST_SUITE_P(Perf,
+                         GPU_SmoothL1LossBackward_FP32,
+                         testing::ValuesIn(SmoothL1LossPerfTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Perf,
+                         GPU_SmoothL1LossBackward_FP16,
+                         testing::ValuesIn(SmoothL1LossPerfTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Perf,
+                         GPU_SmoothL1LossBackward_BFP16,
+                         testing::ValuesIn(SmoothL1LossPerfTestConfigs()));
+
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_SmoothL1LossBackward_FP32,
+                         testing::ValuesIn(SmoothL1LossFullTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_SmoothL1LossBackward_FP16,
+                         testing::ValuesIn(SmoothL1LossFullTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_SmoothL1LossBackward_BFP16,
+                         testing::ValuesIn(SmoothL1LossFullTestConfigs()));
diff --git a/test/gtest/smoothl1loss.hpp b/test/gtest/smoothl1loss.hpp
new file mode 100644
index 0000000000..f6a566c671
--- /dev/null
+++ b/test/gtest/smoothl1loss.hpp
@@ -0,0 +1,356 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "cpu_smoothl1loss.hpp"
+#include "get_handle.hpp"
+#include "random.hpp"
+#include "tensor_holder.hpp"
+#include "verify.hpp"
+#include <gtest/gtest.h>
+#include <miopen/miopen.h>
+#include <miopen/smoothl1loss.hpp>
+
+inline std::ostream& operator<<(std::ostream& os, const std::vector<size_t>& v)
+{
+    os << '{';
+    for(int i = 0; i < v.size(); ++i)
+    {
+        if(i != 0)
+            os << ',';
+        os << v[i];
+    }
+    os << '}';
+    return os;
+}
+
+struct SmoothL1LossTestCase
+{
+    std::vector<size_t> lengths;
+    float beta;
+    miopenLossReductionMode_t reduction;
+    bool contiguous;
+
+    friend std::ostream& operator<<(std::ostream& os, const SmoothL1LossTestCase& tc)
+    {
+        return os << " Lengths:" << tc.lengths << " Beta:" << tc.beta
+                  << " Reduction:" << tc.reduction
+                  << " Contiguous:" << (tc.contiguous ? "True" : "False");
+    }
+};
+
+inline std::vector<SmoothL1LossTestCase>
+SmoothL1LossTestConfigs(const std::vector<std::vector<size_t>>& SizeList)
+{
+    std::vector<SmoothL1LossTestCase> tcs;
+    auto all_mode = {
+        MIOPEN_LOSS_REDUCTION_NONE, MIOPEN_LOSS_REDUCTION_SUM, MIOPEN_LOSS_REDUCTION_MEAN};
+    for(auto reduction : all_mode)
+        for(auto contiguous : {true, false})
+            for(const auto& lengths : SizeList)
+                tcs.push_back({lengths, 1, reduction, contiguous});
+    return tcs;
+}
+
+inline std::vector<SmoothL1LossTestCase> SmoothL1LossSmokeTestConfigs()
+{
+    return SmoothL1LossTestConfigs({
+        {1, 1, 1},
+        {4, 7, 5},
+        {1, 2, 3, 4},
+        {1, 1, 1, 257},
+        {34, 4, 5},
+        {15, 4, 5},
+        {5, 13, 17, 11},
+        {2, 10, 128, 128},
+    });
+}
+
+inline std::vector<SmoothL1LossTestCase> SmoothL1LossPerfTestConfigs()
+{
+    return SmoothL1LossTestConfigs({{256, 4, 8723}});
+}
+
+inline std::vector<SmoothL1LossTestCase> SmoothL1LossFullTestConfigs()
+{
+    std::vector<SmoothL1LossTestCase> tcs;
+
+    auto smoke_test = SmoothL1LossSmokeTestConfigs();
+    auto perf_test  = SmoothL1LossPerfTestConfigs();
+
+    tcs.reserve(smoke_test.size() + perf_test.size());
+    for(const auto& test : smoke_test)
+        tcs.push_back(test);
+    for(const auto& test : perf_test)
+        tcs.push_back(test);
+
+    return tcs;
+}
+
+inline std::vector<size_t> GetStrides(std::vector<size_t> lengths, bool contiguous)
+{
+    if(!contiguous)
+        std::swap(lengths.front(), lengths.back());
+    std::vector<size_t> strides(lengths.size());
+    strides.back() = 1;
+    for(int i = lengths.size() - 2; i >= 0; --i)
+        strides[i] = strides[i + 1] * lengths[i + 1];
+    if(!contiguous)
+        std::swap(strides.front(), strides.back());
+    return strides;
+}
+
+template <typename T = float>
+struct SmoothL1LossTestForward : public ::testing::TestWithParam<SmoothL1LossTestCase>
+{
+protected:
+    void SetUp() override
+    {
+        auto&& handle       = get_handle();
+        smoothl1loss_config = GetParam();
+        auto gen_value1     = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 1); };
+        auto gen_value2     = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 2); };
+
+        beta            = smoothl1loss_config.beta;
+        reduction       = smoothl1loss_config.reduction;
+        auto lengths    = smoothl1loss_config.lengths;
+        auto contiguous = smoothl1loss_config.contiguous;
+
+        auto in_strides = GetStrides(lengths, true);
+        input           = tensor<T>{lengths, in_strides}.generate(gen_value1);
+
+        auto tar_strides = GetStrides(lengths, contiguous);
+        target           = tensor<T>{lengths, tar_strides}.generate(gen_value2);
+
+        auto out_lengths =
+            (reduction == MIOPEN_LOSS_REDUCTION_NONE ? lengths : std::vector<size_t>{1});
+        auto out_strides = GetStrides(out_lengths, true);
+
+        output = tensor<T>{out_lengths, out_strides};
+        std::fill(output.begin(), output.end(), std::numeric_limits<T>::quiet_NaN());
+
+        ref_output = tensor<T>{out_lengths, out_strides};
+        std::fill(ref_output.begin(), ref_output.end(), std::numeric_limits<T>::quiet_NaN());
+
+        ws_sizeInBytes =
+            miopen::GetSmoothL1LossForwardWorkspaceSize(handle, input.desc, output.desc, reduction);
+        if(ws_sizeInBytes == static_cast<size_t>(-1))
+            GTEST_SKIP();
+
+        if(ws_sizeInBytes != 0)
+        {
+            std::vector<size_t> workspace_dims;
+            workspace_dims.push_back(ws_sizeInBytes / sizeof(float));
+
+            workspace = tensor<float>{workspace_dims};
+            std::fill(workspace.begin(), workspace.end(), 0.0f);
+
+            workspace_dev = handle.Write(workspace.data);
+        }
+
+        input_dev  = handle.Write(input.data);
+        target_dev = handle.Write(target.data);
+        output_dev = handle.Write(output.data);
+    }
+
+    void RunTest()
+    {
+        auto&& handle = get_handle();
+
+        miopenStatus_t status;
+
+        cpu_smoothl1loss_forward<T>(input, target, ref_output, beta, reduction);
+        status = miopen::SmoothL1LossForward(handle,
+                                             workspace_dev.get(),
+                                             ws_sizeInBytes,
+                                             input.desc,
+                                             input_dev.get(),
+                                             target.desc,
+                                             target_dev.get(),
+                                             output.desc,
+                                             output_dev.get(),
+                                             beta,
+                                             reduction);
+        ASSERT_EQ(status, miopenStatusSuccess);
+
+        workspace.data = handle.Read<float>(workspace_dev, workspace.data.size());
+        output.data    = handle.Read<T>(output_dev, output.data.size());
+    }
+
+    void Verify()
+    {
+        // Computation error of fp16 is ~2^13 (=8192) bigger than
+        // the one of fp32 because mantissa is shorter by 13 bits.
+        double tolerance = std::is_same<T, float>::value ? 1.5e-6 : 8.2e-3;
+
+        // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
+        if(std::is_same<T, bfloat16>::value)
+            tolerance *= 8.0;
+
+        auto error = miopen::rms_range(ref_output, output);
+
+        ASSERT_EQ(miopen::range_distance(ref_output), miopen::range_distance(output));
+        EXPECT_LT(error, tolerance)
+            << "Error output beyond tolerance Error: " << error << ",  Tolerance: " << tolerance;
+    }
+    SmoothL1LossTestCase smoothl1loss_config;
+
+    tensor<T> input;
+    tensor<T> target;
+    tensor<T> output;
+    tensor<float> workspace;
+
+    tensor<T> ref_output;
+
+    miopen::Allocator::ManageDataPtr input_dev;
+    miopen::Allocator::ManageDataPtr target_dev;
+    miopen::Allocator::ManageDataPtr output_dev;
+    miopen::Allocator::ManageDataPtr workspace_dev;
+
+    size_t ws_sizeInBytes;
+
+    float beta;
+    miopenLossReductionMode_t reduction;
+};
+
+template <typename T = float>
+struct SmoothL1LossTestBackward : public ::testing::TestWithParam<SmoothL1LossTestCase>
+{
+protected:
+    void SetUp() override
+    {
+        auto&& handle       = get_handle();
+        smoothl1loss_config = GetParam();
+        auto gen_value1     = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 100); };
+        auto gen_value2     = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 101); };
+
+        beta            = smoothl1loss_config.beta;
+        reduction       = smoothl1loss_config.reduction;
+        auto lengths    = smoothl1loss_config.lengths;
+        auto contiguous = smoothl1loss_config.contiguous;
+
+        auto in_strides = GetStrides(lengths, true);
+        input           = tensor<T>{lengths, in_strides}.generate(gen_value1);
+
+        auto tar_strides = GetStrides(lengths, contiguous);
+        target           = tensor<T>{lengths, tar_strides}.generate(gen_value2);
+
+        auto out_lengths =
+            (reduction == MIOPEN_LOSS_REDUCTION_NONE ? lengths : std::vector<size_t>{1});
+        auto out_strides = GetStrides(out_lengths, true);
+
+        dO = tensor<T>{out_lengths, out_strides};
+        std::fill(dO.begin(), dO.end(), 0);
+
+        dI = tensor<T>{lengths, in_strides};
+        std::fill(dI.begin(), dI.end(), std::numeric_limits<T>::quiet_NaN());
+        dT = tensor<T>{lengths, tar_strides};
+        std::fill(dT.begin(), dT.end(), std::numeric_limits<T>::quiet_NaN());
+
+        ref_dI = tensor<T>{lengths, in_strides};
+        std::fill(ref_dI.begin(), ref_dI.end(), std::numeric_limits<T>::quiet_NaN());
+        ref_dT = tensor<T>{lengths, tar_strides};
+        std::fill(ref_dT.begin(), ref_dT.end(), std::numeric_limits<T>::quiet_NaN());
+
+        if(input.desc.IsContiguous() && target.desc.IsContiguous() && dO.desc.IsContiguous() &&
+           dI.desc.IsContiguous() && dT.desc.IsContiguous())
+            GTEST_SKIP();
+
+        input_dev  = handle.Write(input.data);
+        target_dev = handle.Write(target.data);
+        dO_dev     = handle.Write(dO.data);
+        dI_dev     = handle.Write(dI.data);
+        dT_dev     = handle.Write(dT.data);
+    }
+
+    void RunTest()
+    {
+        auto&& handle = get_handle();
+
+        miopenStatus_t status;
+
+        cpu_smoothl1loss_backward<T>(input, target, dO, ref_dI, ref_dT, beta, reduction);
+        status = miopen::SmoothL1LossBackward(handle,
+                                              input.desc,
+                                              input_dev.get(),
+                                              target.desc,
+                                              target_dev.get(),
+                                              dO.desc,
+                                              dO_dev.get(),
+                                              dI.desc,
+                                              dI_dev.get(),
+                                              dT.desc,
+                                              dT_dev.get(),
+                                              beta,
+                                              reduction);
+
+        EXPECT_EQ(status, miopenStatusSuccess);
+
+        dI.data = handle.Read<T>(dI_dev, dI.data.size());
+        dT.data = handle.Read<T>(dT_dev, dT.data.size());
+    }
+
+    void Verify()
+    {
+        // Computation error of fp16 is ~2^13 (=8192) bigger than
+        // the one of fp32 because mantissa is shorter by 13 bits.
+        double tolerance = std::is_same<T, float>::value ? 1.5e-6 : 8.2e-3;
+
+        // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
+        if(std::is_same<T, bfloat16>::value)
+            tolerance *= 8.0;
+
+        auto error_dI = miopen::rms_range(ref_dI, dI);
+        auto error_dT = miopen::rms_range(ref_dT, dT);
+
+        ASSERT_EQ(miopen::range_distance(ref_dI), miopen::range_distance(dI));
+        ASSERT_EQ(miopen::range_distance(ref_dT), miopen::range_distance(dT));
+        EXPECT_LT(error_dI, tolerance)
+            << "Error Input Gradient beyond tolerance Error: " << error_dI
+            << ",  Tolerance: " << tolerance;
+        EXPECT_LT(error_dT, tolerance)
+            << "Error Target Gradient beyond tolerance Error: " << error_dT
+            << ",  Tolerance: " << tolerance;
+    }
+    SmoothL1LossTestCase smoothl1loss_config;
+
+    tensor<T> input;
+    tensor<T> target;
+    tensor<T> dO;
+    tensor<T> dI;
+    tensor<T> dT;
+
+    tensor<T> ref_dI;
+    tensor<T> ref_dT;
+
+    miopen::Allocator::ManageDataPtr input_dev;
+    miopen::Allocator::ManageDataPtr target_dev;
+    miopen::Allocator::ManageDataPtr dO_dev;
+    miopen::Allocator::ManageDataPtr dI_dev;
+    miopen::Allocator::ManageDataPtr dT_dev;
+
+    float beta;
+    miopenLossReductionMode_t reduction;
+};