migration of cuda::barrier #2578

jinz2014 · 2024-12-20T23:46:59Z

I tried Intel(R) DPC++ Compatibility Tool version 2024.2.0. Codebase:(55a3f03). clang version 19.0.0

#include <cuda/barrier>
#include <cooperative_groups.h>

__device__ void compute(float* data, int curr_iteration);

__global__ void split_arrive_wait(int iteration_count, float *data) {
    using barrier = cuda::barrier<cuda::thread_scope_block>;
    __shared__  barrier bar;
    auto block = cooperative_groups::this_thread_block();

    if (block.thread_rank() == 0) {
        init(&bar, block.size()); // Initialize the barrier with expected arrival count
    }
    block.sync();

    for (int curr_iter = 0; curr_iter < iteration_count; ++curr_iter) {
        /* code before arrive */
       barrier::arrival_token token = bar.arrive(); /* this thread arrives. Arrival does not block a thread */
       compute(data, curr_iter);
       bar.wait(std::move(token)); /* wait for all threads participating in the barrier to complete bar.arrive()*/
        /* code after wait */
    }
}

The migrated code is

#include <sycl/sycl.hpp>
#include <dpct/dpct.hpp>

void compute(float* data, int curr_iteration);

/*
DPCT1125:2: The type "barrier" defined in function "split_arrive_wait" is used
as the parameter type in all functions in the call path from the corresponding
sycl::handler::parallel_for() to the current function. You may need to adjust
the definition location of the type.
*/
void split_arrive_wait(int iteration_count, float *data,
                       const sycl::nd_item<3> &item_ct1, barrier &bar) {
    using barrier = cuda::barrier<cuda::thread_scope_block>;

    auto block = item_ct1.get_group();

    if (item_ct1.get_local_linear_id() == 0) {
        init(&bar,
             item_ct1.get_group()
                 .get_local_linear_range()); // Initialize the barrier with
                                             // expected arrival count
    }
    /*
    DPCT1065:0: Consider replacing sycl::nd_item::barrier() with
    sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
    performance if there is no access to global memory.
    */
    item_ct1.barrier();

    for (int curr_iter = 0; curr_iter < iteration_count; ++curr_iter) {
        /* code before arrive */
       barrier::arrival_token token = bar.arrive(); /* this thread arrives. Arrival does not block a thread */
       compute(data, curr_iter);
       bar.wait(std::move(token)); /* wait for all threads participating in the barrier to complete bar.arrive()*/
        /* code after wait */
    }
}

The text was updated successfully, but these errors were encountered:

tomflinda · 2024-12-26T03:28:20Z

@jinz2014
For the issue that parameter "barrier &bar" is inserted in function split_arrive_wait() in the migrated code, you can specify the option "--use-experimental-features=local-memory-kernel-scope-allocation" during migration to fix it.
For the issue that "cuda::barriercuda::thread_scope_block" is not migrated, as the mapping "sycl_ext_oneapi_barrier" is still in proposal stage(https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/proposed/sycl_ext_oneapi_barrier.asciidoc), you can file a query issue in https://github.com/intel/llvm/issues to ask the ETA for this feature, when the mapping is available, we will plan to implement it.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

migration of cuda::barrier #2578

migration of cuda::barrier #2578

jinz2014 commented Dec 20, 2024

tomflinda commented Dec 26, 2024 •

edited

Loading

migration of cuda::barrier #2578

migration of cuda::barrier #2578

Comments

jinz2014 commented Dec 20, 2024

tomflinda commented Dec 26, 2024 • edited Loading

tomflinda commented Dec 26, 2024 •

edited

Loading