diff --git a/test cases/cuda/17 separate compilation linking/b.cu b/test cases/cuda/17 separate compilation linking/b.cu new file mode 100644 index 000000000000..33ff561e21c4 --- /dev/null +++ b/test cases/cuda/17 separate compilation linking/b.cu @@ -0,0 +1,5 @@ +#include "b.h" + +__device__ int g[N]; + +__device__ void bar(void) { g[threadIdx.x]++; } diff --git a/test cases/cuda/17 separate compilation linking/b.h b/test cases/cuda/17 separate compilation linking/b.h new file mode 100644 index 000000000000..d8a0efcd9ca4 --- /dev/null +++ b/test cases/cuda/17 separate compilation linking/b.h @@ -0,0 +1,5 @@ +#define N 8 + +extern __device__ int g[N]; + +extern __device__ void bar(void); diff --git a/test cases/cuda/17 separate compilation linking/main.cu b/test cases/cuda/17 separate compilation linking/main.cu new file mode 100644 index 000000000000..b07d01bda018 --- /dev/null +++ b/test cases/cuda/17 separate compilation linking/main.cu @@ -0,0 +1,44 @@ +#include + +#include "b.h" + +__global__ void foo(void) +{ + __shared__ int a[N]; + a[threadIdx.x] = threadIdx.x; + + __syncthreads(); + + g[threadIdx.x] = a[blockDim.x - threadIdx.x - 1]; + + bar(); +} + +int main(void) +{ + unsigned int i; + int *dg, hg[N]; + int sum = 0; + + foo<<<1, N>>>(); + + if (cudaGetSymbolAddress((void**)&dg, g)) { + printf("couldn't get the symbol addr\n"); + return 1; + } + if (cudaMemcpy(hg, dg, N * sizeof(int), cudaMemcpyDeviceToHost)) { + printf("couldn't memcpy\n"); + return 1; + } + + for (i = 0; i < N; i++) { + sum += hg[i]; + } + if (sum == 36) { + printf("PASSED\n"); + } else { + printf("FAILED (%d)\n", sum); + } + + return 0; +} diff --git a/test cases/cuda/17 separate compilation linking/meson.build b/test cases/cuda/17 separate compilation linking/meson.build new file mode 100644 index 000000000000..ee86123eb40d --- /dev/null +++ b/test cases/cuda/17 separate compilation linking/meson.build @@ -0,0 +1,19 @@ +# example here is inspired by Nvidia's blog post: +# https://developer.nvidia.com/blog/separate-compilation-linking-cuda-device-code/ +# code: +# https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#examples + +project('device linking', ['cpp', 'cuda'], version : '1.0.0') + +nvcc = meson.get_compiler('cuda') +cuda = import('unstable-cuda') + +arch_flags = cuda.nvcc_arch_flags(nvcc.version(), 'Auto', detected : ['8.0']) + +message('NVCC version: ' + nvcc.version()) +message('NVCC flags: ' + ' '.join(arch_flags)) + +# test device linking with -dc (which is equivalent to `--relocatable-device-code true`) +lib = static_library('devicefuncs', ['b.cu'], cuda_args : ['-dc'] + arch_flags) +exe = executable('app', 'main.cu', cuda_args : ['-dc'] + arch_flags, link_with : lib, link_args : arch_flags) +test('cudatest', exe)