diff --git a/CMakeLists.txt b/CMakeLists.txt index cd530e1139..b7034a32fe 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -233,6 +233,15 @@ if(Legion_USE_CUDA) endif() endif() +#------------------------------------------------------------------------------# +# Accelerator configuration +#------------------------------------------------------------------------------# +option(Legion_USE_ACCELERATOR "Enable support for accelerator" OFF) +if (Legion_USE_ACCELERATOR) + message("Including Accelerator in Realm...") + set(REALM_USE_ACCELERATOR ON) +endif() + #------------------------------------------------------------------------------# # Kokkos configuration #------------------------------------------------------------------------------# diff --git a/cmake/accelerator-deps.cmake b/cmake/accelerator-deps.cmake new file mode 100644 index 0000000000..e734e8d450 --- /dev/null +++ b/cmake/accelerator-deps.cmake @@ -0,0 +1,12 @@ +set(hls_dep $ENV{HLS_CONFIG} CACHE STRING "set hls config .cmake path for module") + +get_filename_component(hls_dir ${hls_dep} DIRECTORY) +get_filename_component(hls_module ${hls_dep} NAME_WE) + +list(APPEND CMAKE_MODULE_PATH ${hls_dir}) + +include(${hls_module}) +link_directories(${XRT_LIB_DIR}) +target_link_libraries(RealmRuntime PRIVATE ${hls_module}) +install(TARGETS ${hls_module} EXPORT LegionTargets) +install(TARGETS miniglog EXPORT LegionTargets) diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt index 79f0add07e..1f692f050f 100644 --- a/runtime/CMakeLists.txt +++ b/runtime/CMakeLists.txt @@ -139,6 +139,13 @@ if(REALM_USE_MPI) ) endif() +if (REALM_USE_ACCELERATOR) + list(APPEND REALM_SRC + realm/accelerator/accelerator_module.h + realm/accelerator/accelerator_module.cc + ) +endif() + list(APPEND REALM_SRC realm.h realm/activemsg.h realm/activemsg.cc @@ -206,6 +213,14 @@ endforeach() find_package(Threads REQUIRED) add_library(RealmRuntime ${REALM_SRC}) + +if(Legion_USE_ACCELERATOR) + if (REALM_USE_ACCELERATOR) + include(accelerator-deps) + add_definitions(-DREALM_USE_ACCELERATOR) + endif() +endif() + target_compile_options(RealmRuntime PRIVATE ${CXX_BUILD_WARNING_FLAGS}) if(COMPILER_SUPPORTS_DEFCHECK) # use the cxx_defcheck wrapper to make sure realm_defines.h is included diff --git a/runtime/legion/runtime.cc b/runtime/legion/runtime.cc index 18fedc41fd..37c3828fd9 100644 --- a/runtime/legion/runtime.cc +++ b/runtime/legion/runtime.cc @@ -11769,6 +11769,11 @@ namespace Legion { LegionSpy::log_processor_kind(kind, "Python"); break; } + case Processor::ACCEL_PROC: + { + LegionSpy::log_processor_kind(kind, "Accelerator"); + break; + } default: assert(false); // unknown processor kind } @@ -22843,6 +22848,7 @@ namespace Legion { ((local_util_procs.empty() || config.replay_on_cpus) && ((it->first.kind() == Processor::LOC_PROC) || (it->first.kind() == Processor::TOC_PROC) || + (it->first.kind() == Processor::ACCEL_PROC) || (it->first.kind() == Processor::IO_PROC)))) { registered_events.insert(RtEvent( diff --git a/runtime/mappers/default_mapper.cc b/runtime/mappers/default_mapper.cc index d50513211c..1e1a070e6c 100644 --- a/runtime/mappers/default_mapper.cc +++ b/runtime/mappers/default_mapper.cc @@ -55,14 +55,14 @@ namespace Legion { node_id(local.address_space()), machine(m), mapper_name((name == NULL) ? create_default_name(local) : own ? name : strdup(name)), - next_local_gpu(0), next_local_cpu(0), next_local_io(0), + next_local_gpu(0), next_local_cpu(0), next_local_io(0), next_local_accel(0), next_local_procset(0), next_local_omp(0), next_local_py(0), - next_global_gpu(Processor::NO_PROC), + next_global_gpu(Processor::NO_PROC), next_global_accel(Processor::NO_PROC), next_global_cpu(Processor::NO_PROC), next_global_io(Processor::NO_PROC), next_global_procset(Processor::NO_PROC), next_global_omp(Processor::NO_PROC), next_global_py(Processor::NO_PROC), global_gpu_query(NULL), global_cpu_query(NULL), global_io_query(NULL), - global_procset_query(NULL), global_omp_query(NULL), + global_procset_query(NULL), global_omp_query(NULL), global_accel_query(NULL), global_py_query(NULL), max_steals_per_theft(STATIC_MAX_PERMITTED_STEALS), max_steal_count(STATIC_MAX_STEAL_COUNT), @@ -151,6 +151,11 @@ namespace Legion { local_omps.push_back(*it); break; } + case Processor::ACCEL_PROC: + { + local_accels.push_back(*it); + break; + } default: // ignore anything else break; } @@ -211,6 +216,14 @@ namespace Legion { remote_omps[node] = *it; break; } + case Processor::ACCEL_PROC: + { + if (node >= remote_accels.size()) + remote_accels.resize(node+1, Processor::NO_PROC); + if (!remote_accels[node].exists()) + remote_accels[node] = *it; + break; + } default: // ignore anything else break; } @@ -392,6 +405,8 @@ namespace Legion { return default_get_next_local_omp(); case Processor::PY_PROC: return default_get_next_local_py(); + case Processor::ACCEL_PROC: + return default_get_next_local_accel(); default: // make warnings go away break; } @@ -421,6 +436,8 @@ namespace Legion { return default_get_next_local_omp(); case Processor::PY_PROC: return default_get_next_local_py(); + case Processor::ACCEL_PROC: + return default_get_next_local_accel(); default: // make warnings go away break; } @@ -446,6 +463,8 @@ namespace Legion { return default_get_next_global_omp(); case Processor::PY_PROC: return default_get_next_global_py(); + case Processor::ACCEL_PROC: + return default_get_next_global_accel(); default: // make warnings go away break; } @@ -468,6 +487,8 @@ namespace Legion { return default_get_next_local_omp(); case Processor::PY_PROC: return default_get_next_local_py(); + case Processor::ACCEL_PROC: + return default_get_next_local_accel(); default: // make warnings go away break; } @@ -552,6 +573,37 @@ namespace Legion { return result; } + //-------------------------------------------------------------------------- + Processor DefaultMapper::default_get_next_local_accel(void) + //-------------------------------------------------------------------------- + { + Processor result = local_accels[next_local_accel++]; + if (next_local_accel == local_accels.size()) + next_local_accel = 0; + return result; + } + + //-------------------------------------------------------------------------- + Processor DefaultMapper::default_get_next_global_accel(void) + //------------------------------------------------------------------------- + { + if (total_nodes == 1) + return default_get_next_local_accel(); + if (!next_global_accel.exists()) + { + global_accel_query = new Machine::ProcessorQuery(machine); + global_accel_query->only_kind(Processor::ACCEL_PROC); + next_global_accel = global_accel_query->first(); + } + Processor result = next_global_accel; + next_global_accel = global_accel_query->next(result); + if (!next_global_accel.exists()) + { + delete global_accel_query; + global_accel_query = NULL; + } + return result; + } //-------------------------------------------------------------------------- Processor DefaultMapper::default_get_next_local_io(void) //-------------------------------------------------------------------------- @@ -753,6 +805,13 @@ namespace Legion { continue; break; } + case Processor::ACCEL_PROC: + { + kindString += "ACCEL_PROC "; + if (local_accels.empty()) + continue; + break; + } case Processor::LOC_PROC: { kindString += "LOC_PROC "; @@ -914,10 +973,11 @@ namespace Legion { //-------------------------------------------------------------------------- { // Default mapper is ignorant about task IDs so just do whatever: - // 1) GPU > OMP > procset > cpu > IO > Python (default) - // 2) OMP > procset > cpu > IO > Python > GPU (with PREFER_CPU_VARIANT) + // 1) GPU > OMP > procset > cpu > IO > Python > Accel (default) + // 2) OMP > procset > cpu > IO > Python > GPU > Accel (with PREFER_CPU_VARIANT) // It is up to the caller to filter out processor kinds that aren't // suitable for a given task + bool prefer_cpu = ((task.tag & PREFER_CPU_VARIANT) != 0); if ((local_gpus.size() > 0) && !prefer_cpu) ranking.push_back(Processor::TOC_PROC); @@ -928,6 +988,10 @@ namespace Legion { if (local_pys.size() > 0) ranking.push_back(Processor::PY_PROC); if ((local_gpus.size() > 0) && prefer_cpu) ranking.push_back(Processor::TOC_PROC); + + if (local_accels.size() > 0) { + ranking.push_back(Processor::ACCEL_PROC); + } } //-------------------------------------------------------------------------- @@ -1024,6 +1088,23 @@ namespace Legion { } break; } + case Processor::ACCEL_PROC: + { + if (task.index_domain.get_volume() > local_accels.size()) + { + if (!global_memory.exists()) + { + log_mapper.error("Default mapper failure. No memory found " + "for CPU task %s (ID %lld) which is visible " + "for all point in the index space.", + task.get_task_name(), task.get_unique_id()); + assert(false); + } + else + target_memory = global_memory; + } + break; + } case Processor::LOC_PROC: { if (task.index_domain.get_volume() > local_cpus.size()) @@ -1125,6 +1206,7 @@ namespace Legion { switch (task.target_proc.kind()) { case Processor::LOC_PROC: + case Processor::ACCEL_PROC: // use cpu memory case Processor::IO_PROC: case Processor::PROC_SET: case Processor::OMP_PROC: @@ -1252,6 +1334,11 @@ namespace Legion { input, output, gpu_slices_cache); break; } + case Processor::ACCEL_PROC: + { + default_slice_task(task, local_accels, remote_accels, + input, output, cpu_slices_cache); + } case Processor::IO_PROC: { default_slice_task(task, local_ios, remote_ios, @@ -1701,6 +1788,15 @@ namespace Legion { target_procs.push_back(task.target_proc); break; } + case Processor::ACCEL_PROC: + { + if (!task.must_epoch_task) + target_procs.insert(target_procs.end(), + local_accels.begin(), local_accels.end()); + else + target_procs.push_back(task.target_proc); + break; + } case Processor::LOC_PROC: { // Put any of our local cpus on here @@ -3124,6 +3220,11 @@ namespace Legion { *result = local_gpus.size(); break; } + case DEFAULT_TUNABLE_LOCAL_ACCELS: + { + *result = local_accels.size(); + break; + } case DEFAULT_TUNABLE_LOCAL_CPUS: { *result = local_cpus.size(); @@ -3418,6 +3519,15 @@ namespace Legion { } break; } + case Processor::ACCEL_PROC: + { + if (local_accels.empty()) + { + ++it; + continue; + } + break; + } case Processor::OMP_PROC: { if (local_omps.empty()) diff --git a/runtime/mappers/default_mapper.h b/runtime/mappers/default_mapper.h index 55d3bd79b1..9c04644b06 100644 --- a/runtime/mappers/default_mapper.h +++ b/runtime/mappers/default_mapper.h @@ -51,7 +51,8 @@ namespace Legion { DEFAULT_TUNABLE_GLOBAL_IOS = 8, DEFAULT_TUNABLE_GLOBAL_OMPS = 9, DEFAULT_TUNABLE_GLOBAL_PYS = 10, - DEFAULT_TUNABLE_LAST = 11, // this one must always be last and unused + DEFAULT_TUNABLE_LOCAL_ACCELS = 11, + DEFAULT_TUNABLE_LAST = 12 // this one must always be last and unused }; enum MappingKind { TASK_MAPPING, @@ -375,6 +376,8 @@ namespace Legion { Processor default_get_next_global_cpu(void); Processor default_get_next_local_gpu(void); Processor default_get_next_global_gpu(void); + Processor default_get_next_local_accel(void); + Processor default_get_next_global_accel(void); Processor default_get_next_local_io(void); Processor default_get_next_global_io(void); Processor default_get_next_local_py(void); @@ -464,12 +467,14 @@ namespace Legion { // There are a couple of parameters from the machine description that // the default mapper uses to determine how to perform mapping. std::vector local_gpus; + std::vector local_accels; std::vector local_cpus; std::vector local_ios; std::vector local_procsets; std::vector local_omps; std::vector local_pys; std::vector remote_gpus; + std::vector remote_accels; std::vector remote_cpus; std::vector remote_ios; std::vector remote_procsets; @@ -477,11 +482,11 @@ namespace Legion { std::vector remote_pys; protected: // For doing round-robining of tasks onto processors - unsigned next_local_gpu, next_local_cpu, next_local_io, + unsigned next_local_gpu, next_local_cpu, next_local_io, next_local_accel, next_local_procset, next_local_omp, next_local_py; - Processor next_global_gpu, next_global_cpu, next_global_io, + Processor next_global_gpu, next_global_cpu, next_global_io, next_global_accel, next_global_procset, next_global_omp, next_global_py; - Machine::ProcessorQuery *global_gpu_query, *global_cpu_query, + Machine::ProcessorQuery *global_gpu_query, *global_cpu_query, *global_accel_query, *global_io_query, *global_procset_query, *global_omp_query, *global_py_query; protected: diff --git a/runtime/mappers/mapping_utilities.cc b/runtime/mappers/mapping_utilities.cc index c8a49e0ea5..9f13d137d2 100644 --- a/runtime/mappers/mapping_utilities.cc +++ b/runtime/mappers/mapping_utilities.cc @@ -1059,6 +1059,7 @@ namespace Legion { case Processor::PROC_SET: return "PROC_SET"; case Processor::OMP_PROC: return "OMP_PROC"; case Processor::PY_PROC: return "PY_PROC"; + case Processor::ACCEL_PROC: return "ACCEL_PROC"; default: assert(false); return ""; } } diff --git a/runtime/realm/accelerator/accelerator_module.cc b/runtime/realm/accelerator/accelerator_module.cc new file mode 100644 index 0000000000..728a68c290 --- /dev/null +++ b/runtime/realm/accelerator/accelerator_module.cc @@ -0,0 +1,187 @@ +#include "realm/accelerator/accelerator_module.h" + + +#include "realm/logging.h" +#include "realm/cmdline.h" +#include "realm/threads.h" +#include "realm/utils.h" + +// each task access by include header file where the namespace is declared +namespace XRTContext { + // define extern xrt_device + thread_local XRTDevice *xrt_device = 0; +} + +namespace Realm { + namespace Accelerator { + + Logger log_accel("accel"); + + AcceleratorModule::AcceleratorModule() : Module("accelerator"), cfg_num_accelerators_(0) { + } + + AcceleratorModule::~AcceleratorModule(void) {} + + Module *AcceleratorModule::create_module(RuntimeImpl *runtime, std::vector& cmdline) { + AcceleratorModule *m = new AcceleratorModule; + log_accel.info() << "use accelerator"; + Realm::CommandLineParser cp; + cp.add_option_string("-accel:fwbin", m->cfg_fwbin_path_); + cp.add_option_int("-ll:num_accelerators", m->cfg_num_accelerators_); + + bool ok = cp.parse_command_line(cmdline); + if (!ok) { + log_accel.error() << "error reading accelerator parameters"; + exit(1); + } + + for (int i = 0; i < m->cfg_num_accelerators_; i++) { + // template arguments must be known at compile time and const + // TODO: add support for non-xilinx fpgas + XRTDevice *xrt = new XRTDevice(m->cfg_fwbin_path_); + m->xrt_devices_.push_back(xrt); + } + + return m; + } + + // do any general initialization - this is called after all configuration is + // complete + void AcceleratorModule::initialize(RuntimeImpl *runtime) { + Module::initialize(runtime); + } + + // create any memories provided by this module (default == do nothing) + // (each new MemoryImpl should use a Memory from RuntimeImpl::next_local_memory_id) + void AcceleratorModule::create_memories(RuntimeImpl *runtime) { + Module::create_memories(runtime); + } + + // create any processors provided by the module (default == do nothing) + // (each new ProcessorImpl should use a Processor from + // RuntimeImpl::next_local_processor_id) + void AcceleratorModule::create_processors(RuntimeImpl *runtime) { + Module::create_processors(runtime); + // 1 : 1 mapping processor to device + for (int i = 0; i < xrt_devices_.size(); i++) { + Processor p = runtime->next_local_processor_id(); + AcceleratorProcessor *proc = new AcceleratorProcessor(xrt_devices_[i], p, runtime->core_reservation_set()); + procs_.push_back(proc); + runtime->add_processor(proc); + + // create mem affinities to add a proc to machine model + // create affinities between this processor and system/reg memories + // if the memory is one we created, use the kernel-reported distance + // to adjust the answer + std::vector& local_mems = runtime->nodes[Network::my_node_id].memories; + for(std::vector::iterator it2 = local_mems.begin(); + it2 != local_mems.end(); + ++it2) { + Memory::Kind kind = (*it2)->get_kind(); + if((kind != Memory::SYSTEM_MEM) && (kind != Memory::REGDMA_MEM)) + continue; + + Machine::ProcessorMemoryAffinity pma; + pma.p = p; + pma.m = (*it2)->me; + + // use the same made-up numbers as in + // runtime_impl.cc + if(kind == Memory::SYSTEM_MEM) { + pma.bandwidth = 100; // "large" + pma.latency = 5; // "small" + } else { + pma.bandwidth = 80; // "large" + pma.latency = 10; // "small" + } + + runtime->add_proc_mem_affinity(pma); + + } + + } + } + + // create any DMA channels provided by the module (default == do nothing) + void AcceleratorModule::create_dma_channels(RuntimeImpl *runtime) { + Module::create_dma_channels(runtime); + } + + // create any code translators provided by the module (default == do nothing) + void AcceleratorModule::create_code_translators(RuntimeImpl *runtime) { + Module::create_code_translators(runtime); + } + + // clean up any common resources created by the module - this will be called + // after all memories/processors/etc. have been shut down and destroyed + void AcceleratorModule::cleanup(void) { + for (std::vector *>::iterator it = xrt_devices_.begin(); it != xrt_devices_.end(); it++) + delete *it; + xrt_devices_.clear(); + } + + template + class AcceleratorTaskScheduler : public T { + public: + AcceleratorTaskScheduler(Processor proc, Realm::CoreReservation& core_rsrv, AcceleratorProcessor *accelerator_proc); + virtual ~AcceleratorTaskScheduler(void); + protected: + virtual bool execute_task(Task *task); + virtual void execute_internal_task(InternalTask *task); + AcceleratorProcessor *accel_proc_; + }; + + template + AcceleratorTaskScheduler::AcceleratorTaskScheduler(Processor proc, + Realm::CoreReservation& core_rsrv, + AcceleratorProcessor *accel_proc) : T(proc, core_rsrv), accel_proc_(accel_proc) { + } + + template + AcceleratorTaskScheduler::~AcceleratorTaskScheduler(void) { + } + + template + bool AcceleratorTaskScheduler::execute_task(Task *task) { + // add device to thread's xrt context + XRTContext::xrt_device = accel_proc_->xrt_device_; + bool ok = T::execute_task(task); + return ok; + } + + template + void AcceleratorTaskScheduler::execute_internal_task(InternalTask *task) { + // add device to thread's xrt context + XRTContext::xrt_device = accel_proc_->xrt_device_; + T::execute_internal_task(task); + } + + AcceleratorProcessor::AcceleratorProcessor(XRTDevice *xrt, Processor me, Realm::CoreReservationSet& crs) + : LocalTaskProcessor(me, Processor::ACCEL_PROC) + { + xrt_device_ = xrt; + + Realm::CoreReservationParameters params; + params.set_num_cores(1); + params.set_alu_usage(params.CORE_USAGE_SHARED); + params.set_fpu_usage(params.CORE_USAGE_SHARED); + params.set_ldst_usage(params.CORE_USAGE_SHARED); + params.set_max_stack_size(2 << 20); + std::string name = stringbuilder() << "Accel proc " << me; + core_rsrv_ = new Realm::CoreReservation(name, crs, params); + +#ifdef REALM_USE_USER_THREADS + UserThreadTaskScheduler *sched = new AcceleratorTaskScheduler(me, *core_rsrv_, this); +#else + KernelThreadTaskScheduler *sched = new AcceleratorTaskScheduler(me, *core_rsrv_, this); +#endif + set_scheduler(sched); + } + + AcceleratorProcessor::~AcceleratorProcessor(void) { + delete core_rsrv_; + } + + }; // namespace Accelerator +}; // namespace Realm + diff --git a/runtime/realm/accelerator/accelerator_module.h b/runtime/realm/accelerator/accelerator_module.h new file mode 100644 index 0000000000..a9eaccd083 --- /dev/null +++ b/runtime/realm/accelerator/accelerator_module.h @@ -0,0 +1,75 @@ +#ifndef REALM_ACCELERATOR_MODULE_H +#define REALM_ACCELERATOR_MODULE_H + +#include "hls/cpfp_conv.h" // class XRTdevice + +#include "realm/module.h" +#include "realm/proc_impl.h" +#include "realm/mem_impl.h" +#include "realm/runtime_impl.h" + +namespace Realm { + namespace Accelerator { + + class AcceleratorModule; + class AcceleratorProcessor; + + class AcceleratorModule : public Module { + protected: + AcceleratorModule(void); + + public: + virtual ~AcceleratorModule(void); + + static Module *create_module(RuntimeImpl *runtime, std::vector& cmdline); + + // do any general initialization - this is called after all configuration is + // complete + virtual void initialize(RuntimeImpl *runtime); + + // create any memories provided by this module (default == do nothing) + // (each new MemoryImpl should use a Memory from RuntimeImpl::next_local_memory_id) + virtual void create_memories(RuntimeImpl *runtime); + + // create any processors provided by the module (default == do nothing) + // (each new ProcessorImpl should use a Processor from + // RuntimeImpl::next_local_processor_id) + virtual void create_processors(RuntimeImpl *runtime); + + // create any DMA channels provided by the module (default == do nothing) + virtual void create_dma_channels(RuntimeImpl *runtime); + + // create any code translators provided by the module (default == do nothing) + virtual void create_code_translators(RuntimeImpl *runtime); + + // clean up any common resources created by the module - this will be called + // after all memories/processors/etc. have been shut down and destroyed + virtual void cleanup(void); + + public: + unsigned cfg_num_accelerators_; + std::string cfg_fwbin_path_; + + protected: + std::vector procs_; + std::vector *> xrt_devices_; + }; + + REGISTER_REALM_MODULE(AcceleratorModule); + + class AcceleratorProcessor : public LocalTaskProcessor { + public: + AcceleratorProcessor(XRTDevice *xrt, Processor me, Realm::CoreReservationSet& crs); + virtual ~AcceleratorProcessor(void); + + XRTDevice *xrt_device_; + + protected: + Realm::CoreReservation *core_rsrv_; + }; + + }; // namespace Accelerator +}; // namespace Realm + +#endif + diff --git a/runtime/realm/module.cc b/runtime/realm/module.cc index 856ad38eb1..aef17ce59b 100644 --- a/runtime/realm/module.cc +++ b/runtime/realm/module.cc @@ -56,6 +56,9 @@ #if defined REALM_USE_MPI #include "realm/mpi/mpi_module.h" #endif +#ifdef REALM_USE_ACCELERATOR +#include "realm/accelerator/accelerator_module.h" +#endif namespace Realm { diff --git a/runtime/realm/realm_c.h b/runtime/realm/realm_c.h index 62d7a1b544..0d908c1333 100644 --- a/runtime/realm/realm_c.h +++ b/runtime/realm/realm_c.h @@ -51,7 +51,8 @@ typedef unsigned long long realm_barrier_timestamp_t; __op__(PROC_GROUP, "Processor group") \ __op__(PROC_SET, "Set of Processors for OpenMP/Kokkos etc.") \ __op__(OMP_PROC, "OpenMP (or similar) thread pool") \ - __op__(PY_PROC, "Python interpreter") + __op__(PY_PROC, "Python interpreter") \ + __op__(ACCEL_PROC, "XRT Accelerator") typedef enum realm_processor_kind_t { #define C_ENUMS(name, desc) name, diff --git a/test/attach_file_mini/CMakeLists.txt b/test/attach_file_mini/CMakeLists.txt index 67ea942312..8ccf686959 100644 --- a/test/attach_file_mini/CMakeLists.txt +++ b/test/attach_file_mini/CMakeLists.txt @@ -22,6 +22,12 @@ if(NOT Legion_SOURCE_DIR) find_package(Legion REQUIRED) endif() +if (Legion_USE_ACCELERATOR) + set(XRT_LIB_DIR $ENV{XILINX_XRT}/lib) + link_directories(${XRT_LIB_DIR}) +endif() + + add_executable(attach_file_mini attach_file_mini.cc) target_link_libraries(attach_file_mini Legion::Legion) if(Legion_ENABLE_TESTING) diff --git a/test/legion_stl/CMakeLists.txt b/test/legion_stl/CMakeLists.txt index 407682b247..c43f7055e7 100644 --- a/test/legion_stl/CMakeLists.txt +++ b/test/legion_stl/CMakeLists.txt @@ -22,6 +22,12 @@ if(NOT Legion_SOURCE_DIR) find_package(Legion REQUIRED) endif() +if (Legion_USE_ACCELERATOR) + set(XRT_LIB_DIR $ENV{XILINX_XRT}/lib) + link_directories(${XRT_LIB_DIR}) +endif() + + add_executable(test_stl test_stl.cc) set_property(TARGET test_stl PROPERTY CXX_STANDARD 11) set_property(TARGET test_stl PROPERTY CXX_STANDARD_REQUIRED ON) diff --git a/test/realm/CMakeLists.txt b/test/realm/CMakeLists.txt index 1b6a4be6d8..32eb705d6f 100644 --- a/test/realm/CMakeLists.txt +++ b/test/realm/CMakeLists.txt @@ -44,11 +44,22 @@ list(APPEND REALM_TESTS coverings ) +# TODO: build memspeed properly with cuda enabled... if(Legion_USE_CUDA) # some tests have CUDA source files too set(CUDASRC_memspeed memspeed_gpu.cu) endif() +if (Legion_USE_ACCELERATOR) + list(APPEND REALM_TESTS + accelerator_task + ) + # environment variable XILINX_XRT + # looking for libxilinxopencl + set(XRT_LIB_DIR $ENV{XILINX_XRT}/lib) + link_directories(${XRT_LIB_DIR}) +endif() + foreach(test IN LISTS REALM_TESTS) if(CUDASRC_${test}) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Wno-deprecated-gpu-targets) diff --git a/test/realm/accelerator_task.cc b/test/realm/accelerator_task.cc new file mode 100644 index 0000000000..3b68eee5e2 --- /dev/null +++ b/test/realm/accelerator_task.cc @@ -0,0 +1,90 @@ +#include "realm.h" + +using namespace Realm; + +// execute a task on Processor::ACCEL_PROC processor + +Logger log_app("app"); + +enum { + TOP_LEVEL_TASK = Processor::TASK_ID_FIRST_AVAILABLE+0, + CHILD_TASK_ID_START +}; + +void child_task(const void *args, size_t arglen, + const void *userdata, size_t userlen, Processor p) +{ + log_app.print() << "child task on " << p << ": arglen=" << arglen << ", userlen=" << userlen; +} + +void top_level_task(const void *args, size_t arglen, + const void *userdata, size_t userlen, Processor p) +{ + log_app.print() << "top task running on " << p; + + Machine machine = Machine::get_machine(); + Processor::TaskFuncID func_id = CHILD_TASK_ID_START; + CodeDescriptor child_task_desc(child_task); + + std::set finish_events; + Event e = Processor::register_task_by_kind(Processor::ACCEL_PROC, true /*global*/, + func_id, + child_task_desc, + ProfilingRequestSet()); + + int count = 0; + + std::set all_processors; + machine.get_all_processors(all_processors); + for(std::set::const_iterator it = all_processors.begin(); + it != all_processors.end(); + it++) { + Processor pp = (*it); + + // only ACCEL_PROCs + if(pp.kind() != Processor::ACCEL_PROC) + continue; + + Event e2 = pp.spawn(func_id, &count, sizeof(count), e); + + finish_events.insert(e2); + } + + func_id++; + + Event merged = Event::merge_events(finish_events); + + merged.wait(); + + log_app.print() << "all done!"; +} + +int main(int argc, char **argv) +{ + Runtime rt; + + rt.init(&argc, &argv); + + // select a processor to run the top level task on + Processor p = Machine::ProcessorQuery(Machine::get_machine()) + .only_kind(Processor::ACCEL_PROC) + .first(); + assert(p.exists()); + + Event e1 = Processor::register_task_by_kind(p.kind(), + false /*!global*/, + TOP_LEVEL_TASK, + CodeDescriptor(top_level_task), + ProfilingRequestSet()); + + // collective launch of a single task - everybody gets the same finish event + Event e2 = rt.collective_spawn(p, TOP_LEVEL_TASK, 0, 0, e1); + + // request shutdown once that task is complete + rt.shutdown(e2); + + // now sleep this thread until that shutdown actually happens + rt.wait_for_shutdown(); + + return 0; +} diff --git a/test/rendering/CMakeLists.txt b/test/rendering/CMakeLists.txt index 6a9de95a96..cfa109dc34 100644 --- a/test/rendering/CMakeLists.txt +++ b/test/rendering/CMakeLists.txt @@ -22,6 +22,12 @@ if(NOT Legion_SOURCE_DIR) find_package(Legion REQUIRED) endif() +if (Legion_USE_ACCELERATOR) + set(XRT_LIB_DIR $ENV{XILINX_XRT}/lib) + link_directories(${XRT_LIB_DIR}) +endif() + + add_executable(rendering rendering.cc) target_link_libraries(rendering Legion::Legion) if(Legion_ENABLE_TESTING)