diff --git a/CMakeLists.txt b/CMakeLists.txt
index cd530e1139..b7034a32fe 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -233,6 +233,15 @@ if(Legion_USE_CUDA)
   endif()
 endif()
 
+#------------------------------------------------------------------------------#
+# Accelerator configuration
+#------------------------------------------------------------------------------#
+option(Legion_USE_ACCELERATOR "Enable support for accelerator" OFF)
+if (Legion_USE_ACCELERATOR)
+  message("Including Accelerator in Realm...")
+  set(REALM_USE_ACCELERATOR ON)
+endif()
+
 #------------------------------------------------------------------------------#
 # Kokkos configuration
 #------------------------------------------------------------------------------#
diff --git a/cmake/accelerator-deps.cmake b/cmake/accelerator-deps.cmake
new file mode 100644
index 0000000000..e734e8d450
--- /dev/null
+++ b/cmake/accelerator-deps.cmake
@@ -0,0 +1,12 @@
+set(hls_dep $ENV{HLS_CONFIG} CACHE STRING "set hls config .cmake path for module")
+
+get_filename_component(hls_dir ${hls_dep} DIRECTORY)
+get_filename_component(hls_module ${hls_dep} NAME_WE)
+
+list(APPEND CMAKE_MODULE_PATH ${hls_dir})
+
+include(${hls_module})
+link_directories(${XRT_LIB_DIR})
+target_link_libraries(RealmRuntime PRIVATE ${hls_module})
+install(TARGETS ${hls_module} EXPORT LegionTargets)
+install(TARGETS miniglog EXPORT LegionTargets)
diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt
index 79f0add07e..1f692f050f 100644
--- a/runtime/CMakeLists.txt
+++ b/runtime/CMakeLists.txt
@@ -139,6 +139,13 @@ if(REALM_USE_MPI)
   )
 endif()
 
+if (REALM_USE_ACCELERATOR)
+  list(APPEND REALM_SRC
+    realm/accelerator/accelerator_module.h
+    realm/accelerator/accelerator_module.cc
+  )
+endif()
+
 list(APPEND REALM_SRC
   realm.h
   realm/activemsg.h realm/activemsg.cc
@@ -206,6 +213,14 @@ endforeach()
 
 find_package(Threads REQUIRED)
 add_library(RealmRuntime ${REALM_SRC})
+
+if(Legion_USE_ACCELERATOR)
+  if (REALM_USE_ACCELERATOR)
+    include(accelerator-deps)
+    add_definitions(-DREALM_USE_ACCELERATOR)
+  endif()
+endif()
+
 target_compile_options(RealmRuntime PRIVATE ${CXX_BUILD_WARNING_FLAGS})
 if(COMPILER_SUPPORTS_DEFCHECK)
   # use the cxx_defcheck wrapper to make sure realm_defines.h is included
diff --git a/runtime/legion/runtime.cc b/runtime/legion/runtime.cc
index 18fedc41fd..37c3828fd9 100644
--- a/runtime/legion/runtime.cc
+++ b/runtime/legion/runtime.cc
@@ -11769,6 +11769,11 @@ namespace Legion {
                 LegionSpy::log_processor_kind(kind, "Python");
                 break;
               }
+            case Processor::ACCEL_PROC:
+              {
+                LegionSpy::log_processor_kind(kind, "Accelerator");
+                break;
+              }
             default:
               assert(false); // unknown processor kind
           }
@@ -22843,6 +22848,7 @@ namespace Legion {
             ((local_util_procs.empty() || config.replay_on_cpus) &&
               ((it->first.kind() == Processor::LOC_PROC) ||
                (it->first.kind() == Processor::TOC_PROC) ||
+               (it->first.kind() == Processor::ACCEL_PROC) ||
                (it->first.kind() == Processor::IO_PROC))))
         {
           registered_events.insert(RtEvent(
diff --git a/runtime/mappers/default_mapper.cc b/runtime/mappers/default_mapper.cc
index d50513211c..1e1a070e6c 100644
--- a/runtime/mappers/default_mapper.cc
+++ b/runtime/mappers/default_mapper.cc
@@ -55,14 +55,14 @@ namespace Legion {
         node_id(local.address_space()), machine(m),
         mapper_name((name == NULL) ? create_default_name(local) : 
                       own ? name : strdup(name)),
-        next_local_gpu(0), next_local_cpu(0), next_local_io(0),
+        next_local_gpu(0), next_local_cpu(0), next_local_io(0), next_local_accel(0),
         next_local_procset(0), next_local_omp(0), next_local_py(0),
-        next_global_gpu(Processor::NO_PROC),
+        next_global_gpu(Processor::NO_PROC), next_global_accel(Processor::NO_PROC),
         next_global_cpu(Processor::NO_PROC), next_global_io(Processor::NO_PROC),
         next_global_procset(Processor::NO_PROC),
         next_global_omp(Processor::NO_PROC), next_global_py(Processor::NO_PROC),
         global_gpu_query(NULL), global_cpu_query(NULL), global_io_query(NULL),
-        global_procset_query(NULL), global_omp_query(NULL),
+        global_procset_query(NULL), global_omp_query(NULL), global_accel_query(NULL),
         global_py_query(NULL),
         max_steals_per_theft(STATIC_MAX_PERMITTED_STEALS),
         max_steal_count(STATIC_MAX_STEAL_COUNT),
@@ -151,6 +151,11 @@ namespace Legion {
                 local_omps.push_back(*it);
                 break;
               }
+	    case Processor::ACCEL_PROC:
+	      {
+		local_accels.push_back(*it);
+		break;
+	      }
             default: // ignore anything else
               break;
           }
@@ -211,6 +216,14 @@ namespace Legion {
                 remote_omps[node] = *it;
               break;
             }
+	  case Processor::ACCEL_PROC:
+	    {
+              if (node >= remote_accels.size())
+		remote_accels.resize(node+1, Processor::NO_PROC);
+	      if (!remote_accels[node].exists())
+		remote_accels[node] = *it;
+	      break;
+	    }
           default: // ignore anything else
             break;
         }
@@ -392,6 +405,8 @@ namespace Legion {
             return default_get_next_local_omp();
           case Processor::PY_PROC:
             return default_get_next_local_py();
+	  case Processor::ACCEL_PROC:
+	    return default_get_next_local_accel();
           default: // make warnings go away
             break;
         }
@@ -421,6 +436,8 @@ namespace Legion {
                 return default_get_next_local_omp();
               case Processor::PY_PROC:
                 return default_get_next_local_py();
+	      case Processor::ACCEL_PROC:
+                return default_get_next_local_accel();
               default: // make warnings go away
                 break;
             }
@@ -446,6 +463,8 @@ namespace Legion {
                   return default_get_next_global_omp();
                 case Processor::PY_PROC:
                   return default_get_next_global_py();
+		case Processor::ACCEL_PROC:
+                  return default_get_next_global_accel();
                 default: // make warnings go away
                   break;
               }
@@ -468,6 +487,8 @@ namespace Legion {
                 return default_get_next_local_omp();
               case Processor::PY_PROC:
                 return default_get_next_local_py();
+	      case Processor::ACCEL_PROC:
+                return default_get_next_local_accel();
               default: // make warnings go away
                 break;
             }
@@ -552,6 +573,37 @@ namespace Legion {
       return result;
     }
 
+    //--------------------------------------------------------------------------
+    Processor DefaultMapper::default_get_next_local_accel(void)
+    //--------------------------------------------------------------------------
+    {
+      Processor result = local_accels[next_local_accel++];
+      if (next_local_accel == local_accels.size())
+        next_local_accel = 0;
+      return result;
+    }
+
+    //--------------------------------------------------------------------------
+    Processor DefaultMapper::default_get_next_global_accel(void)
+    //-------------------------------------------------------------------------
+    {
+      if (total_nodes == 1)
+	return default_get_next_local_accel();
+      if (!next_global_accel.exists())
+      {
+         global_accel_query = new Machine::ProcessorQuery(machine);
+	 global_accel_query->only_kind(Processor::ACCEL_PROC);
+	 next_global_accel = global_accel_query->first();
+      }
+      Processor result = next_global_accel;
+      next_global_accel = global_accel_query->next(result);
+      if (!next_global_accel.exists())
+      {
+        delete global_accel_query;
+	global_accel_query = NULL;
+      }
+      return result;
+    }
     //--------------------------------------------------------------------------
     Processor DefaultMapper::default_get_next_local_io(void)
     //--------------------------------------------------------------------------
@@ -753,6 +805,13 @@ namespace Legion {
                     continue;
                   break;
                 }
+	      case Processor::ACCEL_PROC:
+		{
+		  kindString += "ACCEL_PROC ";
+		  if (local_accels.empty())
+		    continue;
+		  break;
+		}
               case Processor::LOC_PROC:
                 {
                   kindString += "LOC_PROC ";
@@ -914,10 +973,11 @@ namespace Legion {
     //--------------------------------------------------------------------------
     {
       // Default mapper is ignorant about task IDs so just do whatever:
-      // 1) GPU > OMP > procset > cpu > IO > Python  (default)
-      // 2) OMP > procset > cpu > IO > Python > GPU  (with PREFER_CPU_VARIANT)
+      // 1) GPU > OMP > procset > cpu > IO > Python > Accel  (default)
+      // 2) OMP > procset > cpu > IO > Python > GPU > Accel  (with PREFER_CPU_VARIANT)
       // It is up to the caller to filter out processor kinds that aren't
       // suitable for a given task
+
       bool prefer_cpu = ((task.tag & PREFER_CPU_VARIANT) != 0);
       if ((local_gpus.size() > 0) && !prefer_cpu)
        ranking.push_back(Processor::TOC_PROC);
@@ -928,6 +988,10 @@ namespace Legion {
       if (local_pys.size() > 0) ranking.push_back(Processor::PY_PROC);
       if ((local_gpus.size() > 0) && prefer_cpu)
        ranking.push_back(Processor::TOC_PROC);
+
+      if (local_accels.size() > 0) {
+       ranking.push_back(Processor::ACCEL_PROC);
+      }
     }
 
     //--------------------------------------------------------------------------
@@ -1024,6 +1088,23 @@ namespace Legion {
                 }
                 break;
               }
+	    case Processor::ACCEL_PROC:
+	      {
+                if (task.index_domain.get_volume() > local_accels.size())
+                {
+                  if (!global_memory.exists())
+                  {
+                    log_mapper.error("Default mapper failure. No memory found "
+                        "for CPU task %s (ID %lld) which is visible "
+                        "for all point in the index space.",
+                        task.get_task_name(), task.get_unique_id());
+                    assert(false);
+                  }
+                  else
+                    target_memory = global_memory;
+                }
+                break;
+              }
             case Processor::LOC_PROC:
               {
                 if (task.index_domain.get_volume() > local_cpus.size())
@@ -1125,6 +1206,7 @@ namespace Legion {
         switch (task.target_proc.kind())
         {
           case Processor::LOC_PROC:
+          case Processor::ACCEL_PROC: // use cpu memory
           case Processor::IO_PROC:
           case Processor::PROC_SET:
           case Processor::OMP_PROC:
@@ -1252,6 +1334,11 @@ namespace Legion {
                                input, output, gpu_slices_cache);
             break;
           }
+	case Processor::ACCEL_PROC:
+	  {
+            default_slice_task(task, local_accels, remote_accels,
+                               input, output, cpu_slices_cache);
+	  }
         case Processor::IO_PROC:
           {
             default_slice_task(task, local_ios, remote_ios, 
@@ -1701,6 +1788,15 @@ namespace Legion {
               target_procs.push_back(task.target_proc);
               break;
             }
+	  case Processor::ACCEL_PROC:
+	    {
+	      if (!task.must_epoch_task)
+                target_procs.insert(target_procs.end(),
+                    local_accels.begin(), local_accels.end());
+              else
+                target_procs.push_back(task.target_proc);
+              break;
+	    }
           case Processor::LOC_PROC:
             {
               // Put any of our local cpus on here
@@ -3124,6 +3220,11 @@ namespace Legion {
             *result = local_gpus.size();
             break;
           }
+	case DEFAULT_TUNABLE_LOCAL_ACCELS:
+	  {
+	    *result = local_accels.size();
+	    break;
+          }
         case DEFAULT_TUNABLE_LOCAL_CPUS:
           {
             *result = local_cpus.size();
@@ -3418,6 +3519,15 @@ namespace Legion {
                 }
                 break;
               }
+	    case  Processor::ACCEL_PROC:
+	      {
+                if (local_accels.empty())
+		{
+		  ++it;
+		  continue;
+		}
+		break;
+	      }
             case Processor::OMP_PROC:
               {
                 if (local_omps.empty())
diff --git a/runtime/mappers/default_mapper.h b/runtime/mappers/default_mapper.h
index 55d3bd79b1..9c04644b06 100644
--- a/runtime/mappers/default_mapper.h
+++ b/runtime/mappers/default_mapper.h
@@ -51,7 +51,8 @@ namespace Legion {
         DEFAULT_TUNABLE_GLOBAL_IOS = 8,
         DEFAULT_TUNABLE_GLOBAL_OMPS = 9,
         DEFAULT_TUNABLE_GLOBAL_PYS = 10,
-        DEFAULT_TUNABLE_LAST = 11, // this one must always be last and unused
+        DEFAULT_TUNABLE_LOCAL_ACCELS = 11,
+        DEFAULT_TUNABLE_LAST = 12 // this one must always be last and unused
       };
       enum MappingKind {
         TASK_MAPPING,
@@ -375,6 +376,8 @@ namespace Legion {
       Processor default_get_next_global_cpu(void);
       Processor default_get_next_local_gpu(void);
       Processor default_get_next_global_gpu(void);
+      Processor default_get_next_local_accel(void);
+      Processor default_get_next_global_accel(void);
       Processor default_get_next_local_io(void);
       Processor default_get_next_global_io(void);
       Processor default_get_next_local_py(void);
@@ -464,12 +467,14 @@ namespace Legion {
       // There are a couple of parameters from the machine description that 
       // the default mapper uses to determine how to perform mapping.
       std::vector<Processor> local_gpus;
+      std::vector<Processor> local_accels;
       std::vector<Processor> local_cpus;
       std::vector<Processor> local_ios;
       std::vector<Processor> local_procsets;
       std::vector<Processor> local_omps;
       std::vector<Processor> local_pys;
       std::vector<Processor> remote_gpus;
+      std::vector<Processor> remote_accels;
       std::vector<Processor> remote_cpus;
       std::vector<Processor> remote_ios;
       std::vector<Processor> remote_procsets;
@@ -477,11 +482,11 @@ namespace Legion {
       std::vector<Processor> remote_pys;
     protected:
       // For doing round-robining of tasks onto processors
-      unsigned next_local_gpu, next_local_cpu, next_local_io,
+      unsigned next_local_gpu, next_local_cpu, next_local_io, next_local_accel,
                next_local_procset, next_local_omp, next_local_py;
-      Processor next_global_gpu, next_global_cpu, next_global_io,
+      Processor next_global_gpu, next_global_cpu, next_global_io, next_global_accel,
                 next_global_procset, next_global_omp, next_global_py;
-      Machine::ProcessorQuery *global_gpu_query, *global_cpu_query,
+      Machine::ProcessorQuery *global_gpu_query, *global_cpu_query, *global_accel_query,
                               *global_io_query, *global_procset_query,
                               *global_omp_query, *global_py_query;
     protected: 
diff --git a/runtime/mappers/mapping_utilities.cc b/runtime/mappers/mapping_utilities.cc
index c8a49e0ea5..9f13d137d2 100644
--- a/runtime/mappers/mapping_utilities.cc
+++ b/runtime/mappers/mapping_utilities.cc
@@ -1059,6 +1059,7 @@ namespace Legion {
           case Processor::PROC_SET: return "PROC_SET";
           case Processor::OMP_PROC: return "OMP_PROC";
           case Processor::PY_PROC: return "PY_PROC";
+	  case Processor::ACCEL_PROC: return "ACCEL_PROC";
           default: assert(false); return "";
         }
       }
diff --git a/runtime/realm/accelerator/accelerator_module.cc b/runtime/realm/accelerator/accelerator_module.cc
new file mode 100644
index 0000000000..728a68c290
--- /dev/null
+++ b/runtime/realm/accelerator/accelerator_module.cc
@@ -0,0 +1,187 @@
+#include "realm/accelerator/accelerator_module.h"
+
+
+#include "realm/logging.h"
+#include "realm/cmdline.h"
+#include "realm/threads.h"
+#include "realm/utils.h"
+
+// each task access by include header file where the namespace is declared
+namespace XRTContext {
+  // define extern xrt_device
+  thread_local XRTDevice<HW_EXP_SIZE, HW_MANT_SIZE> *xrt_device = 0;
+}
+
+namespace Realm {
+  namespace Accelerator {
+
+    Logger log_accel("accel");
+
+    AcceleratorModule::AcceleratorModule() : Module("accelerator"), cfg_num_accelerators_(0) {
+    }
+
+    AcceleratorModule::~AcceleratorModule(void) {}
+
+    Module *AcceleratorModule::create_module(RuntimeImpl *runtime, std::vector<std::string>& cmdline) {
+      AcceleratorModule *m = new AcceleratorModule;
+      log_accel.info() << "use accelerator";
+      Realm::CommandLineParser cp;
+      cp.add_option_string("-accel:fwbin", m->cfg_fwbin_path_);
+      cp.add_option_int("-ll:num_accelerators", m->cfg_num_accelerators_);
+
+      bool ok = cp.parse_command_line(cmdline);
+      if (!ok) {
+        log_accel.error() << "error reading accelerator parameters";
+        exit(1);
+      }
+
+      for (int i = 0; i < m->cfg_num_accelerators_; i++) {
+        // template arguments must be known at compile time and const
+        // TODO: add support for non-xilinx fpgas
+        XRTDevice<HW_EXP_SIZE, HW_MANT_SIZE> *xrt = new XRTDevice<HW_EXP_SIZE, HW_MANT_SIZE>(m->cfg_fwbin_path_);
+        m->xrt_devices_.push_back(xrt);
+      }
+
+      return m;
+    }
+
+    // do any general initialization - this is called after all configuration is
+    //  complete
+    void AcceleratorModule::initialize(RuntimeImpl *runtime) {
+      Module::initialize(runtime);
+    }
+
+    // create any memories provided by this module (default == do nothing)
+    //  (each new MemoryImpl should use a Memory from RuntimeImpl::next_local_memory_id)
+    void AcceleratorModule::create_memories(RuntimeImpl *runtime) {
+      Module::create_memories(runtime);
+    }
+
+    // create any processors provided by the module (default == do nothing)
+    //  (each new ProcessorImpl should use a Processor from
+    //   RuntimeImpl::next_local_processor_id)
+    void AcceleratorModule::create_processors(RuntimeImpl *runtime) {
+      Module::create_processors(runtime); 
+      // 1 : 1 mapping processor to device
+      for (int i = 0; i < xrt_devices_.size(); i++) {
+        Processor p = runtime->next_local_processor_id();
+        AcceleratorProcessor *proc = new AcceleratorProcessor(xrt_devices_[i], p, runtime->core_reservation_set());
+        procs_.push_back(proc);
+        runtime->add_processor(proc);
+
+        // create mem affinities to add a proc to machine model
+        // create affinities between this processor and system/reg memories
+        // if the memory is one we created, use the kernel-reported distance
+        // to adjust the answer
+        std::vector<MemoryImpl *>& local_mems = runtime->nodes[Network::my_node_id].memories;
+          for(std::vector<MemoryImpl *>::iterator it2 = local_mems.begin();
+              it2 != local_mems.end();
+              ++it2) {
+            Memory::Kind kind = (*it2)->get_kind();
+            if((kind != Memory::SYSTEM_MEM) && (kind != Memory::REGDMA_MEM))
+              continue;
+
+            Machine::ProcessorMemoryAffinity pma;
+            pma.p = p;
+            pma.m = (*it2)->me;
+
+            // use the same made-up numbers as in
+            //  runtime_impl.cc
+            if(kind == Memory::SYSTEM_MEM) {
+              pma.bandwidth = 100;  // "large"
+              pma.latency = 5;      // "small"
+            } else {
+              pma.bandwidth = 80;   // "large"
+              pma.latency = 10;     // "small"
+            }
+
+            runtime->add_proc_mem_affinity(pma);
+ 
+          }
+
+      }
+    }
+
+    // create any DMA channels provided by the module (default == do nothing)
+    void AcceleratorModule::create_dma_channels(RuntimeImpl *runtime) {
+      Module::create_dma_channels(runtime);
+    }
+
+    // create any code translators provided by the module (default == do nothing)
+    void AcceleratorModule::create_code_translators(RuntimeImpl *runtime) {
+      Module::create_code_translators(runtime);
+    }
+
+    // clean up any common resources created by the module - this will be called
+    //  after all memories/processors/etc. have been shut down and destroyed
+    void AcceleratorModule::cleanup(void) {
+      for (std::vector<XRTDevice<HW_EXP_SIZE, HW_MANT_SIZE> *>::iterator it = xrt_devices_.begin(); it != xrt_devices_.end(); it++)
+        delete *it;
+      xrt_devices_.clear();
+    }
+
+    template <typename T>
+    class AcceleratorTaskScheduler : public T {
+      public:
+        AcceleratorTaskScheduler(Processor proc, Realm::CoreReservation& core_rsrv, AcceleratorProcessor *accelerator_proc);
+        virtual ~AcceleratorTaskScheduler(void);
+      protected:
+        virtual bool execute_task(Task *task);
+        virtual void execute_internal_task(InternalTask *task);
+        AcceleratorProcessor *accel_proc_;
+    };
+
+    template <typename T>
+    AcceleratorTaskScheduler<T>::AcceleratorTaskScheduler(Processor proc,
+                                                          Realm::CoreReservation& core_rsrv,
+                                                          AcceleratorProcessor *accel_proc) : T(proc, core_rsrv), accel_proc_(accel_proc) {
+    }
+
+    template <typename T>
+      AcceleratorTaskScheduler<T>::~AcceleratorTaskScheduler(void) {
+    }
+
+    template <typename T>
+    bool AcceleratorTaskScheduler<T>::execute_task(Task *task) {
+      // add device to thread's xrt context
+      XRTContext::xrt_device = accel_proc_->xrt_device_;
+      bool ok = T::execute_task(task);
+      return ok;
+    }
+
+    template <typename T>
+    void AcceleratorTaskScheduler<T>::execute_internal_task(InternalTask *task) {
+      // add device to thread's xrt context
+      XRTContext::xrt_device = accel_proc_->xrt_device_;
+      T::execute_internal_task(task);
+    }
+
+    AcceleratorProcessor::AcceleratorProcessor(XRTDevice<HW_EXP_SIZE, HW_MANT_SIZE> *xrt, Processor me, Realm::CoreReservationSet& crs)
+    : LocalTaskProcessor(me, Processor::ACCEL_PROC)
+    {
+      xrt_device_ = xrt;
+
+      Realm::CoreReservationParameters params;
+      params.set_num_cores(1);
+      params.set_alu_usage(params.CORE_USAGE_SHARED);
+      params.set_fpu_usage(params.CORE_USAGE_SHARED);
+      params.set_ldst_usage(params.CORE_USAGE_SHARED);
+      params.set_max_stack_size(2 << 20);
+      std::string name = stringbuilder() << "Accel proc " << me;
+      core_rsrv_ = new Realm::CoreReservation(name, crs, params);
+
+#ifdef REALM_USE_USER_THREADS
+      UserThreadTaskScheduler *sched = new AcceleratorTaskScheduler<UserThreadTaskScheduler>(me, *core_rsrv_, this);
+#else
+      KernelThreadTaskScheduler *sched = new AcceleratorTaskScheduler<KernelThreadTaskScheduler>(me, *core_rsrv_, this);
+#endif
+      set_scheduler(sched);
+    }
+
+    AcceleratorProcessor::~AcceleratorProcessor(void) {
+      delete core_rsrv_;
+    }
+
+  }; // namespace Accelerator
+}; // namespace Realm
+
diff --git a/runtime/realm/accelerator/accelerator_module.h b/runtime/realm/accelerator/accelerator_module.h
new file mode 100644
index 0000000000..a9eaccd083
--- /dev/null
+++ b/runtime/realm/accelerator/accelerator_module.h
@@ -0,0 +1,75 @@
+#ifndef REALM_ACCELERATOR_MODULE_H
+#define REALM_ACCELERATOR_MODULE_H
+
+#include "hls/cpfp_conv.h" // class XRTdevice
+
+#include "realm/module.h"
+#include "realm/proc_impl.h"
+#include "realm/mem_impl.h"
+#include "realm/runtime_impl.h"
+
+namespace Realm {
+  namespace Accelerator {
+
+    class AcceleratorModule;
+    class AcceleratorProcessor;
+
+    class AcceleratorModule : public Module {
+      protected:
+        AcceleratorModule(void);
+
+      public:
+        virtual ~AcceleratorModule(void);
+
+        static Module *create_module(RuntimeImpl *runtime, std::vector<std::string>& cmdline);
+
+        // do any general initialization - this is called after all configuration is
+        //  complete
+        virtual void initialize(RuntimeImpl *runtime);
+
+        // create any memories provided by this module (default == do nothing)
+        //  (each new MemoryImpl should use a Memory from RuntimeImpl::next_local_memory_id)
+        virtual void create_memories(RuntimeImpl *runtime);
+
+        // create any processors provided by the module (default == do nothing)
+        //  (each new ProcessorImpl should use a Processor from
+        //   RuntimeImpl::next_local_processor_id)
+        virtual void create_processors(RuntimeImpl *runtime);
+
+        // create any DMA channels provided by the module (default == do nothing)
+        virtual void create_dma_channels(RuntimeImpl *runtime);
+
+        // create any code translators provided by the module (default == do nothing)
+        virtual void create_code_translators(RuntimeImpl *runtime);
+
+        // clean up any common resources created by the module - this will be called
+        //  after all memories/processors/etc. have been shut down and destroyed
+        virtual void cleanup(void);
+
+      public:
+        unsigned cfg_num_accelerators_;
+        std::string cfg_fwbin_path_;
+
+      protected:
+  	std::vector<AcceleratorProcessor *> procs_;
+        std::vector<XRTDevice<HW_EXP_SIZE, HW_MANT_SIZE> *> xrt_devices_;
+    };
+
+    REGISTER_REALM_MODULE(AcceleratorModule);
+
+    class AcceleratorProcessor : public LocalTaskProcessor {
+      public:
+        AcceleratorProcessor(XRTDevice<HW_EXP_SIZE, HW_MANT_SIZE> *xrt, Processor me, Realm::CoreReservationSet& crs);
+        virtual ~AcceleratorProcessor(void);
+
+        XRTDevice<HW_EXP_SIZE, HW_MANT_SIZE> *xrt_device_;
+      
+      protected:
+        Realm::CoreReservation *core_rsrv_;
+    };
+
+  }; // namespace Accelerator
+}; // namespace Realm
+
+#endif
+
diff --git a/runtime/realm/module.cc b/runtime/realm/module.cc
index 856ad38eb1..aef17ce59b 100644
--- a/runtime/realm/module.cc
+++ b/runtime/realm/module.cc
@@ -56,6 +56,9 @@
 #if defined REALM_USE_MPI
 #include "realm/mpi/mpi_module.h"
 #endif
+#ifdef REALM_USE_ACCELERATOR
+#include "realm/accelerator/accelerator_module.h"
+#endif
 
 namespace Realm {
 
diff --git a/runtime/realm/realm_c.h b/runtime/realm/realm_c.h
index 62d7a1b544..0d908c1333 100644
--- a/runtime/realm/realm_c.h
+++ b/runtime/realm/realm_c.h
@@ -51,7 +51,8 @@ typedef unsigned long long realm_barrier_timestamp_t;
   __op__(PROC_GROUP, "Processor group") \
   __op__(PROC_SET, "Set of Processors for OpenMP/Kokkos etc.") \
   __op__(OMP_PROC, "OpenMP (or similar) thread pool") \
-  __op__(PY_PROC, "Python interpreter")
+  __op__(PY_PROC, "Python interpreter") \
+  __op__(ACCEL_PROC, "XRT Accelerator")
 
 typedef enum realm_processor_kind_t {
 #define C_ENUMS(name, desc) name,
diff --git a/test/attach_file_mini/CMakeLists.txt b/test/attach_file_mini/CMakeLists.txt
index 67ea942312..8ccf686959 100644
--- a/test/attach_file_mini/CMakeLists.txt
+++ b/test/attach_file_mini/CMakeLists.txt
@@ -22,6 +22,12 @@ if(NOT Legion_SOURCE_DIR)
   find_package(Legion REQUIRED)
 endif()
 
+if (Legion_USE_ACCELERATOR)
+  set(XRT_LIB_DIR $ENV{XILINX_XRT}/lib)
+  link_directories(${XRT_LIB_DIR})
+endif()
+
+
 add_executable(attach_file_mini attach_file_mini.cc)
 target_link_libraries(attach_file_mini Legion::Legion)
 if(Legion_ENABLE_TESTING)
diff --git a/test/legion_stl/CMakeLists.txt b/test/legion_stl/CMakeLists.txt
index 407682b247..c43f7055e7 100644
--- a/test/legion_stl/CMakeLists.txt
+++ b/test/legion_stl/CMakeLists.txt
@@ -22,6 +22,12 @@ if(NOT Legion_SOURCE_DIR)
   find_package(Legion REQUIRED)
 endif()
 
+if (Legion_USE_ACCELERATOR)
+  set(XRT_LIB_DIR $ENV{XILINX_XRT}/lib)
+  link_directories(${XRT_LIB_DIR})
+endif()
+
+
 add_executable(test_stl test_stl.cc)
 set_property(TARGET test_stl PROPERTY CXX_STANDARD 11)
 set_property(TARGET test_stl PROPERTY CXX_STANDARD_REQUIRED ON)
diff --git a/test/realm/CMakeLists.txt b/test/realm/CMakeLists.txt
index 1b6a4be6d8..32eb705d6f 100644
--- a/test/realm/CMakeLists.txt
+++ b/test/realm/CMakeLists.txt
@@ -44,11 +44,22 @@ list(APPEND REALM_TESTS
   coverings
   )
 
+# TODO: build memspeed properly with cuda enabled...
 if(Legion_USE_CUDA)
   # some tests have CUDA source files too
   set(CUDASRC_memspeed memspeed_gpu.cu)
 endif()
 
+if (Legion_USE_ACCELERATOR)
+  list(APPEND REALM_TESTS
+    accelerator_task
+  )
+  # environment variable XILINX_XRT
+  # looking for libxilinxopencl
+  set(XRT_LIB_DIR $ENV{XILINX_XRT}/lib)
+  link_directories(${XRT_LIB_DIR})
+endif()
+
 foreach(test IN LISTS REALM_TESTS)
   if(CUDASRC_${test})
     set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Wno-deprecated-gpu-targets)
diff --git a/test/realm/accelerator_task.cc b/test/realm/accelerator_task.cc
new file mode 100644
index 0000000000..3b68eee5e2
--- /dev/null
+++ b/test/realm/accelerator_task.cc
@@ -0,0 +1,90 @@
+#include "realm.h"
+
+using namespace Realm;
+
+// execute a task on Processor::ACCEL_PROC processor
+
+Logger log_app("app");
+
+enum {
+  TOP_LEVEL_TASK = Processor::TASK_ID_FIRST_AVAILABLE+0,
+  CHILD_TASK_ID_START
+};
+
+void child_task(const void *args, size_t arglen,
+		const void *userdata, size_t userlen, Processor p)
+{
+  log_app.print() << "child task on " << p << ": arglen=" << arglen << ", userlen=" << userlen;
+}
+
+void top_level_task(const void *args, size_t arglen,
+		    const void *userdata, size_t userlen, Processor p)
+{
+  log_app.print() << "top task running on " << p;
+
+  Machine machine = Machine::get_machine();
+  Processor::TaskFuncID func_id = CHILD_TASK_ID_START;
+  CodeDescriptor child_task_desc(child_task);
+
+  std::set<Event> finish_events;
+  Event e = Processor::register_task_by_kind(Processor::ACCEL_PROC, true /*global*/,
+					     func_id,
+					     child_task_desc,
+					     ProfilingRequestSet());
+
+  int count = 0;
+
+  std::set<Processor> all_processors;
+  machine.get_all_processors(all_processors);
+  for(std::set<Processor>::const_iterator it = all_processors.begin();
+        it != all_processors.end();
+	it++) {
+    Processor pp = (*it);
+
+    // only ACCEL_PROCs
+    if(pp.kind() != Processor::ACCEL_PROC)
+      continue;
+
+    Event e2 = pp.spawn(func_id, &count, sizeof(count), e);
+
+    finish_events.insert(e2);
+  }
+
+  func_id++;
+
+  Event merged = Event::merge_events(finish_events);
+
+  merged.wait();
+
+  log_app.print() << "all done!";
+}
+
+int main(int argc, char **argv)
+{
+  Runtime rt;
+
+  rt.init(&argc, &argv);
+
+  // select a processor to run the top level task on
+  Processor p = Machine::ProcessorQuery(Machine::get_machine())
+    .only_kind(Processor::ACCEL_PROC)
+    .first();
+  assert(p.exists());
+
+  Event e1 = Processor::register_task_by_kind(p.kind(),
+                                              false /*!global*/,
+                                              TOP_LEVEL_TASK,
+                                              CodeDescriptor(top_level_task),
+                                              ProfilingRequestSet());
+
+  // collective launch of a single task - everybody gets the same finish event
+  Event e2 = rt.collective_spawn(p, TOP_LEVEL_TASK, 0, 0, e1);
+
+  // request shutdown once that task is complete
+  rt.shutdown(e2);
+
+  // now sleep this thread until that shutdown actually happens
+  rt.wait_for_shutdown();
+  
+  return 0;
+}
diff --git a/test/rendering/CMakeLists.txt b/test/rendering/CMakeLists.txt
index 6a9de95a96..cfa109dc34 100644
--- a/test/rendering/CMakeLists.txt
+++ b/test/rendering/CMakeLists.txt
@@ -22,6 +22,12 @@ if(NOT Legion_SOURCE_DIR)
   find_package(Legion REQUIRED)
 endif()
 
+if (Legion_USE_ACCELERATOR)
+  set(XRT_LIB_DIR $ENV{XILINX_XRT}/lib)
+  link_directories(${XRT_LIB_DIR})
+endif()
+
+
 add_executable(rendering rendering.cc)
 target_link_libraries(rendering Legion::Legion)
 if(Legion_ENABLE_TESTING)