Update interfaces

microsoft · Sep 17, 2023 · 13ebacc · 13ebacc
1 parent e4f47dd
commit 13ebacc
Show file tree

Hide file tree

Showing 23 changed files with 571 additions and 587 deletions.
diff --git a/ark/executor.cc b/ark/executor.cc
@@ -1,115 +1,117 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
+#include "executor.h"
 #include "env.h"
-#include "gpu/gpu_kernel.h"
 #include "include/ark.h"
 #include "include/ark_utils.h"
-
 #include "logging.h"
 #include "sched/sched.h"
 #include <algorithm>
 #include <string>
 
-using namespace std;
-
 namespace ark {
 
-class Executor::Impl
-{
-  public:
-    GpuMgrCtx *ctx;
-    BaseScheduler *sched;
-    GpuLoopKernel *glk = nullptr;
-    GpuStream stream = nullptr;
-};
-
-// Constructor.
-Executor::Executor(const int gpu_id_, int rank_, int world_size_, Model &model,
-                   const string &name, int num_warps_per_sm_)
-    : gpu_id{gpu_id_}, rank{rank_},
-      world_size{world_size_}, impl{make_unique<Impl>()}
+Executor::Impl::Impl(int rank, int world_size, Model &model,
+                     const std::string &name, int num_warps_per_sm)
+    : rank_{rank}, world_size_{world_size}
 {
     //
-    GpuMgr *mgr = get_gpu_mgr(gpu_id);
-    const GpuInfo &ginfo = mgr->get_gpu_info();
+    gpu_id_ = rank_ % get_env().num_ranks_per_host;
     if (get_env().scheduler == "Simple") {
-        this->impl->sched = new SimpleScheduler{model, gpu_id_, rank_,
-                                                world_size_, num_warps_per_sm_};
-    }
-    if (get_env().scheduler == "Default") {
-        this->impl->sched = new DefaultScheduler{
-            model, gpu_id_, rank_, world_size_, num_warps_per_sm_};
+        sched_.reset(static_cast<BaseScheduler *>(new SimpleScheduler{
+            model, gpu_id_, rank_, world_size_, num_warps_per_sm}));
+    } else if (get_env().scheduler == "Default") {
+        sched_.reset(static_cast<BaseScheduler *>(new DefaultScheduler{
+            model, gpu_id_, rank_, world_size_, num_warps_per_sm}));
     }
 #ifdef USE_KAHYPAR
     if (get_env().scheduler == "Kahypar") {
-        this->impl->sched = new KahyparScheduler{
-            model, gpu_id_, rank_, world_size_, num_warps_per_sm_};
+        sched_.reset(static_cast<BaseScheduler *>(new KahyparScheduler{
+            model, gpu_id_, rank_, world_size_, num_warps_per_sm}));
     }
 #endif // USE_KAHYPAR
 
-    this->impl->sched->schedule();
-    this->impl->ctx = this->impl->sched->create_context(name);
-    this->impl->stream = this->impl->ctx->create_stream();
-    auto codes = this->impl->sched->gen_code();
-
-    this->impl->glk = new GpuLoopKernel{name,
-                                        codes,
-                                        (unsigned int)ginfo.num_sm,
-                                        (unsigned int)num_warps_per_sm_,
-                                        (unsigned int)ginfo.smem_block_total,
-                                        "",
-                                        this->impl->ctx};
+    const GpuInfo &ginfo = get_gpu_mgr(gpu_id_)->get_gpu_info();
+    sched_->schedule();
+    ctx_ = sched_->create_context(name);
+    stream_ = ctx_->create_stream();
+    glk_ = std::make_unique<GpuLoopKernel>(
+        name, sched_->gen_code(), (unsigned int)ginfo.num_sm,
+        (unsigned int)num_warps_per_sm, (unsigned int)ginfo.smem_block_total,
+        "", ctx_);
 }
 
-// Destructor.
-Executor::~Executor()
+Executor::Impl::~Impl()
 {
-    if (this->impl->glk != nullptr) {
-        delete this->impl->glk;
-    }
-    if (this->impl->ctx != nullptr) {
-        GpuMgr *mgr = get_gpu_mgr(this->gpu_id);
-        mgr->destroy_context(this->impl->ctx);
-        this->impl->ctx = nullptr;
-    }
+    // TODO: pass a shared pointer of GpuMgrCtx to GpuLoopKernel
+    // so that we don't need to call reset() here.
+    glk_.reset();
+    get_gpu_mgr(gpu_id_)->destroy_context(ctx_);
 }
 
-// Compile the model. This must be called before `launch()`.
-void Executor::compile()
+void Executor::Impl::compile()
 {
-    GpuMgr *mgr = get_gpu_mgr(gpu_id);
-    this->impl->glk->compile(mgr->get_gpu_info());
+    glk_->compile(get_gpu_mgr(gpu_id_)->get_gpu_info());
 }
 
-// Launch the model (not running yet). This must be called after `compile()`.
-void Executor::launch()
+void Executor::Impl::launch()
 {
-    this->impl->glk->load();
-    GpuState ret = this->impl->glk->launch(this->impl->stream, false);
+    glk_->load();
+    GpuState ret = glk_->launch(stream_, false);
     if (ret != 0) {
         LOG(ERROR, "failed to launch this executor.");
     }
 }
 
-// Run the model for `iter` iterations.
+void Executor::Impl::run(int iter)
+{
+    glk_->run(iter);
+}
+
+void Executor::Impl::wait()
+{
+    glk_->wait();
+}
+
+float Executor::Impl::stop()
+{
+    glk_->stop();
+    return glk_->get_elapsed_msec();
+}
+
+Executor::Executor(int rank, int world_size, Model &model,
+                   const std::string &name, int num_warps_per_sm)
+    : impl_{std::make_unique<Executor::Impl>(rank, world_size, model, name,
+                                             num_warps_per_sm)}
+{
+}
+
+Executor::~Executor() = default;
+
+void Executor::compile()
+{
+    impl_->compile();
+}
+
+void Executor::launch()
+{
+    impl_->launch();
+}
+
 void Executor::run(int iter)
 {
-    this->impl->glk->run(iter);
+    impl_->run(iter);
 }
 
-// Wait for the previous run to finish.
 void Executor::wait()
 {
-    this->impl->glk->wait();
+    impl_->wait();
 }
 
-// Stop the model and return the elapsed time in milliseconds.
-// Once this is called, we need to call `launch()` again to run the model again.
 float Executor::stop()
 {
-    this->impl->glk->stop();
-    return this->impl->glk->get_elapsed_msec();
+    return impl_->stop();
 }
 
 } // namespace ark
diff --git a/ark/executor.h b/ark/executor.h
@@ -0,0 +1,39 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_EXECUTOR_H
+#define ARK_EXECUTOR_H
+
+#include "gpu/gpu_kernel.h"
+#include "include/ark.h"
+#include <memory>
+
+namespace ark {
+
+class Executor::Impl
+{
+  public:
+    Impl(int rank, int world_size, Model &model, const std::string &name,
+         int num_warps_per_sm);
+    ~Impl();
+
+    void compile();
+    void launch();
+    void run(int iter);
+    void wait();
+    float stop();
+
+  private:
+    const int rank_;
+    const int world_size_;
+    int gpu_id_;
+
+    GpuMgrCtx *ctx_;
+    std::unique_ptr<BaseScheduler> sched_;
+    std::unique_ptr<GpuLoopKernel> glk_;
+    GpuStream stream_;
+};
+
+} // namespace ark
+
+#endif // ARK_EXECUTOR_H
diff --git a/ark/include/ark.h b/ark/include/ark.h
@@ -222,6 +222,10 @@ class Tensor
     DimType offset_bytes(DimType i0 = 0, DimType i1 = 0, DimType i2 = 0,
                          DimType i3 = 0) const;
 
+    /// Checks if the tensor has the actually memory allocated.
+    /// @return True if the tensor has the memory allocated.
+    bool is_alloced() const;
+
     /// Checks if the tensor's data range is sequential in memory.
     /// @return True if the tensor is sequential in memory.
     bool is_sequential() const;
@@ -306,6 +310,9 @@ class Model
     Tensor *reshape(Tensor *input, const Dims &shape, bool allowzero = false,
                     Tensor *output = nullptr,
                     const std::string &name = "reshape");
+    Tensor *reshape(Tensor *input, const std::initializer_list<DimType> &shape,
+                    bool allowzero = false, Tensor *output = nullptr,
+                    const std::string &name = "reshape");
     // Reshape `input` to `shape`. If one dimension of `shape` is -1, it will be
     // inferred from the `input`. If one dimension of `shape` is 0, by default
     // (`allowzero` is false), that dimension is unchanged from the
@@ -314,7 +321,7 @@ class Model
     // `input` should also be an empty tensor. If `allowzero` is true, `shape`
     // should not include both 0 and -1 at the same time. If `shape` is an empty
     // vector, `input` will be converted to a scalar.
-    Tensor *reshape(Tensor *input, std::initializer_list<DimType> shape,
+    Tensor *reshape(Tensor *input, const std::vector<DimType> &shape,
                     bool allowzero = false, Tensor *output = nullptr,
                     const std::string &name = "reshape");
     // Returns an identical tensor of `input` with execution dependencies
@@ -478,36 +485,31 @@ class Model
 
 class GpuBuf;
 
-// Convenience class for executing a model.
+/// Convenience class for executing a model.
 class Executor
 {
   public:
-    // Constructor.
-    Executor(const int gpu_id_, int rank_, int world_size_, Model &model,
-             const std::string &name, int num_warps_per_sm_ = 16);
+    /// Constructor.
+    Executor(int rank, int world_size, Model &model, const std::string &name,
+             int num_warps_per_sm = 16);
     ~Executor();
-    // Compile the model. This must be called before `launch()`.
+    /// Compile the model. This must be called before `launch()`.
     void compile();
-    // Launch the model (not running yet). This must be called after
-    // `compile()`.
+    /// Launch the model (not running yet). This must be called after
+    /// `compile()`.
     void launch();
-    // Run the model for `iter` iterations.
+    /// Run the model for `iter` iterations.
     void run(int iter);
-    // Wait for the previous run to finish.
+    /// Wait for the previous run to finish.
     void wait();
-    // Stop the model and return the elapsed time in milliseconds.
-    // Once this is called, we need to call `launch()` again to run the model
-    // again.
+    /// Stop the model and return the elapsed time in milliseconds.
+    /// Once this is called, we need to call `launch()` again to run the model
+    /// again.
     float stop();
 
-  protected:
-    class Impl;
-
   private:
-    const int gpu_id;
-    const int rank;
-    const int world_size;
-    std::unique_ptr<Impl> impl;
+    class Impl;
+    std::unique_ptr<Impl> impl_;
 };
 
 } // namespace ark

diff --git a/ark/ops/ops_identity_test.cc b/ark/ops/ops_identity_test.cc
@@ -13,7 +13,7 @@ ark::unittest::State test_identity()
     ark::Tensor *tns1 = model.identity(tns0);
 
     // Create an executor
-    ark::Executor exe{0, 0, 1, model, "test_tensor_layout"};
+    ark::Executor exe{0, 1, model, "test_tensor_layout"};
     exe.compile();
 
     int num_elem = 2 * 3 * 4 * 5;

diff --git a/ark/ops/ops_im2col_test.cc b/ark/ops/ops_im2col_test.cc
@@ -27,7 +27,7 @@ void test_im2col_internal(ark::DimType n, ark::DimType h, ark::DimType w,
     UNITTEST_EQ(tns_y->ndims(), 3);
 
     //
-    ark::Executor exe{0, 0, 1, model, "test_im2col"};
+    ark::Executor exe{0, 1, model, "test_im2col"};
     exe.compile();
 
     // Set data.

diff --git a/ark/ops/ops_layernorm_test.cc b/ark/ops/ops_layernorm_test.cc
@@ -29,7 +29,7 @@ void test_layernorm_internal(unsigned int n, unsigned int m, unsigned int k)
     /* ark::Tensor *tns_y = */ model.layernorm(tns_x);
 
     //
-    ark::Executor exe{0, 0, 1, model, "test_layernorm"};
+    ark::Executor exe{0, 1, model, "test_layernorm"};
     exe.compile();
 
     // Set data.

diff --git a/ark/ops/ops_reshape.cc b/ark/ops/ops_reshape.cc
@@ -80,6 +80,14 @@ Tensor *Model::reshape(Tensor *input, const Dims &shape, bool allowzero,
     return this->impl->add_op(op)[0];
 }
 
+Tensor *Model::reshape(Tensor *input,
+                       const std::initializer_list<DimType> &shape,
+                       bool allowzero, Tensor *output, const std::string &name)
+{
+    std::vector<DimType> shape_vec{shape};
+    return this->reshape(input, shape_vec, allowzero, output, name);
+}
+
 // Reshape `input` to `shape`. If one dimension of `shape` is -1, it will be
 // inferred from the `input`. If one dimension of `shape` is 0, by default
 // (`allowzero` is false), that dimension is unchanged from the corresponding
@@ -88,8 +96,7 @@ Tensor *Model::reshape(Tensor *input, const Dims &shape, bool allowzero,
 // be an empty tensor. If `allowzero` is true, `shape` should not include both
 // 0 and -1 at the same time. If `shape` is an empty vector, `input` will be
 // converted to a scalar.
-Tensor *Model::reshape(Tensor *input,
-                       const std::initializer_list<DimType> shape,
+Tensor *Model::reshape(Tensor *input, const std::vector<DimType> &shape,
                        bool allowzero, Tensor *output, const std::string &name)
 {
     if (input == nullptr) {

diff --git a/ark/ops/ops_reshape_test.cc b/ark/ops/ops_reshape_test.cc
@@ -15,7 +15,7 @@ ark::unittest::State test_reshape()
     ark::Tensor *tns1 = model.reshape(tns0, {5, 4, 3, 2});
 
     // Create an executor
-    ark::Executor exe{0, 0, 1, model, "test_tensor_layout"};
+    ark::Executor exe{0, 1, model, "test_tensor_layout"};
     exe.compile();
 
     int num_elem = 2 * 3 * 4 * 5;
@@ -44,7 +44,7 @@ ark::unittest::State test_reshape_infer()
     ark::Tensor *tns1 = model.reshape(tns0, {-1, 4, 3, 2});
 
     // Create an executor
-    ark::Executor exe{0, 0, 1, model, "test_tensor_layout"};
+    ark::Executor exe{0, 1, model, "test_tensor_layout"};
     exe.compile();
 
     int num_elem = 2 * 3 * 4 * 5;
@@ -73,7 +73,7 @@ ark::unittest::State test_reshape_allowzero()
     ark::Tensor *tns1 = model.reshape(tns0, {5, 3, 0, 2});
 
     // Create an executor
-    ark::Executor exe{0, 0, 1, model, "test_tensor_layout"};
+    ark::Executor exe{0, 1, model, "test_tensor_layout"};
     exe.compile();
 
     int num_elem = 2 * 3 * 4 * 5;