Skip to content

Commit

Permalink
Update interfaces
Browse files Browse the repository at this point in the history
  • Loading branch information
chhwang committed Sep 17, 2023
1 parent e4f47dd commit 13ebacc
Show file tree
Hide file tree
Showing 23 changed files with 571 additions and 587 deletions.
134 changes: 68 additions & 66 deletions ark/executor.cc
Original file line number Diff line number Diff line change
@@ -1,115 +1,117 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

#include "executor.h"
#include "env.h"
#include "gpu/gpu_kernel.h"
#include "include/ark.h"
#include "include/ark_utils.h"

#include "logging.h"
#include "sched/sched.h"
#include <algorithm>
#include <string>

using namespace std;

namespace ark {

class Executor::Impl
{
public:
GpuMgrCtx *ctx;
BaseScheduler *sched;
GpuLoopKernel *glk = nullptr;
GpuStream stream = nullptr;
};

// Constructor.
Executor::Executor(const int gpu_id_, int rank_, int world_size_, Model &model,
const string &name, int num_warps_per_sm_)
: gpu_id{gpu_id_}, rank{rank_},
world_size{world_size_}, impl{make_unique<Impl>()}
Executor::Impl::Impl(int rank, int world_size, Model &model,
const std::string &name, int num_warps_per_sm)
: rank_{rank}, world_size_{world_size}
{
//
GpuMgr *mgr = get_gpu_mgr(gpu_id);
const GpuInfo &ginfo = mgr->get_gpu_info();
gpu_id_ = rank_ % get_env().num_ranks_per_host;
if (get_env().scheduler == "Simple") {
this->impl->sched = new SimpleScheduler{model, gpu_id_, rank_,
world_size_, num_warps_per_sm_};
}
if (get_env().scheduler == "Default") {
this->impl->sched = new DefaultScheduler{
model, gpu_id_, rank_, world_size_, num_warps_per_sm_};
sched_.reset(static_cast<BaseScheduler *>(new SimpleScheduler{
model, gpu_id_, rank_, world_size_, num_warps_per_sm}));
} else if (get_env().scheduler == "Default") {
sched_.reset(static_cast<BaseScheduler *>(new DefaultScheduler{
model, gpu_id_, rank_, world_size_, num_warps_per_sm}));
}
#ifdef USE_KAHYPAR
if (get_env().scheduler == "Kahypar") {
this->impl->sched = new KahyparScheduler{
model, gpu_id_, rank_, world_size_, num_warps_per_sm_};
sched_.reset(static_cast<BaseScheduler *>(new KahyparScheduler{
model, gpu_id_, rank_, world_size_, num_warps_per_sm}));
}
#endif // USE_KAHYPAR

this->impl->sched->schedule();
this->impl->ctx = this->impl->sched->create_context(name);
this->impl->stream = this->impl->ctx->create_stream();
auto codes = this->impl->sched->gen_code();

this->impl->glk = new GpuLoopKernel{name,
codes,
(unsigned int)ginfo.num_sm,
(unsigned int)num_warps_per_sm_,
(unsigned int)ginfo.smem_block_total,
"",
this->impl->ctx};
const GpuInfo &ginfo = get_gpu_mgr(gpu_id_)->get_gpu_info();
sched_->schedule();
ctx_ = sched_->create_context(name);
stream_ = ctx_->create_stream();
glk_ = std::make_unique<GpuLoopKernel>(
name, sched_->gen_code(), (unsigned int)ginfo.num_sm,
(unsigned int)num_warps_per_sm, (unsigned int)ginfo.smem_block_total,
"", ctx_);
}

// Destructor.
Executor::~Executor()
Executor::Impl::~Impl()
{
if (this->impl->glk != nullptr) {
delete this->impl->glk;
}
if (this->impl->ctx != nullptr) {
GpuMgr *mgr = get_gpu_mgr(this->gpu_id);
mgr->destroy_context(this->impl->ctx);
this->impl->ctx = nullptr;
}
// TODO: pass a shared pointer of GpuMgrCtx to GpuLoopKernel
// so that we don't need to call reset() here.
glk_.reset();
get_gpu_mgr(gpu_id_)->destroy_context(ctx_);
}

// Compile the model. This must be called before `launch()`.
void Executor::compile()
void Executor::Impl::compile()
{
GpuMgr *mgr = get_gpu_mgr(gpu_id);
this->impl->glk->compile(mgr->get_gpu_info());
glk_->compile(get_gpu_mgr(gpu_id_)->get_gpu_info());
}

// Launch the model (not running yet). This must be called after `compile()`.
void Executor::launch()
void Executor::Impl::launch()
{
this->impl->glk->load();
GpuState ret = this->impl->glk->launch(this->impl->stream, false);
glk_->load();
GpuState ret = glk_->launch(stream_, false);
if (ret != 0) {
LOG(ERROR, "failed to launch this executor.");
}
}

// Run the model for `iter` iterations.
void Executor::Impl::run(int iter)
{
glk_->run(iter);
}

void Executor::Impl::wait()
{
glk_->wait();
}

float Executor::Impl::stop()
{
glk_->stop();
return glk_->get_elapsed_msec();
}

Executor::Executor(int rank, int world_size, Model &model,
const std::string &name, int num_warps_per_sm)
: impl_{std::make_unique<Executor::Impl>(rank, world_size, model, name,
num_warps_per_sm)}
{
}

Executor::~Executor() = default;

void Executor::compile()
{
impl_->compile();
}

void Executor::launch()
{
impl_->launch();
}

void Executor::run(int iter)
{
this->impl->glk->run(iter);
impl_->run(iter);
}

// Wait for the previous run to finish.
void Executor::wait()
{
this->impl->glk->wait();
impl_->wait();
}

// Stop the model and return the elapsed time in milliseconds.
// Once this is called, we need to call `launch()` again to run the model again.
float Executor::stop()
{
this->impl->glk->stop();
return this->impl->glk->get_elapsed_msec();
return impl_->stop();
}

} // namespace ark
39 changes: 39 additions & 0 deletions ark/executor.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

#ifndef ARK_EXECUTOR_H
#define ARK_EXECUTOR_H

#include "gpu/gpu_kernel.h"
#include "include/ark.h"
#include <memory>

namespace ark {

class Executor::Impl
{
public:
Impl(int rank, int world_size, Model &model, const std::string &name,
int num_warps_per_sm);
~Impl();

void compile();
void launch();
void run(int iter);
void wait();
float stop();

private:
const int rank_;
const int world_size_;
int gpu_id_;

GpuMgrCtx *ctx_;
std::unique_ptr<BaseScheduler> sched_;
std::unique_ptr<GpuLoopKernel> glk_;
GpuStream stream_;
};

} // namespace ark

#endif // ARK_EXECUTOR_H
42 changes: 22 additions & 20 deletions ark/include/ark.h
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,10 @@ class Tensor
DimType offset_bytes(DimType i0 = 0, DimType i1 = 0, DimType i2 = 0,
DimType i3 = 0) const;

/// Checks if the tensor has the actually memory allocated.
/// @return True if the tensor has the memory allocated.
bool is_alloced() const;

/// Checks if the tensor's data range is sequential in memory.
/// @return True if the tensor is sequential in memory.
bool is_sequential() const;
Expand Down Expand Up @@ -306,6 +310,9 @@ class Model
Tensor *reshape(Tensor *input, const Dims &shape, bool allowzero = false,
Tensor *output = nullptr,
const std::string &name = "reshape");
Tensor *reshape(Tensor *input, const std::initializer_list<DimType> &shape,
bool allowzero = false, Tensor *output = nullptr,
const std::string &name = "reshape");
// Reshape `input` to `shape`. If one dimension of `shape` is -1, it will be
// inferred from the `input`. If one dimension of `shape` is 0, by default
// (`allowzero` is false), that dimension is unchanged from the
Expand All @@ -314,7 +321,7 @@ class Model
// `input` should also be an empty tensor. If `allowzero` is true, `shape`
// should not include both 0 and -1 at the same time. If `shape` is an empty
// vector, `input` will be converted to a scalar.
Tensor *reshape(Tensor *input, std::initializer_list<DimType> shape,
Tensor *reshape(Tensor *input, const std::vector<DimType> &shape,
bool allowzero = false, Tensor *output = nullptr,
const std::string &name = "reshape");
// Returns an identical tensor of `input` with execution dependencies
Expand Down Expand Up @@ -478,36 +485,31 @@ class Model

class GpuBuf;

// Convenience class for executing a model.
/// Convenience class for executing a model.
class Executor
{
public:
// Constructor.
Executor(const int gpu_id_, int rank_, int world_size_, Model &model,
const std::string &name, int num_warps_per_sm_ = 16);
/// Constructor.
Executor(int rank, int world_size, Model &model, const std::string &name,
int num_warps_per_sm = 16);
~Executor();
// Compile the model. This must be called before `launch()`.
/// Compile the model. This must be called before `launch()`.
void compile();
// Launch the model (not running yet). This must be called after
// `compile()`.
/// Launch the model (not running yet). This must be called after
/// `compile()`.
void launch();
// Run the model for `iter` iterations.
/// Run the model for `iter` iterations.
void run(int iter);
// Wait for the previous run to finish.
/// Wait for the previous run to finish.
void wait();
// Stop the model and return the elapsed time in milliseconds.
// Once this is called, we need to call `launch()` again to run the model
// again.
/// Stop the model and return the elapsed time in milliseconds.
/// Once this is called, we need to call `launch()` again to run the model
/// again.
float stop();

protected:
class Impl;

private:
const int gpu_id;
const int rank;
const int world_size;
std::unique_ptr<Impl> impl;
class Impl;
std::unique_ptr<Impl> impl_;
};

} // namespace ark
Expand Down
2 changes: 1 addition & 1 deletion ark/ops/ops_identity_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ ark::unittest::State test_identity()
ark::Tensor *tns1 = model.identity(tns0);

// Create an executor
ark::Executor exe{0, 0, 1, model, "test_tensor_layout"};
ark::Executor exe{0, 1, model, "test_tensor_layout"};
exe.compile();

int num_elem = 2 * 3 * 4 * 5;
Expand Down
2 changes: 1 addition & 1 deletion ark/ops/ops_im2col_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ void test_im2col_internal(ark::DimType n, ark::DimType h, ark::DimType w,
UNITTEST_EQ(tns_y->ndims(), 3);

//
ark::Executor exe{0, 0, 1, model, "test_im2col"};
ark::Executor exe{0, 1, model, "test_im2col"};
exe.compile();

// Set data.
Expand Down
2 changes: 1 addition & 1 deletion ark/ops/ops_layernorm_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ void test_layernorm_internal(unsigned int n, unsigned int m, unsigned int k)
/* ark::Tensor *tns_y = */ model.layernorm(tns_x);

//
ark::Executor exe{0, 0, 1, model, "test_layernorm"};
ark::Executor exe{0, 1, model, "test_layernorm"};
exe.compile();

// Set data.
Expand Down
11 changes: 9 additions & 2 deletions ark/ops/ops_reshape.cc
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,14 @@ Tensor *Model::reshape(Tensor *input, const Dims &shape, bool allowzero,
return this->impl->add_op(op)[0];
}

Tensor *Model::reshape(Tensor *input,
const std::initializer_list<DimType> &shape,
bool allowzero, Tensor *output, const std::string &name)
{
std::vector<DimType> shape_vec{shape};
return this->reshape(input, shape_vec, allowzero, output, name);
}

// Reshape `input` to `shape`. If one dimension of `shape` is -1, it will be
// inferred from the `input`. If one dimension of `shape` is 0, by default
// (`allowzero` is false), that dimension is unchanged from the corresponding
Expand All @@ -88,8 +96,7 @@ Tensor *Model::reshape(Tensor *input, const Dims &shape, bool allowzero,
// be an empty tensor. If `allowzero` is true, `shape` should not include both
// 0 and -1 at the same time. If `shape` is an empty vector, `input` will be
// converted to a scalar.
Tensor *Model::reshape(Tensor *input,
const std::initializer_list<DimType> shape,
Tensor *Model::reshape(Tensor *input, const std::vector<DimType> &shape,
bool allowzero, Tensor *output, const std::string &name)
{
if (input == nullptr) {
Expand Down
6 changes: 3 additions & 3 deletions ark/ops/ops_reshape_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ ark::unittest::State test_reshape()
ark::Tensor *tns1 = model.reshape(tns0, {5, 4, 3, 2});

// Create an executor
ark::Executor exe{0, 0, 1, model, "test_tensor_layout"};
ark::Executor exe{0, 1, model, "test_tensor_layout"};
exe.compile();

int num_elem = 2 * 3 * 4 * 5;
Expand Down Expand Up @@ -44,7 +44,7 @@ ark::unittest::State test_reshape_infer()
ark::Tensor *tns1 = model.reshape(tns0, {-1, 4, 3, 2});

// Create an executor
ark::Executor exe{0, 0, 1, model, "test_tensor_layout"};
ark::Executor exe{0, 1, model, "test_tensor_layout"};
exe.compile();

int num_elem = 2 * 3 * 4 * 5;
Expand Down Expand Up @@ -73,7 +73,7 @@ ark::unittest::State test_reshape_allowzero()
ark::Tensor *tns1 = model.reshape(tns0, {5, 3, 0, 2});

// Create an executor
ark::Executor exe{0, 0, 1, model, "test_tensor_layout"};
ark::Executor exe{0, 1, model, "test_tensor_layout"};
exe.compile();

int num_elem = 2 * 3 * 4 * 5;
Expand Down
Loading

0 comments on commit 13ebacc

Please sign in to comment.