Skip to content

Commit

Permalink
add herd kernel arg lowering to air-to-aie
Browse files Browse the repository at this point in the history
Add lowering of air.herd_load to npu.rtp_write
  • Loading branch information
fifield committed Jun 21, 2024
1 parent f4f758c commit 568d409
Show file tree
Hide file tree
Showing 9 changed files with 299 additions and 53 deletions.
5 changes: 3 additions & 2 deletions mlir/include/air/Conversion/AIRToAIESchedulingUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ AIE::LockOp allocateLockOp(AIE::DeviceOp aie_device, AIE::TileOp tile,
int init = 0, int id = -1);

std::stringstream
generateBufferNameInStringStream(std::string prefix, uint64_t &BufferId,
generateBufferNameInStringStream(StringRef prefix, uint64_t &BufferId,
mlir::StringAttr attr = nullptr, int x = -1,
int y = -1);

Expand Down Expand Up @@ -194,7 +194,8 @@ void simpleDMAChannelAllocation(std::vector<MemcpyBundleAsFlow> &memcpy_flows,
ShimDMAAllocator &shim_dma_alloc,
MemTileDMAAllocator &memtile_dma_alloc,
TileDMAAllocator &tile_dma_alloc);
template <typename T> int foundInVector(T item, std::vector<T> vec);
template <typename T>
int foundInVector(T item, std::vector<T> vec);
int getSCFForLoopDepth(Operation *o);
bool groupingMemcpysByLoop(std::vector<MemcpyBundleAsFlow> &memcpy_flows);

Expand Down
18 changes: 12 additions & 6 deletions mlir/lib/Conversion/AIRLoweringPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -231,14 +231,20 @@ class AIRHerdConversion : public ConversionPattern {
return failure();
}

{
OpBuilder::InsertionGuard guard(rewriter);
rewriter.setInsertionPointToStart(op->getBlock());
rewriter.create<airrt::HerdLoadOp>(op->getLoc(), rewriter.getI64Type(),
herd_name_attr.getValue().str(),
/* operands */ SmallVector<Value>());
// Integer kernel operands are passed as arguments (runtime parameters) to
// the herd load op.
SmallVector<Value> args;
for (int i = operands.size() - herd.getNumKernelOperands(),
e = operands.size();
i < e; i++) {
Value o = operands[i];
if (llvm::isa<IntegerType, IndexType, FloatType>(o.getType()))
args.push_back(o);
}

rewriter.create<airrt::HerdLoadOp>(op->getLoc(), rewriter.getI64Type(),
herd_name_attr.getValue().str(), args);

SmallVector<Value, 4> deps;
for (auto &o : operands)
if (llvm::isa<airrt::EventType>(o.getType()))
Expand Down
60 changes: 60 additions & 0 deletions mlir/lib/Conversion/AIRRtToNpuPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,62 @@ struct HerdLoadToNpuPattern : public OpConversionPattern<HerdLoadOp> {
LogicalResult
matchAndRewrite(HerdLoadOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {

auto module = op->getParentOfType<ModuleOp>();

// get the size metadata associated with this herd load
int64_t size_x = -1;
int64_t size_y = -1;
int64_t loc_x = -1;
int64_t loc_y = -1;
module.walk([&](HerdMetadataOp metadata) {
// return the first match by name
if (metadata.getSymName() == op.getSymName()) {
auto sxAttr = metadata->getAttrOfType<IntegerAttr>("size_x");
auto syAttr = metadata->getAttrOfType<IntegerAttr>("size_y");
auto lxAttr = metadata->getAttrOfType<IntegerAttr>("loc_x");
auto lyAttr = metadata->getAttrOfType<IntegerAttr>("loc_y");
if (sxAttr && syAttr && lxAttr && lyAttr) {
size_x = sxAttr.getInt();
size_y = syAttr.getInt();
loc_x = lxAttr.getInt();
loc_y = lyAttr.getInt();
} else {
metadata.emitWarning(
"HerdMetadataOp missing size_x, size_y, loc_x, or loc_y");
}
return WalkResult::interrupt();
}
return WalkResult::advance();
});
if (size_x < 0)
return failure();

// for each herd core, emit write_rtp ops for every herd operand
// followed by a write32 to the herd lock, setting it to 1.
for (int phys_x = loc_x; phys_x < size_x + loc_x; phys_x++) {
for (int phys_y = loc_y; phys_y < size_y + loc_y; phys_y++) {

for (int i = 0, e = op.getNumOperands(); i < e; i++) {
Value oper = adaptor.getOperands()[i];
if (!llvm::isa<IntegerType, IndexType, FloatType>(oper.getType()))
continue;

std::string name = "__air_herd_rtp_" + std::to_string(phys_x) + "_" +
std::to_string(phys_y);
auto constOp =
dyn_cast_if_present<arith::ConstantOp>(oper.getDefiningOp());
if (!constOp)
continue;
uint32_t v = cast<IntegerAttr>(constOp.getValue()).getInt();
rewriter.create<AIEX::NpuWriteRTPOp>(op.getLoc(), name, phys_x,
phys_y, i, v);
}
rewriter.create<AIEX::NpuWrite32Op>(op.getLoc(), 0x0001F000, 0x1,
rewriter.getI32IntegerAttr(phys_x),
rewriter.getI32IntegerAttr(phys_y));
}
}
rewriter.eraseOp(op);
return success();
}
Expand Down Expand Up @@ -1252,6 +1308,10 @@ struct AIRRtToNpuPass : public impl::AIRRtToNpuBase<AIRRtToNpuPass> {
auto chan = builder.getI32IntegerAttr(infoOp->getChannelIndex());
auto col_num = builder.getI32IntegerAttr(1);
auto row_num = builder.getI32IntegerAttr(1);
// FIXME: setting the insertion point to the end is a hack for
// RTP POC, so that the sync is after the rtp
// writes and the herd lock aquire.
// builder.setInsertionPoint(dma->getBlock()->getTerminator());
builder.setInsertionPointAfter(dma);
builder.create<AIEX::NpuSyncOp>(dma->getLoc(), col, row, dir, chan,
col_num, row_num);
Expand Down
159 changes: 118 additions & 41 deletions mlir/lib/Conversion/AIRToAIEPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"

#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/TypeSwitch.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
Expand Down Expand Up @@ -171,10 +172,10 @@ bool isLegalMemorySpace(air::MemcpyInterface memcpyOp, AIE::AIEArch arch) {
return false;
}

AIE::BufferOp allocateBufferOp(uint64_t &BufferId, MemRefType memrefTy,
AIE::TileOp tile,
mlir::StringAttr attr = nullptr, int x = -1,
int y = -1) {
static AIE::BufferOp allocateBufferOp(uint64_t &BufferId, MemRefType memrefTy,
AIE::TileOp tile,
mlir::StringAttr attr = nullptr,
int x = -1, int y = -1) {

OpBuilder builder(tile);
Operation *t = tile.getOperation();
Expand Down Expand Up @@ -215,16 +216,17 @@ void outlineAIECores(OpBuilder &builder, AIE::DeviceOp aie_device,
int64_t row_offset = options.row_offset;
auto col_name = xilinx::air::HerdOp::getColOffsetAttrName();
auto row_name = xilinx::air::HerdOp::getRowOffsetAttrName();
auto ctx = h->getContext();
if (auto co = h.getColOffset())
col_offset = *co;
else
h->setAttr(col_name, IntegerAttr::get(IntegerType::get(h->getContext(), 32),
col_offset));
h->setAttr(col_name,
IntegerAttr::get(IntegerType::get(ctx, 32), col_offset));
if (auto ro = h.getRowOffset())
row_offset = *ro;
else
h->setAttr(row_name, IntegerAttr::get(IntegerType::get(h->getContext(), 32),
row_offset));
h->setAttr(row_name,
IntegerAttr::get(IntegerType::get(ctx, 32), row_offset));

for (auto y = 0; y < herd_size_y; y++) {
for (auto x = 0; x < herd_size_x; x++) {
Expand All @@ -251,26 +253,45 @@ void outlineAIECores(OpBuilder &builder, AIE::DeviceOp aie_device,
->getAttrOfType<StringAttr>(SymbolTable::getSymbolAttrName())
.getValue()
.str();
core->setAttr("elf_file",
StringAttr::get(aie_device.getContext(),
herd_name + "_core_" +
std::to_string(phys_x) + "_" +
std::to_string(phys_y) + ".elf"));
core->setAttr(
"elf_file",
StringAttr::get(ctx, herd_name + "_core_" + std::to_string(phys_x) +
"_" + std::to_string(phys_y) + ".elf"));
if (auto a = h->getAttrOfType<StringAttr>("link_with"))
core->setAttr("link_with", a);
}

int64_t rtp_buffer_size = 0; // size in i32s
for (unsigned ki = 0, ke = h.getNumKernelOperands(); ki < ke; ki++) {
BlockArgument karg = h.getKernelArgument(ki);
// each one gets 32-bits in the rtp buffer
if (llvm::isa<IntegerType, IndexType, FloatType>(karg.getType()))
rtp_buffer_size++;
}
AIE::BufferOp rtp_buffer = nullptr;
if (rtp_buffer_size) {
uint64_t buffer_id = 0;
rtp_buffer = allocateBufferOp(
buffer_id,
MemRefType::get({rtp_buffer_size},builder.getI32Type()), tile,
builder.getStringAttr("__air_herd_rtp"), phys_x, phys_y);
if (!options.emit_herd_lock) {
h.emitWarning("Herd RTP buffer allocated but herd lock disabled");
h.emitWarning("Enabling herd lock for RTP buffer synchronization");
options.emit_herd_lock = true;
}
}

Value herd_lock = nullptr;
if (options.emit_herd_lock)
herd_lock = allocateLockOp(aie_device, tile, /*init=*/0, /*id=*/0);

// the buffers and locks created below need to go before the core and
// mem
builder.setInsertionPoint(core);

assert((h.getBody().getBlocks().size() == 1) &&
"Launch body can only contain one Block");

// set insertion point for anything below created on behalf of the core
builder.setInsertionPoint(core);

// generate the aie.core body
//
OpBuilder core_builder(core);
Expand Down Expand Up @@ -313,33 +334,88 @@ void outlineAIECores(OpBuilder &builder, AIE::DeviceOp aie_device,
remap.map(h.getSize()[1],
core_builder.create<arith::ConstantIndexOp>(hloc, herd_size_y));

for (unsigned i = 0; i < h.getNumKernelOperands(); i++) {
auto a = h.getKernelArgument(i);
auto memrefTy = llvm::dyn_cast<MemRefType>(a.getType());
if (options.emit_herd_lock) {
if (aie_device.getTargetModel().getTargetArch() == AIE::AIEArch::AIE1) {
core_builder.create<AIE::UseLockOp>(core_builder.getUnknownLoc(),
herd_lock,
AIE::LockAction::Acquire, 0);
} else if (aie_device.getTargetModel().getTargetArch() ==
AIE::AIEArch::AIE2) {
core_builder.create<AIE::UseLockOp>(
core_builder.getUnknownLoc(), herd_lock,
AIE::LockAction::AcquireGreaterEqual, 1);
}
}

for (unsigned ki = 0, ke = h.getNumKernelOperands(); ki < ke; ki++) {
BlockArgument karg = h.getKernelArgument(ki);

// Remap the kernel operands to the rtp buffer.
// For each kernel operand of a supported type, load the data from the
// rtp buffer and remap uses of the kernel operand to the loaded value.
if (llvm::isa<IntegerType, IndexType, FloatType>(karg.getType())) {

// load from rtp buffer
SmallVector<Value> offsets{
core_builder.create<arith::ConstantIndexOp>(hloc, ki)};
auto load = core_builder.create<memref::LoadOp>(
hloc, IntegerType::get(ctx, 32), rtp_buffer, offsets);

// truncate, extend or bitcast the value to the correct type
Value rtp = nullptr;
llvm::TypeSwitch<Type>(karg.getType())
.Case<IntegerType>([&](IntegerType ity) {
unsigned int width = ity.getWidth();
if (width < 32)
rtp = core_builder.create<arith::TruncIOp>(hloc, ity, load);
else if (width > 32)
rtp = core_builder.create<arith::ExtUIOp>(hloc, ity, load);
else
rtp = load;
})
.Case<IndexType>([&](IndexType ity) {
rtp = core_builder.create<arith::IndexCastOp>(hloc, ity, load);
})
.Case<FloatType>([&](FloatType fty) {
if (fty.getWidth() == 32) {
rtp = core_builder.create<arith::BitcastOp>(hloc, fty, load);
} else if (fty.getWidth() == 16) {
auto ity = IntegerType::get(ctx, 16);
auto tr =
core_builder.create<arith::TruncIOp>(hloc, ity, load);
rtp = core_builder.create<arith::BitcastOp>(hloc, fty, tr);
}
});

// remap the kernel operand
if (rtp)
remap.map(karg, rtp);
else
h.emitWarning("Unsupported runtime parmeter int or float type");
}

auto memrefTy = llvm::dyn_cast<MemRefType>(karg.getType());
if (!memrefTy)
continue;

OpBuilder b(aie_device);
b.setInsertionPoint(core);

if (memrefTy.getMemorySpaceAsInt() == (int)air::MemorySpace::L1) {
// fused herds sometimes have L1 memref allocation outside of herds.
// mapping them back
remap.map(a, h.getKernelOperand(i));
remap.map(karg, h.getKernelOperand(ki));
continue;
}

int which_try = 0;
std::string sym_name = "__air_herd_arg_0";
while (aie_device.lookupSymbol(sym_name))
sym_name = "__air_herd_arg_" + std::to_string(++which_try);
b.create<memref::GlobalOp>(builder.getUnknownLoc(), sym_name,
builder.getStringAttr("public"), memrefTy,
nullptr, false, nullptr);
builder.create<memref::GlobalOp>(builder.getUnknownLoc(), sym_name,
builder.getStringAttr("public"),
memrefTy, nullptr, false, nullptr);

auto m = core_builder.create<memref::GetGlobalOp>(
hloc, SmallVector<Type, 1>{a.getType()}, sym_name);
remap.map(a, m);
hloc, SmallVector<Type, 1>{karg.getType()}, sym_name);
remap.map(karg, m);
}

if (options.emit_herd_lock)
Expand Down Expand Up @@ -1017,8 +1093,7 @@ struct AllocL1BuffersPattern : public OpRewritePattern<memref::AllocOp> {
if (memrefTy.getMemorySpaceAsInt() != (int)air::MemorySpace::L1)
return failure();

rewriter.setInsertionPointAfter(tile);
auto herd = tileToHerdMap[core.getTileOp()];
auto herd = tileToHerdMap[tile];
int64_t col_offset = 0;
int64_t row_offset = 0;
if (herd) {
Expand All @@ -1033,7 +1108,6 @@ struct AllocL1BuffersPattern : public OpRewritePattern<memref::AllocOp> {
alloc->getAttrOfType<StringAttr>(SymbolTable::getSymbolAttrName()),
tile.getCol() - col_offset, tile.getRow() - row_offset);

rewriter.setInsertionPoint(alloc);
rewriter.replaceOp(alloc, buffer->getResults());
return success();
}
Expand Down Expand Up @@ -1076,7 +1150,6 @@ struct AllocL2BuffersPattern : public OpRewritePattern<memref::AllocOp> {
if (!tile)
return failure();

rewriter.setInsertionPointAfter(tile);
auto seg = alloc->getParentOfType<air::SegmentOp>();
int64_t col_offset = 0;
int64_t row_offset = 0;
Expand All @@ -1091,7 +1164,6 @@ struct AllocL2BuffersPattern : public OpRewritePattern<memref::AllocOp> {
alloc->getAttrOfType<StringAttr>(SymbolTable::getSymbolAttrName()),
tile.getCol() - col_offset, tile.getRow() - row_offset);

rewriter.setInsertionPoint(alloc);
rewriter.replaceOp(alloc, buffer->getResults());
bufferToMemtileMap[buffer] = tile;
return success();
Expand Down Expand Up @@ -2613,6 +2685,12 @@ class AIRToAIEPass : public air::impl::AIRToAIEBase<AIRToAIEPass> {
name = attr.getValue().str();

auto herd_meta = builder.create<airrt::HerdMetadataOp>(loc, name);
herd_meta->setAttr("size_x", builder.getI64IntegerAttr(herd.getNumCols()));
herd_meta->setAttr("size_y", builder.getI64IntegerAttr(herd.getNumRows()));
if (auto co = herd.getColOffset())
herd_meta->setAttr("loc_x", builder.getI64IntegerAttr(*co));
if (auto ro = herd.getRowOffset())
herd_meta->setAttr("loc_y", builder.getI64IntegerAttr(*ro));
return herd_meta;
}

Expand Down Expand Up @@ -3351,7 +3429,13 @@ class AIRToAIEPass : public air::impl::AIRToAIEBase<AIRToAIEPass> {
herds.push_back(h);
}

auto segment_name =
device->getAttrOfType<StringAttr>(SymbolTable::getSymbolAttrName())
.getValue();
auto segment_meta = getOrCreateSegmentMetadata(module_meta, segment_name);
for (auto herd : herds) {
auto herd_meta = createHerdMetadata(segment_meta, herd);

std::vector<Attribute> dma_allocations;
if (device.getTargetModel().getTargetArch() == AIE::AIEArch::AIE1) {
// AIE1 dma metadata format
Expand All @@ -3362,13 +3446,6 @@ class AIRToAIEPass : public air::impl::AIRToAIEBase<AIRToAIEPass> {
true, chan_renumber_reverse_map,
dma_allocations);

auto segment_name =
device
->getAttrOfType<StringAttr>(SymbolTable::getSymbolAttrName())
.getValue();
auto segment_meta =
getOrCreateSegmentMetadata(module_meta, segment_name);
auto herd_meta = createHerdMetadata(segment_meta, herd);
herd_meta->setAttr("dma_allocations",
ArrayAttr::get(ctx, dma_allocations));
} else if (device.getTargetModel().getTargetArch() ==
Expand Down
Loading

0 comments on commit 568d409

Please sign in to comment.