From 2daa05cbc71d27b1b800af7473de1387145f107f Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Sat, 18 May 2024 02:28:13 +0000 Subject: [PATCH] a few fixes & improvements --- ark/codegen.cpp | 3 +- ark/include/kernels/kernel_template.in | 4 +- examples/tutorial/plan.json | 228 +++++++- examples/tutorial/plan_1_larger_tile.json | 505 ++++++++++++++++ examples/tutorial/plan_2_split_k.json | 673 ++++++++++++++++++++++ examples/tutorial/plan_3_overwrite.json | 673 ++++++++++++++++++++++ third_party/mscclpp | 2 +- 7 files changed, 2054 insertions(+), 34 deletions(-) create mode 100644 examples/tutorial/plan_1_larger_tile.json create mode 100644 examples/tutorial/plan_2_split_k.json create mode 100644 examples/tutorial/plan_3_overwrite.json diff --git a/ark/codegen.cpp b/ark/codegen.cpp index e234e53f1..d2ae2d3d1 100644 --- a/ark/codegen.cpp +++ b/ark/codegen.cpp @@ -143,8 +143,7 @@ CodeGenerator::Impl::Impl(const Json &plan, } } for (auto &rg : pg["ResourceGroups"]) { - body_ss << this->resource_group(rg, plan.at("TaskInfos"), - proc_range); + body_ss << resource_group(rg, plan.at("TaskInfos"), proc_range); } unsynced.push_back(proc_range); pg_idx++; diff --git a/ark/include/kernels/kernel_template.in b/ark/include/kernels/kernel_template.in index 13e1751fe..bc842ea4a 100644 --- a/ark/include/kernels/kernel_template.in +++ b/ark/include/kernels/kernel_template.in @@ -15,6 +15,9 @@ __forceinline__ __device__ void task_seq(char *_buf) { constexpr size_t SramBytesPerWarp = SlotSramBytes / SlotNumWarps; size_t p = ((blockIdx.x + gridDim.x - ProcCurrent) % gridDim.x) / ProcStep; size_t k = threadIdx.x / SlotNumThreads; + if constexpr (ARK_WARPS_PER_BLOCK > SlotNumWarps) { + if (k >= NumSlots) return; + } size_t task_id_base = TaskBegin + p * TaskStep * TaskGranularity; for (size_t t = k; ; t += NumSlots) { size_t task_id = task_id_base + TaskStep * @@ -22,7 +25,6 @@ __forceinline__ __device__ void task_seq(char *_buf) { if (task_id >= TaskEnd) break; task(_buf, task_id, SramBytesPerWarp); } - __syncthreads(); } } diff --git a/examples/tutorial/plan.json b/examples/tutorial/plan.json index ab461db28..bcbb1d1c7 100644 --- a/examples/tutorial/plan.json +++ b/examples/tutorial/plan.json @@ -87,7 +87,7 @@ }, { "Id": 1, - "NumWarps": 1, + "NumWarps": 8, "SramBytes": 0, "Ops": [ { @@ -109,7 +109,7 @@ { "Id": 6, "DataType": "FP16", - "Buffer": {"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Buffer": {"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}, "Shape": [1,512,11008], "Strides": [1,512,11008], "Offsets": [0,0,0], @@ -120,7 +120,7 @@ { "Id": 7, "DataType": "FP16", - "Buffer": {"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Buffer": {"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}, "Shape": [1,512,11008], "Strides": [1,512,11008], "Offsets": [0,0,0], @@ -129,17 +129,17 @@ ], "Args": {}, "Config": { - "NumWarps": 1, + "NumWarps": 8, "SramBytes": 0, - "Tile": [1,64], - "NumTasks": 88064 + "Tile": [128,256], + "NumTasks": 172 } } ] }, { "Id": 2, - "NumWarps": 1, + "NumWarps": 8, "SramBytes": 0, "Ops": [ { @@ -159,7 +159,7 @@ { "Id": 7, "DataType": "FP16", - "Buffer": {"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Buffer": {"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}, "Shape": [1,512,11008], "Strides": [1,512,11008], "Offsets": [0,0,0], @@ -170,7 +170,7 @@ { "Id": 8, "DataType": "FP16", - "Buffer": {"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Buffer": {"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}, "Shape": [1,512,11008], "Strides": [1,512,11008], "Offsets": [0,0,0], @@ -181,7 +181,7 @@ { "Id": 9, "DataType": "FP16", - "Buffer": {"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Buffer": {"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}, "Shape": [1,512,11008], "Strides": [1,512,11008], "Offsets": [0,0,0], @@ -190,10 +190,10 @@ ], "Args": {}, "Config": { - "NumWarps": 1, + "NumWarps": 8, "SramBytes": 0, - "Tile": [1,64], - "NumTasks": 88064 + "Tile": [128,256], + "NumTasks": 172 } } ] @@ -281,7 +281,7 @@ }, { "Id": 4, - "NumWarps": 1, + "NumWarps": 8, "SramBytes": 0, "Ops": [ { @@ -292,7 +292,7 @@ { "Id": 9, "DataType": "FP16", - "Buffer": {"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Buffer": {"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}, "Shape": [1,512,11008], "Strides": [1,512,11008], "Offsets": [0,0,0], @@ -332,10 +332,10 @@ ], "Args": {}, "Config": { - "NumWarps": 1, + "NumWarps": 8, "SramBytes": 0, - "Tile": [1,64], - "NumTasks": 88064 + "Tile": [128,256], + "NumTasks": 172 } } ] @@ -351,19 +351,19 @@ "IsVirtual": false, "ReadTensors": [ { - "Id": 13, + "Id": 16, "DataType": "FP16", "Buffer": {"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}, - "Shape": [1,512,11008], + "Shape": [1,512,8320], "Strides": [1,512,11008], "Offsets": [0,0,0], "Pads": [1,1,1] }, { - "Id": 2, + "Id": 17, "DataType": "FP16", "Buffer": {"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]}, - "Shape": [4096,11008], + "Shape": [4096,8320], "Strides": [4096,11008], "Offsets": [0,0], "Pads": [1,1] @@ -382,7 +382,7 @@ ], "ResultTensors": [ { - "Id": 15, + "Id": 22, "DataType": "FP16", "Buffer": {"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}, "Shape": [1,512,4096], @@ -399,7 +399,7 @@ "DIMS": [1,1] }, "ShapeMNK": { - "DIMS": [512,4096,11008] + "DIMS": [512,4096,8320] }, "StridesACDB": { "DIMS": [11008,4096,4096,11008] @@ -420,6 +420,148 @@ } } ] + }, + { + "Id": 6, + "NumWarps": 8, + "SramBytes": 147456, + "Ops": [ + { + "Type": "Matmul", + "Name": "matmul_1", + "IsVirtual": false, + "ReadTensors": [ + { + "Id": 18, + "DataType": "FP16", + "Buffer": {"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,2688], + "Strides": [1,512,11008], + "Offsets": [0,0,8320], + "Pads": [1,1,1] + }, + { + "Id": 19, + "DataType": "FP16", + "Buffer": {"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [4096,2688], + "Strides": [4096,11008], + "Offsets": [0,8320], + "Pads": [1,1] + } + ], + "WriteTensors": [ + { + "Id": 20, + "DataType": "FP16", + "Buffer": {"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,4096], + "Strides": [1,512,4096], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "ResultTensors": [ + { + "Id": 21, + "DataType": "FP16", + "Buffer": {"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,4096], + "Strides": [1,512,4096], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "Args": { + "InputDimNC": { + "DIMS": [1,1] + }, + "OtherDimNC": { + "DIMS": [1,1] + }, + "ShapeMNK": { + "DIMS": [512,4096,2688] + }, + "StridesACDB": { + "DIMS": [11008,4096,4096,11008] + }, + "TransposeInput": { + "BOOL": false + }, + "TransposeOther": { + "BOOL": true + } + }, + "Config": { + "NumWarps": 8, + "SramBytes": 147456, + "TileShapeMNK": [128,256,64], + "TilePadMNK": [128,256,64], + "NumTasks": 64 + } + } + ] + }, + { + "Id": 7, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Add", + "Name": "add_1", + "IsVirtual": false, + "ReadTensors": [ + { + "Id": 22, + "DataType": "FP16", + "Buffer": {"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,4096], + "Strides": [1,512,4096], + "Offsets": [0,0,0], + "Pads": [1,1,1] + }, + { + "Id": 21, + "DataType": "FP16", + "Buffer": {"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,4096], + "Strides": [1,512,4096], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "WriteTensors": [ + { + "Id": 23, + "DataType": "FP16", + "Buffer": {"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,4096], + "Strides": [1,512,4096], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "ResultTensors": [ + { + "Id": 15, + "DataType": "FP16", + "Buffer": {"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,4096], + "Strides": [1,512,4096], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "Args": {}, + "Config": { + "NumWarps": 8, + "SramBytes": 0, + "Tile": [128,256], + "NumTasks": 64 + } + } + ] } ], "ProcessorGroups": [ @@ -441,10 +583,10 @@ "ResourceGroups": [ { "ProcessorRange": [0,108], - "WarpRange": [0,1], + "WarpRange": [0,8], "SramRange": [0,0], "TaskGroups": [ - {"TaskId":1,"TaskRange":[0,88064],"Granularity":1} + {"TaskId":1,"TaskRange":[0,172],"Granularity":1} ] } ] @@ -454,10 +596,10 @@ "ResourceGroups": [ { "ProcessorRange": [0,108], - "WarpRange": [0,1], + "WarpRange": [0,8], "SramRange": [0,0], "TaskGroups": [ - {"TaskId":2,"TaskRange":[0,88064],"Granularity":1} + {"TaskId":2,"TaskRange":[0,172],"Granularity":1} ] } ] @@ -480,10 +622,10 @@ "ResourceGroups": [ { "ProcessorRange": [0,108], - "WarpRange": [0,1], + "WarpRange": [0,8], "SramRange": [0,0], "TaskGroups": [ - {"TaskId":4,"TaskRange":[0,88064],"Granularity":1} + {"TaskId":4,"TaskRange":[0,172],"Granularity":1} ] } ] @@ -500,6 +642,32 @@ ] } ] + }, + { + "ProcessorRange": [64,108], + "ResourceGroups": [ + { + "ProcessorRange": [64,108], + "WarpRange": [0,8], + "SramRange": [0,147456], + "TaskGroups": [ + {"TaskId":6,"TaskRange":[0,64],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,108], + "ResourceGroups": [ + { + "ProcessorRange": [0,64], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":7,"TaskRange":[0,64],"Granularity":1} + ] + } + ] } ] } diff --git a/examples/tutorial/plan_1_larger_tile.json b/examples/tutorial/plan_1_larger_tile.json new file mode 100644 index 000000000..b6de82711 --- /dev/null +++ b/examples/tutorial/plan_1_larger_tile.json @@ -0,0 +1,505 @@ +{ + "Rank": 0, + "WorldSize": 1, + "NumProcessors": 108, + "NumWarpsPerProcessor": 8, + "TaskInfos": [ + { + "Id": 0, + "NumWarps": 8, + "SramBytes": 147456, + "Ops": [ + { + "Type": "Matmul", + "Name": "matmul", + "IsVirtual": false, + "ReadTensors": [ + { + "Id": 0, + "DataType": "FP16", + "Buffer": {"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,4096], + "Strides": [1,512,4096], + "Offsets": [0,0,0], + "Pads": [1,1,1] + }, + { + "Id": 1, + "DataType": "FP16", + "Buffer": {"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [11008,4096], + "Strides": [11008,4096], + "Offsets": [0,0], + "Pads": [1,1] + } + ], + "WriteTensors": [ + { + "Id": 4, + "DataType": "FP16", + "Buffer": {"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "ResultTensors": [ + { + "Id": 5, + "DataType": "FP16", + "Buffer": {"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "Args": { + "InputDimNC": { + "DIMS": [1,1] + }, + "OtherDimNC": { + "DIMS": [1,1] + }, + "ShapeMNK": { + "DIMS": [512,11008,4096] + }, + "StridesACDB": { + "DIMS": [4096,11008,11008,4096] + }, + "TransposeInput": { + "BOOL": false + }, + "TransposeOther": { + "BOOL": true + } + }, + "Config": { + "NumWarps": 8, + "SramBytes": 147456, + "TileShapeMNK": [128,256,64], + "TilePadMNK": [128,256,64], + "NumTasks": 172 + } + } + ] + }, + { + "Id": 1, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Sigmoid", + "Name": "sigmoid", + "IsVirtual": false, + "ReadTensors": [ + { + "Id": 5, + "DataType": "FP16", + "Buffer": {"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "WriteTensors": [ + { + "Id": 6, + "DataType": "FP16", + "Buffer": {"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "ResultTensors": [ + { + "Id": 7, + "DataType": "FP16", + "Buffer": {"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "Args": {}, + "Config": { + "NumWarps": 8, + "SramBytes": 0, + "Tile": [128,256], + "NumTasks": 172 + } + } + ] + }, + { + "Id": 2, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Mul", + "Name": "mul", + "IsVirtual": false, + "ReadTensors": [ + { + "Id": 5, + "DataType": "FP16", + "Buffer": {"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + }, + { + "Id": 7, + "DataType": "FP16", + "Buffer": {"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "WriteTensors": [ + { + "Id": 8, + "DataType": "FP16", + "Buffer": {"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "ResultTensors": [ + { + "Id": 9, + "DataType": "FP16", + "Buffer": {"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "Args": {}, + "Config": { + "NumWarps": 8, + "SramBytes": 0, + "Tile": [128,256], + "NumTasks": 172 + } + } + ] + }, + { + "Id": 3, + "NumWarps": 8, + "SramBytes": 147456, + "Ops": [ + { + "Type": "Matmul", + "Name": "matmul_1", + "IsVirtual": false, + "ReadTensors": [ + { + "Id": 0, + "DataType": "FP16", + "Buffer": {"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,4096], + "Strides": [1,512,4096], + "Offsets": [0,0,0], + "Pads": [1,1,1] + }, + { + "Id": 3, + "DataType": "FP16", + "Buffer": {"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [11008,4096], + "Strides": [11008,4096], + "Offsets": [0,0], + "Pads": [1,1] + } + ], + "WriteTensors": [ + { + "Id": 10, + "DataType": "FP16", + "Buffer": {"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "ResultTensors": [ + { + "Id": 11, + "DataType": "FP16", + "Buffer": {"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "Args": { + "InputDimNC": { + "DIMS": [1,1] + }, + "OtherDimNC": { + "DIMS": [1,1] + }, + "ShapeMNK": { + "DIMS": [512,11008,4096] + }, + "StridesACDB": { + "DIMS": [4096,11008,11008,4096] + }, + "TransposeInput": { + "BOOL": false + }, + "TransposeOther": { + "BOOL": true + } + }, + "Config": { + "NumWarps": 8, + "SramBytes": 147456, + "TileShapeMNK": [128,256,64], + "TilePadMNK": [128,256,64], + "NumTasks": 172 + } + } + ] + }, + { + "Id": 4, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Mul", + "Name": "mul_1", + "IsVirtual": false, + "ReadTensors": [ + { + "Id": 9, + "DataType": "FP16", + "Buffer": {"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + }, + { + "Id": 11, + "DataType": "FP16", + "Buffer": {"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "WriteTensors": [ + { + "Id": 12, + "DataType": "FP16", + "Buffer": {"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "ResultTensors": [ + { + "Id": 13, + "DataType": "FP16", + "Buffer": {"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "Args": {}, + "Config": { + "NumWarps": 8, + "SramBytes": 0, + "Tile": [128,256], + "NumTasks": 172 + } + } + ] + }, + { + "Id": 5, + "NumWarps": 8, + "SramBytes": 147456, + "Ops": [ + { + "Type": "Matmul", + "Name": "matmul_1", + "IsVirtual": false, + "ReadTensors": [ + { + "Id": 13, + "DataType": "FP16", + "Buffer": {"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + }, + { + "Id": 2, + "DataType": "FP16", + "Buffer": {"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [4096,11008], + "Strides": [4096,11008], + "Offsets": [0,0], + "Pads": [1,1] + } + ], + "WriteTensors": [ + { + "Id": 14, + "DataType": "FP16", + "Buffer": {"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,4096], + "Strides": [1,512,4096], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "ResultTensors": [ + { + "Id": 15, + "DataType": "FP16", + "Buffer": {"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,4096], + "Strides": [1,512,4096], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "Args": { + "InputDimNC": { + "DIMS": [1,1] + }, + "OtherDimNC": { + "DIMS": [1,1] + }, + "ShapeMNK": { + "DIMS": [512,4096,11008] + }, + "StridesACDB": { + "DIMS": [11008,4096,4096,11008] + }, + "TransposeInput": { + "BOOL": false + }, + "TransposeOther": { + "BOOL": true + } + }, + "Config": { + "NumWarps": 8, + "SramBytes": 147456, + "TileShapeMNK": [128,256,64], + "TilePadMNK": [128,256,64], + "NumTasks": 64 + } + } + ] + } + ], + "ProcessorGroups": [ + { + "ProcessorRange": [0,108], + "ResourceGroups": [ + { + "ProcessorRange": [0,108], + "WarpRange": [0,8], + "SramRange": [0,147456], + "TaskGroups": [ + {"TaskId":0,"TaskRange":[0,172],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,108], + "ResourceGroups": [ + { + "ProcessorRange": [0,108], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":1,"TaskRange":[0,172],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,108], + "ResourceGroups": [ + { + "ProcessorRange": [0,108], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":2,"TaskRange":[0,172],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,108], + "ResourceGroups": [ + { + "ProcessorRange": [0,108], + "WarpRange": [0,8], + "SramRange": [0,147456], + "TaskGroups": [ + {"TaskId":3,"TaskRange":[0,172],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,108], + "ResourceGroups": [ + { + "ProcessorRange": [0,108], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":4,"TaskRange":[0,172],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,64], + "ResourceGroups": [ + { + "ProcessorRange": [0,64], + "WarpRange": [0,8], + "SramRange": [0,147456], + "TaskGroups": [ + {"TaskId":5,"TaskRange":[0,64],"Granularity":1} + ] + } + ] + } + ] +} diff --git a/examples/tutorial/plan_2_split_k.json b/examples/tutorial/plan_2_split_k.json new file mode 100644 index 000000000..11e91d37a --- /dev/null +++ b/examples/tutorial/plan_2_split_k.json @@ -0,0 +1,673 @@ +{ + "Rank": 0, + "WorldSize": 1, + "NumProcessors": 108, + "NumWarpsPerProcessor": 8, + "TaskInfos": [ + { + "Id": 0, + "NumWarps": 8, + "SramBytes": 147456, + "Ops": [ + { + "Type": "Matmul", + "Name": "matmul", + "IsVirtual": false, + "ReadTensors": [ + { + "Id": 0, + "DataType": "FP16", + "Buffer": {"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,4096], + "Strides": [1,512,4096], + "Offsets": [0,0,0], + "Pads": [1,1,1] + }, + { + "Id": 1, + "DataType": "FP16", + "Buffer": {"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [11008,4096], + "Strides": [11008,4096], + "Offsets": [0,0], + "Pads": [1,1] + } + ], + "WriteTensors": [ + { + "Id": 4, + "DataType": "FP16", + "Buffer": {"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "ResultTensors": [ + { + "Id": 5, + "DataType": "FP16", + "Buffer": {"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "Args": { + "InputDimNC": { + "DIMS": [1,1] + }, + "OtherDimNC": { + "DIMS": [1,1] + }, + "ShapeMNK": { + "DIMS": [512,11008,4096] + }, + "StridesACDB": { + "DIMS": [4096,11008,11008,4096] + }, + "TransposeInput": { + "BOOL": false + }, + "TransposeOther": { + "BOOL": true + } + }, + "Config": { + "NumWarps": 8, + "SramBytes": 147456, + "TileShapeMNK": [128,256,64], + "TilePadMNK": [128,256,64], + "NumTasks": 172 + } + } + ] + }, + { + "Id": 1, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Sigmoid", + "Name": "sigmoid", + "IsVirtual": false, + "ReadTensors": [ + { + "Id": 5, + "DataType": "FP16", + "Buffer": {"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "WriteTensors": [ + { + "Id": 6, + "DataType": "FP16", + "Buffer": {"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "ResultTensors": [ + { + "Id": 7, + "DataType": "FP16", + "Buffer": {"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "Args": {}, + "Config": { + "NumWarps": 8, + "SramBytes": 0, + "Tile": [128,256], + "NumTasks": 172 + } + } + ] + }, + { + "Id": 2, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Mul", + "Name": "mul", + "IsVirtual": false, + "ReadTensors": [ + { + "Id": 5, + "DataType": "FP16", + "Buffer": {"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + }, + { + "Id": 7, + "DataType": "FP16", + "Buffer": {"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "WriteTensors": [ + { + "Id": 8, + "DataType": "FP16", + "Buffer": {"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "ResultTensors": [ + { + "Id": 9, + "DataType": "FP16", + "Buffer": {"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "Args": {}, + "Config": { + "NumWarps": 8, + "SramBytes": 0, + "Tile": [128,256], + "NumTasks": 172 + } + } + ] + }, + { + "Id": 3, + "NumWarps": 8, + "SramBytes": 147456, + "Ops": [ + { + "Type": "Matmul", + "Name": "matmul_1", + "IsVirtual": false, + "ReadTensors": [ + { + "Id": 0, + "DataType": "FP16", + "Buffer": {"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,4096], + "Strides": [1,512,4096], + "Offsets": [0,0,0], + "Pads": [1,1,1] + }, + { + "Id": 3, + "DataType": "FP16", + "Buffer": {"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [11008,4096], + "Strides": [11008,4096], + "Offsets": [0,0], + "Pads": [1,1] + } + ], + "WriteTensors": [ + { + "Id": 10, + "DataType": "FP16", + "Buffer": {"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "ResultTensors": [ + { + "Id": 11, + "DataType": "FP16", + "Buffer": {"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "Args": { + "InputDimNC": { + "DIMS": [1,1] + }, + "OtherDimNC": { + "DIMS": [1,1] + }, + "ShapeMNK": { + "DIMS": [512,11008,4096] + }, + "StridesACDB": { + "DIMS": [4096,11008,11008,4096] + }, + "TransposeInput": { + "BOOL": false + }, + "TransposeOther": { + "BOOL": true + } + }, + "Config": { + "NumWarps": 8, + "SramBytes": 147456, + "TileShapeMNK": [128,256,64], + "TilePadMNK": [128,256,64], + "NumTasks": 172 + } + } + ] + }, + { + "Id": 4, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Mul", + "Name": "mul_1", + "IsVirtual": false, + "ReadTensors": [ + { + "Id": 9, + "DataType": "FP16", + "Buffer": {"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + }, + { + "Id": 11, + "DataType": "FP16", + "Buffer": {"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "WriteTensors": [ + { + "Id": 12, + "DataType": "FP16", + "Buffer": {"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "ResultTensors": [ + { + "Id": 13, + "DataType": "FP16", + "Buffer": {"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "Args": {}, + "Config": { + "NumWarps": 8, + "SramBytes": 0, + "Tile": [128,256], + "NumTasks": 172 + } + } + ] + }, + { + "Id": 5, + "NumWarps": 8, + "SramBytes": 147456, + "Ops": [ + { + "Type": "Matmul", + "Name": "matmul_1", + "IsVirtual": false, + "ReadTensors": [ + { + "Id": 16, + "DataType": "FP16", + "Buffer": {"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,8320], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + }, + { + "Id": 17, + "DataType": "FP16", + "Buffer": {"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [4096,8320], + "Strides": [4096,11008], + "Offsets": [0,0], + "Pads": [1,1] + } + ], + "WriteTensors": [ + { + "Id": 14, + "DataType": "FP16", + "Buffer": {"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,4096], + "Strides": [1,512,4096], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "ResultTensors": [ + { + "Id": 22, + "DataType": "FP16", + "Buffer": {"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,4096], + "Strides": [1,512,4096], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "Args": { + "InputDimNC": { + "DIMS": [1,1] + }, + "OtherDimNC": { + "DIMS": [1,1] + }, + "ShapeMNK": { + "DIMS": [512,4096,8320] + }, + "StridesACDB": { + "DIMS": [11008,4096,4096,11008] + }, + "TransposeInput": { + "BOOL": false + }, + "TransposeOther": { + "BOOL": true + } + }, + "Config": { + "NumWarps": 8, + "SramBytes": 147456, + "TileShapeMNK": [128,256,64], + "TilePadMNK": [128,256,64], + "NumTasks": 64 + } + } + ] + }, + { + "Id": 6, + "NumWarps": 8, + "SramBytes": 147456, + "Ops": [ + { + "Type": "Matmul", + "Name": "matmul_1", + "IsVirtual": false, + "ReadTensors": [ + { + "Id": 18, + "DataType": "FP16", + "Buffer": {"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,2688], + "Strides": [1,512,11008], + "Offsets": [0,0,8320], + "Pads": [1,1,1] + }, + { + "Id": 19, + "DataType": "FP16", + "Buffer": {"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [4096,2688], + "Strides": [4096,11008], + "Offsets": [0,8320], + "Pads": [1,1] + } + ], + "WriteTensors": [ + { + "Id": 20, + "DataType": "FP16", + "Buffer": {"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,4096], + "Strides": [1,512,4096], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "ResultTensors": [ + { + "Id": 21, + "DataType": "FP16", + "Buffer": {"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,4096], + "Strides": [1,512,4096], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "Args": { + "InputDimNC": { + "DIMS": [1,1] + }, + "OtherDimNC": { + "DIMS": [1,1] + }, + "ShapeMNK": { + "DIMS": [512,4096,2688] + }, + "StridesACDB": { + "DIMS": [11008,4096,4096,11008] + }, + "TransposeInput": { + "BOOL": false + }, + "TransposeOther": { + "BOOL": true + } + }, + "Config": { + "NumWarps": 8, + "SramBytes": 147456, + "TileShapeMNK": [128,256,64], + "TilePadMNK": [128,256,64], + "NumTasks": 64 + } + } + ] + }, + { + "Id": 7, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Add", + "Name": "add_1", + "IsVirtual": false, + "ReadTensors": [ + { + "Id": 22, + "DataType": "FP16", + "Buffer": {"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,4096], + "Strides": [1,512,4096], + "Offsets": [0,0,0], + "Pads": [1,1,1] + }, + { + "Id": 21, + "DataType": "FP16", + "Buffer": {"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,4096], + "Strides": [1,512,4096], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "WriteTensors": [ + { + "Id": 23, + "DataType": "FP16", + "Buffer": {"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,4096], + "Strides": [1,512,4096], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "ResultTensors": [ + { + "Id": 15, + "DataType": "FP16", + "Buffer": {"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,4096], + "Strides": [1,512,4096], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "Args": {}, + "Config": { + "NumWarps": 8, + "SramBytes": 0, + "Tile": [128,256], + "NumTasks": 64 + } + } + ] + } + ], + "ProcessorGroups": [ + { + "ProcessorRange": [0,108], + "ResourceGroups": [ + { + "ProcessorRange": [0,108], + "WarpRange": [0,8], + "SramRange": [0,147456], + "TaskGroups": [ + {"TaskId":0,"TaskRange":[0,172],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,108], + "ResourceGroups": [ + { + "ProcessorRange": [0,108], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":1,"TaskRange":[0,172],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,108], + "ResourceGroups": [ + { + "ProcessorRange": [0,108], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":2,"TaskRange":[0,172],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,108], + "ResourceGroups": [ + { + "ProcessorRange": [0,108], + "WarpRange": [0,8], + "SramRange": [0,147456], + "TaskGroups": [ + {"TaskId":3,"TaskRange":[0,172],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,108], + "ResourceGroups": [ + { + "ProcessorRange": [0,108], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":4,"TaskRange":[0,172],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,64], + "ResourceGroups": [ + { + "ProcessorRange": [0,64], + "WarpRange": [0,8], + "SramRange": [0,147456], + "TaskGroups": [ + {"TaskId":5,"TaskRange":[0,64],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [64,108], + "ResourceGroups": [ + { + "ProcessorRange": [64,108], + "WarpRange": [0,8], + "SramRange": [0,147456], + "TaskGroups": [ + {"TaskId":6,"TaskRange":[0,64],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,108], + "ResourceGroups": [ + { + "ProcessorRange": [0,64], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":7,"TaskRange":[0,64],"Granularity":1} + ] + } + ] + } + ] +} diff --git a/examples/tutorial/plan_3_overwrite.json b/examples/tutorial/plan_3_overwrite.json new file mode 100644 index 000000000..bcbb1d1c7 --- /dev/null +++ b/examples/tutorial/plan_3_overwrite.json @@ -0,0 +1,673 @@ +{ + "Rank": 0, + "WorldSize": 1, + "NumProcessors": 108, + "NumWarpsPerProcessor": 8, + "TaskInfos": [ + { + "Id": 0, + "NumWarps": 8, + "SramBytes": 147456, + "Ops": [ + { + "Type": "Matmul", + "Name": "matmul", + "IsVirtual": false, + "ReadTensors": [ + { + "Id": 0, + "DataType": "FP16", + "Buffer": {"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,4096], + "Strides": [1,512,4096], + "Offsets": [0,0,0], + "Pads": [1,1,1] + }, + { + "Id": 1, + "DataType": "FP16", + "Buffer": {"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [11008,4096], + "Strides": [11008,4096], + "Offsets": [0,0], + "Pads": [1,1] + } + ], + "WriteTensors": [ + { + "Id": 4, + "DataType": "FP16", + "Buffer": {"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "ResultTensors": [ + { + "Id": 5, + "DataType": "FP16", + "Buffer": {"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "Args": { + "InputDimNC": { + "DIMS": [1,1] + }, + "OtherDimNC": { + "DIMS": [1,1] + }, + "ShapeMNK": { + "DIMS": [512,11008,4096] + }, + "StridesACDB": { + "DIMS": [4096,11008,11008,4096] + }, + "TransposeInput": { + "BOOL": false + }, + "TransposeOther": { + "BOOL": true + } + }, + "Config": { + "NumWarps": 8, + "SramBytes": 147456, + "TileShapeMNK": [128,256,64], + "TilePadMNK": [128,256,64], + "NumTasks": 172 + } + } + ] + }, + { + "Id": 1, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Sigmoid", + "Name": "sigmoid", + "IsVirtual": false, + "ReadTensors": [ + { + "Id": 5, + "DataType": "FP16", + "Buffer": {"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "WriteTensors": [ + { + "Id": 6, + "DataType": "FP16", + "Buffer": {"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "ResultTensors": [ + { + "Id": 7, + "DataType": "FP16", + "Buffer": {"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "Args": {}, + "Config": { + "NumWarps": 8, + "SramBytes": 0, + "Tile": [128,256], + "NumTasks": 172 + } + } + ] + }, + { + "Id": 2, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Mul", + "Name": "mul", + "IsVirtual": false, + "ReadTensors": [ + { + "Id": 5, + "DataType": "FP16", + "Buffer": {"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + }, + { + "Id": 7, + "DataType": "FP16", + "Buffer": {"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "WriteTensors": [ + { + "Id": 8, + "DataType": "FP16", + "Buffer": {"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "ResultTensors": [ + { + "Id": 9, + "DataType": "FP16", + "Buffer": {"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "Args": {}, + "Config": { + "NumWarps": 8, + "SramBytes": 0, + "Tile": [128,256], + "NumTasks": 172 + } + } + ] + }, + { + "Id": 3, + "NumWarps": 8, + "SramBytes": 147456, + "Ops": [ + { + "Type": "Matmul", + "Name": "matmul_1", + "IsVirtual": false, + "ReadTensors": [ + { + "Id": 0, + "DataType": "FP16", + "Buffer": {"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,4096], + "Strides": [1,512,4096], + "Offsets": [0,0,0], + "Pads": [1,1,1] + }, + { + "Id": 3, + "DataType": "FP16", + "Buffer": {"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [11008,4096], + "Strides": [11008,4096], + "Offsets": [0,0], + "Pads": [1,1] + } + ], + "WriteTensors": [ + { + "Id": 10, + "DataType": "FP16", + "Buffer": {"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "ResultTensors": [ + { + "Id": 11, + "DataType": "FP16", + "Buffer": {"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "Args": { + "InputDimNC": { + "DIMS": [1,1] + }, + "OtherDimNC": { + "DIMS": [1,1] + }, + "ShapeMNK": { + "DIMS": [512,11008,4096] + }, + "StridesACDB": { + "DIMS": [4096,11008,11008,4096] + }, + "TransposeInput": { + "BOOL": false + }, + "TransposeOther": { + "BOOL": true + } + }, + "Config": { + "NumWarps": 8, + "SramBytes": 147456, + "TileShapeMNK": [128,256,64], + "TilePadMNK": [128,256,64], + "NumTasks": 172 + } + } + ] + }, + { + "Id": 4, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Mul", + "Name": "mul_1", + "IsVirtual": false, + "ReadTensors": [ + { + "Id": 9, + "DataType": "FP16", + "Buffer": {"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + }, + { + "Id": 11, + "DataType": "FP16", + "Buffer": {"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "WriteTensors": [ + { + "Id": 12, + "DataType": "FP16", + "Buffer": {"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "ResultTensors": [ + { + "Id": 13, + "DataType": "FP16", + "Buffer": {"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,11008], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "Args": {}, + "Config": { + "NumWarps": 8, + "SramBytes": 0, + "Tile": [128,256], + "NumTasks": 172 + } + } + ] + }, + { + "Id": 5, + "NumWarps": 8, + "SramBytes": 147456, + "Ops": [ + { + "Type": "Matmul", + "Name": "matmul_1", + "IsVirtual": false, + "ReadTensors": [ + { + "Id": 16, + "DataType": "FP16", + "Buffer": {"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,8320], + "Strides": [1,512,11008], + "Offsets": [0,0,0], + "Pads": [1,1,1] + }, + { + "Id": 17, + "DataType": "FP16", + "Buffer": {"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [4096,8320], + "Strides": [4096,11008], + "Offsets": [0,0], + "Pads": [1,1] + } + ], + "WriteTensors": [ + { + "Id": 14, + "DataType": "FP16", + "Buffer": {"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,4096], + "Strides": [1,512,4096], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "ResultTensors": [ + { + "Id": 22, + "DataType": "FP16", + "Buffer": {"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,4096], + "Strides": [1,512,4096], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "Args": { + "InputDimNC": { + "DIMS": [1,1] + }, + "OtherDimNC": { + "DIMS": [1,1] + }, + "ShapeMNK": { + "DIMS": [512,4096,8320] + }, + "StridesACDB": { + "DIMS": [11008,4096,4096,11008] + }, + "TransposeInput": { + "BOOL": false + }, + "TransposeOther": { + "BOOL": true + } + }, + "Config": { + "NumWarps": 8, + "SramBytes": 147456, + "TileShapeMNK": [128,256,64], + "TilePadMNK": [128,256,64], + "NumTasks": 64 + } + } + ] + }, + { + "Id": 6, + "NumWarps": 8, + "SramBytes": 147456, + "Ops": [ + { + "Type": "Matmul", + "Name": "matmul_1", + "IsVirtual": false, + "ReadTensors": [ + { + "Id": 18, + "DataType": "FP16", + "Buffer": {"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,2688], + "Strides": [1,512,11008], + "Offsets": [0,0,8320], + "Pads": [1,1,1] + }, + { + "Id": 19, + "DataType": "FP16", + "Buffer": {"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [4096,2688], + "Strides": [4096,11008], + "Offsets": [0,8320], + "Pads": [1,1] + } + ], + "WriteTensors": [ + { + "Id": 20, + "DataType": "FP16", + "Buffer": {"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,4096], + "Strides": [1,512,4096], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "ResultTensors": [ + { + "Id": 21, + "DataType": "FP16", + "Buffer": {"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,4096], + "Strides": [1,512,4096], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "Args": { + "InputDimNC": { + "DIMS": [1,1] + }, + "OtherDimNC": { + "DIMS": [1,1] + }, + "ShapeMNK": { + "DIMS": [512,4096,2688] + }, + "StridesACDB": { + "DIMS": [11008,4096,4096,11008] + }, + "TransposeInput": { + "BOOL": false + }, + "TransposeOther": { + "BOOL": true + } + }, + "Config": { + "NumWarps": 8, + "SramBytes": 147456, + "TileShapeMNK": [128,256,64], + "TilePadMNK": [128,256,64], + "NumTasks": 64 + } + } + ] + }, + { + "Id": 7, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Add", + "Name": "add_1", + "IsVirtual": false, + "ReadTensors": [ + { + "Id": 22, + "DataType": "FP16", + "Buffer": {"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,4096], + "Strides": [1,512,4096], + "Offsets": [0,0,0], + "Pads": [1,1,1] + }, + { + "Id": 21, + "DataType": "FP16", + "Buffer": {"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,4096], + "Strides": [1,512,4096], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "WriteTensors": [ + { + "Id": 23, + "DataType": "FP16", + "Buffer": {"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,4096], + "Strides": [1,512,4096], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "ResultTensors": [ + { + "Id": 15, + "DataType": "FP16", + "Buffer": {"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}, + "Shape": [1,512,4096], + "Strides": [1,512,4096], + "Offsets": [0,0,0], + "Pads": [1,1,1] + } + ], + "Args": {}, + "Config": { + "NumWarps": 8, + "SramBytes": 0, + "Tile": [128,256], + "NumTasks": 64 + } + } + ] + } + ], + "ProcessorGroups": [ + { + "ProcessorRange": [0,108], + "ResourceGroups": [ + { + "ProcessorRange": [0,108], + "WarpRange": [0,8], + "SramRange": [0,147456], + "TaskGroups": [ + {"TaskId":0,"TaskRange":[0,172],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,108], + "ResourceGroups": [ + { + "ProcessorRange": [0,108], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":1,"TaskRange":[0,172],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,108], + "ResourceGroups": [ + { + "ProcessorRange": [0,108], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":2,"TaskRange":[0,172],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,108], + "ResourceGroups": [ + { + "ProcessorRange": [0,108], + "WarpRange": [0,8], + "SramRange": [0,147456], + "TaskGroups": [ + {"TaskId":3,"TaskRange":[0,172],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,108], + "ResourceGroups": [ + { + "ProcessorRange": [0,108], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":4,"TaskRange":[0,172],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,64], + "ResourceGroups": [ + { + "ProcessorRange": [0,64], + "WarpRange": [0,8], + "SramRange": [0,147456], + "TaskGroups": [ + {"TaskId":5,"TaskRange":[0,64],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [64,108], + "ResourceGroups": [ + { + "ProcessorRange": [64,108], + "WarpRange": [0,8], + "SramRange": [0,147456], + "TaskGroups": [ + {"TaskId":6,"TaskRange":[0,64],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,108], + "ResourceGroups": [ + { + "ProcessorRange": [0,64], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":7,"TaskRange":[0,64],"Granularity":1} + ] + } + ] + } + ] +} diff --git a/third_party/mscclpp b/third_party/mscclpp index 6226556ce..9c2a96060 160000 --- a/third_party/mscclpp +++ b/third_party/mscclpp @@ -1 +1 @@ -Subproject commit 6226556ce277a3e26f5de91a8bf3a36e01613f55 +Subproject commit 9c2a96060a89ab00a239934a38f64754e3d3026b