diff --git a/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942_80cu/Equality/aquavanjaram_Cijk_Ailk_Bjlk_BBS_BH_Bias_HAS_SAV_UserArgs.yaml b/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942_80cu/Equality/aquavanjaram_Cijk_Ailk_Bjlk_BBS_BH_Bias_HAS_SAV_UserArgs.yaml new file mode 100644 index 0000000000..f57447eb66 --- /dev/null +++ b/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942_80cu/Equality/aquavanjaram_Cijk_Ailk_Bjlk_BBS_BH_Bias_HAS_SAV_UserArgs.yaml @@ -0,0 +1,655 @@ +- {MinimumRequiredVersion: 4.33.0} +- aquavanjaram +- {Architecture: gfx942, CUCount: 80} +- [Device 0049, Device 0050] +- Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeAmaxD: 0 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false +- - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 8 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: default + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 32 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_BBS_BH_Bias_HAS_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_GRVWA8_GRVWB8_K1_LBSPPA1024_LBSPPB512_LPA0_LPB32_MIWT4_2_NLCA1_NLCB1_SVW4_VWA4_VWB2_WG32_8_1 + LSCA: 128 + LSCB: 64 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25600 + LdsInitCVgprs: false + LdsNumBytes: 25600 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25600 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 32 + LdsPadMetadata: -1 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: true + MIArchVgpr: false + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeAmaxD: 0 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_Bias_HAS_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_GRVWA8_GRVWB8_GSU32_GSUC0_GSUWGMRR0_K1_LBSPPA1024_LBSPPB512_LPA0_LPB32_MIWT4_2_NLCA1_NLCB1_SU0_SUM0_SUS0_SVW4_VWA4_VWB2_WG32_8_1_WGM1_WGMXCC1_WGMXCCG80 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: 80 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 32] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + tailLoopOpt: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 8 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: default + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 40 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_BBS_BH_Bias_HAS_SAV_UserArgs_MT128x256x64_MI32x32x1_SN_GRVWA8_GRVWB8_K1_LBSPPA0_LBSPPB0_LPA0_LPB0_MIWT2_4_NLCA1_NLCB1_SVW2_VWA2_VWB4_WG64_4_1 + LSCA: 128 + LSCB: 256 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 49152 + LdsInitCVgprs: false + LdsNumBytes: 49152 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 49152 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: -1 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 64 + MFMA_BF16_1K: true + MIArchVgpr: false + MIBlock: [32, 32, 8, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 8 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 8, 1] + MaxOccupancy: 40 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeAmaxD: 0 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_Bias_HAS_SAV_UserArgs_MT128x256x64_MI32x32x1_SN_GRVWA8_GRVWB8_GSU40_GSUC0_GSUWGMRR0_K1_LBSPPA0_LBSPPB0_LPA0_LPB0_MIWT2_4_NLCA1_NLCB1_SU0_SUM0_SUS0_SVW2_VWA2_VWB4_WG64_4_1_WGM4_WGMXCC4_WGMXCCG80 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: 80 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 40] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + tailLoopOpt: true +- [2, 3, 0, 1] +- - - [128, 128, 1, 12000, 128, 128, 128, 128] + - [0, 10.77] + - - [256, 200, 1, 1200000, 256, 256, 256, 200] + - [1, 99.0] + - - [256, 256, 1, 3224984, 256, 256, 256, 256] + - [1, 129.74] +- null +- null +- DeviceEfficiency +- Equality diff --git a/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942_80cu/Equality/aquavanjaram_Cijk_Ailk_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs.yaml b/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942_80cu/Equality/aquavanjaram_Cijk_Ailk_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs.yaml new file mode 100644 index 0000000000..cbe0db1870 --- /dev/null +++ b/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942_80cu/Equality/aquavanjaram_Cijk_Ailk_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs.yaml @@ -0,0 +1,3503 @@ +- {MinimumRequiredVersion: 4.33.0} +- aquavanjaram +- {Architecture: gfx942, CUCount: 80} +- [Device 0049, Device 0050] +- Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeAmaxD: 0 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false +- - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: default + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 5 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_GRVWA8_GRVWB8_K1_LBSPPA4096_LBSPPB256_LPA0_LPB16_LRVW8_MIWT4_14_NLCA1_SVW4_VWA4_VWB2_WG64_4_1 + LSCA: 256 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 4 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65024 + LdsInitCVgprs: false + LdsNumBytes: 65024 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 65024 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 16 + LdsPadMetadata: -1 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: true + MIArchVgpr: false + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 14] + MIWaveTileA: 4 + MIWaveTileB: 14 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeAmaxD: 0 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_GRVWA8_GRVWB8_GSU5_GSUC0_GSUWGMRR0_K1_LBSPPA4096_LBSPPB256_LPA0_LPB16_LRVW8_MIWT4_14_NLCA1_SU0_SUM0_SUS0_SVW4_VWA4_VWB2_WG64_4_1_WGM1_WGMXCC1_WGMXCCG80 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 14 + ThreadTileA: 16 + ThreadTileB: 14 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: 80 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 5] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + tailLoopOpt: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: default + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 2 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_GRVWA8_GRVWB8_K1_LBSPPA4096_LBSPPB512_LPA0_LPB16_LRVW8_MIWT4_12_NLCA1_SVW4_VWA4_VWB4_WG64_4_1 + LSCA: 256 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 4 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 58880 + LdsInitCVgprs: false + LdsNumBytes: 58880 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 26112 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 58880 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 16 + LdsPadMetadata: -1 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: true + MIArchVgpr: false + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 12] + MIWaveTileA: 4 + MIWaveTileB: 12 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeAmaxD: 0 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_GRVWA8_GRVWB8_GSU2_GSUC0_GSUWGMRR0_K1_LBSPPA4096_LBSPPB512_LPA0_LPB16_LRVW8_MIWT4_12_NLCA1_SU0_SUM0_SUS0_SVW4_VWA4_VWB4_WG64_4_1_WGM4_WGMXCC4_WGMXCCG80 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 12 + ThreadTileA: 16 + ThreadTileB: 12 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: 80 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 2] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + tailLoopOpt: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: default + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 5 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT128x224x64_MI16x16x1_SN_GRVWA8_GRVWB8_K1_LBSPPA2048_LBSPPB128_LPA0_LPB16_LRVW8_MIWT4_7_NLCA1_SVW4_VWA4_VWB1_WG32_8_1 + LSCA: 128 + LSCB: 64 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 52224 + LdsInitCVgprs: false + LdsNumBytes: 52224 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 52224 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 16 + LdsPadMetadata: -1 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: true + MIArchVgpr: false + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 7] + MIWaveTileA: 4 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 224 + MacroTileA: 128 + MacroTileB: 224 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 4 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeAmaxD: 0 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT128x224x64_MI16x16x1_SN_GRVWA8_GRVWB8_GSU5_GSUC0_GSUWGMRR0_K1_LBSPPA2048_LBSPPB128_LPA0_LPB16_LRVW8_MIWT4_7_NLCA1_SU0_SUM0_SUS0_SVW4_VWA4_VWB1_WG32_8_1_WGM4_WGMXCC4_WGMXCCG80 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 7 + ThreadTileA: 16 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: 80 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 5] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + tailLoopOpt: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: default + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_CLR1_GRVWA8_GRVWB4_K1_LBSPPA2048_LBSPPB512_LPA0_LPB4_LRVW4_MIWT4_8_NLCA1_PLR1_SVW4_VWA4_VWB8_WG64_4_1 + LSCA: 256 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 24704 + LdsInitCVgprs: false + LdsNumBytes: 24704 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8320 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 24704 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: -1 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: true + MIArchVgpr: false + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeAmaxD: 0 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_CLR1_GRVWA8_GRVWB4_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA2048_LBSPPB512_LPA0_LPB4_LRVW4_MIWT4_8_NLCA1_PLR1_SU0_SUM0_SUS0_SVW4_VWA4_VWB8_WG64_4_1_WGM1_WGMXCC1_WGMXCCG80 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: 80 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + tailLoopOpt: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 4 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: default + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_GRVWA8_GRVWB8_K1_LBSPPA2048_LBSPPB512_LPA0_LPB16_LRVW8_MIWT4_2_NLCA1_SVW4_ULSGRO0_VWA4_VWB2_WSGRA1_WSGRB1_WG32_8_1 + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50176 + LdsInitCVgprs: false + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50176 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 16 + LdsPadMetadata: -1 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 128 + MFMA_BF16_1K: true + MIArchVgpr: false + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeAmaxD: 0 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_GRVWA8_GRVWB8_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA2048_LBSPPB512_LPA0_LPB16_LRVW8_MIWT4_2_NLCA1_SU0_SUM0_SUS0_SVW4_ULSGRO0_VWA4_VWB2_WSGRA1_WSGRB1_WG32_8_1_WGM1_WGMXCC1_WGMXCCG80 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: 80 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + tailLoopOpt: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 4 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: default + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 2 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_GRVWA8_GRVWB8_K1_LBSPPA2048_LBSPPB256_LPA0_LPB16_LRVW8_MIWT4_3_NLCA1_SVW4_ULSGRO0_VWA4_VWB1_WSGRA0_WSGRB0_WG32_8_1 + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 60416 + LdsInitCVgprs: false + LdsNumBytes: 60416 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 60416 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 16 + LdsPadMetadata: -1 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 128 + MFMA_BF16_1K: true + MIArchVgpr: false + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeAmaxD: 0 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_GRVWA8_GRVWB8_GSU2_GSUC0_GSUWGMRR0_K1_LBSPPA2048_LBSPPB256_LPA0_LPB16_LRVW8_MIWT4_3_NLCA1_SU0_SUM0_SUS0_SVW4_ULSGRO0_VWA4_VWB1_WSGRA0_WSGRB0_WG32_8_1_WGM1_WGMXCC1_WGMXCCG80 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: 80 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 2] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + tailLoopOpt: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 4 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: default + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT128x256x64_MI16x16x1_SN_GRVWA8_GRVWB8_K1_LBSPPA2048_LBSPPB1024_LPA0_LPB16_LRVW8_MIWT4_8_NLCA1_SVW4_ULSGRO0_VWA4_VWB8_WSGRA0_WSGRB1_WG32_8_1 + LSCA: 128 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50176 + LdsInitCVgprs: false + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50176 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 16 + LdsPadMetadata: -1 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: true + MIArchVgpr: false + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeAmaxD: 0 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT128x256x64_MI16x16x1_SN_GRVWA8_GRVWB8_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA2048_LBSPPB1024_LPA0_LPB16_LRVW8_MIWT4_8_NLCA1_SU0_SUM0_SUS0_SVW4_ULSGRO0_VWA4_VWB8_WSGRA0_WSGRB1_WG32_8_1_WGM1_WGMXCC1_WGMXCCG80 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 1 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: 80 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + tailLoopOpt: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 4 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: default + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT256x64x64_MI16x16x1_SN_GRVWA8_GRVWB8_K1_LBSPPA4096_LBSPPB512_LPA0_LPB16_LRVW8_MIWT4_4_NLCA1_SVW4_ULSGRO0_VWA4_VWB4_WSGRA0_WSGRB0_WG64_4_1 + LSCA: 256 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 4 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 41472 + LdsInitCVgprs: false + LdsNumBytes: 41472 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 41472 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 16 + LdsPadMetadata: -1 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: true + MIArchVgpr: false + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeAmaxD: 0 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT256x64x64_MI16x16x1_SN_GRVWA8_GRVWB8_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA4096_LBSPPB512_LPA0_LPB16_LRVW8_MIWT4_4_NLCA1_SU0_SUM0_SUS0_SVW4_ULSGRO0_VWA4_VWB4_WSGRA0_WSGRB0_WG64_4_1_WGM1_WGMXCC1_WGMXCCG80 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: 80 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + tailLoopOpt: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 4 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: default + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_GRVWA8_GRVWB8_K1_LBSPPA2048_LBSPPB256_LPA0_LPB16_LRVW8_MIWT4_3_NLCA1_SVW4_ULSGRO0_VWA4_VWB1_WSGRA1_WSGRB0_WG32_8_1 + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 60416 + LdsInitCVgprs: false + LdsNumBytes: 60416 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 60416 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 16 + LdsPadMetadata: -1 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 128 + MFMA_BF16_1K: true + MIArchVgpr: false + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeAmaxD: 0 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_GRVWA8_GRVWB8_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA2048_LBSPPB256_LPA0_LPB16_LRVW8_MIWT4_3_NLCA1_SU0_SUM0_SUS0_SVW4_ULSGRO0_VWA4_VWB1_WSGRA1_WSGRB0_WG32_8_1_WGM4_WGMXCC4_WGMXCCG80 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: 80 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + tailLoopOpt: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 4 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: default + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT256x240x64_MI16x16x1_SN_GRVWA8_GRVWB4_K1_LBSPPA2048_LBSPPB128_LPA0_LPB4_LRVW4_MIWT4_15_NLCA1_SVW4_ULSGRO0_VWA4_VWB1_WSGRA0_WSGRB0_WG64_4_1 + LSCA: 256 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 4 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65408 + LdsInitCVgprs: false + LdsNumBytes: 65408 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32640 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 65408 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: -1 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: true + MIArchVgpr: false + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 15] + MIWaveTileA: 4 + MIWaveTileB: 15 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 240 + MacroTileA: 256 + MacroTileB: 240 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 240 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 8 + NumLoadsB: 15 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 15 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeAmaxD: 0 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT256x240x64_MI16x16x1_SN_GRVWA8_GRVWB4_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA2048_LBSPPB128_LPA0_LPB4_LRVW4_MIWT4_15_NLCA1_SU0_SUM0_SUS0_SVW4_ULSGRO0_VWA4_VWB1_WSGRA0_WSGRB0_WG64_4_1_WGM1_WGMXCC1_WGMXCCG80 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 15 + ThreadTileA: 16 + ThreadTileB: 15 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: 80 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + tailLoopOpt: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 4 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: default + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_GRVWA8_GRVWB8_K1_LBSPPA4096_LBSPPB1024_LPA0_LPB16_LRVW8_MIWT4_8_NLCA1_SVW4_ULSGRO0_VWA4_VWB8_WSGRA0_WSGRB0_WG64_4_1 + LSCA: 256 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 4 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 49664 + LdsInitCVgprs: false + LdsNumBytes: 49664 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 49664 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 16 + LdsPadMetadata: -1 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: true + MIArchVgpr: false + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeAmaxD: 0 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_GRVWA8_GRVWB8_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA4096_LBSPPB1024_LPA0_LPB16_LRVW8_MIWT4_8_NLCA1_SU0_SUM0_SUS0_SVW4_ULSGRO0_VWA4_VWB8_WSGRA0_WSGRB0_WG64_4_1_WGM1_WGMXCC1_WGMXCCG80 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: 80 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + tailLoopOpt: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 4 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: default + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT256x96x64_MI16x16x1_SN_GRVWA8_GRVWB8_K1_LBSPPA4096_LBSPPB256_LPA0_LPB16_LRVW8_MIWT4_6_NLCA1_SVW4_ULSGRO0_VWA4_VWB2_WSGRA1_WSGRB0_WG64_4_1 + LSCA: 256 + LSCB: 64 + LSPA: 2 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 4 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 46592 + LdsInitCVgprs: false + LdsNumBytes: 46592 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 13824 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 46592 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 16 + LdsPadMetadata: -1 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: true + MIArchVgpr: false + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 96 + MacroTileA: 256 + MacroTileB: 96 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeAmaxD: 0 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT256x96x64_MI16x16x1_SN_GRVWA8_GRVWB8_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA4096_LBSPPB256_LPA0_LPB16_LRVW8_MIWT4_6_NLCA1_SU0_SUM0_SUS0_SVW4_ULSGRO0_VWA4_VWB2_WSGRA1_WSGRB0_WG64_4_1_WGM1_WGMXCC1_WGMXCCG80 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: 80 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + tailLoopOpt: false +- [2, 3, 0, 1] +- - - [128, 3584, 1, 37888] + - [2, 111.5] + - - [256, 3584, 1, 37888] + - [0, 131.82] + - - [500, 3584, 1, 37888] + - [1, 122.22] + - - [256, 3224984, 1, 256] + - [3, 0.0] + - - [128, 4608, 1, 3584] + - [4, 0.0] + - - [128, 3584, 1, 3584] + - [5, 0.0] + - - [128, 18944, 1, 3584] + - [6, 0.0] + - - [256, 4608, 1, 3584] + - [7, 0.0] + - - [256, 3584, 1, 3584] + - [8, 0.0] + - - [256, 18944, 1, 3584] + - [9, 0.0] + - - [500, 4608, 1, 3584] + - [10, 0.0] + - - [500, 3584, 1, 3584] + - [11, 0.0] + - - [500, 18944, 1, 3584] + - [9, 0.0] +- null +- null +- DeviceEfficiency +- Equality diff --git a/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942_80cu/Equality/aquavanjaram_Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs.yaml b/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942_80cu/Equality/aquavanjaram_Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs.yaml new file mode 100644 index 0000000000..9599432e72 --- /dev/null +++ b/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942_80cu/Equality/aquavanjaram_Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs.yaml @@ -0,0 +1,655 @@ +- {MinimumRequiredVersion: 4.33.0} +- aquavanjaram +- {Architecture: gfx942, CUCount: 80} +- [Device 0049, Device 0050] +- Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeAmaxD: 0 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false +- - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 8 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: default + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_GRVWA8_GRVWB8_GSUAMB_K1_LBSPPA1024_LBSPPB256_LPA16_LPB16_LRVW8_MIWT8_6_SVW8_ULSGRO0_VWA8_VWB2_WSGRA1_WSGRB0_WG32_8_1 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 61440 + LdsInitCVgprs: false + LdsNumBytes: 61440 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: -1 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: true + MIArchVgpr: false + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 6] + MIWaveTileA: 8 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeAmaxD: 0 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_K1_LBSPPA1024_LBSPPB256_LPA16_LPB16_LRVW8_MIWT8_6_SU0_SUM0_SUS0_SVW8_ULSGRO0_VWA8_VWB2_WSGRA1_WSGRB0_WG32_8_1_WGM1_WGMXCC1_WGMXCCG80 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 6 + ThreadTileA: 32 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: 80 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + tailLoopOpt: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 8 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 0 + CodeObjectVersion: default + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT32x96x16_MI16x16x1_SN_CLR0_GRVWA2_GRVWB2_K1_LBSPPA128_LBSPPB128_MIWT1_3_NTB4_PLR0_SVW1_ULSGRO0_VWA1_VWB1_WSGRA0_WSGRB0_WG32_8_1 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 4480 + LdsInitCVgprs: false + LdsNumBytes: 4480 + LdsNumElementsAlignedA: 1152 + LdsNumElementsAlignedB: 3328 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 1152 + LdsOffsetB_Blk: 9344 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4480 + LdsOffsetMetadata_Blk: 9344 + LdsPadA: 4 + LdsPadB: 4 + LdsPadMetadata: -1 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 16 + MFMA_BF16_1K: true + MIArchVgpr: false + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 96 + MacroTileA: 32 + MacroTileB: 96 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 1 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeAmaxD: 0 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT32x96x16_MI16x16x1_SN_CLR0_GRVWA2_GRVWB2_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA128_LBSPPB128_MIWT1_3_NTB4_PLR0_SU0_SUM0_SUS0_SVW1_ULSGRO0_VWA1_VWB1_WSGRA0_WSGRB0_WG32_8_1_WGM1_WGMXCC1_WGMXCCG80 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 3 + ThreadTileA: 4 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: 80 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 16 + _DepthUA: 16 + _DepthUB: 16 + _DepthUMetadata: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + tailLoopOpt: true +- [2, 3, 0, 1] +- - - [200, 1200000, 1, 256] + - [0, 48.03] + - - [256, 3224984, 1, 256] + - [0, 65.53] + - - [32, 12000, 1, 3] + - [1, 0.0] +- null +- null +- DeviceEfficiency +- Equality