From 1a013c4aac32834ba7fb80a4825a6738247e7634 Mon Sep 17 00:00:00 2001 From: Feroz Date: Wed, 27 Nov 2024 02:30:39 +0000 Subject: [PATCH] Tune Aquavanjaram942X BBS TN sizes into grid based library --- ...lik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs.yaml | 4247 ++++++++++++++++- 1 file changed, 4231 insertions(+), 16 deletions(-) diff --git a/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/GridBased/aquavanjaram_Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs.yaml b/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/GridBased/aquavanjaram_Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs.yaml index dd2e11f5d3..2d553ba7ee 100644 --- a/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/GridBased/aquavanjaram_Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs.yaml +++ b/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/GridBased/aquavanjaram_Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs.yaml @@ -1393006,6 +1393006,4221 @@ _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: default + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_AS_SAV_UserArgs_MT384x64x64_MI16x16x1_SN_GRVWA8_GRVWB8_K1_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIAV0_MIWT12_2_NTA4_SVW4_VWA4_VWB2_WG32_8_1 + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 61440 + LdsInitCVgprs: false + LdsNumBytes: 61440 + LdsNumElementsAlignedA: 52224 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 52224 + LdsOffsetB_Blk: 117760 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 117760 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: true + MIArchVgpr: 0 + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [12, 2] + MIWaveTileA: 12 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 64 + MacroTileA: 384 + MacroTileB: 64 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 12 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeAmaxD: 0 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5312 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_AS_SAV_UserArgs_MT384x64x64_MI16x16x1_SN_GRVWA8_GRVWB8_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIAV0_MIWT12_2_NTA4_SU8_SUM0_SUS256_SVW4_VWA4_VWB2_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSyncOpt: 4 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 2 + ThreadTileA: 48 + ThreadTileB: 2 + TotalVgprNumber: 512 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: default + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_AS_SAV_UserArgs_MT192x192x64_MI16x16x1_SN_GRVWA2_GRVWB8_K1_LBSPPA256_LBSPPB256_LPA16_LPB16_LRVW8_MIAV0_MIWT6_6_SVW2_VWA2_VWB2_WG32_8_1 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 55296 + LdsInitCVgprs: false + LdsNumBytes: 55296 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 55296 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: true + MIArchVgpr: 0 + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 192 + MacroTileA: 192 + MacroTileB: 192 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 72 + NumLoadsA: 24 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 24 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeAmaxD: 0 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5313 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_AS_SAV_UserArgs_MT192x192x64_MI16x16x1_SN_GRVWA2_GRVWB8_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA256_LBSPPB256_LPA16_LPB16_LRVW8_MIAV0_MIWT6_6_SU8_SUM0_SUS256_SVW2_VWA2_VWB2_WG32_8_1_WGMn48_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSyncOpt: 4 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + TotalVgprNumber: 512 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: -48 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: default + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_AS_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_GRVWA8_GRVWB8_K1_LBSPPA512_LBSPPB512_LPA16_LPB16_LRVW8_MIAV1_MIWT4_4_SVW4_VWA4_VWB4_WG32_8_1 + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34816 + LdsInitCVgprs: false + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: true + MIArchVgpr: 1 + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeAmaxD: 0 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5314 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_AS_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_GRVWA8_GRVWB8_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA512_LBSPPB512_LPA16_LPB16_LRVW8_MIAV1_MIWT4_4_SU8_SUM0_SUS256_SVW4_VWA4_VWB4_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSyncOpt: 4 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TotalVgprNumber: 512 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: default + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_AS_SAV_UserArgs_MT208x256x64_MI16x16x1_SN_GRVWA2_GRVWB8_K1_LBSPPA128_LBSPPB512_LPA4_LPB8_LRVW4_MIWT13_4_SVW1_VWA1_VWB4_WG16_16_1 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 62080 + LdsInitCVgprs: false + LdsNumBytes: 62080 + LdsNumElementsAlignedA: 28288 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 28288 + LdsOffsetB_Blk: 93824 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 62080 + LdsOffsetMetadata_Blk: 93824 + LdsPadA: 4 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: true + MIArchVgpr: 0 + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [13, 4] + MIWaveTileA: 13 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 208 + MacroTile1: 256 + MacroTileA: 208 + MacroTileB: 256 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 208 + NumGlobalWriteVectorsPerThread: 208 + NumLoadsA: 26 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 26 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeAmaxD: 0 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5315 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_AS_SAV_UserArgs_MT208x256x64_MI16x16x1_SN_GRVWA2_GRVWB8_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA128_LBSPPB512_LPA4_LPB8_LRVW4_MIWT13_4_SU8_SUM0_SUS256_SVW1_VWA1_VWB4_WG16_16_1_WGMn48_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSyncOpt: 4 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 52 + ThreadTile1: 4 + ThreadTileA: 52 + ThreadTileB: 4 + TotalVgprNumber: 512 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: -48 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: default + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_AS_SAV_UserArgs_MT192x224x64_MI16x16x1_SN_GRVWA8_GRVWB8_K1_LBSPPA256_LBSPPB128_LPA16_LPB16_LRVW8_MIAV0_MIWT6_7_SVW2_VWA2_VWB1_WG32_8_1 + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63488 + LdsInitCVgprs: false + LdsNumBytes: 63488 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 63488 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: true + MIArchVgpr: 0 + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 7] + MIWaveTileA: 6 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 224 + MacroTileA: 192 + MacroTileB: 224 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 168 + NumGlobalWriteVectorsPerThread: 84 + NumLoadsA: 6 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeAmaxD: 0 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5316 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_AS_SAV_UserArgs_MT192x224x64_MI16x16x1_SN_GRVWA8_GRVWB8_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA256_LBSPPB128_LPA16_LPB16_LRVW8_MIAV0_MIWT6_7_SU8_SUM0_SUS256_SVW2_VWA2_VWB1_WG32_8_1_WGMn48_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSyncOpt: 4 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 7 + ThreadTileA: 24 + ThreadTileB: 7 + TotalVgprNumber: 512 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: -48 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: default + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_AS_SAV_UserArgs_MT192x160x64_MI16x16x1_SN_GRVWA2_GRVWB8_K1_LBSPPA256_LBSPPB128_LPA16_LPB16_LRVW8_MIAV0_MIWT6_5_SVW2_VWA2_VWB1_WG32_8_1 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 53248 + LdsInitCVgprs: false + LdsNumBytes: 53248 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 53248 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: true + MIArchVgpr: 0 + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 5] + MIWaveTileA: 6 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 160 + MacroTileA: 192 + MacroTileB: 160 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 120 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 24 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 24 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeAmaxD: 0 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5317 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_AS_SAV_UserArgs_MT192x160x64_MI16x16x1_SN_GRVWA2_GRVWB8_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA256_LBSPPB128_LPA16_LPB16_LRVW8_MIAV0_MIWT6_5_SU8_SUM0_SUS256_SVW2_VWA2_VWB1_WG32_8_1_WGMn48_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSyncOpt: 4 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 5 + ThreadTileA: 24 + ThreadTileB: 5 + TotalVgprNumber: 512 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: -48 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: default + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 2 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_AS_SAV_UserArgs_MT128x224x64_MI16x16x1_SN_GRVWA2_GRVWB8_K1_LBSPPA512_LBSPPB128_LPA16_LPB16_LRVW8_MIAV0_MIWT4_7_NTC3_NTD3_SVW4_VWA4_VWB1_WG32_8_1 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 53248 + LdsInitCVgprs: false + LdsNumBytes: 53248 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 53248 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: true + MIArchVgpr: 0 + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 7] + MIWaveTileA: 4 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 224 + MacroTileA: 128 + MacroTileB: 224 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 3 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 16 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeAmaxD: 0 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5318 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_AS_SAV_UserArgs_MT128x224x64_MI16x16x1_SN_GRVWA2_GRVWB8_GSU2_GSUC0_GSUWGMRR0_K1_LBSPPA512_LBSPPB128_LPA16_LPB16_LRVW8_MIAV0_MIWT4_7_NTC3_NTD3_SU0_SUM0_SUS0_SVW4_VWA4_VWB1_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 7 + ThreadTileA: 16 + ThreadTileB: 7 + TotalVgprNumber: 512 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 2] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: default + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_AS_SAV_UserArgs_MT192x128x64_MI16x16x1_SN_GRVWA8_GRVWB8_K1_LBSPPA256_LBSPPB512_LPA16_LPB16_LRVW8_MIAV0_MIWT6_4_SVW2_VWA2_VWB4_WG32_8_1 + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 45056 + LdsInitCVgprs: false + LdsNumBytes: 45056 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 45056 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: true + MIArchVgpr: 0 + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 4] + MIWaveTileA: 6 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeAmaxD: 0 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5319 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_AS_SAV_UserArgs_MT192x128x64_MI16x16x1_SN_GRVWA8_GRVWB8_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA256_LBSPPB512_LPA16_LPB16_LRVW8_MIAV0_MIWT6_4_SU8_SUM0_SUS256_SVW2_VWA2_VWB4_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSyncOpt: 4 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 4 + ThreadTileA: 24 + ThreadTileB: 4 + TotalVgprNumber: 512 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: default + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_AS_SAV_UserArgs_MT192x192x64_MI16x16x1_SN_GRVWA2_GRVWB8_K1_LBSPPA256_LBSPPB256_LPA16_LPB16_LRVW8_MIAV0_MIWT6_6_SVW2_VWA2_VWB2_WG32_8_1 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 55296 + LdsInitCVgprs: false + LdsNumBytes: 55296 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 55296 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: true + MIArchVgpr: 0 + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 192 + MacroTileA: 192 + MacroTileB: 192 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 72 + NumLoadsA: 24 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 24 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeAmaxD: 0 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5320 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_AS_SAV_UserArgs_MT192x192x64_MI16x16x1_SN_GRVWA2_GRVWB8_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA256_LBSPPB256_LPA16_LPB16_LRVW8_MIAV0_MIWT6_6_SU8_SUM0_SUS256_SVW2_VWA2_VWB2_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSyncOpt: 4 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + TotalVgprNumber: 512 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: default + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_AS_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_GRVWA8_GRVWB8_K1_LBSPPA256_LBSPPB1024_LPA16_LPB16_LRVW8_MIAV0_MIWT6_8_SVW2_VWA2_VWB8_WG32_8_1 + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 61440 + LdsInitCVgprs: false + LdsNumBytes: 61440 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: true + MIArchVgpr: 0 + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeAmaxD: 0 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5321 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_AS_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_GRVWA8_GRVWB8_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA256_LBSPPB1024_LPA16_LPB16_LRVW8_MIAV0_MIWT6_8_SU8_SUM0_SUS256_SVW2_VWA2_VWB8_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSyncOpt: 4 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 + TotalVgprNumber: 512 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: default + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_AS_SAV_UserArgs_MT192x160x64_MI16x16x1_SN_GRVWA8_GRVWB8_K1_LBSPPA256_LBSPPB128_LPA16_LPB16_LRVW8_MIAV0_MIWT6_5_SVW2_VWA2_VWB1_WG32_8_1 + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 53248 + LdsInitCVgprs: false + LdsNumBytes: 53248 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 53248 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: true + MIArchVgpr: 0 + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 5] + MIWaveTileA: 6 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 160 + MacroTileA: 192 + MacroTileB: 160 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 120 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 6 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeAmaxD: 0 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5322 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_AS_SAV_UserArgs_MT192x160x64_MI16x16x1_SN_GRVWA8_GRVWB8_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA256_LBSPPB128_LPA16_LPB16_LRVW8_MIAV0_MIWT6_5_SU8_SUM0_SUS256_SVW2_VWA2_VWB1_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSyncOpt: 4 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 5 + ThreadTileA: 24 + ThreadTileB: 5 + TotalVgprNumber: 512 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: default + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_AS_SAV_UserArgs_MT128x256x64_MI16x16x1_SN_GRVWA2_GRVWB8_K1_LBSPPA1024_LBSPPB512_LPA16_LPB16_LRVW8_MIAV0_MIWT8_4_SVW8_VWA8_VWB4_WG16_16_1 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51712 + LdsInitCVgprs: false + LdsNumBytes: 51712 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51712 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: true + MIArchVgpr: 0 + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeAmaxD: 0 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5323 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_AS_SAV_UserArgs_MT128x256x64_MI16x16x1_SN_GRVWA2_GRVWB8_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA1024_LBSPPB512_LPA16_LPB16_LRVW8_MIAV0_MIWT8_4_SU8_SUM0_SUS256_SVW8_VWA8_VWB4_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSyncOpt: 4 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TotalVgprNumber: 512 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: default + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_AS_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_GRVWA2_GRVWB8_K1_LBSPPA1024_LBSPPB128_LPA16_LPB16_LRVW8_MIWT8_5_SVW8_VWA8_VWB1_WG32_8_1 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 59392 + LdsInitCVgprs: false + LdsNumBytes: 59392 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 59392 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: true + MIArchVgpr: 0 + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 5] + MIWaveTileA: 8 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 32 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeAmaxD: 0 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5324 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_AS_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_GRVWA2_GRVWB8_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA1024_LBSPPB128_LPA16_LPB16_LRVW8_MIWT8_5_SU8_SUM0_SUS256_SVW8_VWA8_VWB1_WG32_8_1_WGMn48_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSyncOpt: 4 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 + TotalVgprNumber: 512 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: -48 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: default + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_AS_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_GRVWA2_GRVWB8_K1_LBSPPA256_LBSPPB512_LPA16_LPB16_LRVW8_MIAV0_MIWT10_4_SVW2_VWA2_VWB4_WG16_16_1 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 57856 + LdsInitCVgprs: false + LdsNumBytes: 57856 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 88576 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 57856 + LdsOffsetMetadata_Blk: 88576 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: true + MIArchVgpr: 0 + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [10, 4] + MIWaveTileA: 10 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 256 + MacroTileA: 160 + MacroTileB: 256 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 20 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 20 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeAmaxD: 0 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5325 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_AS_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_GRVWA2_GRVWB8_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA256_LBSPPB512_LPA16_LPB16_LRVW8_MIAV0_MIWT10_4_SU8_SUM0_SUS256_SVW2_VWA2_VWB4_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSyncOpt: 4 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 4 + ThreadTileA: 40 + ThreadTileB: 4 + TotalVgprNumber: 512 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: default + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_AS_SAV_UserArgs_MT128x192x64_MI16x16x1_SN_GRVWA8_GRVWB8_K1_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIAV0_MIWT4_6_SVW4_VWA4_VWB2_WG32_8_1 + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 45056 + LdsInitCVgprs: false + LdsNumBytes: 45056 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 45056 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: true + MIArchVgpr: 0 + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeAmaxD: 0 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5326 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_AS_SAV_UserArgs_MT128x192x64_MI16x16x1_SN_GRVWA8_GRVWB8_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIAV0_MIWT4_6_SU8_SUM0_SUS256_SVW4_VWA4_VWB2_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSyncOpt: 4 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 + TotalVgprNumber: 512 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 - [2, 3, 0, 1] - - - [8, 48, 1, 128] - [16, 23.3179] @@ -1421486,7 +1425701,7 @@ - - [106496, 64, 1, 128] - [246, 84088.8] - - [106496, 64, 1, 8192] - - [547, 215072.0] + - [5312, 0.0] - - [106496, 128, 1, 4096] - [592, 324313.0] - - [106496, 64, 1, 512] @@ -1427658,7 +1431873,7 @@ - - [6144, 704, 1, 128] - [534, 121823.0] - - [6144, 704, 1, 8192] - - [325, 428914.0] + - [5314, 0.0] - - [6144, 768, 1, 4096] - [354, 380025.0] - - [6144, 832, 1, 2048] @@ -1427774,7 +1431989,7 @@ - - [6144, 608, 1, 128] - [534, 115240.0] - - [6144, 608, 1, 8192] - - [325, 359520.0] + - [5318, 0.0] - - [6144, 640, 1, 4096] - [354, 337981.0] - - [6144, 704, 1, 2048] @@ -1427796,7 +1432011,7 @@ - - [6144, 1152, 1, 128] - [531, 142720.0] - - [6144, 1152, 1, 8192] - - [403, 487633.0] + - [5326, 0.0] - - [6144, 1216, 1, 4096] - [345, 443250.0] - - [6144, 1280, 1, 2048] @@ -1427826,7 +1432041,7 @@ - - [6144, 2304, 1, 8192] - [451, 564704.0] - - [6144, 2432, 1, 4096] - - [483, 505676.0] + - [5319, 0.0] - - [6144, 2560, 1, 4096] - [483, 518141.0] - - [6144, 24, 1, 128] @@ -1427924,7 +1432139,7 @@ - - [6144, 1520, 1, 8192] - [407, 508255.0] - - [6144, 1536, 1, 4096] - - [531, 476232.0] + - [5323, 0.0] - - [6144, 1664, 1, 4096] - [407, 478325.0] - - [6144, 1792, 1, 2048] @@ -1439432,7 +1443647,7 @@ - - [6144, 3040, 1, 2048] - [483, 531211.0] - - [6144, 3040, 1, 4096] - - [483, 593429.0] + - [5322, 0.0] - - [6144, 3072, 1, 4096] - [483, 574454.0] - - [6144, 3328, 1, 128] @@ -1439490,7 +1443705,7 @@ - - [6144, 4096, 1, 2048] - [710, 473039.0] - - [6144, 4096, 1, 4096] - - [713, 522307.0] + - [5321, 0.0] - - [6144, 4256, 1, 128] - [713, 169871.0] - - [6144, 4256, 1, 512] @@ -1440918,7 +1445133,7 @@ - - [7680, 3072, 1, 2048] - [612, 491372.0] - - [7680, 3072, 1, 4096] - - [627, 534592.0] + - [5325, 0.0] - - [7680, 3648, 1, 128] - [652, 174401.0] - - [7680, 3648, 1, 512] @@ -1440934,7 +1445149,7 @@ - - [7680, 4096, 1, 2048] - [661, 494261.0] - - [7680, 4096, 1, 4096] - - [766, 546953.0] + - [5320, 0.0] - - [7680, 4864, 1, 128] - [747, 174576.0] - - [7680, 4864, 1, 512] @@ -1448938,7 +1453153,7 @@ - - [40960, 608, 1, 2048] - [653, 383033.0] - - [40960, 608, 1, 4096] - - [620, 420711.0] + - [5317, 0.0] - - [40960, 640, 1, 128] - [747, 159406.0] - - [40960, 640, 1, 512] @@ -1448946,7 +1453161,7 @@ - - [40960, 640, 1, 2048] - [727, 394921.0] - - [40960, 640, 1, 4096] - - [620, 437202.0] + - [5317, 0.0] - - [40960, 768, 1, 128] - [475, 158258.0] - - [40960, 768, 1, 512] @@ -1448954,7 +1453169,7 @@ - - [40960, 768, 1, 2048] - [482, 436443.0] - - [40960, 768, 1, 4096] - - [621, 472778.0] + - [5313, 0.0] - - [40960, 896, 1, 128] - [747, 168209.0] - - [40960, 896, 1, 512] @@ -1448962,7 +1453177,7 @@ - - [40960, 896, 1, 2048] - [615, 432692.0] - - [40960, 896, 1, 4096] - - [615, 496820.0] + - [5316, 0.0] - - [40960, 1024, 1, 128] - [769, 161020.0] - - [40960, 1024, 1, 512] @@ -1448994,7 +1453209,7 @@ - - [40960, 1536, 1, 2048] - [650, 490439.0] - - [40960, 1536, 1, 4096] - - [615, 492340.0] + - [5315, 0.0] - - [40960, 1792, 1, 128] - [475, 183171.0] - - [40960, 1792, 1, 512] @@ -1449026,7 +1453241,7 @@ - - [40960, 2432, 1, 2048] - [652, 512914.0] - - [40960, 2432, 1, 4096] - - [482, 533593.0] + - [5324, 0.0] - - [40960, 2560, 1, 128] - [747, 187387.0] - - [40960, 2560, 1, 512]