diff --git a/source/lib/rocprofiler-sdk/counters/tests/metrics_test.h b/source/lib/rocprofiler-sdk/counters/tests/metrics_test.h index 897fcfff..d1be630f 100644 --- a/source/lib/rocprofiler-sdk/counters/tests/metrics_test.h +++ b/source/lib/rocprofiler-sdk/counters/tests/metrics_test.h @@ -35,43 +35,57 @@ static const std::unordered_map", - "Number of VMEM write instructions issued (including FLAT). (per-simd, emulated)"}, + "The number of VMEM (GPU Memory) write instructions issued (including FLAT/scratch memory). " + "The value is returned per-SE (aggregate of values in SIMDs in the SE)."}, {"SQ_INSTS_VMEM_RD", "SQ", "29", "", - "Number of VMEM read instructions issued (including FLAT). (per-simd, emulated)"}, + "The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch memory). " + "The value is returned per-SE (aggregate of values in SIMDs in the SE)."}, {"SQ_INSTS_SALU", "SQ", "31", "", - "Number of SALU instructions issued. (per-simd, emulated)"}, + "Total Number of SALU (Scalar ALU) instructions issued. This value is returned per-SE " + "(aggregate of values in SIMDs in the SE). See AMD ISAs for more information on SALU " + "instructions."}, {"SQ_INSTS_SMEM", "SQ", "32", "", - "Number of SMEM instructions issued. (per-simd, emulated)"}, + "Total number of SMEM (Scalar Memory Read) instructions issued. This value is returned " + "per-SE (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on SMEM " + "instructions."}, {"SQ_INSTS_FLAT", "SQ", "33", "", - "Number of FLAT instructions issued. (per-simd, emulated)"}, + "Total number of FLAT instructions issued. When used in combination with " + "SQ_ACTIVE_INST_FLAT (cycle count for executing instructions) the average latency of FLAT " + "instruction execution can be calculated (SQ_ACTIVE_INST_FLAT / SQ_INSTS). This value is " + "returned per-SE (aggregate of values in SIMDs in the SE)."}, {"SQ_INSTS_FLAT_LDS_ONLY", "SQ", "34", "", - "Number of FLAT instructions issued that read/wrote only from/to LDS (only works if " - "EARLY_TA_DONE is enabled). (per-simd, emulated)"}, + "Total number of FLAT instructions issued that read/wrote only from/to LDS (scratch " + "memory). Values are only populated if EARLY_TA_DONE is enabled. This value is returned " + "per-SE (aggregate of values in SIMDs in the SE)."}, {"SQ_INSTS_LDS", "SQ", "35", "", - "Number of LDS instructions issued (including FLAT). (per-simd, emulated)"}, + "Total number of LDS instructions issued (including FLAT). This value is returned per-SE " + "(aggregate of values in SIMDs in the SE). See AMD ISAs for more information on LDS " + "instructions."}, {"SQ_INSTS_GDS", "SQ", "36", "", - "Number of GDS instructions issued. (per-simd, emulated)"}, + "Total number of GDS (global data sync) instructions issued. This value is returned per-SE " + "(aggregate of values in SIMDs in the SE). See AMD ISAs for more information on GDS (global " + "data sync) instructions."}, {"SQ_WAIT_INST_LDS", "SQ", "64", @@ -82,14 +96,18 @@ static const std::unordered_map", - "Number of cycles the SQ instruction arbiter is working on a VALU instruction. " - "(per-simd, emulated). Units in quad-cycles(4 cycles)"}, + "Number of cycles each wave spends working on a VALU instructions. This value represents " + "the number of cycles each wave spends executing vector ALU instructions. On MI200 " + "platforms, there are 4 VALUs per CU. High values indicates a large amount of time spent " + "executing vector instructions. This value is returned on a per-SE (aggregate of values in " + "SIMDs in the SE) basis with units in quad-cycles(4 cycles)."}, {"SQ_INST_CYCLES_SALU", "SQ", "85", "", - "Number of cycles needed to execute non-memory read scalar operations. (per-simd, " - "emulated). Units in quad-cycles(4 cycles)"}, + "The number of cycles needed to execute non-memory read scalar operations (SALU). This " + "value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis with units in " + "quad-cycles(4 cycles)."}, {"SQ_THREAD_CYCLES_VALU", "SQ", "86", @@ -100,7 +118,8 @@ static const std::unordered_map", - "Number of cycles LDS is stalled by bank conflicts. (emulated)"}, + "The number of cycles LDS (local data store) is stalled by bank conflicts. This value is " + "returned on a per-SE (aggregate of values in SIMDs in the SE) basis."}, {"TCC_HIT", "TCC", "17", "", "Number of cache hits."}, {"TCC_MISS", "TCC", "19", "", "Number of cache misses. UC reads count as misses."}, {"TCC_EA_WRREQ", @@ -133,12 +152,19 @@ static const std::unordered_map", - "Count number of waves sent to SQs. (per-simd, emulated, global)"}, + "Count number of waves sent to distributed sequencers (SQs). This value represents the " + "number of waves that are sent to each SQ. This only counts new waves sent since the start " + "of collection (for dispatch profiling this is the timeframe of kernel execution, for agent " + "profiling it is the timeframe between start_context and read counter data). A sum of all " + "SQ_WAVES values will give the total number of waves started by the application during the " + "collection timeframe. Returns one value per-SE (aggregates of SIMD values)."}, {"SQ_INSTS_VALU", "SQ", "26", "", - "Number of VALU instructions issued. (per-simd, emulated)"}, + "The number of VALU (Vector ALU) instructions issued. The value is returned per-SE " + "(aggregate of values in SIMDs in the SE). See AMD ISAs for more information on VALU " + "instructions."}, {"TA_TA_BUSY", "TA", "15", @@ -220,8 +246,10 @@ static const std::unordered_map int). The value + is returned per-SE (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on + VALU instructions. SQ_INSTS_VALU_FMA_F16: architectures: gfx942/gfx941/gfx940/gfx90a: block: SQ event: 29 - description: Number of VALU FMA/MAD instructions on float16. (per-simd, emulated) + description: The number of VALU (Vector ALU) FMA (Fused-Multiply-Add)/MAD(Multiply-Add) instructions + on float16. For maximum performance lower percision floating point ops are preferred to higher percision + ones. The value is returned per-SE (aggregate of values in SIMDs in the SE). See AMD ISAs for more + information on VALU instructions. SQ_INSTS_VALU_FMA_F32: architectures: gfx942/gfx941/gfx940/gfx90a: block: SQ event: 33 - description: Number of VALU FMA/MAD instructions on float32. (per-simd, emulated) + description: The number of VALU (Vector ALU) FMA (Fused-Multiply-Add)/MAD(Multiply-Add) instructions + on float32. For maximum performance lower percision floating point ops are preferred to higher percision + ones. The value is returned per-SE (aggregate of values in SIMDs in the SE). See AMD ISAs for more + information on VALU instructions. SQ_INSTS_VALU_FMA_F64: architectures: gfx942/gfx941/gfx940/gfx90a: block: SQ event: 37 - description: Number of VALU FMA/MAD instructions on float64. (per-simd, emulated) + description: The number of VALU (Vector ALU) FMA (Fused-Multiply-Add)/MAD(Multiply-Add) instructions + on float64. For maximum performance lower percision floating point ops are preferred to higher percision + ones. The value is returned per-SE (aggregate of values in SIMDs in the SE). See AMD ISAs for more + information on VALU instructions. SQ_INSTS_VALU_INT32: architectures: gfx942/gfx941/gfx940/gfx90a: block: SQ event: 39 - description: Number of VALU 32-bit integer (signed or unsigned) instructions. (per-simd, - emulated) + description: The number of VALU (Vector ALU) 32-bit integer (signed or unsigned) instructions. The value + is returned per-SE (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on + VALU instruction. SQ_INSTS_VALU_INT64: architectures: gfx942/gfx941/gfx940/gfx90a: block: SQ event: 40 - description: Number of VALU 64-bit integer (signed or unsigned) instructions. (per-simd, - emulated) + description: The number of VALU (Vector ALU) 64-bit integer (signed or unsigned) instructions. The value + is returned per-SE (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on + VALU instruction. SQ_INSTS_VALU_MFMA_BF16: architectures: gfx942/gfx941/gfx940/gfx90a: block: SQ event: 44 - description: Number of VALU V_MFMA_*_BF16 instructions. (per-simd, emulated) + description: The number of VALU (Vector ALU) MFMA (Matrix-Fused-Multiply-Add) BF16 (outputing bfloat16 + format) instructions (V_MFMA_*_BF16). For maximum performance lower percision floating point ops are + preferred to higher percision ones. The value is returned per-SE (aggregate of values in SIMDs in + the SE). See AMD ISAs for more information on MFMA instructions. SQ_INSTS_VALU_MFMA_F16: architectures: gfx942/gfx941/gfx940/gfx90a: block: SQ event: 43 - description: Number of VALU V_MFMA_*_F16 instructions. (per-simd, emulated) + description: The number of VALU (Vector ALU) MFMA (Matrix-Fused-Multiply-Add) F16 (outputing float16 + format) instructions (V_MFMA_*_F16). For maximum performance lower percision floating point ops are + preferred to higher percision ones. The value is returned per-SE (aggregate of values in SIMDs in + the SE). See AMD ISAs for more information on MFMA instructions. SQ_INSTS_VALU_MFMA_F32: architectures: gfx942/gfx941/gfx940/gfx90a: block: SQ event: 45 - description: Number of VALU V_MFMA_*_F32 instructions. (per-simd, emulated) + description: The number of VALU (Vector ALU) MFMA (Matrix-Fused-Multiply-Add) F32 (outputing float32 + format) instructions (V_MFMA_*_F32). For maximum performance lower percision floating point ops are + preferred to higher percision ones. The value is returned per-SE (aggregate of values in SIMDs in + the SE). See AMD ISAs for more information on MFMA instructions. SQ_INSTS_VALU_MFMA_F64: architectures: gfx942/gfx941/gfx940/gfx90a: block: SQ event: 46 - description: Number of VALU V_MFMA_*_F64 instructions. (per-simd, emulated) + description: The number of VALU (Vector ALU) MFMA (Matrix-Fused-Multiply-Add) F64 (outputing float32 + format) instructions (V_MFMA_*_F64). For maximum performance lower percision floating point ops are + preferred to higher percision ones. The value is returned per-SE (aggregate of values in SIMDs in + the SE). See AMD ISAs for more information on MFMA instructions. SQ_INSTS_VALU_MFMA_I8: architectures: gfx942/gfx941/gfx940/gfx90a: block: SQ event: 42 - description: Number of VALU V_MFMA_*_I8 instructions. (per-simd, emulated) + description: The number of VALU (Vector ALU) MFMA (Matrix-Fused-Multiply-Add) I8 (outputing 8bit intergers) + instructions (V_MFMA_*_I8). See AMD ISAs for more information on MFMA instructions. SQ_INSTS_VALU_MFMA_MOPS_BF16: architectures: gfx90a: @@ -1360,8 +1431,11 @@ SQ_INSTS_VALU_MFMA_MOPS_BF16: gfx942/gfx941/gfx940: block: SQ event: 51 - description: Number of VALU matrix math operations (add or mul) performed dividied - by 512, assuming a full EXEC mask, of data type BF16. (per-simd, emulated) + description: The number of math operation instructions on the VALU (Vector ALU) using MFMA (Matrix-Fused-Multiply-Add) + and operating on BF16 (bfloat16) data. Captures add or mul ops performed divided by 512. For maximum + performance lower percision floating point ops are preferred to higher percision ones. The value is + returned per-SE (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on MFMA + instructions. SQ_INSTS_VALU_MFMA_MOPS_F16: architectures: gfx90a: @@ -1370,8 +1444,11 @@ SQ_INSTS_VALU_MFMA_MOPS_F16: gfx942/gfx941/gfx940: block: SQ event: 50 - description: Number of VALU matrix math operations (add or mul) performed dividied - by 512, assuming a full EXEC mask, of data type F16. (per-simd, emulated) + description: The number of math operation instructions on the VALU (Vector ALU) using MFMA (Matrix-Fused-Multiply-Add) + and operating on F16 (float16) data. Captures add or mul ops performed divided by 512. For maximum + performance lower percision floating point ops are preferred to higher percision ones. The value is + returned per-SE (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on MFMA + instructions. SQ_INSTS_VALU_MFMA_MOPS_F32: architectures: gfx90a: @@ -1380,8 +1457,11 @@ SQ_INSTS_VALU_MFMA_MOPS_F32: gfx942/gfx941/gfx940: block: SQ event: 52 - description: Number of VALU matrix math operations (add or mul) performed dividied - by 512, assuming a full EXEC mask, of data type F32. (per-simd, emulated) + description: The number of math operation instructions on the VALU (Vector ALU) using MFMA (Matrix-Fused-Multiply-Add) + and operating on F32 (float32) data. Captures add or mul ops performed divided by 512. For maximum + performance lower percision floating point ops are preferred to higher percision ones. The value is + returned per-SE (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on MFMA + instructions. SQ_INSTS_VALU_MFMA_MOPS_F64: architectures: gfx90a: @@ -1390,8 +1470,11 @@ SQ_INSTS_VALU_MFMA_MOPS_F64: gfx942/gfx941/gfx940: block: SQ event: 53 - description: Number of VALU matrix math operations (add or mul) performed dividied - by 512, assuming a full EXEC mask, of data type F64. (per-simd, emulated) + description: The number of math operation instructions on the VALU (Vector ALU) using MFMA (Matrix-Fused-Multiply-Add) + and operating on F64 (float64) data. Captures add or mul ops performed divided by 512. For maximum + performance lower percision floating point ops are preferred to higher percision ones. The value is + returned per-SE (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on MFMA + instructions. SQ_INSTS_VALU_MFMA_MOPS_I8: architectures: gfx90a: @@ -1400,44 +1483,61 @@ SQ_INSTS_VALU_MFMA_MOPS_I8: gfx942/gfx941/gfx940: block: SQ event: 49 - description: Number of VALU matrix math operations (add or mul) performed dividied - by 512, assuming a full EXEC mask, of data type I8. (per-simd, emulated) + description: The number of math operation instructions on the VALU (Vector ALU) using MFMA (Matrix-Fused-Multiply-Add) + and operating on I8 (8 bit int) data. Captures add or mul ops performed divided by 512. The value + is returned per-SE (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on + MFMA instructions. SQ_INSTS_VALU_MUL_F16: architectures: gfx942/gfx941/gfx940/gfx90a: block: SQ event: 28 - description: Number of VALU MUL instructions on float16. (per-simd, emulated) + description: The number of VALU MUL instructions on float16 data. For maximum performance lower percision + floating point ops are preferred to higher percision ones. The value is returned per-SE (aggregate + of values in SIMDs in the SE). See AMD ISAs for more information on VALU instructions. SQ_INSTS_VALU_MUL_F32: architectures: gfx942/gfx941/gfx940/gfx90a: block: SQ event: 32 - description: Number of VALU MUL instructions on float32. (per-simd, emulated) + description: The number of VALU MUL instructions on float32 data. For maximum performance lower percision + floating point ops are preferred to higher percision ones. The value is returned per-SE (aggregate + of values in SIMDs in the SE). See AMD ISAs for more information on VALU instructions. SQ_INSTS_VALU_MUL_F64: architectures: gfx942/gfx941/gfx940/gfx90a: block: SQ event: 36 - description: Number of VALU MUL instructions on float64. (per-simd, emulated) + description: The number of VALU MUL instructions on float64 data. For maximum performance lower percision + floating point ops are preferred to higher percision ones. The value is returned per-SE (aggregate + of values in SIMDs in the SE). See AMD ISAs for more information on VALU instructions. SQ_INSTS_VALU_TRANS_F16: architectures: gfx942/gfx941/gfx940/gfx90a: block: SQ event: 30 - description: Number of VALU transcendental instructions on float16. (per-simd, emulated) + description: The number of VALU transcendental instructions on float16 data. Transcendental instructions + include sin, cos, exp, log, etc. For maximum performance lower percision floating point ops are preferred + to higher percision ones. The value is returned per-SE (aggregate of values in SIMDs in the SE). See + AMD ISAs for more information on VALU instructions. SQ_INSTS_VALU_TRANS_F32: architectures: gfx942/gfx941/gfx940/gfx90a: block: SQ event: 34 - description: Number of VALU transcendental instructions on float32. (per-simd, emulated) + description: The number of VALU transcendental instructions on float32 data. Transcendental instructions + include sin, cos, exp, log, etc. For maximum performance lower percision floating point ops are preferred + to higher percision ones. The value is returned per-SE (aggregate of values in SIMDs in the SE). See + AMD ISAs for more information on VALU instructions. SQ_INSTS_VALU_TRANS_F64: architectures: gfx942/gfx941/gfx940/gfx90a: block: SQ event: 38 - description: Number of VALU transcendental instructions on float64. (per-simd, emulated) + description: The number of VALU transcendental instructions on float64 data. Transcendental instructions + include sin, cos, exp, log, etc. For maximum performance lower percision floating point ops are preferred + to higher percision ones. The value is returned per-SE (aggregate of values in SIMDs in the SE). See + AMD ISAs for more information on VALU instructions. SQ_INSTS_VMEM: architectures: gfx90a: @@ -1446,7 +1546,8 @@ SQ_INSTS_VMEM: gfx942/gfx941/gfx940: block: SQ event: 59 - description: Number of VMEM instructions issued. (per-simd, emulated) + description: The number of VMEM (GPU Memory) instructions issued. The value is returned per-SE (aggregate + of values in SIMDs in the SE). SQ_INSTS_VMEM_RD: architectures: gfx906/gfx8/gfx900/gfx9: @@ -1461,8 +1562,8 @@ SQ_INSTS_VMEM_RD: gfx942/gfx941/gfx940: block: SQ event: 58 - description: Number of VMEM read instructions issued (including FLAT). (per-simd, - emulated) + description: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch memory). + The value is returned per-SE (aggregate of values in SIMDs in the SE). SQ_INSTS_VMEM_WR: architectures: gfx906/gfx8/gfx900/gfx9: @@ -1477,8 +1578,8 @@ SQ_INSTS_VMEM_WR: gfx942/gfx941/gfx940: block: SQ event: 57 - description: Number of VMEM write instructions issued (including FLAT). (per-simd, - emulated) + description: The number of VMEM (GPU Memory) write instructions issued (including FLAT/scratch memory). + The value is returned per-SE (aggregate of values in SIMDs in the SE). SQ_INSTS_VSKIPPED: architectures: gfx90a: @@ -1487,7 +1588,10 @@ SQ_INSTS_VSKIPPED: gfx942/gfx941/gfx940: block: SQ event: 71 - description: Number of vector instructions skipped. (per-simd, emulated) + description: The number of vector instructions skipped. This can occur when the S_SETVSKIP bit is enabled + on certain instructions. Often this is used as an alturnative to branching (a compiler may replace + a branch with setting this bit to skip the operation, typically as a performance optimization). The + value is returned per-SE (aggregate of values in SIMDs in the SE). SQ_INSTS_WAVE32: architectures: gfx10/gfx1010/gfx1030/gfx1031/gfx1032: @@ -1496,8 +1600,7 @@ SQ_INSTS_WAVE32: gfx11/gfx1102/gfx1100/gfx1101: block: SQ event: 70 - description: Number of wave32 instructions issued, for flat, lds, valu, tex. {emulated, - C1} + description: Number of wave32 instructions issued, for flat, lds, valu, tex. {emulated, C1} SQ_INSTS_WAVE32_LDS: architectures: gfx10/gfx1010/gfx1030/gfx1031/gfx1032: @@ -1506,8 +1609,8 @@ SQ_INSTS_WAVE32_LDS: gfx11/gfx1102/gfx1100/gfx1101: block: SQ event: 72 - description: Number of wave32 LDS indexed instructions issued. Wave64 may count - 1 or 2, depending on what gets issued. {emulated, C1} + description: Number of wave32 LDS indexed instructions issued. Wave64 may count 1 or 2, depending on + what gets issued. {emulated, C1} SQ_INSTS_WAVE32_VALU: architectures: gfx10/gfx1010/gfx1030/gfx1031/gfx1032: @@ -1516,8 +1619,8 @@ SQ_INSTS_WAVE32_VALU: gfx11/gfx1102/gfx1100/gfx1101: block: SQ event: 73 - description: Number of wave32 valu instructions issued. Wave64 may count 1 or 2, - depending on what gets issued. {emulated, C1} + description: Number of wave32 valu instructions issued. Wave64 may count 1 or 2, depending on what gets + issued. {emulated, C1} SQ_INST_CYCLES_SALU: architectures: gfx8: @@ -1535,8 +1638,9 @@ SQ_INST_CYCLES_SALU: gfx942/gfx941/gfx940: block: SQ event: 117 - description: Number of cycles needed to execute non-memory read scalar operations. - (per-simd, emulated). Units in quad-cycles(4 cycles) + description: The number of cycles needed to execute non-memory read scalar operations (SALU). This value + is returned on a per-SE (aggregate of values in SIMDs in the SE) basis with units in quad-cycles(4 + cycles). SQ_INST_CYCLES_SMEM: architectures: gfx90a: @@ -1545,8 +1649,8 @@ SQ_INST_CYCLES_SMEM: gfx942/gfx941/gfx940: block: SQ event: 116 - description: Number of cycles needed to execute scalar memory reads. (per-simd, - emulated) + description: The number of cycles needed to execute scalar memory reads (SMEM). This value is returned + on a per-SE (aggregate of values in SIMDs in the SE) basis with units in quad-cycles(4 cycles). SQ_INST_CYCLES_VMEM: architectures: gfx10/gfx1010/gfx1030/gfx1031/gfx1032: @@ -1555,8 +1659,9 @@ SQ_INST_CYCLES_VMEM: gfx11/gfx1102/gfx1100/gfx1101: block: SQ event: 106 - description: Number of cycles needed to send addr and data for VMEM (lds, buffer, - image, flat, scratch, global) instructions, windowed by perf_en. {emulated, C1} + description: The number of cycles needed to send addr and data for VMEM (lds, buffer, image, flat, scratch, + global) instructions, windowed by perf_en. This value is returned on a per-SE (aggregate of values + in SIMDs in the SE) basis with units in quad-cycles(4 cycles). SQ_INST_CYCLES_VMEM_RD: architectures: gfx90a: @@ -1565,8 +1670,9 @@ SQ_INST_CYCLES_VMEM_RD: gfx942/gfx941/gfx940: block: SQ event: 110 - description: Number of cycles needed to send addr and cmd data for VMEM read instructions. - (per-simd, emulated). Units in quad-cycles(4 cycles) + description: The number of cycles needed to send addr and cmd data for VMEM read instructions. This + value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis with units in quad-cycles(4 + cycles). SQ_INST_CYCLES_VMEM_WR: architectures: gfx90a: @@ -1575,8 +1681,9 @@ SQ_INST_CYCLES_VMEM_WR: gfx942/gfx941/gfx940: block: SQ event: 109 - description: Number of cycles needed to send addr and cmd data for VMEM write instructions. - (per-simd, emulated). Units in quad-cycles(4 cycles) + description: The number of cycles needed to send addr and cmd data for VMEM write instructions. This + value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis with units in quad-cycles(4 + cycles). SQ_INST_LEVEL_GDS: architectures: gfx10/gfx1010/gfx1030/gfx1031/gfx1032: @@ -1585,8 +1692,10 @@ SQ_INST_LEVEL_GDS: gfx11/gfx1102/gfx1100/gfx1101: block: SQ event: 87 - description: Number of in-flight GDS instructions. Set next counter to ACCUM_PREV - and divide by INSTS_GDS for average latency. {level, nondeterministic, C1} + description: Number of in-flight GDS (global) instructions. This value represents the number of instructions + each wave spends synchronizing workgroups across the device (global data sync). Set next counter to + ACCUM_PREV and divide by INSTS_GDS for average latency. This value is returned on a per-SE (aggregate + of values in SIMDs in the SE) basis. SQ_INST_LEVEL_LDS: architectures: gfx10/gfx1010/gfx1030/gfx1031/gfx1032: @@ -1601,9 +1710,10 @@ SQ_INST_LEVEL_LDS: gfx942/gfx941/gfx940: block: SQ event: 74 - description: Number of in-flight LDS instructions. Set next counter to ACCUM_PREV - and divide by INSTS_LDS for average latency. Includes FLAT instructions. (per-simd, - level, nondeterministic) + description: Number of in-flight LDS instructions. This value represents the number of instructions + each wave spends executing instructions accessing the local data store (data shared between SIMDs + on the same CU). Set next counter to ACCUM_PREV and divide by INSTS_LDS for average latency. Includes + FLAT instructions. This value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis. SQ_INST_LEVEL_SMEM: architectures: gfx90a: @@ -1612,12 +1722,11 @@ SQ_INST_LEVEL_SMEM: gfx942/gfx941/gfx940: block: SQ event: 73 - description: Number of in-flight SMEM instructions (*2 load/store; *2 atomic; *2 - memtime; *4 wb/inv). Set next counter to ACCUM_PREV and divide by INSTS_SMEM for - average latency per smem request. Falls slightly short of total request latency - because some fetches are divided into two requests that may finish at different - times and this counter collects the average latency of the two. (per-simd, level, - nondeterministic) + description: Number of in-flight SMEM instructions (*2 load/store; *2 atomic; *2 memtime; *4 wb/inv). + Set next counter to ACCUM_PREV and divide by INSTS_SMEM for average latency per smem request. Falls + slightly short of total request latency because some fetches are divided into two requests that may + finish at different times and this counter collects the average latency of the two. This value is + returned on a per-SE (aggregate of values in SIMDs in the SE) basis. SQ_INST_LEVEL_VMEM: architectures: gfx90a: @@ -1626,15 +1735,16 @@ SQ_INST_LEVEL_VMEM: gfx942/gfx941/gfx940: block: SQ event: 72 - description: Number of in-flight VMEM instructions. Set next counter to ACCUM_PREV - and divide by INSTS_VMEM for average latency. Includes FLAT instructions. (per-simd, - level, nondeterministic) + description: Number of in-flight VMEM instructions. Set next counter to ACCUM_PREV and divide by INSTS_VMEM + for average latency. Includes FLAT instructions. This value is returned on a per-SE (aggregate of + values in SIMDs in the SE) basis. SQ_ITEMS: architectures: gfx942/gfx941/gfx940/gfx90a: block: SQ event: 14 - description: Number of valid items per wave. (per-simd, global) + description: Number of valid items per wave. This value is returned on a per-SE (aggregate of values + in SIMDs in the SE) basis. SQ_LDS_ADDR_CONFLICT: architectures: gfx90a: @@ -1643,7 +1753,8 @@ SQ_LDS_ADDR_CONFLICT: gfx942/gfx941/gfx940: block: SQ event: 127 - description: Number of cycles LDS is stalled by address conflicts. (emulated,nondeterministic) + description: Number of cycles LDS (local data store) is stalled by address conflicts. This value is + returned on a per-SE (aggregate of values in SIMDs in the SE) basis. SQ_LDS_ATOMIC_RETURN: architectures: gfx90a: @@ -1652,7 +1763,8 @@ SQ_LDS_ATOMIC_RETURN: gfx942/gfx941/gfx940: block: SQ event: 130 - description: Number of atomic return cycles in LDS. (per-simd, emulated) + description: The number of atomic return cycles in LDS (local data store). This value is returned on + a per-SE (aggregate of values in SIMDs in the SE) basis. SQ_LDS_BANK_CONFLICT: architectures: gfx8: @@ -1670,7 +1782,8 @@ SQ_LDS_BANK_CONFLICT: gfx942/gfx941/gfx940: block: SQ event: 126 - description: Number of cycles LDS is stalled by bank conflicts. (emulated) + description: The number of cycles LDS (local data store) is stalled by bank conflicts. This value is + returned on a per-SE (aggregate of values in SIMDs in the SE) basis. SQ_LDS_IDX_ACTIVE: architectures: gfx90a: @@ -1679,8 +1792,8 @@ SQ_LDS_IDX_ACTIVE: gfx942/gfx941/gfx940: block: SQ event: 131 - description: Number of cycles LDS is used for indexed (non-direct,non-interpolation) - operations. (per-simd, emulated) + description: Number of cycles LDS (local data store) is used for indexed (non-direct,non-interpolation) + operations. This value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis. SQ_LDS_MEM_VIOLATIONS: architectures: gfx90a: @@ -1689,7 +1802,8 @@ SQ_LDS_MEM_VIOLATIONS: gfx942/gfx941/gfx940: block: SQ event: 129 - description: Number of threads that have a memory violation in the LDS.(emulated) + description: Number of threads that have a memory violation in the LDS (local data store). This value + is returned on a per-SE (aggregate of values in SIMDs in the SE) basis. SQ_LDS_UNALIGNED_STALL: architectures: gfx90a: @@ -1698,8 +1812,8 @@ SQ_LDS_UNALIGNED_STALL: gfx942/gfx941/gfx940: block: SQ event: 128 - description: Number of cycles LDS is stalled processing flat unaligned load/store - ops. (emulated) + description: Number of cycles LDS (local data store) is stalled processing flat unaligned load/store + ops. This value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis. SQ_LEVEL_WAVES: architectures: gfx10/gfx1010/gfx1030/gfx1031/gfx1032: @@ -1708,8 +1822,8 @@ SQ_LEVEL_WAVES: gfx942/gfx941/gfx940/gfx90a: block: SQ event: 5 - description: Track the number of waves. Set ACCUM_PREV for the next counter to use - this. (level, per-simd, global) + description: Track the number of waves. Set ACCUM_PREV for the next counter to use this. This value + is returned on a per-SIMD basis. SQ_THREAD_CYCLES_VALU: architectures: gfx8: @@ -1727,8 +1841,8 @@ SQ_THREAD_CYCLES_VALU: gfx942/gfx941/gfx940: block: SQ event: 118 - description: 'Number of thread-cycles used to execute VALU operations (similar to - INST_CYCLES_VALU but multiplied by # of active threads). (per-simd)' + description: 'Number of thread-cycles used to execute VALU operations (similar to INST_CYCLES_VALU but + multiplied by # of active threads). (per-simd)' SQ_VALU_MFMA_BUSY_CYCLES: architectures: gfx90a: @@ -1737,7 +1851,8 @@ SQ_VALU_MFMA_BUSY_CYCLES: gfx942/gfx941/gfx940: block: SQ event: 77 - description: Number of cycles the MFMA ALU is busy (per-simd, emulated) + description: Number of cycles the MFMA (Matrixed-Fused-Multiply-Add) ALU is busy. This value is returned + on a per-SIMD basis. SQ_WAIT_ANY: architectures: gfx10/gfx1010/gfx1030/gfx1031/gfx1032: @@ -1752,8 +1867,8 @@ SQ_WAIT_ANY: gfx942/gfx941/gfx940: block: SQ event: 90 - description: Number of wave-cycles spent waiting for anything (per-simd, nondeterministic). - Units in quad-cycles(4 cycles) + description: Number of wave-cycles spent waiting for anything (per-simd, nondeterministic). Units in + quad-cycles(4 cycles) SQ_WAIT_INST_ANY: architectures: gfx10/gfx1010/gfx1030/gfx1031/gfx1032: @@ -1768,8 +1883,7 @@ SQ_WAIT_INST_ANY: gfx942/gfx941/gfx940: block: SQ event: 93 - description: Number of wave-cycles spent waiting for any instruction issue. In units - of 4 cycles. (per-simd, nondeterministic) + description: Number of wave-cycles spent waiting for any instruction issue. Units in quad-cycles(4 cycles). SQ_WAIT_INST_LDS: architectures: gfx10/gfx1010/gfx1030/gfx1031/gfx1032: @@ -1793,8 +1907,8 @@ SQ_WAIT_INST_LDS: gfx942/gfx941/gfx940: block: SQ event: 96 - description: Number of wave-cycles spent waiting for LDS instruction issue. In units - of 4 cycles. (per-simd, nondeterministic) + description: Number of wave-cycles spent waiting for LDS instruction issue. In units of 4 cycles. (per-simd, + nondeterministic) SQ_WAVE32_INSTS: architectures: gfx10/gfx1010/gfx1030/gfx1031/gfx1032: @@ -1803,8 +1917,7 @@ SQ_WAVE32_INSTS: gfx11/gfx1102/gfx1100/gfx1101: block: SQ event: 82 - description: Number of instructions issued by wave32 waves. Skipped instructions - are not counted. {emulated} + description: Number of instructions issued by wave32 waves. Skipped instructions are not counted. {emulated} SQ_WAVE64_INSTS: architectures: gfx10/gfx1010/gfx1030/gfx1031/gfx1032: @@ -1813,49 +1926,75 @@ SQ_WAVE64_INSTS: gfx11/gfx1102/gfx1100/gfx1101: block: SQ event: 83 - description: Number of instructions issued by wave64 waves. Skipped instructions - are not counted. {emulated} + description: Number of instructions issued by wave64 waves. Skipped instructions are not counted. {emulated} SQ_WAVES: architectures: - gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx8/gfx900/gfx90a/gfx9: + gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx8/gfx900/gfx90a/gfx9: block: SQ event: 4 - description: Count number of waves sent to SQs. (per-simd, emulated, global) + description: Count number of waves sent to distributed sequencers (SQs). This value represents the number + of waves that are sent to each SQ. This only counts new waves sent since the start of collection (for + dispatch profiling this is the timeframe of kernel execution, for agent profiling it is the timeframe + between start_context and read counter data). A sum of all SQ_WAVES values will give the total number + of waves started by the application during the collection timeframe. Returns one value per-SE (aggregates + of SIMD values). SQ_WAVES_EQ_64: architectures: gfx942/gfx941/gfx940/gfx90a: block: SQ event: 6 - description: Count number of waves with exactly 64 active threads sent to SQs. (per-simd, - emulated, global) + description: Count number of waves with exactly 64 active threads sent to SQs. This value represents + the number of waves that an each individual SIMD has enqueued during the collection timeframe (for + dispatch profiling this is the timeframe of kernel execution, for agent profiling it is the timeframe + between start_context and read counter data) with exactly 64 threads. A sum of all SQ_WAVES_EQ_64 + values will give the total number of waves with 64 threads enqueued during the collection timeframe + by the application. Returns one value per-SE (aggregates of SIMD values). Useful for checking for + wavefront occupancy. SQ_WAVES_LT_16: architectures: gfx942/gfx941/gfx940/gfx90a: block: SQ event: 10 - description: Count number of waves sent <16 active threads sent to SQs. (per-simd, - emulated, global) + description: Count number of waves sent <16 active threads sent to SQs. (per-simd, emulated, global). + This value represents the number of waves that an each individual SIMD has enqueued during the collection + timeframe (for dispatch profiling this is the timeframe of kernel execution, for agent profiling it + is the timeframe between start_context and read counter data) with less than 16 threads. A sum of + all SQ_WAVES_LT_16 values will give the total number of waves with 16 threads enqueued during the + collection timeframe by the application. Returns one value per-SE (aggregates of SIMD values). Useful + for checking for wavefront occupancy. SQ_WAVES_LT_32: architectures: gfx942/gfx941/gfx940/gfx90a: block: SQ event: 9 - description: Count number of waves sent <32 active threads sent to SQs. (per-simd, - emulated, global) + description: Count number of waves sent <32 active threads sent to SQs. This value represents the number + of waves that an each individual SIMD has enqueued during the collection timeframe (for dispatch profiling + this is the timeframe of kernel execution, for agent profiling it is the timeframe between start_context + and read counter data) with less than 32 threads. A sum of all SQ_WAVES_LT_32 values will give the + total number of waves with 32 threads enqueued during the collection timeframe by the application. + Returns one value per-SE (aggregates of SIMD values). Useful for checking for wavefront occupancy. SQ_WAVES_LT_48: architectures: gfx942/gfx941/gfx940/gfx90a: block: SQ event: 8 - description: Count number of waves with <48 active threads sent to SQs. (per-simd, - emulated, global) + description: Count number of waves with <48 active threads sent to SQs. This value represents the number + of waves that an each individual SIMD has enqueued during the collection timeframe (for dispatch profiling + this is the timeframe of kernel execution, for agent profiling it is the timeframe between start_context + and read counter data) with less than 48 threads. A sum of all SQ_WAVES_LT_48 values will give the + total number of waves with 48 threads enqueued during the collection timeframe by the application. + Returns one value per-SE (aggregates of SIMD values). Useful for checking for wavefront occupancy. SQ_WAVES_LT_64: architectures: gfx942/gfx941/gfx940/gfx90a: block: SQ event: 7 - description: Count number of waves with <64 active threads sent to SQs. (per-simd, - emulated, global) + description: Count number of waves with <64 active threads sent to SQs. This value represents the number + of waves that an each individual SIMD has enqueued during the collection timeframe (for dispatch profiling + this is the timeframe of kernel execution, for agent profiling it is the timeframe between start_context + and read counter data) with less than 64 threads. A sum of all SQ_WAVES_LT_64 values will give the + total number of waves with 64 threads enqueued during the collection timeframe by the application. + Returns one value per-SE (aggregates of SIMD values). Useful for checking for wavefront occupancy. SQ_WAVES_RESTORED: architectures: gfx90a: @@ -1864,8 +2003,12 @@ SQ_WAVES_RESTORED: gfx942/gfx941/gfx940: block: SQ event: 185 - description: Count number of context-restored waves sent to SQs. (per-simd, emulated, - global) + description: Count number of context-restored waves sent to SQs. This value represents the number of + waves whos current register state has been restored from a register bank during the collection timeframe + (for dispatch profiling this is the timeframe of kernel execution, for agent profiling it is the timeframe + between start_context and read counter data). Context saving/restoring is a slow operation and should + be limited. High values can also indicate that stalling may be taking place (waiting for free register + space). Returns one value per-SE (aggregates of SIMD values). SQ_WAVES_SAVED: architectures: gfx90a: @@ -1874,13 +2017,19 @@ SQ_WAVES_SAVED: gfx942/gfx941/gfx940: block: SQ event: 186 - description: Count number of context-saved waves. (per-simd, emulated, global) + description: Count number of context-saved waves sent to SQs. This value represents the number of waves + whos current register state has been saved to a register bank during the collection timeframe (for + dispatch profiling this is the timeframe of kernel execution, for agent profiling it is the timeframe + between start_context and read counter data) . Context saving/restoring is a slow operation and should + be limited. High values can also indicate that stalling may be taking place (waiting for free register + space). Returns one value per-SE (aggregates of SIMD values). SQ_WAVES_sum: architectures: gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx8/gfx90a/gfx9: expression: reduce(SQ_WAVES,sum) - description: Count number of waves sent to SQs. (per-simd, emulated, global). Sum - over SQ instances. + description: Gives the total number of waves currently enqueued by the application during the collection + timeframe (for dispatch profiling this is the timeframe of kernel execution, for agent profiling it + is the timeframe between start_context and read counter data). See SQ_WAVES for more details. SQ_WAVE_CYCLES: architectures: gfx10/gfx1010/gfx1030/gfx1031/gfx1032: @@ -1895,8 +2044,10 @@ SQ_WAVE_CYCLES: gfx942/gfx941/gfx940: block: SQ event: 79 - description: Number of wave-cycles spent by waves in the CUs (per-simd, nondeterministic). - Units in quad-cycles(4 cycles) + description: The cycles spent executing waves in the CUs. This value is reported per-SE (aggregates + of SIMD values) and is nondeterministic. Units are in quad-cycles (4 cycles). Useful for determining + how much time is spent executing wave code vs overhead/waiting. Low cycle count relative to actual + number of cycles processed by the CU can indicate that the CU is stalling or is overloaded. ScaPipeIssueUtil: architectures: gfx90a: @@ -1920,14 +2071,13 @@ TA_ADDR_STALLED_BY_TC_CYCLES: gfx942/gfx941/gfx940: block: TA event: 42 - description: Number of cycles addr path stalled by TC. Perf_Windowing not supported - for this counter. + description: Number of cycles addr path stalled by TC. Perf_Windowing not supported for this counter. TA_ADDR_STALLED_BY_TC_CYCLES_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TA_ADDR_STALLED_BY_TC_CYCLES,sum) - description: Number of cycles addr path stalled by TC. Perf_Windowing not supported - for this counter. Sum over TA instances. + description: Number of cycles addr path stalled by TC. Perf_Windowing not supported for this counter. + Sum over TA instances. TA_ADDR_STALLED_BY_TD_CYCLES: architectures: gfx90a: @@ -1936,14 +2086,13 @@ TA_ADDR_STALLED_BY_TD_CYCLES: gfx942/gfx941/gfx940: block: TA event: 43 - description: Number of cycles addr path stalled by TD. Perf_Windowing not supported - for this counter. + description: Number of cycles addr path stalled by TD. Perf_Windowing not supported for this counter. TA_ADDR_STALLED_BY_TD_CYCLES_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TA_ADDR_STALLED_BY_TD_CYCLES,sum) - description: Number of cycles addr path stalled by TD. Perf_Windowing not supported - for this counter. Sum over TA instances. + description: Number of cycles addr path stalled by TD. Perf_Windowing not supported for this counter. + Sum over TA instances. TA_BUFFER_ATOMIC_WAVEFRONTS: architectures: gfx90a: @@ -1996,8 +2145,7 @@ TA_BUFFER_LOAD_WAVEFRONTS_sum: architectures: gfx11/gfx1102/gfx1100/gfx1101: expression: reduce(TA_BUFFER_LOAD_WAVEFRONTS,sum) - description: Number of buffer load vec32 packets processed by the TA. Sum over TA - instances. + description: Number of buffer load vec32 packets processed by the TA. Sum over TA instances. TA_BUFFER_READ_WAVEFRONTS: architectures: gfx90a: @@ -2022,8 +2170,7 @@ TA_BUFFER_STORE_WAVEFRONTS_sum: architectures: gfx11/gfx1102/gfx1100/gfx1101: expression: reduce(TA_BUFFER_STORE_WAVEFRONTS,sum) - description: Number of buffer store vec32 packets processed by the TA. Sum over - TA instances. + description: Number of buffer store vec32 packets processed by the TA. Sum over TA instances. TA_BUFFER_TOTAL_CYCLES: architectures: gfx90a: @@ -2089,14 +2236,13 @@ TA_DATA_STALLED_BY_TC_CYCLES: gfx942/gfx941/gfx940: block: TA event: 44 - description: Number of cycles data path stalled by TC. Perf_Windowing not supported - for this counter. + description: Number of cycles data path stalled by TC. Perf_Windowing not supported for this counter. TA_DATA_STALLED_BY_TC_CYCLES_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TA_DATA_STALLED_BY_TC_CYCLES,sum) - description: Number of cycles data path stalled by TC. Perf_Windowing not supported - for this counter. Sum over TA instances. + description: Number of cycles data path stalled by TC. Perf_Windowing not supported for this counter. + Sum over TA instances. TA_FLAT_ATOMIC_WAVEFRONTS: architectures: gfx90a: @@ -2116,14 +2262,13 @@ TA_FLAT_LOAD_WAVEFRONTS: gfx10/gfx1010/gfx1030/gfx1031/gfx1032: block: TA event: 101 - description: ' Number of flat load vec32 packets processed by TA, same as flat_read_wavefronts - in earlier IP' + description: ' Number of flat load vec32 packets processed by TA, same as flat_read_wavefronts in earlier + IP' TA_FLAT_LOAD_WAVEFRONTS_sum: architectures: gfx10/gfx1010/gfx1030/gfx1031/gfx1032: expression: reduce(TA_FLAT_LOAD_WAVEFRONTS,sum) - description: Number of flat load vec32 packets processed by the TA. Sum over TA - instances. + description: Number of flat load vec32 packets processed by the TA. Sum over TA instances. TA_FLAT_READ_WAVEFRONTS: architectures: gfx906/gfx908/gfx8/gfx900/gfx90a/gfx9: @@ -2143,14 +2288,13 @@ TA_FLAT_STORE_WAVEFRONTS: gfx10/gfx1010/gfx1030/gfx1031/gfx1032: block: TA event: 102 - description: Number of flat store vec32 packets processed by TA, same as flat_write_wavefronts - in earlier IP + description: Number of flat store vec32 packets processed by TA, same as flat_write_wavefronts in earlier + IP TA_FLAT_STORE_WAVEFRONTS_sum: architectures: gfx10/gfx1010/gfx1030/gfx1031/gfx1032: expression: reduce(TA_FLAT_STORE_WAVEFRONTS,sum) - description: Number of flat store vec32 packets processed by the TA. Sum over TA - instances. + description: Number of flat store vec32 packets processed by the TA. Sum over TA instances. TA_FLAT_WAVEFRONTS: architectures: gfx90a: @@ -2192,8 +2336,7 @@ TA_TA_BUSY_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TA_TA_BUSY,sum) - description: TA block is busy. Perf_Windowing not supported for this counter. Sum - over TA instances. + description: TA block is busy. Perf_Windowing not supported for this counter. Sum over TA instances. TA_TOTAL_WAVEFRONTS: architectures: gfx90a: @@ -2212,8 +2355,8 @@ TA_UTIL: architectures: gfx10/gfx1010/gfx1030/gfx1031/gfx1032: expression: 100*GRBM_TA_BUSY/GRBM_GUI_ACTIVE - description: Percentage of the GRBM_GUI_ACTIVE time that any of the Texture Pipes - (TA) are busy in the shader engine(s). + description: Percentage of the GRBM_GUI_ACTIVE time that any of the Texture Pipes (TA) are busy in the + shader engine(s). TCA_BUSY: architectures: gfx942/gfx941/gfx940/gfx90a: @@ -2246,8 +2389,7 @@ TCC_ALL_TC_OP_INV_EVICT_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCC_ALL_TC_OP_INV_EVICT,sum) - description: Number of evictions due to all TC_OP invalidate requests. Sum over - TCC instances. + description: Number of evictions due to all TC_OP invalidate requests. Sum over TCC instances. TCC_ALL_TC_OP_WB_WRITEBACK: architectures: gfx942/gfx941/gfx940/gfx90a: @@ -2258,8 +2400,7 @@ TCC_ALL_TC_OP_WB_WRITEBACK_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCC_ALL_TC_OP_WB_WRITEBACK,sum) - description: Number of writebacks due to all TC_OP writeback requests. Sum over - TCC instances. + description: Number of writebacks due to all TC_OP writeback requests. Sum over TCC instances. TCC_ATOMIC: architectures: gfx942/gfx941/gfx940/gfx90a: @@ -2286,21 +2427,19 @@ TCC_BUSY_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCC_BUSY,sum) - description: Number of cycles we have a request pending. Not windowable. Sum over - TCC instances. + description: Number of cycles we have a request pending. Not windowable. Sum over TCC instances. TCC_CC_REQ: architectures: gfx942/gfx941/gfx940/gfx90a: block: TCC event: 7 - description: The number of coherently cached requests. This is measured at the tag - block. + description: The number of coherently cached requests. This is measured at the tag block. TCC_CC_REQ_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCC_CC_REQ,sum) - description: The number of coherently cached requests. This is measured at the tag - block. Sum over TCC instances. + description: The number of coherently cached requests. This is measured at the tag block. Sum over TCC + instances. TCC_CYCLE: architectures: gfx942/gfx941/gfx940/gfx90a: @@ -2317,28 +2456,27 @@ TCC_EA0_ATOMIC: gfx942/gfx941/gfx940: block: TCC event: 36 - description: Number of transactions going over the TC_EA_wrreq interface that are - actually atomic requests. + description: Number of transactions going over the TC_EA_wrreq interface that are actually atomic requests. TCC_EA0_ATOMIC_LEVEL: architectures: gfx942/gfx941/gfx940: block: TCC event: 37 - description: The sum of the number of EA atomics in flight. This is primarily meant - for measure average EA atomic latency. Average atomic latency = TCC_PERF_SEL_EA_WRREQ_ATOMIC_LEVEL/TCC_PERF_SEL_EA_WRREQ_ATOMIC. + description: The sum of the number of EA atomics in flight. This is primarily meant for measure average + EA atomic latency. Average atomic latency = TCC_PERF_SEL_EA_WRREQ_ATOMIC_LEVEL/TCC_PERF_SEL_EA_WRREQ_ATOMIC. TCC_EA0_ATOMIC_LEVEL_sum: architectures: gfx942/gfx941/gfx940: expression: reduce(TCC_EA0_ATOMIC_LEVEL,sum) - description: The sum of the number of EA atomics in flight. This is primarily meant - for measure average EA atomic latency. Average atomic latency = TCC_PERF_SEL_EA_WRREQ_ATOMIC_LEVEL/TCC_PERF_SEL_EA_WRREQ_ATOMIC. + description: The sum of the number of EA atomics in flight. This is primarily meant for measure average + EA atomic latency. Average atomic latency = TCC_PERF_SEL_EA_WRREQ_ATOMIC_LEVEL/TCC_PERF_SEL_EA_WRREQ_ATOMIC. Sum over TCC instances. TCC_EA0_ATOMIC_sum: architectures: gfx942/gfx941/gfx940: expression: reduce(TCC_EA0_ATOMIC,sum) - description: Number of transactions going over the TC_EA_wrreq interface that are - actually atomic requests. Sum over TCC instances. + description: Number of transactions going over the TC_EA_wrreq interface that are actually atomic requests. + Sum over TCC instances. TCC_EA0_RDREQ: architectures: gfx942/gfx941/gfx940: @@ -2361,178 +2499,165 @@ TCC_EA0_RDREQ_DRAM: gfx942/gfx941/gfx940: block: TCC event: 102 - description: Number of TCC/EA read requests (either 32-byte or 64-byte) destined - for DRAM (MC). + description: Number of TCC/EA read requests (either 32-byte or 64-byte) destined for DRAM (MC). TCC_EA0_RDREQ_DRAM_CREDIT_STALL: architectures: gfx942/gfx941/gfx940: block: TCC event: 43 - description: Number of cycles there was a stall because the read request interface - was out of DRAM credits. Stalls occur regardless of whether a read needed to be - performed or not. + description: Number of cycles there was a stall because the read request interface was out of DRAM credits. + Stalls occur regardless of whether a read needed to be performed or not. TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum: architectures: gfx942/gfx941/gfx940: expression: reduce(TCC_EA0_RDREQ_DRAM_CREDIT_STALL,sum) - description: Number of cycles there was a stall because the read request interface - was out of DRAM credits. Stalls occur regardless of whether a read needed to be - performed or not. Sum over TCC instances. + description: Number of cycles there was a stall because the read request interface was out of DRAM credits. + Stalls occur regardless of whether a read needed to be performed or not. Sum over TCC instances. TCC_EA0_RDREQ_DRAM_sum: architectures: gfx942/gfx941/gfx940: expression: reduce(TCC_EA0_RDREQ_DRAM,sum) - description: Number of TCC/EA read requests (either 32-byte or 64-byte) destined - for DRAM (MC). Sum over TCC instances. + description: Number of TCC/EA read requests (either 32-byte or 64-byte) destined for DRAM (MC). Sum + over TCC instances. TCC_EA0_RDREQ_GMI_CREDIT_STALL: architectures: gfx942/gfx941/gfx940: block: TCC event: 42 - description: Number of cycles there was a stall because the read request interface - was out of GMI credits. Stalls occur regardless of whether a read needed to be - performed or not. + description: Number of cycles there was a stall because the read request interface was out of GMI credits. + Stalls occur regardless of whether a read needed to be performed or not. TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum: architectures: gfx942/gfx941/gfx940: expression: reduce(TCC_EA0_RDREQ_GMI_CREDIT_STALL,sum) - description: Number of cycles there was a stall because the read request interface - was out of GMI credits. Stalls occur regardless of whether a read needed to be - performed or not. Sum over TCC instances. + description: Number of cycles there was a stall because the read request interface was out of GMI credits. + Stalls occur regardless of whether a read needed to be performed or not. Sum over TCC instances. TCC_EA0_RDREQ_IO_CREDIT_STALL: architectures: gfx942/gfx941/gfx940: block: TCC event: 41 - description: Number of cycles there was a stall because the read request interface - was out of IO credits. Stalls occur regardless of whether a read needed to be - performed or not. + description: Number of cycles there was a stall because the read request interface was out of IO credits. + Stalls occur regardless of whether a read needed to be performed or not. TCC_EA0_RDREQ_IO_CREDIT_STALL_sum: architectures: gfx942/gfx941/gfx940: expression: reduce(TCC_EA0_RDREQ_IO_CREDIT_STALL,sum) - description: Number of cycles there was a stall because the read request interface - was out of IO credits. Stalls occur regardless of whether a read needed to be - performed or not. Sum over TCC instances. + description: Number of cycles there was a stall because the read request interface was out of IO credits. + Stalls occur regardless of whether a read needed to be performed or not. Sum over TCC instances. TCC_EA0_RDREQ_LEVEL: architectures: gfx942/gfx941/gfx940: block: TCC event: 44 - description: The sum of the number of TCC/EA read requests in flight. This is primarily - meant for measure average EA read latency. Average read latency = TCC_PERF_SEL_EA_RDREQ_LEVEL/TCC_PERF_SEL_EA_RDREQ. + description: The sum of the number of TCC/EA read requests in flight. This is primarily meant for measure + average EA read latency. Average read latency = TCC_PERF_SEL_EA_RDREQ_LEVEL/TCC_PERF_SEL_EA_RDREQ. TCC_EA0_RDREQ_LEVEL_sum: architectures: gfx942/gfx941/gfx940: expression: reduce(TCC_EA0_RDREQ_LEVEL,sum) - description: The sum of the number of TCC/EA read requests in flight. This is primarily - meant for measure average EA read latency. Average read latency = TCC_PERF_SEL_EA_RDREQ_LEVEL/TCC_PERF_SEL_EA_RDREQ. + description: The sum of the number of TCC/EA read requests in flight. This is primarily meant for measure + average EA read latency. Average read latency = TCC_PERF_SEL_EA_RDREQ_LEVEL/TCC_PERF_SEL_EA_RDREQ. Sum over TCC instances. TCC_EA0_RDREQ_sum: architectures: gfx942/gfx941/gfx940: expression: reduce(TCC_EA0_RDREQ,sum) - description: Number of TCC/EA read requests (either 32-byte or 64-byte) Sum over - TCC instances. + description: Number of TCC/EA read requests (either 32-byte or 64-byte) Sum over TCC instances. TCC_EA0_RD_UNCACHED_32B: architectures: gfx942/gfx941/gfx940: block: TCC event: 40 - description: Number of 32-byte TCC/EA read due to uncached traffic. A 64-byte request - will be counted as 2 + description: Number of 32-byte TCC/EA read due to uncached traffic. A 64-byte request will be counted + as 2 TCC_EA0_RD_UNCACHED_32B_sum: architectures: gfx942/gfx941/gfx940: expression: reduce(TCC_EA0_RD_UNCACHED_32B,sum) - description: Number of 32-byte TCC/EA read due to uncached traffic. A 64-byte request - will be counted as 2 Sum over TCC instances. + description: Number of 32-byte TCC/EA read due to uncached traffic. A 64-byte request will be counted + as 2 Sum over TCC instances. TCC_EA0_WRREQ: architectures: gfx942/gfx941/gfx940: block: TCC event: 26 - description: Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq - interface. Atomics may travel over the same interface and are generally classified - as write requests. This does not include probe commands. + description: Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. + Atomics may travel over the same interface and are generally classified as write requests. This does + not include probe commands. TCC_EA0_WRREQ_64B: architectures: gfx942/gfx941/gfx940: block: TCC event: 27 - description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over - the TC_EA_wrreq interface. + description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. TCC_EA0_WRREQ_64B_sum: architectures: gfx942/gfx941/gfx940: expression: reduce(TCC_EA0_WRREQ_64B,sum) - description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over - the TC_EA_wrreq interface. Sum over TCC instances. + description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. + Sum over TCC instances. TCC_EA0_WRREQ_DRAM: architectures: gfx942/gfx941/gfx940: block: TCC event: 103 - description: Number of TCC/EA write requests (either 32-byte of 64-byte) destined - for DRAM (MC). + description: Number of TCC/EA write requests (either 32-byte of 64-byte) destined for DRAM (MC). TCC_EA0_WRREQ_DRAM_CREDIT_STALL: architectures: gfx942/gfx941/gfx940: block: TCC event: 33 - description: Number of cycles a EA write request was stalled because the interface - was out of DRAM credits. + description: Number of cycles a EA write request was stalled because the interface was out of DRAM credits. TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum: architectures: gfx942/gfx941/gfx940: expression: reduce(TCC_EA0_WRREQ_DRAM_CREDIT_STALL,sum) - description: Number of cycles a EA write request was stalled because the interface - was out of DRAM credits. Sum over TCC instances. + description: Number of cycles a EA write request was stalled because the interface was out of DRAM credits. + Sum over TCC instances. TCC_EA0_WRREQ_DRAM_sum: architectures: gfx942/gfx941/gfx940: expression: reduce(TCC_EA0_WRREQ_DRAM,sum) - description: Number of TCC/EA write requests (either 32-byte of 64-byte) destined - for DRAM (MC). Sum over TCC instances. + description: Number of TCC/EA write requests (either 32-byte of 64-byte) destined for DRAM (MC). Sum + over TCC instances. TCC_EA0_WRREQ_GMI_CREDIT_STALL: architectures: gfx942/gfx941/gfx940: block: TCC event: 32 - description: Number of cycles a EA write request was stalled because the interface - was out of GMI credits. + description: Number of cycles a EA write request was stalled because the interface was out of GMI credits. TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum: architectures: gfx942/gfx941/gfx940: expression: reduce(TCC_EA0_WRREQ_GMI_CREDIT_STALL,sum) - description: Number of cycles a EA write request was stalled because the interface - was out of GMI credits. Sum over TCC instances. + description: Number of cycles a EA write request was stalled because the interface was out of GMI credits. + Sum over TCC instances. TCC_EA0_WRREQ_IO_CREDIT_STALL: architectures: gfx942/gfx941/gfx940: block: TCC event: 31 - description: Number of cycles a EA write request was stalled because the interface - was out of IO credits. + description: Number of cycles a EA write request was stalled because the interface was out of IO credits. TCC_EA0_WRREQ_IO_CREDIT_STALL_sum: architectures: gfx942/gfx941/gfx940: expression: reduce(TCC_EA0_WRREQ_IO_CREDIT_STALL,sum) - description: Number of cycles a EA write request was stalled because the interface - was out of IO credits. Sum over TCC instances. + description: Number of cycles a EA write request was stalled because the interface was out of IO credits. + Sum over TCC instances. TCC_EA0_WRREQ_LEVEL: architectures: gfx942/gfx941/gfx940: block: TCC event: 35 - description: The sum of the number of EA write requests in flight. This is primarily - meant for measure average EA write latency. Average write latency = TCC_PERF_SEL_EA_WRREQ_LEVEL/TCC_PERF_SEL_EA_WRREQ. + description: The sum of the number of EA write requests in flight. This is primarily meant for measure + average EA write latency. Average write latency = TCC_PERF_SEL_EA_WRREQ_LEVEL/TCC_PERF_SEL_EA_WRREQ. TCC_EA0_WRREQ_LEVEL_sum: architectures: gfx942/gfx941/gfx940: expression: reduce(TCC_EA0_WRREQ_LEVEL,sum) - description: The sum of the number of EA write requests in flight. This is primarily - meant for measure average EA write latency. Average write latency = TCC_PERF_SEL_EA_WRREQ_LEVEL/TCC_PERF_SEL_EA_WRREQ. + description: The sum of the number of EA write requests in flight. This is primarily meant for measure + average EA write latency. Average write latency = TCC_PERF_SEL_EA_WRREQ_LEVEL/TCC_PERF_SEL_EA_WRREQ. Sum over TCC instances. TCC_EA0_WRREQ_PROBE_COMMAND: architectures: @@ -2555,25 +2680,24 @@ TCC_EA0_WRREQ_sum: architectures: gfx942/gfx941/gfx940: expression: reduce(TCC_EA0_WRREQ,sum) - description: Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq - interface. Atomics may travel over the same interface and are generally classified - as write requests. This does not include probe commands. Sum over TCC instances. + description: Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. + Atomics may travel over the same interface and are generally classified as write requests. This does + not include probe commands. Sum over TCC instances. TCC_EA0_WR_UNCACHED_32B: architectures: gfx942/gfx941/gfx940: block: TCC event: 29 - description: Number of 32-byte write/atomic going over the TC_EA_wrreq interface - due to uncached traffic. Note that CC mtypes can produce uncached requests, and - those are included in this. A 64-byte request will be counted as 2 + description: Number of 32-byte write/atomic going over the TC_EA_wrreq interface due to uncached traffic. + Note that CC mtypes can produce uncached requests, and those are included in this. A 64-byte request + will be counted as 2 TCC_EA0_WR_UNCACHED_32B_sum: architectures: gfx942/gfx941/gfx940: expression: reduce(TCC_EA0_WR_UNCACHED_32B,sum) - description: Number of 32-byte write/atomic going over the TC_EA_wrreq interface - due to uncached traffic. Note that CC mtypes can produce uncached requests, and - those are included in this. A 64-byte request will be counted as 2. Sum over TCC - instances. + description: Number of 32-byte write/atomic going over the TC_EA_wrreq interface due to uncached traffic. + Note that CC mtypes can produce uncached requests, and those are included in this. A 64-byte request + will be counted as 2. Sum over TCC instances. TCC_EA1_RDREQ: architectures: gfx906: @@ -2595,29 +2719,27 @@ TCC_EA1_RDREQ_sum: architectures: gfx906: expression: reduce(TCC_EA1_RDREQ,sum) - description: Number of TCC/EA read requests (either 32-byte or 64-byte). Sum over - TCC EA1s. + description: Number of TCC/EA read requests (either 32-byte or 64-byte). Sum over TCC EA1s. TCC_EA1_WRREQ: architectures: gfx906: block: TCC event: 256 - description: Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq - interface. Atomics may travel over the same interface and are generally classified - as write requests. This does not include probe commands. + description: Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. + Atomics may travel over the same interface and are generally classified as write requests. This does + not include probe commands. TCC_EA1_WRREQ_64B: architectures: gfx906: block: TCC event: 257 - description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over - the TC_EA_wrreq interface. + description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. TCC_EA1_WRREQ_64B_sum: architectures: gfx906: expression: reduce(TCC_EA1_WRREQ_64B,sum) - description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over - the TC_EA_wrreq interface. Sum over TCC EA1s. + description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. + Sum over TCC EA1s. TCC_EA1_WRREQ_STALL: architectures: gfx906: @@ -2628,35 +2750,34 @@ TCC_EA1_WRREQ_sum: architectures: gfx906: expression: reduce(TCC_EA1_WRREQ,sum) - description: Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq - interface. Sum over TCC EA1s. + description: Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. + Sum over TCC EA1s. TCC_EA_ATOMIC: architectures: gfx90a: block: TCC event: 36 - description: Number of transactions going over the TC_EA_wrreq interface that are - actually atomic requests. + description: Number of transactions going over the TC_EA_wrreq interface that are actually atomic requests. TCC_EA_ATOMIC_LEVEL: architectures: gfx90a: block: TCC event: 37 - description: The sum of the number of EA atomics in flight. This is primarily meant - for measure average EA atomic latency. Average atomic latency = TCC_PERF_SEL_EA_WRREQ_ATOMIC_LEVEL/TCC_PERF_SEL_EA_WRREQ_ATOMIC. + description: The sum of the number of EA atomics in flight. This is primarily meant for measure average + EA atomic latency. Average atomic latency = TCC_PERF_SEL_EA_WRREQ_ATOMIC_LEVEL/TCC_PERF_SEL_EA_WRREQ_ATOMIC. TCC_EA_ATOMIC_LEVEL_sum: architectures: gfx90a: expression: reduce(TCC_EA_ATOMIC_LEVEL,sum) - description: The sum of the number of EA atomics in flight. This is primarily meant - for measure average EA atomic latency. Average atomic latency = TCC_PERF_SEL_EA_WRREQ_ATOMIC_LEVEL/TCC_PERF_SEL_EA_WRREQ_ATOMIC. + description: The sum of the number of EA atomics in flight. This is primarily meant for measure average + EA atomic latency. Average atomic latency = TCC_PERF_SEL_EA_WRREQ_ATOMIC_LEVEL/TCC_PERF_SEL_EA_WRREQ_ATOMIC. Sum over TCC instances. TCC_EA_ATOMIC_sum: architectures: gfx90a: expression: reduce(TCC_EA_ATOMIC,sum) - description: Number of transactions going over the TC_EA_wrreq interface that are - actually atomic requests. Sum over TCC instances. + description: Number of transactions going over the TC_EA_wrreq interface that are actually atomic requests. + Sum over TCC instances. TCC_EA_RDREQ: architectures: gfx906/gfx900/gfx9: @@ -2685,92 +2806,84 @@ TCC_EA_RDREQ_DRAM: gfx90a: block: TCC event: 102 - description: Number of TCC/EA read requests (either 32-byte or 64-byte) destined - for DRAM (MC). + description: Number of TCC/EA read requests (either 32-byte or 64-byte) destined for DRAM (MC). TCC_EA_RDREQ_DRAM_CREDIT_STALL: architectures: gfx90a: block: TCC event: 43 - description: Number of cycles there was a stall because the read request interface - was out of DRAM credits. Stalls occur regardless of whether a read needed to be - performed or not. + description: Number of cycles there was a stall because the read request interface was out of DRAM credits. + Stalls occur regardless of whether a read needed to be performed or not. TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum: architectures: gfx90a: expression: reduce(TCC_EA_RDREQ_DRAM_CREDIT_STALL,sum) - description: Number of cycles there was a stall because the read request interface - was out of DRAM credits. Stalls occur regardless of whether a read needed to be - performed or not. Sum over TCC instances. + description: Number of cycles there was a stall because the read request interface was out of DRAM credits. + Stalls occur regardless of whether a read needed to be performed or not. Sum over TCC instances. TCC_EA_RDREQ_DRAM_sum: architectures: gfx90a: expression: reduce(TCC_EA_RDREQ_DRAM,sum) - description: Number of TCC/EA read requests (either 32-byte or 64-byte) destined - for DRAM (MC). Sum over TCC instances. + description: Number of TCC/EA read requests (either 32-byte or 64-byte) destined for DRAM (MC). Sum + over TCC instances. TCC_EA_RDREQ_GMI_CREDIT_STALL: architectures: gfx90a: block: TCC event: 42 - description: Number of cycles there was a stall because the read request interface - was out of GMI credits. Stalls occur regardless of whether a read needed to be - performed or not. + description: Number of cycles there was a stall because the read request interface was out of GMI credits. + Stalls occur regardless of whether a read needed to be performed or not. TCC_EA_RDREQ_GMI_CREDIT_STALL_sum: architectures: gfx90a: expression: reduce(TCC_EA_RDREQ_GMI_CREDIT_STALL,sum) - description: Number of cycles there was a stall because the read request interface - was out of GMI credits. Stalls occur regardless of whether a read needed to be - performed or not. Sum over TCC instances. + description: Number of cycles there was a stall because the read request interface was out of GMI credits. + Stalls occur regardless of whether a read needed to be performed or not. Sum over TCC instances. TCC_EA_RDREQ_IO_CREDIT_STALL: architectures: gfx90a: block: TCC event: 41 - description: Number of cycles there was a stall because the read request interface - was out of IO credits. Stalls occur regardless of whether a read needed to be - performed or not. + description: Number of cycles there was a stall because the read request interface was out of IO credits. + Stalls occur regardless of whether a read needed to be performed or not. TCC_EA_RDREQ_IO_CREDIT_STALL_sum: architectures: gfx90a: expression: reduce(TCC_EA_RDREQ_IO_CREDIT_STALL,sum) - description: Number of cycles there was a stall because the read request interface - was out of IO credits. Stalls occur regardless of whether a read needed to be - performed or not. Sum over TCC instances. + description: Number of cycles there was a stall because the read request interface was out of IO credits. + Stalls occur regardless of whether a read needed to be performed or not. Sum over TCC instances. TCC_EA_RDREQ_LEVEL: architectures: gfx90a: block: TCC event: 44 - description: The sum of the number of TCC/EA read requests in flight. This is primarily - meant for measure average EA read latency. Average read latency = TCC_PERF_SEL_EA_RDREQ_LEVEL/TCC_PERF_SEL_EA_RDREQ. + description: The sum of the number of TCC/EA read requests in flight. This is primarily meant for measure + average EA read latency. Average read latency = TCC_PERF_SEL_EA_RDREQ_LEVEL/TCC_PERF_SEL_EA_RDREQ. TCC_EA_RDREQ_LEVEL_sum: architectures: gfx90a: expression: reduce(TCC_EA_RDREQ_LEVEL,sum) - description: The sum of the number of TCC/EA read requests in flight. This is primarily - meant for measure average EA read latency. Average read latency = TCC_PERF_SEL_EA_RDREQ_LEVEL/TCC_PERF_SEL_EA_RDREQ. + description: The sum of the number of TCC/EA read requests in flight. This is primarily meant for measure + average EA read latency. Average read latency = TCC_PERF_SEL_EA_RDREQ_LEVEL/TCC_PERF_SEL_EA_RDREQ. Sum over TCC instances. TCC_EA_RDREQ_sum: architectures: gfx906/gfx908/gfx90a/gfx9: expression: reduce(TCC_EA_RDREQ,sum) - description: Number of TCC/EA read requests (either 32-byte or 64-byte). Sum over - TCC instances. + description: Number of TCC/EA read requests (either 32-byte or 64-byte). Sum over TCC instances. TCC_EA_RD_UNCACHED_32B: architectures: gfx90a: block: TCC event: 40 - description: Number of 32-byte TCC/EA read due to uncached traffic. A 64-byte request - will be counted as 2 + description: Number of 32-byte TCC/EA read due to uncached traffic. A 64-byte request will be counted + as 2 TCC_EA_RD_UNCACHED_32B_sum: architectures: gfx90a: expression: reduce(TCC_EA_RD_UNCACHED_32B,sum) - description: Number of 32-byte TCC/EA read due to uncached traffic. A 64-byte request - will be counted as 2 Sum over TCC instances. + description: Number of 32-byte TCC/EA read due to uncached traffic. A 64-byte request will be counted + as 2 Sum over TCC instances. TCC_EA_WRREQ: architectures: gfx906/gfx900/gfx9: @@ -2779,9 +2892,9 @@ TCC_EA_WRREQ: gfx908/gfx90a: block: TCC event: 26 - description: Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq - interface. Atomics may travel over the same interface and are generally classified - as write requests. This does not include probe commands. + description: Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. + Atomics may travel over the same interface and are generally classified as write requests. This does + not include probe commands. TCC_EA_WRREQ_64B: architectures: gfx906/gfx900/gfx9: @@ -2790,79 +2903,74 @@ TCC_EA_WRREQ_64B: gfx908/gfx90a: block: TCC event: 27 - description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over - the TC_EA_wrreq interface. + description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. TCC_EA_WRREQ_64B_sum: architectures: gfx906/gfx908/gfx90a/gfx9: expression: reduce(TCC_EA_WRREQ_64B,sum) - description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over - the TC_EA_wrreq interface. Sum over TCC instances. + description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. + Sum over TCC instances. TCC_EA_WRREQ_DRAM: architectures: gfx90a: block: TCC event: 103 - description: Number of TCC/EA write requests (either 32-byte of 64-byte) destined - for DRAM (MC). + description: Number of TCC/EA write requests (either 32-byte of 64-byte) destined for DRAM (MC). TCC_EA_WRREQ_DRAM_CREDIT_STALL: architectures: gfx90a: block: TCC event: 33 - description: Number of cycles a EA write request was stalled because the interface - was out of DRAM credits. + description: Number of cycles a EA write request was stalled because the interface was out of DRAM credits. TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum: architectures: gfx90a: expression: reduce(TCC_EA_WRREQ_DRAM_CREDIT_STALL,sum) - description: Number of cycles a EA write request was stalled because the interface - was out of DRAM credits. Sum over TCC instances. + description: Number of cycles a EA write request was stalled because the interface was out of DRAM credits. + Sum over TCC instances. TCC_EA_WRREQ_DRAM_sum: architectures: gfx90a: expression: reduce(TCC_EA_WRREQ_DRAM,sum) - description: Number of TCC/EA write requests (either 32-byte of 64-byte) destined - for DRAM (MC). Sum over TCC instances. + description: Number of TCC/EA write requests (either 32-byte of 64-byte) destined for DRAM (MC). Sum + over TCC instances. TCC_EA_WRREQ_GMI_CREDIT_STALL: architectures: gfx90a: block: TCC event: 32 - description: Number of cycles a EA write request was stalled because the interface - was out of GMI credits. + description: Number of cycles a EA write request was stalled because the interface was out of GMI credits. TCC_EA_WRREQ_GMI_CREDIT_STALL_sum: architectures: gfx90a: expression: reduce(TCC_EA_WRREQ_GMI_CREDIT_STALL,sum) - description: Number of cycles a EA write request was stalled because the interface - was out of GMI credits. Sum over TCC instances. + description: Number of cycles a EA write request was stalled because the interface was out of GMI credits. + Sum over TCC instances. TCC_EA_WRREQ_IO_CREDIT_STALL: architectures: gfx90a: block: TCC event: 31 - description: Number of cycles a EA write request was stalled because the interface - was out of IO credits. + description: Number of cycles a EA write request was stalled because the interface was out of IO credits. TCC_EA_WRREQ_IO_CREDIT_STALL_sum: architectures: gfx90a: expression: reduce(TCC_EA_WRREQ_IO_CREDIT_STALL,sum) - description: Number of cycles a EA write request was stalled because the interface - was out of IO credits. Sum over TCC instances. + description: Number of cycles a EA write request was stalled because the interface was out of IO credits. + Sum over TCC instances. TCC_EA_WRREQ_LEVEL: architectures: gfx90a: block: TCC event: 35 - description: The sum of the number of EA write requests in flight. This is primarily - meant for measure average EA write latency. Average write latency = TCC_PERF_SEL_EA_WRREQ_LEVEL/TCC_PERF_SEL_EA_WRREQ. + description: The sum of the number of EA write requests in flight. This is primarily meant for measure + average EA write latency. Average write latency = TCC_PERF_SEL_EA_WRREQ_LEVEL/TCC_PERF_SEL_EA_WRREQ. TCC_EA_WRREQ_LEVEL_sum: architectures: gfx90a: expression: reduce(TCC_EA_WRREQ_LEVEL,sum) - description: The sum of the number of EA write requests in flight. This is primarily - meant for measure average EA write latency. Average write latency = TCC_PERF_SEL_EA_WRREQ_LEVEL/TCC_PERF_SEL_EA_WRREQ. + description: The sum of the number of EA write requests in flight. This is primarily meant for measure + average EA write latency. Average write latency = TCC_PERF_SEL_EA_WRREQ_LEVEL/TCC_PERF_SEL_EA_WRREQ. Sum over TCC instances. TCC_EA_WRREQ_STALL: architectures: @@ -2882,24 +2990,23 @@ TCC_EA_WRREQ_sum: architectures: gfx906/gfx908/gfx90a/gfx9: expression: reduce(TCC_EA_WRREQ,sum) - description: Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq - interface. Sum over TCC instances. + description: Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. + Sum over TCC instances. TCC_EA_WR_UNCACHED_32B: architectures: gfx90a: block: TCC event: 29 - description: Number of 32-byte write/atomic going over the TC_EA_wrreq interface - due to uncached traffic. Note that CC mtypes can produce uncached requests, and - those are included in this. A 64-byte request will be counted as 2 + description: Number of 32-byte write/atomic going over the TC_EA_wrreq interface due to uncached traffic. + Note that CC mtypes can produce uncached requests, and those are included in this. A 64-byte request + will be counted as 2 TCC_EA_WR_UNCACHED_32B_sum: architectures: gfx90a: expression: reduce(TCC_EA_WR_UNCACHED_32B,sum) - description: Number of 32-byte write/atomic going over the TC_EA_wrreq interface - due to uncached traffic. Note that CC mtypes can produce uncached requests, and - those are included in this. A 64-byte request will be counted as 2. Sum over TCC - instances. + description: Number of 32-byte write/atomic going over the TC_EA_wrreq interface due to uncached traffic. + Note that CC mtypes can produce uncached requests, and those are included in this. A 64-byte request + will be counted as 2. Sum over TCC instances. TCC_HIT: architectures: gfx8: @@ -2922,15 +3029,14 @@ TCC_INTERNAL_PROBE: gfx942/gfx941/gfx940: block: TCC event: 11 - description: Number of self-probes spawned by TCC for CC writes/atomic operations. - Not windowable. + description: Number of self-probes spawned by TCC for CC writes/atomic operations. Not windowable. TCC_MC_RDREQ: architectures: gfx8: block: TCC event: 35 - description: Number of 32-byte reads. The hardware actually does 64-byte reads but - the number is adjusted to provide uniformity. + description: Number of 32-byte reads. The hardware actually does 64-byte reads but the number is adjusted + to provide uniformity. TCC_MC_RDREQ_sum: architectures: gfx8: @@ -2941,9 +3047,8 @@ TCC_MC_WRREQ: gfx8: block: TCC event: 26 - description: Number of 32-byte transactions going over the TC_MC_wrreq interface. - Atomics may travel over the same interface and are generally classified as write - requests. + description: Number of 32-byte transactions going over the TC_MC_wrreq interface. Atomics may travel + over the same interface and are generally classified as write requests. TCC_MC_WRREQ_STALL: architectures: gfx8: @@ -2954,8 +3059,7 @@ TCC_MC_WRREQ_sum: architectures: gfx8: expression: reduce(TCC_MC_WRREQ,sum) - description: Number of 32-byte transactions going over the TC_MC_wrreq interface. - Sum over TCC instaces. + description: Number of 32-byte transactions going over the TC_MC_wrreq interface. Sum over TCC instaces. TCC_MISS: architectures: gfx906/gfx900/gfx9: @@ -2975,27 +3079,25 @@ TCC_NC_REQ: gfx942/gfx941/gfx940/gfx90a: block: TCC event: 5 - description: The number of noncoherently cached requests. This is measured at the - tag block. + description: The number of noncoherently cached requests. This is measured at the tag block. TCC_NC_REQ_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCC_NC_REQ,sum) - description: The number of noncoherently cached requests. This is measured at the - tag block. Sum over TCC instances. + description: The number of noncoherently cached requests. This is measured at the tag block. Sum over + TCC instances. TCC_NORMAL_EVICT: architectures: gfx942/gfx941/gfx940/gfx90a: block: TCC event: 74 - description: Number of evictions due to requests that are not invalidate or probe - requests. + description: Number of evictions due to requests that are not invalidate or probe requests. TCC_NORMAL_EVICT_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCC_NORMAL_EVICT,sum) - description: Number of evictions due to requests that are not invalidate or probe - requests. Sum over TCC instances. + description: Number of evictions due to requests that are not invalidate or probe requests. Sum over + TCC instances. TCC_NORMAL_WRITEBACK: architectures: gfx942/gfx941/gfx940/gfx90a: @@ -3006,8 +3108,7 @@ TCC_NORMAL_WRITEBACK_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCC_NORMAL_WRITEBACK,sum) - description: Number of writebacks due to requests that are not writeback requests. - Sum over TCC instances. + description: Number of writebacks due to requests that are not writeback requests. Sum over TCC instances. TCC_PROBE: architectures: gfx942/gfx941/gfx940/gfx90a: @@ -3019,14 +3120,13 @@ TCC_PROBE_ALL: gfx942/gfx941/gfx940/gfx90a: block: TCC event: 10 - description: Number of external probe requests with with EA_TCC_preq_all== 1. Not - windowable. + description: Number of external probe requests with with EA_TCC_preq_all== 1. Not windowable. TCC_PROBE_ALL_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCC_PROBE_ALL,sum) - description: Number of external probe requests with with EA_TCC_preq_all== 1. Not - windowable. Sum over TCC instances. + description: Number of external probe requests with with EA_TCC_preq_all== 1. Not windowable. Sum over + TCC instances. TCC_PROBE_EVICT: architectures: gfx942/gfx941/gfx940: @@ -3043,30 +3143,29 @@ TCC_READ: gfx942/gfx941/gfx940/gfx90a: block: TCC event: 12 - description: Number of read requests. Compressed reads are included in this, but - metadata reads are not included. + description: Number of read requests. Compressed reads are included in this, but metadata reads are + not included. TCC_READ_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCC_READ,sum) - description: Number of read requests. Compressed reads are included in this, but - metadata reads are not included. Sum over TCC instances. + description: Number of read requests. Compressed reads are included in this, but metadata reads are + not included. Sum over TCC instances. TCC_REQ: architectures: gfx942/gfx941/gfx940/gfx90a: block: TCC event: 3 - description: Number of requests of all types. This is measured at the tag block. - This may be more than the number of requests arriving at the TCC, but it is a - good indication of the total amount of work that needs to be performed. + description: Number of requests of all types. This is measured at the tag block. This may be more than + the number of requests arriving at the TCC, but it is a good indication of the total amount of work + that needs to be performed. TCC_REQ_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCC_REQ,sum) - description: Number of requests of all types. This is measured at the tag block. - This may be more than the number of requests arriving at the TCC, but it is a - good indication of the total amount of work that needs to be performed. Sum over - TCC instances. + description: Number of requests of all types. This is measured at the tag block. This may be more than + the number of requests arriving at the TCC, but it is a good indication of the total amount of work + that needs to be performed. Sum over TCC instances. TCC_RW_REQ: architectures: gfx942/gfx941/gfx940/gfx90a: @@ -3077,8 +3176,7 @@ TCC_RW_REQ_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCC_RW_REQ,sum) - description: The number of RW requests. This is measured at the tag block. Sum over - TCC instances. + description: The number of RW requests. This is measured at the tag block. Sum over TCC instances. TCC_STREAMING_REQ: architectures: gfx942/gfx941/gfx940/gfx90a: @@ -3089,38 +3187,34 @@ TCC_STREAMING_REQ_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCC_STREAMING_REQ,sum) - description: Number of streaming requests. This is measured at the tag block. Sum - over TCC instances. + description: Number of streaming requests. This is measured at the tag block. Sum over TCC instances. TCC_TAG_STALL: architectures: gfx942/gfx941/gfx940/gfx90a: block: TCC event: 45 - description: Number of cycles the normal request pipeline in the tag was stalled - for any reason. Normally, stalls of this nature are measured exactly from one - point the pipeline, but that is not the case for this counter. Probes can stall - the pipeline at a variety of places, and there is no single point that can reasonably - measure the total stalls accurately. + description: Number of cycles the normal request pipeline in the tag was stalled for any reason. Normally, + stalls of this nature are measured exactly from one point the pipeline, but that is not the case for + this counter. Probes can stall the pipeline at a variety of places, and there is no single point that + can reasonably measure the total stalls accurately. TCC_TAG_STALL_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCC_TAG_STALL,sum) - description: Total number of cycles the normal request pipeline in the tag is stalled - for any reason. + description: Total number of cycles the normal request pipeline in the tag is stalled for any reason. TCC_TOO_MANY_EA_WRREQS_STALL: architectures: gfx942/gfx941/gfx940/gfx90a: block: TCC event: 34 - description: Number of cycles the TCC could not send a EA write request because - it already reached its maximum number of pending EA write requests. + description: Number of cycles the TCC could not send a EA write request because it already reached its + maximum number of pending EA write requests. TCC_TOO_MANY_EA_WRREQS_STALL_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCC_TOO_MANY_EA_WRREQS_STALL,sum) - description: Number of cycles the TCC could not send a EA write request because - it already reached its maximum number of pending EA write requests. Sum over TCC - instances. + description: Number of cycles the TCC could not send a EA write request because it already reached its + maximum number of pending EA write requests. Sum over TCC instances. TCC_UC_REQ: architectures: gfx942/gfx941/gfx940/gfx90a: @@ -3131,8 +3225,7 @@ TCC_UC_REQ_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCC_UC_REQ,sum) - description: The number of uncached requests. This is measured at the tag block. - Sum over TCC instances. + description: The number of uncached requests. This is measured at the tag block. Sum over TCC instances. TCC_WRITE: architectures: gfx942/gfx941/gfx940/gfx90a: @@ -3144,14 +3237,14 @@ TCC_WRITEBACK: gfx942/gfx941/gfx940/gfx90a: block: TCC event: 22 - description: Number of lines written back to main memory. This includes writebacks - of dirty lines and uncached write/atomic requests. + description: Number of lines written back to main memory. This includes writebacks of dirty lines and + uncached write/atomic requests. TCC_WRITEBACK_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCC_WRITEBACK,sum) - description: Number of lines written back to main memory. This includes writebacks - of dirty lines and uncached write/atomic requests. Sum over TCC instances. + description: Number of lines written back to main memory. This includes writebacks of dirty lines and + uncached write/atomic requests. Sum over TCC instances. TCC_WRITE_sum: architectures: gfx942/gfx941/gfx940/gfx90a: @@ -3262,8 +3355,7 @@ TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TCC_ATOMIC_WITHOUT_RET_REQ,sum) - description: Total atomic without return requests from TCP to all TCCs Sum over - TCP instances. + description: Total atomic without return requests from TCP to all TCCs Sum over TCP instances. TCP_TCC_ATOMIC_WITH_RET_REQ: architectures: gfx90a: @@ -3277,8 +3369,7 @@ TCP_TCC_ATOMIC_WITH_RET_REQ_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TCC_ATOMIC_WITH_RET_REQ,sum) - description: Total atomic with return requests from TCP to all TCCs Sum over TCP - instances. + description: Total atomic with return requests from TCP to all TCCs Sum over TCP instances. TCP_TCC_CC_ATOMIC_REQ: architectures: gfx90a: @@ -3292,8 +3383,7 @@ TCP_TCC_CC_ATOMIC_REQ_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TCC_CC_ATOMIC_REQ,sum) - description: Total atomic requests with CC mtype from this TCP to all TCCs Sum over - TCP instances. + description: Total atomic requests with CC mtype from this TCP to all TCCs Sum over TCP instances. TCP_TCC_CC_READ_REQ: architectures: gfx90a: @@ -3307,8 +3397,7 @@ TCP_TCC_CC_READ_REQ_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TCC_CC_READ_REQ,sum) - description: Total write requests with CC mtype from this TCP to all TCCs Sum over - TCP instances. + description: Total write requests with CC mtype from this TCP to all TCCs Sum over TCP instances. TCP_TCC_CC_WRITE_REQ: architectures: gfx90a: @@ -3322,8 +3411,7 @@ TCP_TCC_CC_WRITE_REQ_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TCC_CC_WRITE_REQ,sum) - description: Total write requests with CC mtype from this TCP to all TCCs Sum over - TCP instances. + description: Total write requests with CC mtype from this TCP to all TCCs Sum over TCP instances. TCP_TCC_NC_ATOMIC_REQ: architectures: gfx90a: @@ -3337,8 +3425,7 @@ TCP_TCC_NC_ATOMIC_REQ_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TCC_NC_ATOMIC_REQ,sum) - description: Total atomic requests with NC mtype from this TCP to all TCCs Sum over - TCP instances. + description: Total atomic requests with NC mtype from this TCP to all TCCs Sum over TCP instances. TCP_TCC_NC_READ_REQ: architectures: gfx90a: @@ -3352,8 +3439,7 @@ TCP_TCC_NC_READ_REQ_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TCC_NC_READ_REQ,sum) - description: Total read requests with NC mtype from this TCP to all TCCs Sum over - TCP instances. + description: Total read requests with NC mtype from this TCP to all TCCs Sum over TCP instances. TCP_TCC_NC_WRITE_REQ: architectures: gfx90a: @@ -3367,8 +3453,7 @@ TCP_TCC_NC_WRITE_REQ_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TCC_NC_WRITE_REQ,sum) - description: Total write requests with NC mtype from this TCP to all TCCs Sum over - TCP instances. + description: Total write requests with NC mtype from this TCP to all TCCs Sum over TCP instances. TCP_TCC_READ_REQ: architectures: gfx90a: @@ -3383,14 +3468,13 @@ TCP_TCC_READ_REQ_LATENCY: gfx90a: block: TCP event: 66 - description: Total TCP->TCC request latency for reads and atomics with return. Not - Windowed. + description: Total TCP->TCC request latency for reads and atomics with return. Not Windowed. TCP_TCC_READ_REQ_LATENCY_sum: architectures: gfx90a: expression: reduce(TCP_TCC_READ_REQ_LATENCY,sum) - description: Total TCP->TCC request latency for reads and atomics with return. Not - Windowed. Sum over TCP instances. + description: Total TCP->TCC request latency for reads and atomics with return. Not Windowed. Sum over + TCP instances. TCP_TCC_READ_REQ_sum: architectures: gfx942/gfx941/gfx940/gfx90a: @@ -3409,8 +3493,7 @@ TCP_TCC_RW_ATOMIC_REQ_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TCC_RW_ATOMIC_REQ,sum) - description: Total atomic requests with RW mtype from this TCP to all TCCs. Sum - over TCP instances. + description: Total atomic requests with RW mtype from this TCP to all TCCs. Sum over TCP instances. TCP_TCC_RW_READ_REQ: architectures: gfx90a: @@ -3424,8 +3507,7 @@ TCP_TCC_RW_READ_REQ_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TCC_RW_READ_REQ,sum) - description: Total write requests with RW mtype from this TCP to all TCCs. Sum over - TCP instances. + description: Total write requests with RW mtype from this TCP to all TCCs. Sum over TCP instances. TCP_TCC_RW_WRITE_REQ: architectures: gfx90a: @@ -3439,8 +3521,7 @@ TCP_TCC_RW_WRITE_REQ_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TCC_RW_WRITE_REQ,sum) - description: Total write requests with RW mtype from this TCP to all TCCs. Sum over - TCP instances. + description: Total write requests with RW mtype from this TCP to all TCCs. Sum over TCP instances. TCP_TCC_UC_ATOMIC_REQ: architectures: gfx90a: @@ -3454,8 +3535,7 @@ TCP_TCC_UC_ATOMIC_REQ_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TCC_UC_ATOMIC_REQ,sum) - description: Total atomic requests with UC mtype from this TCP to all TCCs Sum over - TCP instances. + description: Total atomic requests with UC mtype from this TCP to all TCCs Sum over TCP instances. TCP_TCC_UC_READ_REQ: architectures: gfx90a: @@ -3469,8 +3549,7 @@ TCP_TCC_UC_READ_REQ_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TCC_UC_READ_REQ,sum) - description: Total read requests with UC mtype from this TCP to all TCCs Sum over - TCP instances. + description: Total read requests with UC mtype from this TCP to all TCCs Sum over TCP instances. TCP_TCC_UC_WRITE_REQ: architectures: gfx90a: @@ -3484,8 +3563,7 @@ TCP_TCC_UC_WRITE_REQ_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TCC_UC_WRITE_REQ,sum) - description: Total write requests with UC mtype from this TCP to all TCCs Sum over - TCP instances. + description: Total write requests with UC mtype from this TCP to all TCCs Sum over TCP instances. TCP_TCC_WRITE_REQ: architectures: gfx90a: @@ -3500,14 +3578,13 @@ TCP_TCC_WRITE_REQ_LATENCY: gfx90a: block: TCP event: 67 - description: Total TCP->TCC request latency for writes and atomics without return. - Not Windowed. + description: Total TCP->TCC request latency for writes and atomics without return. Not Windowed. TCP_TCC_WRITE_REQ_LATENCY_sum: architectures: gfx90a: expression: reduce(TCP_TCC_WRITE_REQ_LATENCY,sum) - description: Total TCP->TCC request latency for writes and atomics without return. - Not Windowed. Sum over TCP instances. + description: Total TCP->TCC request latency for writes and atomics without return. Not Windowed. Sum + over TCP instances. TCP_TCC_WRITE_REQ_sum: architectures: gfx942/gfx941/gfx940/gfx90a: @@ -3518,15 +3595,14 @@ TCP_TCP_LATENCY: gfx90a: block: TCP event: 65 - description: Total TCP wave latency (from first clock of wave entering to first - clock of wave leaving), divide by TA_TCP_STATE_READ to avg wave latency + description: Total TCP wave latency (from first clock of wave entering to first clock of wave leaving), + divide by TA_TCP_STATE_READ to avg wave latency TCP_TCP_LATENCY_sum: architectures: gfx90a: expression: reduce(TCP_TCP_LATENCY,sum) - description: Total TCP wave latency (from first clock of wave entering to first - clock of wave leaving), divide by TA_TCP_STATE_READ to avg wave latency Sum over - TCP instances. + description: Total TCP wave latency (from first clock of wave entering to first clock of wave leaving), + divide by TA_TCP_STATE_READ to avg wave latency Sum over TCP instances. TCP_TCP_TA_DATA_STALL_CYCLES: architectures: gfx8: @@ -3596,8 +3672,7 @@ TCP_TOTAL_ATOMIC_WITHOUT_RET_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TOTAL_ATOMIC_WITHOUT_RET,sum) - description: Total number of atomic without return pixels/buffers from TA Sum over - TCP instances. + description: Total number of atomic without return pixels/buffers from TA Sum over TCP instances. TCP_TOTAL_ATOMIC_WITH_RET: architectures: gfx90a: @@ -3611,8 +3686,7 @@ TCP_TOTAL_ATOMIC_WITH_RET_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TOTAL_ATOMIC_WITH_RET,sum) - description: Total number of atomic with return pixels/buffers from TA. Sum over - TCP instances. + description: Total number of atomic with return pixels/buffers from TA. Sum over TCP instances. TCP_TOTAL_CACHE_ACCESSES: architectures: gfx942/gfx941/gfx940/gfx90a: @@ -3623,8 +3697,7 @@ TCP_TOTAL_CACHE_ACCESSES_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TOTAL_CACHE_ACCESSES,sum) - description: Count of total cache line (tag) accesses (includes hits and misses). - Sum over TCP instances. + description: Count of total cache line (tag) accesses (includes hits and misses). Sum over TCP instances. TCP_TOTAL_READ: architectures: gfx90a: @@ -3633,15 +3706,14 @@ TCP_TOTAL_READ: gfx942/gfx941/gfx940: block: TCP event: 28 - description: Total number of read pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_HIT_LRU_READ - + TCP_PERF_SEL_TOTAL_MISS_LRU_READ + TCP_PERF_SEL_TOTAL_MISS_EVICT_READ + description: Total number of read pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_HIT_LRU_READ + TCP_PERF_SEL_TOTAL_MISS_LRU_READ + + TCP_PERF_SEL_TOTAL_MISS_EVICT_READ TCP_TOTAL_READ_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TOTAL_READ,sum) - description: Total number of read pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_HIT_LRU_READ - + TCP_PERF_SEL_TOTAL_MISS_LRU_READ + TCP_PERF_SEL_TOTAL_MISS_EVICT_READ. Sum over - TCP instances. + description: Total number of read pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_HIT_LRU_READ + TCP_PERF_SEL_TOTAL_MISS_LRU_READ + + TCP_PERF_SEL_TOTAL_MISS_EVICT_READ. Sum over TCP instances. TCP_TOTAL_WRITE: architectures: gfx90a: @@ -3660,16 +3732,14 @@ TCP_TOTAL_WRITEBACK_INVALIDATES: gfx942/gfx941/gfx940: block: TCP event: 43 - description: Total number of cache invalidates. Equals TCP_PERF_SEL_TOTAL_WBINVL1+ - TCP_PERF_SEL_TOTAL_WBINVL1_VOL+ TCP_PERF_SEL_CP_TCP_INVALIDATE+ TCP_PERF_SEL_SQ_TCP_INVALIDATE_VOL. - Not Windowed. + description: Total number of cache invalidates. Equals TCP_PERF_SEL_TOTAL_WBINVL1+ TCP_PERF_SEL_TOTAL_WBINVL1_VOL+ + TCP_PERF_SEL_CP_TCP_INVALIDATE+ TCP_PERF_SEL_SQ_TCP_INVALIDATE_VOL. Not Windowed. TCP_TOTAL_WRITEBACK_INVALIDATES_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TOTAL_WRITEBACK_INVALIDATES,sum) - description: Total number of cache invalidates. Equals TCP_PERF_SEL_TOTAL_WBINVL1+ - TCP_PERF_SEL_TOTAL_WBINVL1_VOL+ TCP_PERF_SEL_CP_TCP_INVALIDATE+ TCP_PERF_SEL_SQ_TCP_INVALIDATE_VOL. - Not Windowed. Sum over TCP instances. + description: Total number of cache invalidates. Equals TCP_PERF_SEL_TOTAL_WBINVL1+ TCP_PERF_SEL_TOTAL_WBINVL1_VOL+ + TCP_PERF_SEL_CP_TCP_INVALIDATE+ TCP_PERF_SEL_SQ_TCP_INVALIDATE_VOL. Not Windowed. Sum over TCP instances. TCP_TOTAL_WRITE_sum: architectures: gfx942/gfx941/gfx940/gfx90a: @@ -3801,8 +3871,7 @@ TD_LOAD_WAVEFRONT_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TD_LOAD_WAVEFRONT,sum) - description: Count the wavefronts with opcode = load, include atomics and store. - Sum over TD instances. + description: Count the wavefronts with opcode = load, include atomics and store. Sum over TD instances. TD_SPI_STALL: architectures: gfx90a: @@ -3850,14 +3919,13 @@ TD_TD_BUSY: gfx942/gfx941/gfx940/gfx90a: block: TD event: 1 - description: TD is processing or waiting for data. Perf_Windowing not supported - for this counter. + description: TD is processing or waiting for data. Perf_Windowing not supported for this counter. TD_TD_BUSY_sum: architectures: gfx942/gfx941/gfx940/gfx90a: expression: reduce(TD_TD_BUSY,sum) - description: TD is processing or waiting for data. Perf_Windowing not supported - for this counter. Sum over TD instances. + description: TD is processing or waiting for data. Perf_Windowing not supported for this counter. Sum + over TD instances. TOTAL_16_OPS: architectures: gfx942/gfx941/gfx940/gfx90a: @@ -3889,35 +3957,33 @@ VALUBusy: expression: 100*SQ_ACTIVE_INST_VALU*4/SIMD_NUM/GRBM_GUI_ACTIVE gfx942/gfx941/gfx940: expression: 100*reduce(SQ_ACTIVE_INST_VALU,sum)*4/SIMD_NUM/reduce(GRBM_GUI_ACTIVE,sum) - description: 'The percentage of GPUTime vector ALU instructions are processed. Value - range: 0% (bad) to 100% (optimal).' + description: 'The percentage of GPUTime vector ALU instructions are processed. Value range: 0% (bad) + to 100% (optimal).' VALUInsts: architectures: gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx908/gfx8/gfx90a/gfx9: expression: SQ_INSTS_VALU/SQ_WAVES - description: The average number of vector ALU instructions executed per work-item - (affected by flow control). + description: The average number of vector ALU instructions executed per work-item (affected by flow + control). VALUUtilization: architectures: gfx942/gfx941/gfx906/gfx940/gfx908/gfx8/gfx90a/gfx9: expression: 100*SQ_THREAD_CYCLES_VALU/(SQ_ACTIVE_INST_VALU*MAX_WAVE_SIZE) - description: 'The percentage of active vector ALU threads in a wave. A lower number - can mean either more thread divergence in a wave or that the work-group size is - not a multiple of 64. Value range: 0% (bad), 100% (ideal - no thread divergence).' + description: 'The percentage of active vector ALU threads in a wave. A lower number can mean either + more thread divergence in a wave or that the work-group size is not a multiple of 64. Value range: + 0% (bad), 100% (ideal - no thread divergence).' VFetchInsts: architectures: gfx942/gfx941/gfx906/gfx940/gfx908/gfx8/gfx90a/gfx9: expression: (SQ_INSTS_VMEM_RD-TA_FLAT_READ_WAVEFRONTS_sum)/SQ_WAVES - description: The average number of vector fetch instructions from the video memory - executed per work-item (affected by flow control). Excludes FLAT instructions - that fetch from video memory. + description: The average number of vector fetch instructions from the video memory executed per work-item + (affected by flow control). Excludes FLAT instructions that fetch from video memory. VWriteInsts: architectures: gfx942/gfx941/gfx906/gfx940/gfx908/gfx8/gfx90a/gfx9: expression: (SQ_INSTS_VMEM_WR-TA_FLAT_WRITE_WAVEFRONTS_sum)/SQ_WAVES - description: The average number of vector write instructions to the video memory - executed per work-item (affected by flow control). Excludes FLAT instructions - that write to video memory. + description: The average number of vector write instructions to the video memory executed per work-item + (affected by flow control). Excludes FLAT instructions that write to video memory. ValuIops: architectures: gfx90a: @@ -3947,14 +4013,12 @@ WAVE_ISSUE_WAIT: architectures: gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx1100/gfx1101: expression: 100*SQ_WAIT_INST_ANY/SQ_WAVE_CYCLES - description: Percentage of the SQ_WAVE_CYCLE time spent waiting for any instruction - issue. + description: Percentage of the SQ_WAVE_CYCLE time spent waiting for any instruction issue. WDATA1_SIZE: architectures: gfx906: expression: ((TCC_EA1_WRREQ_sum-TCC_EA1_WRREQ_64B_sum)*32+TCC_EA1_WRREQ_64B_sum*64) - description: The total kilobytes written to the video memory. This is measured on - EA1s. + description: The total kilobytes written to the video memory. This is measured on EA1s. WRITE_REQ_32B: architectures: gfx8: @@ -3976,8 +4040,8 @@ WRITE_SIZE: expression: ((TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)*32+TCC_EA_WRREQ_64B_sum*64)/1024 gfx942/gfx941/gfx940: expression: ((TCC_EA0_WRREQ_sum-TCC_EA0_WRREQ_64B_sum)*32+TCC_EA0_WRREQ_64B_sum*64)/1024 - description: The total kilobytes written to the video memory. This is measured with - all extra fetches and any cache or memory effects taken into account. + description: The total kilobytes written to the video memory. This is measured with all extra fetches + and any cache or memory effects taken into account. WaveDepWait: architectures: gfx90a: @@ -4012,16 +4076,15 @@ WriteSize: architectures: gfx942/gfx941/gfx906/gfx940/gfx908/gfx8/gfx90a/gfx9: expression: WRITE_SIZE - description: The total kilobytes written to the video memory. This is measured with - all extra fetches and any cache or memory effects taken into account. + description: The total kilobytes written to the video memory. This is measured with all extra fetches + and any cache or memory effects taken into account. WriteUnitStalled: architectures: gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx1100/gfx1101: expression: 100*GL2C_WRREQ_STALL_max/GRBM_GUI_ACTIVE gfx906/gfx908/gfx8/gfx90a/gfx9: expression: 100*TCC_WRREQ_STALL_max/GRBM_GUI_ACTIVE - description: 'The percentage of GPUTime the Write unit is stalled. Value range: - 0% to 100% (bad).' + description: 'The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad).' sL1dCacheHitRate: architectures: gfx90a: