Skip to content

Commit

Permalink
feat(validator): add support to validate essential metrics produced b…
Browse files Browse the repository at this point in the history
…y Kepler

This commit introduces functionality to validate essential metrics produced by Kepler
The following comparisons are included:

- Node Exporter Comparison
  - Validates `node_rapl_<package|core|dram>` metrics against `kepler_node_<package|core|dram>{dev}`

- Kepler Process Comparison
  - Compares `kepler_process_<package|core|dram|platform|other|uncore>{latest}` metrics to
  `kepler_process_<package|core|dram|platform|other|uncore>{dev}`

- Kepler Node Comparison
  - Validates `kepler_node_<package|core|dram|platform|other|uncore>{latest}` against
  `kepler_node_<package|core|dram|platform|other|uncore>{dev}`

Additionally, a stressor script has been added to include system load,
allowing for real-time validation of Kepler under stress conditions.

Signed-off-by: vprashar2929 <[email protected]>
  • Loading branch information
vprashar2929 committed Nov 9, 2024
1 parent 82dc44a commit 5fa8028
Show file tree
Hide file tree
Showing 8 changed files with 498 additions and 5 deletions.
354 changes: 354 additions & 0 deletions e2e/tools/validator/metric_validations.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,354 @@
config:
mapping:
actual: latest
predicted: dev

validations:
# node rapl comparison
- name: node-rapl - kepler-package
units: Watts
mapping:
actual: node-rapl
predicted: kepler-package

node-rapl: |
sum(
rate(
node_rapl_package_joules_total[{rate_interval}]
)
)
kepler-package: |
sum(
rate(
kepler_node_package_joules_total{{
job="dev",
mode="dynamic",
}}[{rate_interval}]
)
)
max_mae: 1.00

- name: node-rapl - kepler-core
units: Watts
mapping:
actual: node-rapl
predicted: kepler-core

node-rapl: |
sum(
rate(
node_rapl_core_joules_total[{rate_interval}]
)
)
kepler-core: |
sum(
rate(
kepler_node_core_joules_total{{
job="dev",
mode="dynamic",
}}[{rate_interval}]
)
)
max_mae: 1.00

- name: node-rapl - kepler-dram
units: Watts
mapping:
actual: node-rapl
predicted: kepler-dram

node-rapl: |
sum(
rate(
node_rapl_dram_joules_total[{rate_interval}]
)
)
kepler-dram: |
sum(
rate(
kepler_node_dram_joules_total{{
job="dev",
mode="dynamic",
}}[{rate_interval}]
)
)
max_mae: 1.00

# absolute power comparison
- name: Total - absolute
latest: |
sum(
rate(
kepler_process_joules_total{{
job="latest",
}}[{rate_interval}]
)
)
dev: |
sum(
rate(
kepler_process_joules_total{{
job="dev",
}}[{rate_interval}]
)
)
max_mae: 0.59

# CPU time comparison
- name: cpu-time
units: Milliseconds
latest: |
sum(
rate(
kepler_process_bpf_cpu_time_ms_total{{
job="latest"
}}[{rate_interval}]
)
)
dev: |
sum(
rate(
kepler_process_bpf_cpu_time_ms_total{{
job="dev",
}}[{rate_interval}]
)
)
# max_mae: 20.0

# process comparison
- name: platform - dynamic
latest: |
sum(
rate(
kepler_process_platform_joules_total{{
job="latest", mode="dynamic",
}}[{rate_interval}]
)
)
dev: |
sum(
rate(
kepler_process_platform_joules_total{{
job="dev", mode="dynamic",
}}[{rate_interval}]
)
)
max_mae: 0.59

- name: package - dynamic
units: Watts
latest: |
sum(
rate(
kepler_process_package_joules_total{{
job="latest",
mode="dynamic",
}}[{rate_interval}]
)
)
dev: |
sum(
rate(
kepler_process_package_joules_total{{
job="dev",
mode="dynamic",
}}[{rate_interval}]
)
)
max_mae: 0.59

- name: core - dynamic
units: Watts
latest: |
sum(
rate(
kepler_process_core_joules_total{{
job="latest",
mode="dynamic",
}}[{rate_interval}]
)
)
dev: |
sum(
rate(
kepler_process_core_joules_total{{
job="dev",
mode="dynamic",
}}[{rate_interval}]
)
)
max_mae: 0.59

- name: dram - dynamic
units: Watts
latest: |
sum(
rate(
kepler_process_dram_joules_total{{
job="latest",
mode="dynamic",
}}[{rate_interval}]
)
)
dev: |
sum(
rate(
kepler_process_dram_joules_total{{
job="dev",
mode="dynamic",
}}[{rate_interval}]
)
)
max_mae: 0.59

- name: other - dynamic
units: Watts
latest: |
sum(
rate(
kepler_process_other_joules_total{{
job="latest",
mode="dynamic",
}}[{rate_interval}]
)
)
dev: |
sum(
rate(
kepler_process_other_joules_total{{
job="dev",
mode="dynamic",
}}[{rate_interval}]
)
)
max_mae: 0.59

- name: uncore - dynamic
units: Watts
latest: |
sum(
rate(
kepler_process_uncore_joules_total{{
job="latest",
mode="dynamic",
}}[{rate_interval}]
)
)
dev: |
sum(
rate(
kepler_process_uncore_joules_total{{
job="dev",
mode="dynamic",
}}[{rate_interval}]
)
)
max_mae: 0.59

# node comparison
- name: node platform - dynamic
units: Watts
latest: |
rate(kepler_node_platform_joules_total{{
job="latest",
mode="dynamic",
}}[{rate_interval}]
)
dev: |
rate(kepler_node_platform_joules_total{{
job="dev",
mode="dynamic",
}}[{rate_interval}]
)
max_mae: 0.59

- name: node package - dynamic
units: Watts
latest: |
rate(kepler_node_package_joules_total{{
job="latest",
mode="dynamic",
}}[{rate_interval}]
)
dev: |
rate(kepler_node_package_joules_total{{
job="dev",
mode="dynamic",
}}[{rate_interval}]
)
max_mae: 0.59

- name: node core - dynamic
units: Watts
latest: |
rate(kepler_node_core_joules_total{{
job="latest",
mode="dynamic",
}}[{rate_interval}]
)
dev: |
rate(kepler_node_core_joules_total{{
job="dev",
mode="dynamic",
}}[{rate_interval}]
)
max_mae: 0.59

- name: node dram - dynamic
units: Watts
latest: |
rate(kepler_node_dram_joules_total{{
job="latest",
mode="dynamic",
}}[{rate_interval}]
)
dev: |
rate(kepler_node_dram_joules_total{{
job="dev",
mode="dynamic",
}}[{rate_interval}]
)
max_mae: 0.59

- name: node other - dynamic
units: Watts
latest: |
rate(kepler_node_other_joules_total{{
job="latest",
mode="dynamic",
}}[{rate_interval}]
)
dev: |
rate(kepler_node_other_joules_total{{
job="dev",
mode="dynamic",
}}[{rate_interval}]
)
max_mae: 0.59

- name: node uncore - dynamic
units: Watts
latest: |
rate(kepler_node_uncore_joules_total{{
job="latest",
mode="dynamic",
}}[{rate_interval}]
)
dev: |
rate(kepler_node_uncore_joules_total{{
job="dev",
mode="dynamic",
}}[{rate_interval}]
)
max_mae: 0.59
44 changes: 44 additions & 0 deletions e2e/tools/validator/scripts/regression-stressor.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/usr/bin/env bash

set -eu -o pipefail

trap exit_all INT
exit_all() {
pkill -P $$
}

run() {
echo "$*"
"$@"
echo " ‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾"
}

main() {

local cpus
cpus=$(nproc)

# load and time
local -a load_curve=(
0:5
10:20
25:20
50:20
75:20
50:20
25:20
10:20
0:5
)
# sleep 5 so that first run and the second run look the same
echo "Warmup .."
run stress-ng --cpu "$cpus" --cpu-method ackermann --cpu-load 0 --timeout 5

for x in "${load_curve[@]}"; do
local load="${x%%:*}"
local time="${x##*:}s"
run stress-ng --cpu "$cpus" --cpu-method ackermann --cpu-load "$load" --timeout "$time"
done
}

main "$@"
Loading

0 comments on commit 5fa8028

Please sign in to comment.