feat(validator): add support to validate essential metrics produced b…

…y Kepler This commit introduces functionality to validate essential metrics produced by Kepler The following comparisons are included: - Node Exporter Comparison - Validates `node_rapl_<package|core|dram>` metrics against `kepler_node_<package|core|dram>{dev}` - Kepler Process Comparison - Compares `kepler_process_<package|core|dram|platform|other|uncore>{latest}` metrics to `kepler_process_<package|core|dram|platform|other|uncore>{dev}` - Kepler Node Comparison - Validates `kepler_node_<package|core|dram|platform|other|uncore>{latest}` against `kepler_node_<package|core|dram|platform|other|uncore>{dev}` Additionally, a stressor script has been added to include system load, allowing for real-time validation of Kepler under stress conditions. Signed-off-by: vprashar2929 <[email protected]>
sustainable-computing-io · Nov 9, 2024 · 5fa8028 · 5fa8028
1 parent 82dc44a
commit 5fa8028
Show file tree

Hide file tree

Showing 8 changed files with 498 additions and 5 deletions.
diff --git a/e2e/tools/validator/metric_validations.yaml b/e2e/tools/validator/metric_validations.yaml
@@ -0,0 +1,354 @@
+config:
+  mapping:
+    actual: latest
+    predicted: dev
+
+validations:
+  # node rapl comparison
+  - name: node-rapl - kepler-package
+    units: Watts
+    mapping:
+      actual: node-rapl
+      predicted: kepler-package
+
+    node-rapl: |
+      sum(
+        rate(
+          node_rapl_package_joules_total[{rate_interval}]
+        )
+      )
+
+    kepler-package: |
+      sum(
+        rate(
+          kepler_node_package_joules_total{{
+            job="dev",
+            mode="dynamic",
+          }}[{rate_interval}]
+        )
+      )
+    max_mae: 1.00
+
+  - name: node-rapl - kepler-core
+    units: Watts
+    mapping:
+      actual: node-rapl
+      predicted: kepler-core
+
+    node-rapl: |
+      sum(
+        rate(
+          node_rapl_core_joules_total[{rate_interval}]
+        )
+      )
+
+    kepler-core: |
+      sum(
+        rate(
+          kepler_node_core_joules_total{{
+            job="dev",
+            mode="dynamic",
+          }}[{rate_interval}]
+        )
+      )
+    max_mae: 1.00
+
+  - name: node-rapl - kepler-dram
+    units: Watts
+    mapping:
+      actual: node-rapl
+      predicted: kepler-dram
+
+    node-rapl: |
+      sum(
+        rate(
+          node_rapl_dram_joules_total[{rate_interval}]
+        )
+      )
+
+    kepler-dram: |
+      sum(
+        rate(
+          kepler_node_dram_joules_total{{
+            job="dev",
+            mode="dynamic",
+          }}[{rate_interval}]
+        )
+      )
+    max_mae: 1.00
+
+  # absolute power comparison
+  - name: Total - absolute
+    latest: |
+      sum(
+        rate(
+        kepler_process_joules_total{{
+          job="latest",
+        }}[{rate_interval}]
+        )
+      )
+    dev: |
+      sum(
+        rate(
+          kepler_process_joules_total{{
+            job="dev",
+          }}[{rate_interval}]
+        )
+      )
+
+    max_mae: 0.59
+
+  # CPU time comparison
+  - name: cpu-time
+    units: Milliseconds
+    latest: |
+      sum(
+        rate(
+          kepler_process_bpf_cpu_time_ms_total{{
+            job="latest"
+          }}[{rate_interval}]
+        )
+      )
+    dev: |
+      sum(
+        rate(
+          kepler_process_bpf_cpu_time_ms_total{{
+            job="dev",
+          }}[{rate_interval}]
+        )
+      )
+    # max_mae: 20.0
+
+  # process comparison
+  - name: platform - dynamic
+    latest: |
+      sum(
+        rate(
+        kepler_process_platform_joules_total{{
+          job="latest", mode="dynamic",
+        }}[{rate_interval}]
+        )
+      )
+    dev: |
+      sum(
+        rate(
+          kepler_process_platform_joules_total{{
+            job="dev", mode="dynamic",
+          }}[{rate_interval}]
+        )
+      )
+
+    max_mae: 0.59
+
+  - name: package - dynamic
+    units: Watts
+    latest: |
+      sum(
+        rate(
+          kepler_process_package_joules_total{{
+            job="latest",
+            mode="dynamic",
+          }}[{rate_interval}]
+        )
+      )
+    dev: |
+      sum(
+        rate(
+          kepler_process_package_joules_total{{
+            job="dev",
+            mode="dynamic",
+          }}[{rate_interval}]
+        )
+      )
+    max_mae: 0.59
+
+  - name: core - dynamic
+    units: Watts
+    latest: |
+      sum(
+        rate(
+          kepler_process_core_joules_total{{
+            job="latest",
+            mode="dynamic",
+          }}[{rate_interval}]
+        )
+      )
+    dev: |
+      sum(
+        rate(
+          kepler_process_core_joules_total{{
+            job="dev",
+            mode="dynamic",
+          }}[{rate_interval}]
+        )
+      )
+    max_mae: 0.59
+
+  - name: dram - dynamic
+    units: Watts
+    latest: |
+      sum(
+        rate(
+          kepler_process_dram_joules_total{{
+            job="latest",
+            mode="dynamic",
+          }}[{rate_interval}]
+        )
+      )
+    dev: |
+      sum(
+        rate(
+          kepler_process_dram_joules_total{{
+            job="dev",
+            mode="dynamic",
+          }}[{rate_interval}]
+        )
+      )
+    max_mae: 0.59
+
+  - name: other - dynamic
+    units: Watts
+    latest: |
+      sum(
+        rate(
+          kepler_process_other_joules_total{{
+            job="latest",
+            mode="dynamic",
+          }}[{rate_interval}]
+        )
+      )
+    dev: |
+      sum(
+        rate(
+          kepler_process_other_joules_total{{
+            job="dev",
+            mode="dynamic",
+          }}[{rate_interval}]
+        )
+      )
+    max_mae: 0.59
+
+  - name: uncore - dynamic
+    units: Watts
+    latest: |
+      sum(
+        rate(
+          kepler_process_uncore_joules_total{{
+            job="latest",
+            mode="dynamic",
+          }}[{rate_interval}]
+        )
+      )
+    dev: |
+      sum(
+        rate(
+          kepler_process_uncore_joules_total{{
+            job="dev",
+            mode="dynamic",
+          }}[{rate_interval}]
+        )
+      )
+    max_mae: 0.59
+
+  # node comparison
+  - name: node platform - dynamic
+    units: Watts
+    latest: |
+      rate(kepler_node_platform_joules_total{{
+          job="latest",
+          mode="dynamic",
+        }}[{rate_interval}]
+      )
+
+    dev: |
+      rate(kepler_node_platform_joules_total{{
+          job="dev",
+          mode="dynamic",
+        }}[{rate_interval}]
+      )
+    max_mae: 0.59
+
+  - name: node package - dynamic
+    units: Watts
+    latest: |
+      rate(kepler_node_package_joules_total{{
+          job="latest",
+          mode="dynamic",
+        }}[{rate_interval}]
+      )
+
+    dev: |
+      rate(kepler_node_package_joules_total{{
+          job="dev",
+          mode="dynamic",
+        }}[{rate_interval}]
+      )
+    max_mae: 0.59
+
+  - name: node core - dynamic
+    units: Watts
+    latest: |
+      rate(kepler_node_core_joules_total{{
+          job="latest",
+          mode="dynamic",
+        }}[{rate_interval}]
+      )
+
+    dev: |
+      rate(kepler_node_core_joules_total{{
+          job="dev",
+          mode="dynamic",
+        }}[{rate_interval}]
+      )
+    max_mae: 0.59
+
+  - name: node dram - dynamic
+    units: Watts
+    latest: |
+      rate(kepler_node_dram_joules_total{{
+          job="latest",
+          mode="dynamic",
+        }}[{rate_interval}]
+      )
+
+    dev: |
+      rate(kepler_node_dram_joules_total{{
+          job="dev",
+          mode="dynamic",
+        }}[{rate_interval}]
+      )
+    max_mae: 0.59
+
+  - name: node other - dynamic
+    units: Watts
+    latest: |
+      rate(kepler_node_other_joules_total{{
+          job="latest",
+          mode="dynamic",
+        }}[{rate_interval}]
+      )
+
+    dev: |
+      rate(kepler_node_other_joules_total{{
+          job="dev",
+          mode="dynamic",
+        }}[{rate_interval}]
+      )
+    max_mae: 0.59
+
+  - name: node uncore - dynamic
+    units: Watts
+    latest: |
+      rate(kepler_node_uncore_joules_total{{
+          job="latest",
+          mode="dynamic",
+        }}[{rate_interval}]
+      )
+
+    dev: |
+      rate(kepler_node_uncore_joules_total{{
+          job="dev",
+          mode="dynamic",
+        }}[{rate_interval}]
+      )
+    max_mae: 0.59
diff --git a/e2e/tools/validator/scripts/regression-stressor.sh b/e2e/tools/validator/scripts/regression-stressor.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+
+set -eu -o pipefail
+
+trap exit_all INT
+exit_all() {
+	pkill -P $$
+}
+
+run() {
+	echo "❯ $*"
+	"$@"
+	echo "      ‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾"
+}
+
+main() {
+
+	local cpus
+	cpus=$(nproc)
+
+	# load and time
+	local -a load_curve=(
+		0:5
+		10:20
+		25:20
+		50:20
+		75:20
+		50:20
+		25:20
+		10:20
+		0:5
+	)
+	# sleep 5  so that first run and the second run look the same
+	echo "Warmup .."
+	run stress-ng --cpu "$cpus" --cpu-method ackermann --cpu-load 0 --timeout 5
+
+	for x in "${load_curve[@]}"; do
+		local load="${x%%:*}"
+		local time="${x##*:}s"
+		run stress-ng --cpu "$cpus" --cpu-method ackermann --cpu-load "$load" --timeout "$time"
+	done
+}
+
+main "$@"