cloudflare · bobrik · Aug 19, 2024 · Jul 29, 2024 · Aug 1, 2024 · Aug 1, 2024
diff --git a/examples/cgroup-rstat-flushing.bpf.c b/examples/cgroup-rstat-flushing.bpf.c
@@ -0,0 +1,139 @@
+/*
+ * Measure cgroup rstat (recursive stats) flushing overhead and latency.
+ *
+ * Loosely based on bpftrace script cgroup_rstat_tracepoint.bt
+ *  - https://github.com/xdp-project/xdp-project/blob/master/areas/latency/cgroup_rstat_tracepoint.bt
+ *
+ * Depends on tracepoints added in kernel v6.10
+ */
+
+#include <vmlinux.h>
+#include <bpf/bpf_tracing.h>
+#include "maps.bpf.h"
+
+#define MAX_CGRP_LEVELS	5
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(max_entries, MAX_CGRP_LEVELS + 1);
+	__type(key, u32);
+	__type(value, u64);
+} cgroup_rstat_flush_total SEC(".maps");
+
+/* Total counter for obtaining lock together with contended state
+ *
+ * This counter also contains "yield" case. To determine "normal" lock
+ * case subtract "yield" counter in prometheus query.
+ */
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(max_entries, 2); /* contended state used as key */
+	__type(key, u32);
+	__type(value, u64);
+} cgroup_rstat_locked_total SEC(".maps");
+
+/* Counter for obtaining lock again after yield (and contended state).
+ *
+ * Kernel can yield the rstat lock when walking individial CPU stats.
+ * This leads to "interresting" concurrency issues.  Thus, keep
+ * seperate counter for "yield" cases to help diagnose.
+ */
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(max_entries, 2); /* contended state used as key */
+	__type(key, u32);
+	__type(value, u64);
+} cgroup_rstat_locked_yield SEC(".maps");
+
+/* Counter for lock contended case, recorded per cgroup level */
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(max_entries, MAX_CGRP_LEVELS + 1);
+	__type(key, u32);
+	__type(value, u64);
+} cgroup_rstat_lock_contended SEC(".maps");
+
+/** Measurement#1: lock rates
+ *  =========================
+ For locks, the problematic/interesting case is when the lock was contended.
+
+ Simply counting the lock+unlock is complicated by yielding the lock in the main
+ flushing loop (in cgroup_rstat_flush_locked()). The tracepoints "cpu" argument
+ will be (minus) -1 when the lock is not a yielded lock.
+
+ Concern: The lock rates will vary (a lot) and aggregating this as an average
+ will not capture spikes. Especially given Prometheus capture intervals only
+ happens every 53 seconds.
+
+ Q: will lock "type=yield" be a good idea?
+
+*/
+
+SEC("tp_btf/cgroup_rstat_locked")
+int BPF_PROG(rstat_locked, struct cgroup *cgrp, int cpu, bool contended)
+{
+	u32 key = contended;
+	u64 *cnt;
+
+	read_array_ptr(&cgroup_rstat_locked_total, &key, cnt);
+	(*cnt)++;
+
+	if (cpu >= 0) {
+		read_array_ptr(&cgroup_rstat_locked_yield, &key, cnt);
+		(*cnt)++;
+	}
+
+	/* What cgrp level is interesting, but I didn't manage to encode it in
+	 * above counters.  As contended case is the most interesting, have
+	 * level counter for contended.
+	 */
+	if (contended) {
+		u32 level = cgrp->level;
+
+		if (level > MAX_CGRP_LEVELS)
+			level = MAX_CGRP_LEVELS;
+
+		read_array_ptr(&cgroup_rstat_lock_contended, &level, cnt);
+		(*cnt)++;
+	}
+
+	return 0;
+}
+
+
+/** Measurement#2: latency/delay caused by flush
+ *  ============================================
+ Measure both time waiting for the lock, and time spend holding the lock.
+
+ This should be a histogram (for later latency heatmap).
+
+ */
+
+
+/** Measurement#3: flush rate
+ *  =========================
+ Simply count invocations of cgroup_rstat_flush_locked().
+
+ Concern: The flush rates will vary (a lot), e.g. then cadvisor collects stats
+ for all cgroups in the system, or when kswapd does concurrent flushing (of root
+ cgroup). Averaging this (over approx 1 minute) gives the wrong impression.
+
+ Mitigation workaround: Store counters per cgroup "level" (level=0 is root).
+ This will allow us to separate root-cgroup flushes from cadvisor walking all
+ cgroup levels.
+
+ */
+SEC("fentry/cgroup_rstat_flush_locked")
+int BPF_PROG(cgroup_rstat_flush_locked, struct cgroup *cgrp)
+{
+	u64 level_key = cgrp->level;
+
+	if (level_key > MAX_CGRP_LEVELS)
+		level_key = MAX_CGRP_LEVELS;
+
+	increment_map_nosync(&cgroup_rstat_flush_total, &level_key, 1);
+
+	return 0;
+}
+
+char LICENSE[] SEC("license") = "GPL";
diff --git a/examples/cgroup-rstat-flushing.yaml b/examples/cgroup-rstat-flushing.yaml
@@ -0,0 +1,31 @@
+metrics:
+  counters:
+    - name: cgroup_rstat_flush_total
+      help: Total number of times cgroup rstat were flushed (recorded per level)
+      labels:
+        - name: level
+          size: 4
+          decoders:
+            - name: uint
+    - name: cgroup_rstat_locked_total
+      help: Total number of times rstat lock was obtainted and contended state
+      labels:
+        - name: contended
+          size: 4
+          decoders: # contended boolean converted to 0 and 1
+            - name: uint
+    - name: cgroup_rstat_locked_yield
+      help: Number of times rstat lock was obtainted again after yield and contended state
+      labels:
+        - name: contended
+          size: 4
+          decoders: # contended boolean converted to 0 and 1
+            - name: uint
+    - name: cgroup_rstat_lock_contended
+      help: Lock contention counters per cgroup level
+      labels:
+        - name: level
+          size: 4
+          decoders:
+            - name: uint
+