forked from iovisor/bcc
-
Notifications
You must be signed in to change notification settings - Fork 2
/
cpudist.py
executable file
·290 lines (250 loc) · 8.21 KB
/
cpudist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
#!/usr/bin/env python
# @lint-avoid-python-3-compatibility-imports
#
# cpudist Summarize on- and off-CPU time per task as a histogram.
#
# USAGE: cpudist [-h] [-O] [-T] [-m] [-P] [-L] [-p PID] [-I] [-e] [interval] [count]
#
# This measures the time a task spends on or off the CPU, and shows this time
# as a histogram, optionally per-process.
#
# By default CPU idle time are excluded by simply excluding PID 0.
#
# Copyright 2016 Sasha Goldshtein
# Licensed under the Apache License, Version 2.0 (the "License")
#
# 27-Mar-2022 Rocky Xing Changed to exclude CPU idle time by default.
# 25-Jul-2022 Rocky Xing Added extension summary support.
from __future__ import print_function
from bcc import BPF
from time import sleep, strftime
import argparse
import os
examples = """examples:
cpudist # summarize on-CPU time as a histogram
cpudist -O # summarize off-CPU time as a histogram
cpudist 1 10 # print 1 second summaries, 10 times
cpudist -mT 1 # 1s summaries, milliseconds, and timestamps
cpudist -P # show each PID separately
cpudist -p 185 # trace PID 185 only
cpudist -I # include CPU idle time
cpudist -e # show extension summary (average/total/count)
"""
parser = argparse.ArgumentParser(
description="Summarize on- and off-CPU time per task as a histogram.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=examples)
parser.add_argument("-O", "--offcpu", action="store_true",
help="measure off-CPU time")
parser.add_argument("-T", "--timestamp", action="store_true",
help="include timestamp on output")
parser.add_argument("-m", "--milliseconds", action="store_true",
help="millisecond histogram")
parser.add_argument("-P", "--pids", action="store_true",
help="print a histogram per process ID")
parser.add_argument("-L", "--tids", action="store_true",
help="print a histogram per thread ID")
parser.add_argument("-p", "--pid",
help="trace this PID only")
parser.add_argument("-I", "--include-idle", action="store_true",
help="include CPU idle time")
parser.add_argument("-e", "--extension", action="store_true",
help="show extension summary (average/total/count)")
parser.add_argument("interval", nargs="?", default=99999999,
help="output interval, in seconds")
parser.add_argument("count", nargs="?", default=99999999,
help="number of outputs")
parser.add_argument("--ebpf", action="store_true",
help=argparse.SUPPRESS)
args = parser.parse_args()
countdown = int(args.count)
debug = 0
bpf_text = """
#include <uapi/linux/ptrace.h>
#include <linux/sched.h>
"""
if not args.offcpu:
bpf_text += "#define ONCPU\n"
bpf_text += """
typedef struct entry_key {
u32 pid;
u32 cpu;
} entry_key_t;
typedef struct pid_key {
u64 id;
u64 slot;
} pid_key_t;
typedef struct ext_val {
u64 total;
u64 count;
} ext_val_t;
BPF_HASH(start, entry_key_t, u64, MAX_PID);
STORAGE
static inline void store_start(u32 tgid, u32 pid, u32 cpu, u64 ts)
{
if (PID_FILTER)
return;
if (IDLE_FILTER)
return;
entry_key_t entry_key = { .pid = pid, .cpu = (pid == 0 ? cpu : 0xFFFFFFFF) };
start.update(&entry_key, &ts);
}
static inline void update_hist(u32 tgid, u32 pid, u32 cpu, u64 ts)
{
if (PID_FILTER)
return;
if (IDLE_FILTER)
return;
entry_key_t entry_key = { .pid = pid, .cpu = (pid == 0 ? cpu : 0xFFFFFFFF) };
u64 *tsp = start.lookup(&entry_key);
if (tsp == 0)
return;
if (ts < *tsp) {
// Probably a clock issue where the recorded on-CPU event had a
// timestamp later than the recorded off-CPU event, or vice versa.
return;
}
u64 delta = ts - *tsp;
FACTOR
STORE
}
int sched_switch(struct pt_regs *ctx, struct task_struct *prev)
{
u64 ts = bpf_ktime_get_ns();
u64 pid_tgid = bpf_get_current_pid_tgid();
u32 tgid = pid_tgid >> 32, pid = pid_tgid;
u32 cpu = bpf_get_smp_processor_id();
struct bpf_pidns_info ns = {};
if (USE_PIDNS && !bpf_get_ns_current_pid_tgid(PIDNS_DEV, PIDNS_INO, &ns, sizeof(struct bpf_pidns_info))) {
PID_STORE
tgid = ns.tgid;
pid = ns.pid;
}
u32 prev_pid = prev->pid;
u32 prev_tgid = prev->tgid;
PID_TRANSLATE
#ifdef ONCPU
update_hist(prev_tgid, prev_pid, cpu, ts);
#else
store_start(prev_tgid, prev_pid, cpu, ts);
#endif
BAIL:
#ifdef ONCPU
store_start(tgid, pid, cpu, ts);
#else
update_hist(tgid, pid, cpu, ts);
#endif
return 0;
}
"""
if args.pid:
bpf_text = bpf_text.replace('PID_FILTER', 'tgid != %s' % args.pid)
else:
bpf_text = bpf_text.replace('PID_FILTER', '0')
# set idle filter
idle_filter = 'pid == 0'
if args.include_idle:
idle_filter = '0'
bpf_text = bpf_text.replace('IDLE_FILTER', idle_filter)
if args.milliseconds:
bpf_text = bpf_text.replace('FACTOR', 'delta /= 1000000;')
label = "msecs"
else:
bpf_text = bpf_text.replace('FACTOR', 'delta /= 1000;')
label = "usecs"
storage_str = ""
store_str = ""
pid_store = ""
pid_translate = ""
try:
devinfo = os.stat("/proc/self/ns/pid")
version = "".join([ver.zfill(2) for ver in os.uname().release.split(".")])
# Need Linux >= 5.7 to have helper bpf_get_ns_current_pid_tgid() available:
assert(version[:4] >= "0507")
bpf_text = bpf_text.replace('USE_PIDNS', "1")
bpf_text = bpf_text.replace('PIDNS_DEV', str(devinfo.st_dev))
bpf_text = bpf_text.replace('PIDNS_INO', str(devinfo.st_ino))
storage_str = "BPF_HASH(ns_pid, u32, u32, MAX_PID);\n"
pid_store = """ns_pid.update(&pid, &ns.pid);
ns_pid.update(&tgid, &ns.tgid);"""
pid_translate = """
u32 *ns_pid_val = ns_pid.lookup(&prev_pid);
u32 *ns_tgid_val = ns_pid.lookup(&prev_tgid);
if (ns_pid_val && ns_tgid_val) {
prev_pid = *ns_pid_val;
prev_tgid = *ns_tgid_val;
}
"""
except:
bpf_text = bpf_text.replace('USE_PIDNS', "0")
bpf_text = bpf_text.replace('PIDNS_DEV', "0")
bpf_text = bpf_text.replace('PIDNS_INO', "0")
if args.pids or args.tids:
section = "pid"
pid = "tgid"
if args.tids:
pid = "pid"
section = "tid"
storage_str += "BPF_HISTOGRAM(dist, pid_key_t, MAX_PID);"
store_str += """
pid_key_t key = {.id = """ + pid + """, .slot = bpf_log2l(delta)};
dist.increment(key);
"""
else:
section = ""
storage_str += "BPF_HISTOGRAM(dist);"
store_str += "dist.atomic_increment(bpf_log2l(delta));"
if args.extension:
storage_str += "BPF_ARRAY(extension, ext_val_t, 1);"
store_str += """
u32 index = 0;
ext_val_t *ext_val = extension.lookup(&index);
if (ext_val) {
lock_xadd(&ext_val->total, delta);
lock_xadd(&ext_val->count, 1);
}
"""
bpf_text = bpf_text.replace("PID_STORE", pid_store)
bpf_text = bpf_text.replace("PID_TRANSLATE", pid_translate)
bpf_text = bpf_text.replace("STORAGE", storage_str)
bpf_text = bpf_text.replace("STORE", store_str)
if debug or args.ebpf:
print(bpf_text)
if args.ebpf:
exit()
max_pid = int(open("/proc/sys/kernel/pid_max").read())
b = BPF(text=bpf_text, cflags=["-DMAX_PID=%d" % max_pid])
b.attach_kprobe(event_re=r'^finish_task_switch$|^finish_task_switch\.isra\.\d$',
fn_name="sched_switch")
print("Tracing %s-CPU time... Hit Ctrl-C to end." %
("off" if args.offcpu else "on"))
exiting = 0 if args.interval else 1
dist = b.get_table("dist")
if args.extension:
extension = b.get_table("extension")
while (1):
try:
sleep(int(args.interval))
except KeyboardInterrupt:
exiting = 1
print()
if args.timestamp:
print("%-8s\n" % strftime("%H:%M:%S"), end="")
def pid_to_comm(pid):
try:
comm = open("/proc/%d/comm" % pid, "r").read()
return "%d %s" % (pid, comm)
except IOError:
return str(pid)
dist.print_log2_hist(label, section, section_print_fn=pid_to_comm)
if args.extension:
total = extension[0].total
count = extension[0].count
if count > 0:
print("\navg = %ld %s, total: %ld %s, count: %ld\n" %
(total / count, label, total, label, count))
extension.clear()
dist.clear()
countdown -= 1
if exiting or countdown == 0:
exit()