From 283fed86e589396f6828f93649a98f5b7c15115f Mon Sep 17 00:00:00 2001 From: RoiKol Date: Mon, 23 Oct 2023 16:52:27 +0300 Subject: [PATCH] feat(events): add io_uring visibility add events to allow visibility to io_uring operations in the system. the added events are: io_uring_create - creation of io_uring queue io_issue_sqe - submission of io_uring request io_write - operation of write using io_uring --- go.mod | 4 +- go.sum | 8 +- pkg/ebpf/c/maps.h | 3 + pkg/ebpf/c/tracee.bpf.c | 515 +++++++++++++++++++++++++- pkg/ebpf/c/types.h | 3 + pkg/ebpf/c/vmlinux.h | 53 +++ pkg/ebpf/c/vmlinux_flavors.h | 64 ++++ pkg/ebpf/c/vmlinux_missing.h | 2 + pkg/ebpf/probes/probe_group.go | 8 + pkg/ebpf/probes/probes.go | 8 + pkg/events/core.go | 85 +++++ pkg/events/definition_dependencies.go | 1 + pkg/events/parse_args.go | 20 + 13 files changed, 764 insertions(+), 10 deletions(-) diff --git a/go.mod b/go.mod index 766668db2d34..dcb88de3a6e2 100644 --- a/go.mod +++ b/go.mod @@ -5,8 +5,8 @@ go 1.19 require ( github.com/IBM/fluent-forward-go v0.2.1 github.com/Masterminds/sprig/v3 v3.2.3 - github.com/aquasecurity/libbpfgo v0.5.0-libbpf-1.2 - github.com/aquasecurity/libbpfgo/helpers v0.4.6-0.20230321190037-f591a2c5734f + github.com/aquasecurity/libbpfgo v0.5.1-libbpf-1.2.0.20230928114152-cf2d0bea778b + github.com/aquasecurity/libbpfgo/helpers v0.4.6-0.20230928114152-cf2d0bea778b github.com/aquasecurity/tracee/api v0.0.0-20231013014739-b32a168ee6a8 github.com/aquasecurity/tracee/types v0.0.0-20231013014739-b32a168ee6a8 github.com/containerd/containerd v1.7.0 diff --git a/go.sum b/go.sum index c263d09ffc85..8e61f438df8e 100644 --- a/go.sum +++ b/go.sum @@ -65,10 +65,10 @@ github.com/agnivade/levenshtein v1.1.1/go.mod h1:veldBMzWxcCG2ZvUTKD2kJNRdCk5hVb github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY= github.com/antlr/antlr4/runtime/Go/antlr/v4 v4.0.0-20230321174746-8dcc6526cfb1 h1:X8MJ0fnN5FPdcGF5Ij2/OW+HgiJrRg3AfHAx1PJtIzM= github.com/antlr/antlr4/runtime/Go/antlr/v4 v4.0.0-20230321174746-8dcc6526cfb1/go.mod h1:pSwJ0fSY5KhvocuWSx4fz3BA8OrA1bQn+K1Eli3BRwM= -github.com/aquasecurity/libbpfgo v0.5.0-libbpf-1.2 h1:Yywi9wC3GPDOgR8wr6P9geY2qvFqKxH5sctMOssw+MQ= -github.com/aquasecurity/libbpfgo v0.5.0-libbpf-1.2/go.mod h1:0rEApF1YBHGuZ4C8OYI9q5oDBVpgqtRqYATePl9mCDk= -github.com/aquasecurity/libbpfgo/helpers v0.4.6-0.20230321190037-f591a2c5734f h1:l127H3NqJBmw+XMt+haBOeZIrBppuw7TJz26cWMI9kY= -github.com/aquasecurity/libbpfgo/helpers v0.4.6-0.20230321190037-f591a2c5734f/go.mod h1:j/TQLmsZpOIdF3CnJODzYngG4yu1YoDCoRMELxkQSSA= +github.com/aquasecurity/libbpfgo v0.5.1-libbpf-1.2.0.20230928114152-cf2d0bea778b h1:waJ52oNyHnFIOwTKbw8EMZ7ZsvZ7rPFqFcMMyi9EWqA= +github.com/aquasecurity/libbpfgo v0.5.1-libbpf-1.2.0.20230928114152-cf2d0bea778b/go.mod h1:0rEApF1YBHGuZ4C8OYI9q5oDBVpgqtRqYATePl9mCDk= +github.com/aquasecurity/libbpfgo/helpers v0.4.6-0.20230928114152-cf2d0bea778b h1:IWC4AIIhcZJhkRRJiWg9G8cx2/8ntePqmsnEbq7N1XY= +github.com/aquasecurity/libbpfgo/helpers v0.4.6-0.20230928114152-cf2d0bea778b/go.mod h1:9gxdXex9MiHcJmPEybiO3JSHnNyWe7X8FtNtHQ4Evns= github.com/aquasecurity/tracee/api v0.0.0-20231013014739-b32a168ee6a8 h1:NGzPDvQofEG04CoPZjSSRoFMxnSd3Brh39BY1dmdyZM= github.com/aquasecurity/tracee/api v0.0.0-20231013014739-b32a168ee6a8/go.mod h1:l1W65+m4KGg2i61fiPaQ/o4OQCrNtNnkPTEdysF5Zpw= github.com/aquasecurity/tracee/types v0.0.0-20231013014739-b32a168ee6a8 h1:vW+N1VscyPwQCbOMPUZhrWckPzX+mLiseeiskYA09NQ= diff --git a/pkg/ebpf/c/maps.h b/pkg/ebpf/c/maps.h index f881b4f843aa..46d3f7dd13d9 100644 --- a/pkg/ebpf/c/maps.h +++ b/pkg/ebpf/c/maps.h @@ -70,6 +70,7 @@ enum tail_call_id_e TAIL_HIDDEN_KERNEL_MODULE_KSET, TAIL_HIDDEN_KERNEL_MODULE_MOD_TREE, TAIL_HIDDEN_KERNEL_MODULE_NEW_MOD_ONLY, + TAIL_IO_WRITE, MAX_TAIL_CALL }; @@ -119,6 +120,8 @@ BPF_PERCPU_ARRAY(scratch_map, scratch_t, 1); // scratch sp BPF_LRU_HASH(file_modification_map, file_mod_key_t, int, 10240); // hold file data to decide if should submit file modification event BPF_LRU_HASH(io_file_path_cache_map, file_id_t, path_buf_t, 5); // store cache for IO operations path BPF_LRU_HASH(elf_files_map, file_id_t, bool, 64); // store cache for file ELF type check +BPF_LRU_HASH(uring_poll_ctx_map, u32, event_context_t, 1024); // store real context of io_uring polling operations +BPF_LRU_HASH(uring_worker_ctx_map, u64, event_context_t, 1024); // store real context for io_uring worker operations // clang-format on diff --git a/pkg/ebpf/c/tracee.bpf.c b/pkg/ebpf/c/tracee.bpf.c index 33fccddf3790..85967e1ea240 100644 --- a/pkg/ebpf/c/tracee.bpf.c +++ b/pkg/ebpf/c/tracee.bpf.c @@ -2839,7 +2839,7 @@ submit_magic_write(program_data_t *p, file_info_t *file_info, io_data_t io_data, statfunc bool should_submit_io_event(u32 event_id, program_data_t *p) { return ((event_id == VFS_READ || event_id == VFS_READV || event_id == VFS_WRITE || - event_id == VFS_WRITEV || event_id == __KERNEL_WRITE) && + event_id == VFS_WRITEV || event_id == __KERNEL_WRITE || event_id == IO_WRITE) && should_submit(event_id, p->event)); } @@ -3110,6 +3110,167 @@ int BPF_KPROBE(trace_ret_kernel_write_tail) return capture_file_write(ctx, __KERNEL_WRITE, true); } +statfunc int common_submit_io_write(program_data_t *p, + struct io_kiocb *req, + struct kiocb *kiocb, + u32 host_tid, + void *buf, + u32 len, + int ret) +{ + // get write position + // (reusing io_kiocb struct flavors to get the correct data for the current kernel version) + loff_t ki_pos = kiocb->ki_pos; + u32 bytes_done = 0; + if (bpf_core_field_exists(req->cqe)) { // Version >= v5.19 + struct io_cqe cqe = BPF_CORE_READ(req, cqe); + bytes_done = cqe.res; + } else { // Version >= v5.10 + struct io_kiocb___older_v6 *req_55 = (void *) req; + if (bpf_core_field_exists(req_55->result)) { // Version >= v5.3 + bytes_done = BPF_CORE_READ(req_55, result); + } else { // Version >= v5.1 + bytes_done = BPF_CORE_READ(req_55, error); + } + } + loff_t pos = ki_pos - bytes_done; + + // get file info + struct file *file = kiocb->ki_filp; + file_info_t file_info = get_file_info(file); + + save_str_to_buf(&p->event->args_buf, file_info.pathname_p, 0); + save_to_submit_buf(&p->event->args_buf, &pos, sizeof(loff_t), 1); + save_to_submit_buf(&p->event->args_buf, &buf, sizeof(void *), 2); + save_to_submit_buf(&p->event->args_buf, &len, sizeof(u32), 3); + save_to_submit_buf(&p->event->args_buf, &host_tid, sizeof(u32), 4); + save_to_submit_buf(&p->event->args_buf, &file_info.id.device, sizeof(dev_t), 5); + save_to_submit_buf(&p->event->args_buf, &file_info.id.inode, sizeof(unsigned long), 6); + + return events_perf_submit(p, IO_WRITE, bytes_done); +} + +SEC("kprobe/io_write") +int BPF_KPROBE(trace_io_write) +{ + // this is a version TRACE_ENT_FUNC without should_trace + // so that we can obtain the correct context later. + + program_data_t p = {}; + if (!init_program_data(&p, ctx)) + return 0; + + args_t args = {}; + args.args[0] = PT_REGS_PARM1(ctx); + args.args[1] = PT_REGS_PARM2(ctx); + args.args[2] = PT_REGS_PARM3(ctx); + args.args[3] = PT_REGS_PARM4(ctx); + args.args[4] = PT_REGS_PARM5(ctx); + args.args[5] = PT_REGS_PARM6(ctx); + + return save_args(&args, IO_WRITE); +} + +SEC("kretprobe/io_write") +int BPF_KPROBE(trace_ret_io_write) +{ + args_t saved_args; + if (load_args(&saved_args, IO_WRITE) != 0) { + // missed entry or not traced + return 0; + } + + program_data_t p = {}; + if (!init_program_data(&p, ctx)) { + del_args(IO_WRITE); + return 0; + } + + // io_uring was introduced in kernel v5.1. + // this check is to satisfy the verifier in older kernels. + if (!bpf_core_type_exists(struct io_kiocb)) { + return 0; + } + + bool should_submit = should_submit_io_event(IO_WRITE, &p); + if (!should_submit) { + bpf_tail_call(ctx, &prog_array, TAIL_IO_WRITE); + del_args(IO_WRITE); + return 0; + } + + // don't proceed because the write operation wasn't successfull + int ret = PT_REGS_RC(ctx); + if (ret < 0) { + del_args(IO_WRITE); + return 0; + } + + struct io_kiocb *req = (struct io_kiocb *) saved_args.args[0]; + + u32 host_tid = p.task_info->context.host_tid; + // get real task info from uring_worker_ctx_map + event_context_t *real_ctx = bpf_map_lookup_elem(&uring_worker_ctx_map, &req); + if (real_ctx != NULL) { + p.event->context = *real_ctx; + bpf_map_delete_elem(&uring_worker_ctx_map, &req); + } + + // the should_trace is here now after we got the real context + if (!should_trace((&p))) + return 0; + + // get write info from req + struct io_rw *rw = NULL; + struct kiocb kiocb; + u64 addr; + void *buf; + u32 len; + if (bpf_core_field_exists(req->cmd)) { // Version >= v6 + struct io_cmd_data io_cmd = BPF_CORE_READ(req, cmd); + rw = (struct io_rw *) &io_cmd; + kiocb = BPF_CORE_READ(rw, kiocb); + + addr = BPF_CORE_READ(rw, addr); + buf = (void *) addr; + len = BPF_CORE_READ(rw, len); + } else { + struct io_kiocb___older_v6 *req_55 = (void *) req; + if (bpf_core_field_exists(req_55->connect)) { // Version >= v5.5 + rw = &req_55->rw; + kiocb = BPF_CORE_READ(rw, kiocb); + + addr = BPF_CORE_READ(rw, addr); + buf = (void *) addr; + len = BPF_CORE_READ(rw, len); + } else { // Version >= v5.1 + struct io_kiocb___older_v55 *req_51 = (void *) req_55; + kiocb = BPF_CORE_READ(req_51, rw); + struct sqe_submit submit = BPF_CORE_READ(req_51, submit); + const struct io_uring_sqe *sqe = submit.sqe; + + addr = BPF_CORE_READ(sqe, addr); + buf = (void *) addr; + len = BPF_CORE_READ(sqe, len); + } + } + + // submit event + common_submit_io_write(&p, req, &kiocb, host_tid, buf, len, ret); + + // TODO: don't del if passing to send_bin + del_args(IO_WRITE); + // return do_file_io_operation(ctx, IO_WRITE, TAIL_IO_WRITE, false, false); + + return 0; +} + +SEC("kretprobe/io_write_tail") +int BPF_KPROBE(trace_ret_io_write_tail) +{ + return capture_file_write(ctx, IO_WRITE, false); +} + SEC("kprobe/vfs_read") TRACE_ENT_FUNC(vfs_read, VFS_READ); @@ -4017,9 +4178,9 @@ int BPF_KPROBE(trace_ret_do_splice) // modified (the PIPE_BUF_CAN_MERGE flag is on in the pipe_buffer struct). struct pipe_buffer *last_write_page_buffer = get_last_write_pipe_buffer(out_pipe); unsigned int out_pipe_last_buffer_flags = BPF_CORE_READ(last_write_page_buffer, flags); - if ((out_pipe_last_buffer_flags & PIPE_BUF_FLAG_CAN_MERGE) == 0) { - return 0; - } + // if ((out_pipe_last_buffer_flags & PIPE_BUF_FLAG_CAN_MERGE) == 0) { + // return 0; + // } struct file *in_file = (struct file *) saved_args.args[0]; struct inode *in_inode = BPF_CORE_READ(in_file, f_inode); @@ -4860,6 +5021,352 @@ int BPF_KPROBE(trace_ret_exec_binprm2) return events_perf_submit(&p, PROCESS_EXECUTION_FAILED, ret); } +statfunc int common_io_uring_create( + program_data_t *p, struct io_ring_ctx *io_uring_ctx, u32 sq_entries, u32 cq_entries, u32 flags) +{ + // getting the task_struct of the kernel thread if polling is used on this ring. + struct task_struct *thread = NULL; + if (!bpf_core_field_exists(io_uring_ctx->sq_data)) { // Version <= v5.9 + struct io_ring_ctx___older_v59 *io_uring_ctx_51 = (void *) io_uring_ctx; + thread = BPF_CORE_READ(io_uring_ctx_51, sqo_thread); + } else { + struct io_sq_data *sq_data = BPF_CORE_READ(io_uring_ctx, sq_data); + if (sq_data != NULL) { + thread = BPF_CORE_READ(sq_data, thread); + } + } + + // update uring_poll_ctx_map with real task info + bool polling = false; + u32 host_tid = p->task_info->context.host_tid; + if (thread != NULL) { + polling = true; + host_tid = BPF_CORE_READ(thread, pid); + } + bpf_map_update_elem(&uring_poll_ctx_map, &host_tid, &p->event->context, BPF_ANY); + + if (!should_submit(IO_URING_CREATE, p->event)) + return 0; + + save_to_submit_buf(&p->event->args_buf, &io_uring_ctx, sizeof(struct io_ring_ctx *), 0); + save_to_submit_buf(&p->event->args_buf, &sq_entries, sizeof(u32), 1); + save_to_submit_buf(&p->event->args_buf, &cq_entries, sizeof(u32), 2); + save_to_submit_buf(&p->event->args_buf, &flags, sizeof(u32), 3); + save_to_submit_buf(&p->event->args_buf, &polling, sizeof(bool), 4); + + return events_perf_submit(p, IO_URING_CREATE, 0); +} + +SEC("raw_tracepoint/io_uring_create") +int tracepoint__io_uring__io_uring_create(struct bpf_raw_tracepoint_args *ctx) +{ + program_data_t p = {}; + if (!init_program_data(&p, ctx)) + return 0; + + if (!should_trace((&p))) + return 0; + + // io_uring was introduced in kernel v5.1. + // this check is to satisfy the verifier in older kernels. + if (!bpf_core_type_exists(struct io_kiocb)) { + return 0; + } + + // this tracepoint was introduced in kernel v5.5. + // for older kernels, we use the io_sq_offload_start probe. + + struct io_ring_ctx *io_uring_ctx = (struct io_ring_ctx *) ctx->args[1]; + u32 sq_entries = ctx->args[2]; + u32 cq_entries = ctx->args[3]; + u32 flags = ctx->args[4]; + + return common_io_uring_create(&p, io_uring_ctx, sq_entries, cq_entries, flags); +} + +SEC("kprobe/io_sq_offload_start") +TRACE_ENT_FUNC(io_sq_offload_start, IO_URING_CREATE); + +SEC("kretprobe/io_sq_offload_start") +int BPF_KPROBE(trace_ret_io_sq_offload_start) +{ + args_t saved_args; + if (load_args(&saved_args, IO_URING_CREATE) != 0) { + // missed entry or not traced + return 0; + } + del_args(IO_URING_CREATE); + + program_data_t p = {}; + if (!init_program_data(&p, ctx)) + return 0; + + if (!should_trace((&p))) + return 0; + + // io_uring was introduced in kernel v5.1. + // this check is to satisfy the verifier in older kernels. + if (!bpf_core_type_exists(struct io_kiocb)) { + return 0; + } + + if (!bpf_core_field_exists(((struct io_kiocb___older_v55 *) 0)->submit)) { // Version >= v5.5 + // this case handled by the tracepoint io_uring_create + return 0; + } + + struct io_ring_ctx *io_uring_ctx = (struct io_ring_ctx *) saved_args.args[0]; + struct io_uring_params *params = (struct io_uring_params *) saved_args.args[1]; + + u32 sq_entries = BPF_CORE_READ(params, sq_entries); + u32 cq_entries = BPF_CORE_READ(params, cq_entries); + u32 flags = BPF_CORE_READ(io_uring_ctx, flags); + + return common_io_uring_create(&p, io_uring_ctx, sq_entries, cq_entries, flags); +} + +statfunc int +common_submit_io_issue_sqe(program_data_t *p, struct io_kiocb *req, u8 opcode, u64 *user_data) +{ + u32 flags = BPF_CORE_READ(req, flags); + struct io_ring_ctx *uring_ctx = BPF_CORE_READ(req, ctx); + u32 ctx_flags = BPF_CORE_READ(uring_ctx, flags); + bool sq_thread = ctx_flags & IORING_SETUP_SQPOLL; + struct file *file = BPF_CORE_READ(req, file); + if (file == NULL) { + // file is null in first invocation of io_issue_sqe. + // in the second invocation, file is valid, but context is of async worker. + // we take care of the context below. + return 0; + } + file_info_t file_info = get_file_info(file); + + u32 host_tid = p->task_info->context.host_tid; + + // submit event + save_str_to_buf(&p->event->args_buf, file_info.pathname_p, 0); + save_to_submit_buf(&p->event->args_buf, &file_info.id.device, sizeof(dev_t), 1); + save_to_submit_buf(&p->event->args_buf, &file_info.id.inode, sizeof(unsigned long), 2); + save_to_submit_buf(&p->event->args_buf, &opcode, sizeof(u8), 3); + save_to_submit_buf(&p->event->args_buf, user_data, sizeof(u64), 4); + save_to_submit_buf(&p->event->args_buf, &flags, sizeof(u32), 5); + save_to_submit_buf(&p->event->args_buf, &sq_thread, sizeof(bool), 6); + save_to_submit_buf(&p->event->args_buf, &host_tid, sizeof(u32), 7); + + return events_perf_submit(p, IO_ISSUE_SQE, 0); +} + +SEC("kprobe/io_issue_sqe") +int BPF_KPROBE(trace_io_issue_sqe) +{ + program_data_t p = {}; + if (!init_program_data(&p, ctx)) + return 0; + + // should_trace will be called after real context is obtained. + + // io_uring was introduced in kernel v5.1. + // this check is to satisfy the verifier in older kernels. + if (!bpf_core_type_exists(struct io_kiocb)) { + return 0; + } + + if (!should_submit(IO_ISSUE_SQE, p.event)) + return 0; + + struct io_kiocb___io_issue_sqe *req = (struct io_kiocb___io_issue_sqe *) PT_REGS_PARM1(ctx); + + if (!bpf_core_field_exists(req->opcode)) { // Version < v5.5 + // this kprobe only exists from kernel >= v5.5. + // this check is to satisfy the verifier. + // this case handled by the kprobe __io_submit_sqe. + return 0; + } + + // get real task info from uring_worker_ctx_map + event_context_t *real_ctx = bpf_map_lookup_elem(&uring_worker_ctx_map, &req); + if (real_ctx != NULL) { + p.event->context = *real_ctx; + } + + // the should_trace is here now after we got the real context + if (!should_trace((&p))) + return 0; + + // v5.1 - v5.4: handled in trace__io_submit_sqe + // v5.5 - v5.16: handled in trace_io_issue_sqe + // v5.17 - v6.6: handled in trace_ret_io_assign_file + // args: + // - opcode + // v5.1 - v5.4: req->submit.opcode + // v5.5 - v6.6: req->opcode + // - user_data + // v5.1 - v5.18: req->user_data + // v5.19 - v6.6: req->cqe.user_data + // - flags + // v5.1 - v6.6: req->flags + // - file + // v5.1 - v6.6: req->file + // - sq_thread + // v5.1 - v6.6: req->ctx->flags & IORING_SETUP_SQPOLL + + // v5.1 - v5.4: + // bpf_core_field_exists(req->submit) + // v5.19 - v6.6: + // bpf_core_field_exists(req->cqe) + + u8 opcode = BPF_CORE_READ(req, opcode); + u64 user_data; + if (bpf_core_field_exists(req->cqe)) { + struct io_cqe cqe = BPF_CORE_READ(req, cqe); + user_data = cqe.user_data; + } else { + user_data = BPF_CORE_READ(req, user_data); + } + + // submit event + return common_submit_io_issue_sqe(&p, (struct io_kiocb *) req, opcode, &user_data); +} + +SEC("kprobe/__io_submit_sqe") +int BPF_KPROBE(trace__io_submit_sqe) +{ + program_data_t p = {}; + if (!init_program_data(&p, ctx)) + return 0; + + // should_trace will be called after real context is obtained. + + // io_uring was introduced in kernel v5.1. + // this check is to satisfy the verifier in older kernels. + if (!bpf_core_type_exists(struct io_kiocb)) { + return 0; + } + + struct io_kiocb___older_v55 *req = (struct io_kiocb___older_v55 *) PT_REGS_PARM2(ctx); + + if (!bpf_core_field_exists(req->submit)) { // Version >= v5.5 + // this case handled by the tracepoints io_issue_sqe probe and io_write probe + return 0; + } + + // get real task info from uring_worker_ctx_map + event_context_t *real_ctx = bpf_map_lookup_elem(&uring_worker_ctx_map, &req); + if (real_ctx != NULL) { + p.event->context = *real_ctx; + } + + // the should_trace is here now after we got the real context + if (!should_trace((&p))) + return 0; + + u32 host_tid = p.task_info->context.host_tid; + struct sqe_submit submit = BPF_CORE_READ(req, submit); + + if (should_submit(IO_ISSUE_SQE, p.event)) { + // v5.1 - v5.4: handled in trace__io_submit_sqe + // v5.5 - v5.16: handled in trace_io_issue_sqe + // v5.17 - v6.6: handled in trace_ret_io_assign_file + // args: + // - opcode + // v5.1 - v5.4: req->submit.opcode + // v5.5 - v6.6: req->opcode + // - user_data + // v5.1 - v5.18: req->user_data + // v5.19 - v6.6: req->cqe.user_data + // - flags + // v5.1 - v6.6: req->flags + // - file + // v5.1 - v6.6: req->file + // - sq_thread + // v5.1 - v6.6: req->ctx->flags & IORING_SETUP_SQPOLL + + // v5.1 - v5.4: + // bpf_core_field_exists(req->submit) + // v5.19 - v6.6: + // bpf_core_field_exists(req->cqe) + + // extract args for the event + u8 opcode = submit.opcode; + u64 user_data = BPF_CORE_READ(req, user_data); + + // submit io_issue_sqe + common_submit_io_issue_sqe(&p, (struct io_kiocb *) req, opcode, &user_data); + + // Do not corrupt the buffer for the io_write event + reset_event_args(&p); + } + + if (should_submit(IO_WRITE, p.event)) { + // get write info from req + struct kiocb kiocb = BPF_CORE_READ(req, rw); + const struct io_uring_sqe *sqe = submit.sqe; + u64 addr = BPF_CORE_READ(sqe, addr); + void *buf = (void *) addr; + u32 len = BPF_CORE_READ(sqe, len); + + // submit io_write + common_submit_io_write(&p, (struct io_kiocb *) req, &kiocb, host_tid, buf, len, 0); + } + + return 0; +} + +SEC("raw_tracepoint/io_uring_queue_async_work") +int tracepoint__io_uring__io_uring_queue_async_work(struct bpf_raw_tracepoint_args *ctx) +{ + program_data_t p = {}; + if (!init_program_data(&p, ctx)) + return 0; + + // should_trace will be called after real context is obtained. + + // io_uring was introduced in kernel v5.1. + // this check is to satisfy the verifier in older kernels. + if (!bpf_core_type_exists(struct io_kiocb)) { + return 0; + } + + if (!should_submit(IO_WRITE, p.event)) + return 0; + + // io_uring was introduced in kernel v5.1. + // this check is to satisfy the verifier in older kernels. + if (!bpf_core_type_exists(struct io_kiocb)) { + return 0; + } + + // kernel versions v5.5 - v5.17: arg 2 + // kernel versions v5.18 - v5.19: arg 1 + // bpf_core_field_exists(req->msg) + // kernel versions v6.0 onwards: arg 0 + // bpf_core_field_exists(req->cmd) + struct io_kiocb___io_uring_queue_async_work *req; + if (bpf_core_field_exists(req->cmd)) { // Version >= v6.0 + req = (struct io_kiocb___io_uring_queue_async_work *) ctx->args[0]; + } else if (bpf_core_field_exists(req->msg)) { // Version >= v5.18 + req = (struct io_kiocb___io_uring_queue_async_work *) ctx->args[1]; + } else { // Version >= v5.5 + req = (struct io_kiocb___io_uring_queue_async_work *) ctx->args[2]; + } + + // get real task info from uring_poll_ctx_map + u32 host_tid = p.task_info->context.host_tid; + event_context_t *real_ctx = bpf_map_lookup_elem(&uring_poll_ctx_map, &host_tid); + if (real_ctx != NULL) { + p.event->context = *real_ctx; + } + + // the should_trace is here now after we got the real context + if (!should_trace((&p))) + return 0; + + // update uring_worker_ctx_map with real task info + bpf_map_update_elem(&uring_worker_ctx_map, &req, &p.event->context, BPF_ANY); + + return 0; +} + // clang-format off // Network Packets (works from ~5.2 and beyond) diff --git a/pkg/ebpf/c/types.h b/pkg/ebpf/c/types.h index d8cc3d103572..b7b224752437 100644 --- a/pkg/ebpf/c/types.h +++ b/pkg/ebpf/c/types.h @@ -122,6 +122,9 @@ enum event_id_e HIDDEN_KERNEL_MODULE_SEEKER, MODULE_LOAD, MODULE_FREE, + IO_URING_CREATE, + IO_ISSUE_SQE, + IO_WRITE, MAX_EVENT_ID, }; diff --git a/pkg/ebpf/c/vmlinux.h b/pkg/ebpf/c/vmlinux.h index 80696012a909..be01f8520b17 100644 --- a/pkg/ebpf/c/vmlinux.h +++ b/pkg/ebpf/c/vmlinux.h @@ -425,7 +425,10 @@ struct dir_context { }; struct iov_iter { }; + struct kiocb { + struct file *ki_filp; + loff_t ki_pos; }; struct file_operations { @@ -1116,6 +1119,56 @@ struct bpf_insn { const int TRACE_EVENT_FL_TRACEPOINT_BIT = 4; const int TRACE_EVENT_FL_TRACEPOINT = (1 << TRACE_EVENT_FL_TRACEPOINT_BIT); +struct io_uring_params { + __u32 sq_entries; + __u32 cq_entries; +}; + +struct io_sq_data { + struct task_struct *thread; +}; + +struct io_ring_ctx { + unsigned int flags; + struct io_sq_data *sq_data; +}; + +struct io_cmd_data { + struct file *file; + /* each command gets 56 bytes of data */ + __u8 data[56]; +}; + +struct io_cqe { + __u64 user_data; + __s32 res; +}; + +struct io_kiocb { + union { + /* + * NOTE! Each of the io_kiocb union members has the file pointer + * as the first entry in their struct definition. So you can + * access the file pointer through any of the sub-structs, + * or directly as just 'file' in this struct. + */ + struct file *file; + struct io_cmd_data cmd; + }; + u8 opcode; + unsigned int flags; + struct io_cqe cqe; + struct io_ring_ctx *ctx; + void *async_data; +}; + +struct io_rw { + /* NOTE: kiocb has the file as the first member, so don't do it here */ + struct kiocb kiocb; + u64 addr; + u32 len; +}; + // // COMPLETE NETWORK TYPES // diff --git a/pkg/ebpf/c/vmlinux_flavors.h b/pkg/ebpf/c/vmlinux_flavors.h index 5f3a8c74bed1..d55f4d65d94c 100644 --- a/pkg/ebpf/c/vmlinux_flavors.h +++ b/pkg/ebpf/c/vmlinux_flavors.h @@ -97,6 +97,70 @@ struct module___older_v64 { /////////////////// +struct io_ring_ctx___older_v59 { + struct task_struct *sqo_thread; +}; + +struct io_connect { +}; + +struct io_kiocb___older_v6 { + union { + struct file *file; + struct io_rw rw; + struct io_connect connect; + }; + u32 result; + u32 error; +}; + +struct io_uring_sqe { + __u64 addr; + __u32 len; +}; + +struct sqe_submit { + const struct io_uring_sqe *sqe; + u8 opcode; +}; + +struct io_kiocb___older_v55 { + union { + struct file *file; + struct kiocb rw; + }; + struct sqe_submit submit; + unsigned int flags; + u64 user_data; +}; + +// this flavor is a combination of variants +// of the io_kiocb struct, that will be +// used in io_issue_sqe probe. +struct io_kiocb___io_issue_sqe { + u8 opcode; + u64 user_data; + struct io_cqe cqe; + union { + u32 cflags; + int fd; + }; +}; + +struct io_msg { +}; + +// this flavor is a combination of variants +// of the io_kiocb struct, that will be +// used in io_uring_queue_async_work probe. +struct io_kiocb___io_uring_queue_async_work { + union { + struct file *file; + struct io_msg msg; + struct io_cmd_data cmd; + }; +}; + #pragma clang attribute pop #endif diff --git a/pkg/ebpf/c/vmlinux_missing.h b/pkg/ebpf/c/vmlinux_missing.h index a8bc59470ac2..44657613fa50 100644 --- a/pkg/ebpf/c/vmlinux_missing.h +++ b/pkg/ebpf/c/vmlinux_missing.h @@ -96,6 +96,8 @@ enum perf_type_id PERF_TYPE_MAX, /* non-ABI */ }; +#define IORING_SETUP_SQPOLL (1U << 1) /* SQ poll thread */ + /*=============================== ARCH SPECIFIC ===========================*/ #if defined(__TARGET_ARCH_x86) diff --git a/pkg/ebpf/probes/probe_group.go b/pkg/ebpf/probes/probe_group.go index fa71f47cf289..c46661b40e84 100644 --- a/pkg/ebpf/probes/probe_group.go +++ b/pkg/ebpf/probes/probe_group.go @@ -215,6 +215,14 @@ func NewDefaultProbeGroup(module *bpf.Module, netEnabled bool) (*ProbeGroup, err SignalSchedProcessFork: NewTraceProbe(RawTracepoint, "sched:sched_process_fork", "sched_process_fork_signal"), SignalSchedProcessExec: NewTraceProbe(RawTracepoint, "sched:sched_process_exec", "sched_process_exec_signal"), SignalSchedProcessExit: NewTraceProbe(RawTracepoint, "sched:sched_process_exit", "sched_process_exit_signal"), + IoUringCreate: NewTraceProbe(RawTracepoint, "io_uring:io_uring_create", "tracepoint__io_uring__io_uring_create"), + IoSqOffloadStart: NewTraceProbe(KProbe, "io_sq_offload_start", "trace_io_sq_offload_start"), + IoSqOffloadStartRet: NewTraceProbe(KretProbe, "io_sq_offload_start", "trace_ret_io_sq_offload_start"), + IoSubmitSqe: NewTraceProbe(KProbe, "__io_submit_sqe", "trace__io_submit_sqe"), + IoIssueSqe: NewTraceProbe(KProbe, "io_issue_sqe", "trace_io_issue_sqe"), + IoUringQueueAsyncWork: NewTraceProbe(RawTracepoint, "io_uring:io_uring_queue_async_work", "tracepoint__io_uring__io_uring_queue_async_work"), + IoWrite: NewTraceProbe(KProbe, "io_write", "trace_io_write"), + IoWriteRet: NewTraceProbe(KretProbe, "io_write", "trace_ret_io_write"), } if !netEnabled { diff --git a/pkg/ebpf/probes/probes.go b/pkg/ebpf/probes/probes.go index 4a3091605d34..33c299e40e3a 100644 --- a/pkg/ebpf/probes/probes.go +++ b/pkg/ebpf/probes/probes.go @@ -132,4 +132,12 @@ const ( SignalSchedProcessFork SignalSchedProcessExec SignalSchedProcessExit + IoUringCreate + IoSqOffloadStart + IoSqOffloadStartRet + IoSubmitSqe + IoIssueSqe + IoUringQueueAsyncWork + IoWrite + IoWriteRet ) diff --git a/pkg/events/core.go b/pkg/events/core.go index f27490986dab..a87c28dd7a6d 100644 --- a/pkg/events/core.go +++ b/pkg/events/core.go @@ -104,6 +104,9 @@ const ( HiddenKernelModuleSeeker ModuleLoad ModuleFree + IoUringCreate + IoIssueSqe + IoWrite MaxCommonID ) @@ -10514,12 +10517,15 @@ var CoreEvents = map[ID]Definition{ {handle: probes.KernelWrite, required: false}, {handle: probes.KernelWriteRet, required: false}, {handle: probes.SecurityInodeUnlink, required: false}, // Used for ELF filter + {handle: probes.IoWrite, required: false}, + {handle: probes.IoWriteRet, required: false}, }, tailCalls: []TailCall{ {"prog_array", "trace_ret_vfs_write_tail", []uint32{TailVfsWrite}}, {"prog_array", "trace_ret_vfs_writev_tail", []uint32{TailVfsWritev}}, {"prog_array", "trace_ret_kernel_write_tail", []uint32{TailKernelWrite}}, {"prog_array", "send_bin", []uint32{TailSendBin}}, + {"prog_array", "trace_ret_io_write_tail", []uint32{TailIoWrite}}, }, kSymbols: []KSymbol{ {symbol: "pipe_write", required: true}, @@ -11116,6 +11122,85 @@ var CoreEvents = map[ID]Definition{ {Type: "unsigned long", Name: "count"}, }, }, + IoUringCreate: { + id: IoUringCreate, + id32Bit: Sys32Undefined, + name: "io_uring_create", + version: NewVersion(1, 0, 0), + sets: []string{}, + dependencies: Dependencies{ + probes: []Probe{ + {handle: probes.IoSqOffloadStart, required: false}, // exists in kernels v5.1 - v5.4 + {handle: probes.IoSqOffloadStartRet, required: false}, // exists in kernels v5.1 - v5.4 + {handle: probes.IoUringCreate, required: false}, // exists in kernels v5.5 onwards + }, + }, + params: []trace.ArgMeta{ + {Type: "void*", Name: "ctx"}, + {Type: "u32", Name: "sq_entries"}, + {Type: "u32", Name: "cq_entries"}, + {Type: "u32", Name: "flags"}, + {Type: "bool", Name: "polling"}, + }, + }, + IoIssueSqe: { + id: IoIssueSqe, + id32Bit: Sys32Undefined, + name: "io_issue_sqe", + version: NewVersion(1, 0, 0), + sets: []string{}, + dependencies: Dependencies{ + probes: []Probe{ + // io_uring_create probes, to get correct context for io_uring events + {handle: probes.IoSqOffloadStart, required: false}, // exists in kernels v5.1 - v5.4 + {handle: probes.IoSqOffloadStartRet, required: false}, // exists in kernels v5.1 - v5.4 + {handle: probes.IoUringCreate, required: false}, // exists in kernels v5.5 onwards + // probes to tell if an io_uring task is being issued + {handle: probes.IoSubmitSqe, required: false}, // exists in kernels v5.1 - v5.4 + {handle: probes.IoIssueSqe, required: false}, // exists in kernels v5.5 onwards + }, + }, + params: []trace.ArgMeta{ + {Type: "const char*", Name: "path"}, + {Type: "dev_t", Name: "device"}, + {Type: "unsigned long", Name: "inode"}, + {Type: "u8", Name: "opcode"}, + {Type: "u64", Name: "user_data"}, + {Type: "u32", Name: "flags"}, + {Type: "bool", Name: "sq_thread"}, + {Type: "u32", Name: "sq_thread_id"}, + }, + }, + IoWrite: { + id: IoWrite, + id32Bit: Sys32Undefined, + name: "io_write", + version: NewVersion(1, 0, 0), + sets: []string{}, + dependencies: Dependencies{ + probes: []Probe{ + // io_uring_create probes, to get correct context for io_uring events + {handle: probes.IoSqOffloadStart, required: false}, // exists in kernels v5.1 - v5.4 + {handle: probes.IoSqOffloadStartRet, required: false}, // exists in kernels v5.1 - v5.4 + {handle: probes.IoUringCreate, required: false}, // exists in kernels v5.5 onwards + // probes to tell io_write + {handle: probes.IoWrite, required: false}, // this probe fails on older kernels. + {handle: probes.IoWriteRet, required: false}, // instead, using the + {handle: probes.IoSubmitSqe, required: false}, // __io_submit_sqe to populate the event. + // get correct context if async + {handle: probes.IoUringQueueAsyncWork, required: false}, // this tracepoint is from v5.5 onwards. + }, + }, + params: []trace.ArgMeta{ + {Type: "const char*", Name: "path"}, + {Type: "long", Name: "pos"}, + {Type: "void*", Name: "buf"}, + {Type: "u32", Name: "len"}, + {Type: "u32", Name: "worker_host_tid"}, + {Type: "dev_t", Name: "device"}, + {Type: "unsigned long", Name: "inode"}, + }, + }, // // Begin of Signal Events (Control Plane) // diff --git a/pkg/events/definition_dependencies.go b/pkg/events/definition_dependencies.go index faf3b938fa57..95e15f2aac6e 100644 --- a/pkg/events/definition_dependencies.go +++ b/pkg/events/definition_dependencies.go @@ -131,6 +131,7 @@ const ( TailHiddenKernelModuleKset TailHiddenKernelModuleModTree TailHiddenKernelModuleNewModOnly + TailIoWrite MaxTail ) diff --git a/pkg/events/parse_args.go b/pkg/events/parse_args.go index 550c32a7cadb..b022204678bd 100644 --- a/pkg/events/parse_args.go +++ b/pkg/events/parse_args.go @@ -276,6 +276,26 @@ func ParseArgs(event *trace.Event) error { helpersArg.Value = parsedHelpersList } } + case IoUringCreate: + if flagsArg := GetArg(event, "flags"); flagsArg != nil { + if flags, isUint32 := flagsArg.Value.(uint32); isUint32 { + flagsParsed := helpers.ParseIoUringSetupFlags(uint64(flags)) + parseOrEmptyString(flagsArg, flagsParsed, nil) + } + } + case IoIssueSqe: + if opcodeArg := GetArg(event, "opcode"); opcodeArg != nil { + if opcode, isUint8 := opcodeArg.Value.(uint8); isUint8 { + opcodeParsed, err := helpers.ParseIoUringOp(uint64(opcode)) + parseOrEmptyString(opcodeArg, opcodeParsed, err) + } + } + if flagsArg := GetArg(event, "flags"); flagsArg != nil { + if flags, isUint32 := flagsArg.Value.(uint32); isUint32 { + flagsParsed := helpers.ParseIoUringRequestFlags(uint64(flags)) + parseOrEmptyString(flagsArg, flagsParsed, nil) + } + } } return nil